In [None]:
#импорт библиотек
import pandas as pd
import numpy as np

In [None]:
# Install the dependencies
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.0.0/spark-3.0.0-bin-hadoop2.7.tgz
!tar xf spark-3.0.0-bin-hadoop2.7.tgz
!pip install -q findspark

In [None]:
#Set the environment variables for running PySpark in the collaboration environmentimport os
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.0-bin-hadoop2.7"

In [None]:
# Run the local session to test the installation
import findspark
findspark.init('spark-3.0.0-bin-hadoop2.7')
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local[*]').getOrCreate()

In [None]:
catalog_product1 = spark.createDataFrame([(100,'furniture'),(101,'electrical goods'),(102,'construction and repair')],
                                         ['id' , 'name'])
catalog_product2 = spark.createDataFrame([(200,'clothes'),(201,'dishes'),(202,'car goods')],
                                         ['id' , 'name'])

In [None]:
#Создадим единый каталог товаров, объединив два датасета
catalog_product_full = catalog_product1.union(catalog_product2).orderBy('id', ascending=True)
catalog_product_full.show()
catalog_product_full = catalog_product_full.withColumnRenamed('id','id_product')
catalog_product_full.show()

+---+--------------------+
| id|                name|
+---+--------------------+
|100|           furniture|
|101|    electrical goods|
|102|construction and ...|
|200|             clothes|
|201|              dishes|
|202|           car goods|
+---+--------------------+

+----------+--------------------+
|id_product|                name|
+----------+--------------------+
|       100|           furniture|
|       101|    electrical goods|
|       102|construction and ...|
|       200|             clothes|
|       201|              dishes|
|       202|           car goods|
+----------+--------------------+



In [None]:
# Создаем датасет с продажами
sales = spark.createDataFrame([(1,100,'AAA', 150),
                               (2,100,'BBB', 250),
                               (3,101,'AAA', 50),
                               (4,102,'DDD', 100),
                               (5,200,'AAA', 500),
                               (6,201,'DDD', 750),
                               (7,201,'BBB', 550),
                               (8,100,'CCC', 350),
                               (9,202102,'CCC', 400),
                               (10,101,'AAA', 650),
                               ],
                              ['id' , 'id_product', 'region', 'amount'])
sales.show()

+---+----------+------+------+
| id|id_product|region|amount|
+---+----------+------+------+
|  1|       100|   AAA|   150|
|  2|       100|   BBB|   250|
|  3|       101|   AAA|    50|
|  4|       102|   DDD|   100|
|  5|       200|   AAA|   500|
|  6|       201|   DDD|   750|
|  7|       201|   BBB|   550|
|  8|       100|   CCC|   350|
|  9|    202102|   CCC|   400|
| 10|       101|   AAA|   650|
+---+----------+------+------+



In [None]:
# Подтянем названия товарных групп в таблицу с продажами
sales_new = sales.join(catalog_product_full,'id_product', how='left').orderBy('id',ascending=True)
sales_new = sales_new.withColumnRenamed('name','name_product')
sales_final = sales_new['id','region','name_product','amount']
sales_final.show()

+---+------+--------------------+------+
| id|region|        name_product|amount|
+---+------+--------------------+------+
|  1|   AAA|           furniture|   150|
|  2|   BBB|           furniture|   250|
|  3|   AAA|    electrical goods|    50|
|  4|   DDD|construction and ...|   100|
|  5|   AAA|             clothes|   500|
|  6|   DDD|              dishes|   750|
|  7|   BBB|              dishes|   550|
|  8|   CCC|           furniture|   350|
|  9|   CCC|                null|   400|
| 10|   AAA|    electrical goods|   650|
+---+------+--------------------+------+



In [None]:
# Удаляем позиции с незаполненными значениями
sales_final = sales_final.na.drop()
sales_final.show()

+---+------+--------------------+------+
| id|region|        name_product|amount|
+---+------+--------------------+------+
|  1|   AAA|           furniture|   150|
|  2|   BBB|           furniture|   250|
|  3|   AAA|    electrical goods|    50|
|  4|   DDD|construction and ...|   100|
|  5|   AAA|             clothes|   500|
|  6|   DDD|              dishes|   750|
|  7|   BBB|              dishes|   550|
|  8|   CCC|           furniture|   350|
| 10|   AAA|    electrical goods|   650|
+---+------+--------------------+------+



In [None]:
# Добавляем расчетный столбец
from pyspark.sql import functions as F
sales_final = sales_final.withColumn('percent', F.when((sales_final.amount>500)&(sales_final.amount<1000),15)
                                    .when((sales_final.amount>250)&(sales_final.amount<=500),10)
                                    .otherwise(5)
                      )
sales_final = sales_final.withColumn('amount_percent', sales_final.amount/100*sales_final.percent)
sales_final.show()

+---+------+--------------------+------+-------+--------------+
| id|region|        name_product|amount|percent|amount_percent|
+---+------+--------------------+------+-------+--------------+
|  1|   AAA|           furniture|   150|      5|           7.5|
|  2|   BBB|           furniture|   250|      5|          12.5|
|  3|   AAA|    electrical goods|    50|      5|           2.5|
|  4|   DDD|construction and ...|   100|      5|           5.0|
|  5|   AAA|             clothes|   500|     10|          50.0|
|  6|   DDD|              dishes|   750|     15|         112.5|
|  7|   BBB|              dishes|   550|     15|          82.5|
|  8|   CCC|           furniture|   350|     10|          35.0|
| 10|   AAA|    electrical goods|   650|     15|          97.5|
+---+------+--------------------+------+-------+--------------+



In [None]:
# Заменяем значения в столбце Регион
sales_final = sales_final.na.replace(['AAA','BBB','CCC','DDD'],['north','south','east','west'])
sales_final.show()

+---+------+--------------------+------+-------+--------------+
| id|region|        name_product|amount|percent|amount_percent|
+---+------+--------------------+------+-------+--------------+
|  1| north|           furniture|   150|      5|           7.5|
|  2| south|           furniture|   250|      5|          12.5|
|  3| north|    electrical goods|    50|      5|           2.5|
|  4|  west|construction and ...|   100|      5|           5.0|
|  5| north|             clothes|   500|     10|          50.0|
|  6|  west|              dishes|   750|     15|         112.5|
|  7| south|              dishes|   550|     15|          82.5|
|  8|  east|           furniture|   350|     10|          35.0|
| 10| north|    electrical goods|   650|     15|          97.5|
+---+------+--------------------+------+-------+--------------+



In [None]:
sales_final.dtypes


[('id', 'bigint'),
 ('region', 'string'),
 ('name_product', 'string'),
 ('amount', 'bigint'),
 ('percent', 'int'),
 ('amount_percent', 'double')]

In [None]:
sales_final.describe().show()

+-------+-----------------+------+------------+------------------+-----------------+------------------+
|summary|               id|region|name_product|            amount|          percent|    amount_percent|
+-------+-----------------+------+------------+------------------+-----------------+------------------+
|  count|                9|     9|           9|                 9|                9|                 9|
|   mean|5.111111111111111|  null|        null|372.22222222222223|9.444444444444445|              45.0|
| stddev|2.934469476943168|  null|        null|252.62510652040197|4.639803635691685|42.884291996021105|
|    min|                1|  east|     clothes|                50|                5|               2.5|
|    max|               10|  west|   furniture|               750|               15|             112.5|
+-------+-----------------+------+------------+------------------+-----------------+------------------+



In [None]:
# Проведем группировку по данным
sales_final.groupBy(['name_product']).agg(F.sum('amount').alias('sum_amount'),
                                                   F.count('amount').alias('count_amount'),
                                                   F.min('amount').alias('min_amount'),
                                                   F.max('amount').alias('max_amount')).show()

+--------------------+----------+------------+----------+----------+
|        name_product|sum_amount|count_amount|min_amount|max_amount|
+--------------------+----------+------------+----------+----------+
|              dishes|      1300|           2|       550|       750|
|    electrical goods|       700|           2|        50|       650|
|             clothes|       500|           1|       500|       500|
|           furniture|       750|           3|       150|       350|
|construction and ...|       100|           1|       100|       100|
+--------------------+----------+------------+----------+----------+



In [18]:
# Построим сводную таблицу
sales_final.groupBy(['region']).pivot('name_product').sum('amount').show()

+------+-------+-----------------------+------+----------------+---------+
|region|clothes|construction and repair|dishes|electrical goods|furniture|
+------+-------+-----------------------+------+----------------+---------+
|  west|   null|                    100|   750|            null|     null|
| north|    500|                   null|  null|             700|      150|
|  east|   null|                   null|  null|            null|      350|
| south|   null|                   null|   550|            null|      250|
+------+-------+-----------------------+------+----------------+---------+

