In [None]:
#импорт библиотек
import pandas as pd
import numpy as np

In [None]:
# Install the dependencies
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.0.0/spark-3.0.0-bin-hadoop2.7.tgz
!tar xf spark-3.0.0-bin-hadoop2.7.tgz
!pip install -q findspark

In [None]:
#Set the environment variables for running PySpark in the collaboration environmentimport os
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.0-bin-hadoop2.7"

In [None]:
# Run the local session to test the installation
import findspark
findspark.init('spark-3.0.0-bin-hadoop2.7')
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local[*]').getOrCreate()

In [None]:
from pyspark.sql.types import *

In [None]:
idColumn = StructField('id', IntegerType(),True)
regionColumn = StructField('region',StringType(),True)
managerColumn = StructField('manager',StringType(),True)
productColumn = StructField('product', StringType(),True)
amountColumn = StructField('amount', IntegerType(),True)

In [None]:
columnList = [idColumn, regionColumn, managerColumn, productColumn, amountColumn]

In [None]:
salesDfSchema = StructType(columnList)

In [None]:
salesDfSchema

StructType(List(StructField(id,IntegerType,true),StructField(region,StringType,true),StructField(manager,StringType,true),StructField(product,StringType,true),StructField(amount,IntegerType,true)))

In [None]:
salesDf = spark.read.csv('drive/My Drive/Colab Notebooks/demo_sales.csv', schema = salesDfSchema, header = True, sep=";")

In [49]:
salesDf.show()

+---+------+-------+-------+------+
| id|region|manager|product|amount|
+---+------+-------+-------+------+
|  1|   AAA| Ivanov|     a1|   100|
|  2|   BBB|Sidorov|     a1|   150|
|  3|   DDD| Petrov|     b2|   250|
|  4|   BBB|Sidorov|     a1|   350|
|  5|   DDD| Petrov|     b2|   250|
|  6|   FFF| Ivanov|     a3|   100|
|  7|   BBB|Sidorov|     a3|   150|
|  8|   DDD| Petrov|     b2|   250|
|  9|   BBB|Sidorov|     a1|   350|
| 10|   DDD| Petrov|     b2|   250|
+---+------+-------+-------+------+



In [50]:
salesDf.printSchema()

root
 |-- id: integer (nullable = true)
 |-- region: string (nullable = true)
 |-- manager: string (nullable = true)
 |-- product: string (nullable = true)
 |-- amount: integer (nullable = true)



In [51]:
from pyspark.sql.functions import round
salesDf = salesDf.withColumn('percent_amount',round(salesDf.amount/100*2.5,1))

In [52]:
salesDf.show(5)

+---+------+-------+-------+------+--------------+
| id|region|manager|product|amount|percent_amount|
+---+------+-------+-------+------+--------------+
|  1|   AAA| Ivanov|     a1|   100|           2.5|
|  2|   BBB|Sidorov|     a1|   150|           3.8|
|  3|   DDD| Petrov|     b2|   250|           6.3|
|  4|   BBB|Sidorov|     a1|   350|           8.8|
|  5|   DDD| Petrov|     b2|   250|           6.3|
+---+------+-------+-------+------+--------------+
only showing top 5 rows



In [53]:
salesDfPercent_amount = salesDf.select('id','percent_amount')

In [54]:
salesDfPercent_amount.show(5)

+---+--------------+
| id|percent_amount|
+---+--------------+
|  1|           2.5|
|  2|           3.8|
|  3|           6.3|
|  4|           8.8|
|  5|           6.3|
+---+--------------+
only showing top 5 rows



In [None]:
salesDfFilter = salesDf.filter(salesDf.manager == 'Petrov')

In [55]:
salesDfFilter.show()
salesDfFilter.printSchema()

+---+------+-------+-------+------+--------------+
| id|region|manager|product|amount|percent_amount|
+---+------+-------+-------+------+--------------+
|  3|   DDD| Petrov|     b2|   250|           6.3|
|  5|   DDD| Petrov|     b2|   250|           6.3|
|  8|   DDD| Petrov|     b2|   250|           6.3|
| 10|   DDD| Petrov|     b2|   250|           6.3|
+---+------+-------+-------+------+--------------+

root
 |-- id: integer (nullable = true)
 |-- region: string (nullable = true)
 |-- manager: string (nullable = true)
 |-- product: string (nullable = true)
 |-- amount: integer (nullable = true)
 |-- percent_amount: double (nullable = true)



In [56]:
salesDf = salesDf.drop(salesDf.percent_amount)

In [57]:
salesDf.show()

+---+------+-------+-------+------+
| id|region|manager|product|amount|
+---+------+-------+-------+------+
|  1|   AAA| Ivanov|     a1|   100|
|  2|   BBB|Sidorov|     a1|   150|
|  3|   DDD| Petrov|     b2|   250|
|  4|   BBB|Sidorov|     a1|   350|
|  5|   DDD| Petrov|     b2|   250|
|  6|   FFF| Ivanov|     a3|   100|
|  7|   BBB|Sidorov|     a3|   150|
|  8|   DDD| Petrov|     b2|   250|
|  9|   BBB|Sidorov|     a1|   350|
| 10|   DDD| Petrov|     b2|   250|
+---+------+-------+-------+------+



In [63]:
# from pyspark.sql import functions as F
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType

def fn_percent_amount(val):
  return val/100*2.5

fn_percent_amount_udf = udf(lambda val:fn_percent_amount(val),DoubleType())

In [64]:
salesDf_new = salesDf.withColumn('percent_amount',fn_percent_amount_udf(salesDf.amount))

In [65]:
salesDf_new.printSchema()

root
 |-- id: integer (nullable = true)
 |-- region: string (nullable = true)
 |-- manager: string (nullable = true)
 |-- product: string (nullable = true)
 |-- amount: integer (nullable = true)
 |-- percent_amount: double (nullable = true)



In [66]:
salesDf_new.show()

+---+------+-------+-------+------+--------------+
| id|region|manager|product|amount|percent_amount|
+---+------+-------+-------+------+--------------+
|  1|   AAA| Ivanov|     a1|   100|           2.5|
|  2|   BBB|Sidorov|     a1|   150|          3.75|
|  3|   DDD| Petrov|     b2|   250|          6.25|
|  4|   BBB|Sidorov|     a1|   350|          8.75|
|  5|   DDD| Petrov|     b2|   250|          6.25|
|  6|   FFF| Ivanov|     a3|   100|           2.5|
|  7|   BBB|Sidorov|     a3|   150|          3.75|
|  8|   DDD| Petrov|     b2|   250|          6.25|
|  9|   BBB|Sidorov|     a1|   350|          8.75|
| 10|   DDD| Petrov|     b2|   250|          6.25|
+---+------+-------+-------+------+--------------+



In [69]:
salesDf_new.select('amount','percent_amount').summary().show()

+-------+-----------------+------------------+
|summary|           amount|    percent_amount|
+-------+-----------------+------------------+
|  count|               10|                10|
|   mean|            220.0|               5.5|
| stddev|91.89365834726814|2.2973414586817036|
|    min|              100|               2.5|
|    25%|              150|              3.75|
|    50%|              250|              6.25|
|    75%|              250|              6.25|
|    max|              350|              8.75|
+-------+-----------------+------------------+



In [73]:
salesDf_new.sortWithinPartitions('region','manager','product',ascending=[True,True,True]).show()

+---+------+-------+-------+------+--------------+
| id|region|manager|product|amount|percent_amount|
+---+------+-------+-------+------+--------------+
|  1|   AAA| Ivanov|     a1|   100|           2.5|
|  2|   BBB|Sidorov|     a1|   150|          3.75|
|  4|   BBB|Sidorov|     a1|   350|          8.75|
|  9|   BBB|Sidorov|     a1|   350|          8.75|
|  7|   BBB|Sidorov|     a3|   150|          3.75|
|  3|   DDD| Petrov|     b2|   250|          6.25|
|  5|   DDD| Petrov|     b2|   250|          6.25|
|  8|   DDD| Petrov|     b2|   250|          6.25|
| 10|   DDD| Petrov|     b2|   250|          6.25|
|  6|   FFF| Ivanov|     a3|   100|           2.5|
+---+------+-------+-------+------+--------------+



In [79]:
salesDf_new.select('region').drop_duplicates().count()

4

In [83]:
salesDf_new.select('region').freqItems(cols=['region']).show()

+--------------------+
|    region_freqItems|
+--------------------+
|[FFF, DDD, BBB, AAA]|
+--------------------+



In [87]:
from pyspark.sql.window import Window
from pyspark.sql.functions import col, row_number

win = Window.partitionBy(salesDf_new['region']).orderBy(salesDf_new['amount'].desc())
salesDf_new = salesDf_new.withColumn('rank',row_number().over(win).alias('rank'))
salesDf_new.show()

+---+------+-------+-------+------+--------------+----+
| id|region|manager|product|amount|percent_amount|rank|
+---+------+-------+-------+------+--------------+----+
|  4|   BBB|Sidorov|     a1|   350|          8.75|   1|
|  9|   BBB|Sidorov|     a1|   350|          8.75|   2|
|  2|   BBB|Sidorov|     a1|   150|          3.75|   3|
|  7|   BBB|Sidorov|     a3|   150|          3.75|   4|
|  3|   DDD| Petrov|     b2|   250|          6.25|   1|
|  5|   DDD| Petrov|     b2|   250|          6.25|   2|
|  8|   DDD| Petrov|     b2|   250|          6.25|   3|
| 10|   DDD| Petrov|     b2|   250|          6.25|   4|
|  6|   FFF| Ivanov|     a3|   100|           2.5|   1|
|  1|   AAA| Ivanov|     a1|   100|           2.5|   1|
+---+------+-------+-------+------+--------------+----+

