In [1]:
%%capture
%%bash
pip install pyspark

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql.functions import cume_dist,row_number,col,avg,sum,min,max

In [3]:
spark = SparkSession.builder.master('local[*]').appName("SparkTest").getOrCreate()

In [4]:
file_path = '/content/files'

In [6]:
spark.sql(f"create table sales using csv location '{file_path}/*.csv' options(header 'true', inferSchema 'true', sep ';')")

DataFrame[]

In [7]:
df  = spark.sql("select input_file_name() as filename, count(*) from sales group by filename")

In [8]:
df.show()

+--------------------+--------+
|            filename|count(1)|
+--------------------+--------+
|file:///content/f...|       9|
|file:///content/f...|       9|
|file:///content/f...|       9|
+--------------------+--------+



In [9]:
df_sales = spark.sql("select region_id, manager_id, val from sales")

In [10]:
df_sales.show(50)

+---------+----------+---+
|region_id|manager_id|val|
+---------+----------+---+
|       r1|        m6|  5|
|       r4|        m3|  3|
|       r1|        m7| 10|
|       r4|        m3|  2|
|       r2|        m2|  1|
|       r2|        m6|  5|
|       r1|        m6|  6|
|       r4|        m2|  2|
|       r3|        m1|  2|
|       r4|        m4|  9|
|       r1|        m4|  3|
|       r2|        m4|  3|
|       r2|        m5|  1|
|       r5|        m7|  4|
|       r3|        m5| 10|
|       r2|        m3|  4|
|       r5|        m5|  8|
|       r2|        m7|  4|
|       r5|        m3|  2|
|       r1|        m1|  4|
|       r2|        m7|  4|
|       r1|        m1|  5|
|       r1|        m2|  5|
|       r5|        m4|  6|
|       r1|        m3|  9|
|       r1|        m4|  8|
|       r1|        m7|  7|
+---------+----------+---+



In [11]:
w_rm = Window.partitionBy("region_id").orderBy("manager_id")
w_r = Window.partitionBy("region_id")

In [12]:
df_sales.withColumn("row",row_number().over(w_rm)) \
       .withColumn("cume_dist",cume_dist().over(w_rm)) \
       .withColumn("avg", avg(col("val")).over(w_r)) \
       .withColumn("sum", sum(col("val")).over(w_r)) \
       .withColumn("min", min(col("val")).over(w_r)) \
       .withColumn("max", max(col("val")).over(w_r)) \
       .show()

+---------+----------+---+---+-------------------+-----------------+---+---+---+
|region_id|manager_id|val|row|          cume_dist|              avg|sum|min|max|
+---------+----------+---+---+-------------------+-----------------+---+---+---+
|       r1|        m1|  4|  1|                0.2|              6.2| 62|  3| 10|
|       r1|        m1|  5|  2|                0.2|              6.2| 62|  3| 10|
|       r1|        m2|  5|  3|                0.3|              6.2| 62|  3| 10|
|       r1|        m3|  9|  4|                0.4|              6.2| 62|  3| 10|
|       r1|        m4|  3|  5|                0.6|              6.2| 62|  3| 10|
|       r1|        m4|  8|  6|                0.6|              6.2| 62|  3| 10|
|       r1|        m6|  5|  7|                0.8|              6.2| 62|  3| 10|
|       r1|        m6|  6|  8|                0.8|              6.2| 62|  3| 10|
|       r1|        m7| 10|  9|                1.0|              6.2| 62|  3| 10|
|       r1|        m7|  7| 1