In [167]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import Window

# Spark session & context
spark = SparkSession.builder.master("local").getOrCreate()
sc = spark.sparkContext

In [168]:
path = "/home/jovyan/work/Table.csv"

from pyspark.sql.types import LongType, StringType, StructField, StructType, BooleanType, ArrayType, IntegerType, DateType, DoubleType
customSchema = StructType([
    StructField("Region", StringType(), True),        
    StructField("Product", StringType(), True),
    StructField("Date", DateType(), True),
    StructField("Quantity", IntegerType(), True),
    StructField("Price", StringType(), True) #String because csv uses comma as decimal separator and not supported directly in Spark
])


df_data = spark.read.schema(customSchema).options(delimiter=";", header=True).csv(path)
df_data = df_data.withColumn('Price_number', F.regexp_replace('Price', ',', '.').cast(DoubleType())).drop('Price')
df_data = df_data.withColumnRenamed('Price_number','Price')


In [175]:
w = Window.partitionBy(["Region","Product"]).orderBy(("Date"))

In [179]:
df_data.withColumn("Quantity YTD", F.sum('Quantity').over(w)).show()

+------+--------+----------+--------+-----------+------------+
|Region| Product|      Date|Quantity|      Price|Quantity YTD|
+------+--------+----------+--------+-----------+------------+
|  East|   Books|2024-03-02|      18| 555.315656|          18|
|  East|   Books|2024-03-03|      32|131.2550527|          50|
|  East|Computer|2024-03-01|      80|362.5919681|          80|
|  East|Computer|2024-03-02|      37|584.9247303|         117|
|  East|Computer|2024-03-03|      21|72.19181479|         138|
|  East|   Music|2024-03-01|      94|663.0665039|          94|
|  East|   Music|2024-03-02|      73|890.8353521|         167|
|  East|   Music|2024-03-03|      91|542.7143176|         258|
| North|   Books|2024-03-02|      27|824.2629527|          27|
| North|   Books|2024-03-03|      54|954.3096495|          81|
| North|Computer|2024-03-01|      65|556.9622863|          65|
| North|Computer|2024-03-02|      97|39.36502872|         162|
| North|Computer|2024-03-03|      97|892.2455727|      

In [185]:
w = Window.partitionBy(["Region","Product",F.month("Date")]).orderBy(("Date"))

MTD_columns = { "Quantity_MTD": F.sum('Quantity').over(w),
               "Price_MTD": F.sum('Price').over(w)
}

df_data.withColumns(MTD_columns).show()



+------+--------+----------+--------+-----------+------------+-------------+
|Region| Product|      Date|Quantity|      Price|Quantity_MTD|    Price_MTD|
+------+--------+----------+--------+-----------+------------+-------------+
|  East|   Books|2024-03-02|      18| 555.315656|          18|   555.315656|
|  East|   Books|2024-03-03|      32|131.2550527|          50|  686.5707087|
|  East|Computer|2024-03-01|      80|362.5919681|          80|  362.5919681|
|  East|Computer|2024-03-02|      37|584.9247303|         117|  947.5166984|
|  East|Computer|2024-03-03|      21|72.19181479|         138|1019.70851319|
|  East|   Music|2024-03-01|      94|663.0665039|          94|  663.0665039|
|  East|   Music|2024-03-02|      73|890.8353521|         167|  1553.901856|
|  East|   Music|2024-03-03|      91|542.7143176|         258| 2096.6161736|
| North|   Books|2024-03-02|      27|824.2629527|          27|  824.2629527|
| North|   Books|2024-03-03|      54|954.3096495|          81| 1778.5726022|