In [23]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, col
from pyspark.sql.window import Window
from pyspark.sql.functions import regexp_extract

def rollingAverage3h(file_path):
    df = spark.read.csv(file_path, header=True, inferSchema=True).select("MTU", "Wind Onshore  - Actual Aggregated [MW]")

    # just display the end time of the time frame
    df_with_time = df.withColumn("End_Time", regexp_extract(df["MTU"], r"(\d{2}\.\d{2}\.\d{4} \d{2}:\d{2} \(CET/CEST\))", 1))

    # 3h sliding window average
    windowSpec = Window.orderBy("End_Time").rowsBetween(-2, 0)

    df_with_avg = df_with_time.withColumn("Rolling Average 3h in MWh", avg("Wind Onshore  - Actual Aggregated [MW]").over(windowSpec))

    df_clean = df_with_avg.select(
        col("End_Time"), 
        col("Rolling Average 3h in MWh")
    )
    
    return df_clean

spark = SparkSession.builder.appName("ComputeAverageForEveryRow").getOrCreate()

df_average2021 = rollingAverage3h("data/production/SE1Onshore2021.csv")

df_average2021.show()

df_average2022 = rollingAverage3h("data/production/SE1Onshore2022.csv")

df_average2022.show()



23/10/05 17:18:07 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/10/05 17:18:07 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/10/05 17:18:07 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/10/05 17:18:07 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/10/05 17:18:07 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+--------------------+-------------------------+
|            End_Time|Rolling Average 3h in MWh|
+--------------------+-------------------------+
|01.01.2021 01:00 ...|                    140.0|
|01.01.2021 02:00 ...|                    128.0|
|01.01.2021 03:00 ...|                    122.0|
|01.01.2021 04:00 ...|       112.66666666666667|
|01.01.2021 05:00 ...|       118.66666666666667|
|01.01.2021 06:00 ...|       136.66666666666666|
|01.01.2021 07:00 ...|       164.66666666666666|
|01.01.2021 08:00 ...|       173.33333333333334|
|01.01.2021 09:00 ...|       174.66666666666666|
|01.01.2021 10:00 ...|       167.33333333333334|
|01.01.2021 11:00 ...|       166.66666666666666|
|01.01.2021 12:00 ...|       167.33333333333334|
|01.01.2021 13:00 ...|       161.66666666666666|
|01.01.2021 14:00 ...|                    151.0|
|01.01.2021 15:00 ...|                    135.0|
|01.01.2021 16:00 ...|       122.66666666666667|
|01.01.2021 17:00 ...|       120.66666666666667|
|01.01.2021 18:00 ..

23/10/05 17:18:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/10/05 17:18:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/10/05 17:18:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/10/05 17:18:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/10/05 17:18:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
