In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, col
from pyspark.sql.window import Window
from pyspark.sql.functions import regexp_extract

def rollingAverage3h(file_path):
    df = spark.read.csv(file_path, header=True, inferSchema=True).select("MTU", "Wind Onshore  - Actual Aggregated [MW]")
    
    df.show(n=5, truncate=False)
    # just display the end time of the time frame
    df_with_time = df.withColumn("End_Time", regexp_extract(df["MTU"], r"(\d{2}\.\d{2}\.\d{4} \d{2}:\d{2} \(UTC\))", 1))
    df_with_time.show(n=5, truncate=False)
    # 3h sliding window average
    windowSpec = Window.orderBy("End_Time").rowsBetween(-2, 0)

    df_with_avg = df_with_time.withColumn("Rolling Average 3h in MWh", avg("Wind Onshore  - Actual Aggregated [MW]").over(windowSpec))

    df_clean = df_with_avg.select(
        col("End_Time"), 
        col("Rolling Average 3h in MWh")
    )
    
    return df_clean

spark = SparkSession.builder.appName("ComputeAverageForEveryRow").getOrCreate()

df_average2021 = rollingAverage3h("data/production/SE1Onshore2021UTC.csv")

df_average2021.show()

df_average2022 = rollingAverage3h("data/production/SE1Onshore2022UTC.csv")

df_average2022.show()



+-----------------------------------------+--------------------------------------+
|MTU                                      |Wind Onshore  - Actual Aggregated [MW]|
+-----------------------------------------+--------------------------------------+
|01.01.2021 00:00 - 01.01.2021 01:00 (UTC)|116                                   |
|01.01.2021 01:00 - 01.01.2021 02:00 (UTC)|110                                   |
|01.01.2021 02:00 - 01.01.2021 03:00 (UTC)|112                                   |
|01.01.2021 03:00 - 01.01.2021 04:00 (UTC)|134                                   |
|01.01.2021 04:00 - 01.01.2021 05:00 (UTC)|164                                   |
+-----------------------------------------+--------------------------------------+
only showing top 5 rows

+--------------------+--------------------------------------+--------------------+
|                 MTU|Wind Onshore  - Actual Aggregated [MW]|            End_Time|
+--------------------+--------------------------------------+-