In [0]:
generation = spark.table("curlybyte_solutions_rawdata_europe_grid_load.european_grid_raw__v2.generation")

In [0]:
display(generation.limit(5))

In [0]:
pd_generation = generation.toPandas()

In [0]:
numeric_cols = [
    c for c, t in generation.dtypes 
    if t in ("double", "float", "int", "bigint")
]

In [0]:
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

def add_imputations(df, cols, knn_features=None, iter_features=None):
    """
    Adds multiple imputation variations for several columns.
    df: pandas DataFrame
    cols: list of columns to impute
    knn_features: columns used by KNNImputer (must not have NaNs ideally)
    iter_features: columns used by IterativeImputer
    """

    df_out = df.copy()

    for col in cols:
        # 1) Simple imputers
        df_out[f"{col}_zero"] = SimpleImputer(strategy="constant", fill_value=0).fit_transform(df[[col]])
        df_out[f"{col}_mean"] = SimpleImputer(strategy="mean").fit_transform(df[[col]])
        df_out[f"{col}_median"] = SimpleImputer(strategy="median").fit_transform(df[[col]])
        df_out[f"{col}_freq"] = SimpleImputer(strategy="most_frequent").fit_transform(df[[col]])

        # 2) Forward/Backward fill
        df_out[f"{col}_ffill"] = df[col].fillna(method='ffill')
        df_out[f"{col}_bfill"] = df[col].fillna(method='bfill')
        df_out[f"{col}_ffill_bfill"] = df_out[[f"{col}_ffill", f"{col}_bfill"]].mean(axis=1)

        # 3) KNN Imputer
        if knn_features is not None:
            imputer_knn = KNNImputer(n_neighbors=3, weights="distance")
            df_out[f"{col}_knn"] = imputer_knn.fit_transform(df[knn_features + [col]])[:, -1]

        # 4) Iterative imputer
        if iter_features is not None:
            imputer_iter = IterativeImputer()
            df_out[f"{col}_iter"] = imputer_iter.fit_transform(df[iter_features + [col]])[:, -1]

    return df_out


In [0]:
import pandas as pd

In [0]:
pd_generation.index = pd.to_datetime(pd_generation.index)


pd_generation = pd_generation.assign(
    day=pd_generation.index.day,
    month=pd_generation.index.month,
    year=pd_generation.index.year,
    hour=pd_generation.index.hour    # optional but helps
)

In [0]:
df_imputed = add_imputations(
    pd_generation,
    cols=numeric_cols,
    knn_features=["day", "month", "year"],
    iter_features=["day", "month", "year"]
)

In [0]:
from sklearn.metrics import mean_absolute_error
def evaluate_imputations(df, original_cols):
    """
    For each original column:
      - detect all its imputed versions col_na_*
      - compute MAE only where original col has missing values
      - produce one dictionary of MAE scores
    Returns:
      dict where keys = column name, values = MAE dataframe
    """

    results = {}

    for col in original_cols:
        # all imputation versions for this column
        pattern = f"{col}_na_"
        imp_cols = [c for c in df.columns if c.startswith(pattern)]

        # rows where original value was missing
        missing_mask = df[col].isna()

        mae_dict = {}
        for imp_col in imp_cols:
            mae_dict[imp_col] = mean_absolute_error(
                y_true=df.loc[missing_mask, col],
                y_pred=df.loc[missing_mask, imp_col]
            )

        # save results
        results[col] = pd.DataFrame.from_dict(mae_dict, orient="index", columns=["mae"])

    return results


In [0]:
original_columns = numeric_cols

results = evaluate_imputations(df_imputed, original_columns)

plot_mae_results(results)


In [0]:
from pyspark.sql import Window
import pyspark.sql.functions as F

w = Window.partitionBy("country")

df_filled = generation
for c in numeric_cols:
    df_filled = df_filled.withColumn(
        c,
        F.when(
            F.col(c).isNull(),
            F.avg(c).over(w)
        ).otherwise(F.col(c))
    )

In [0]:
display(df_filled)

In [0]:
generation_filled = df_filled.fillna(0, subset=numeric_cols)

In [0]:
print(generation_filled.schema["index"].dataType)


In [0]:
generation_filled = generation_filled.withColumn("hour",F.date_trunc("hour", F.col("index")))

In [0]:
agg_exprs = [F.mean(c).alias(c) for c in numeric_cols]  # keep same names

generation_hourly = generation_filled.groupBy("country","hour").agg(*agg_exprs).orderBy("hour")
generation_hourly = generation_hourly.withColumnRenamed("hour", "index")


In [0]:
display(generation_hourly)

In [0]:
# Create schema under workspace
spark.sql("CREATE SCHEMA IF NOT EXISTS workspace.schema_capstone")

In [0]:
# Save the cleaned generation table under workspace
generation_hourly.write.format("delta").mode("overwrite").saveAsTable(
    "workspace.schema_capstone.generation_clean_with_mean_filled"
)
