In [0]:
load_train = spark.table("workspace.default.train_set_imputed")

In [0]:
train = load_train.toPandas()

In [0]:
train.head()

In [0]:
train.columns

In [0]:
load_val = spark.table("workspace.default.validation_set_imputed")
val = load_val.toPandas()
val.head()
val.columns

In [0]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

renewables_cols = [
    "Solar__Actual_Aggregated",
    "Wind_Onshore__Actual_Aggregated",
    "Wind_Offshore__Actual_Aggregated",
    "Hydro_Run_of_river_and_poundage__Actual_Aggregated",
    "Hydro_Water_Reservoir__Actual_Aggregated",
    "Geothermal__Actual_Aggregated",
    "Hydro_Pumped_Storage__Actual_Aggregated",
    "Biomass__Actual_Aggregated",
    "Other_renewable__Actual_Aggregated",
    "Marine__Actual_Aggregated"
]

generation_cols = [
    "Biomass__Actual_Aggregated", "Fossil_Brown_coal_Lignite__Actual_Aggregated",
    "Fossil_Coal_derived_gas__Actual_Aggregated", "Fossil_Gas__Actual_Aggregated",
    "Fossil_Hard_coal__Actual_Aggregated", "Fossil_Oil__Actual_Aggregated",
    "Fossil_Oil_shale__Actual_Aggregated", "Fossil_Peat__Actual_Aggregated",
    "Geothermal__Actual_Aggregated", "Hydro_Pumped_Storage__Actual_Aggregated",
    "Hydro_Run_of_river_and_poundage__Actual_Aggregated",
    "Hydro_Water_Reservoir__Actual_Aggregated", "Marine__Actual_Aggregated",
    "Nuclear__Actual_Aggregated", "Other__Actual_Aggregated",
    "Other_renewable__Actual_Aggregated", "Solar__Actual_Aggregated",
    "Waste__Actual_Aggregated", "Wind_Offshore__Actual_Aggregated",
    "Wind_Onshore__Actual_Aggregated"
]

columns_to_drop = [
    "index", "net_imports", "reserve_margin_ml", "forecast_load_error",
    "load_rel_error", "P10_net", "P90_net", "score_reserve_margin",
    "score_load_error", "T7_high_exports", "T8_high_imports",
    "score_T7", "score_T8", "grid_stress_score",
    "hour", "month"
]

# Function to apply feature engineering

def feature_engineering(df, le=None, fit_encoder=False):
    df_copy = df.copy()
    
    # Renewables fraction
    df_copy["renewables"] = df_copy[renewables_cols].sum(axis=1)
    df_copy["total_generation"] = df_copy[generation_cols].sum(axis=1)
    df_copy["renewables_fraction"] = df_copy["renewables"] / df_copy["total_generation"]
    
    # Time features
    datetime_series = pd.to_datetime(df_copy["index"])
    df_copy["hour"] = datetime_series.dt.hour
    df_copy["day_of_week"] = datetime_series.dt.dayofweek
    df_copy["month"] = datetime_series.dt.month
    df_copy["is_weekend"] = (df_copy["day_of_week"] >= 5).astype(int)
    
    df_copy["hour_sin"] = np.sin(2 * np.pi * df_copy["hour"] / 24)
    df_copy["hour_cos"] = np.cos(2 * np.pi * df_copy["hour"] / 24)
    df_copy["month_sin"] = np.sin(2 * np.pi * df_copy["month"] / 12)
    df_copy["month_cos"] = np.cos(2 * np.pi * df_copy["month"] / 12)
    
    # Country encoding
    if le is None:
        le = LabelEncoder()
        df_copy["country_encoded"] = le.fit_transform(df_copy["country"])
    else:
        if fit_encoder:
            df_copy["country_encoded"] = le.fit_transform(df_copy["country"])
        else:
            df_copy["country_encoded"] = le.transform(df_copy["country"])
    
    X = df_copy.drop(columns_to_drop, axis=1)
    y = df_copy["grid_stress_score"]
    
    return X, y, le

In [0]:
# Apply to training
X_train, y_train, le = feature_engineering(train, le=None, fit_encoder=True)

In [0]:
# Apply to validation
X_val, y_val, _ = feature_engineering(val, le=le, fit_encoder=False)