In [0]:
%pip install xgboost
%pip install lightgbm


In [0]:
# ============================
# 1. Imports and Configuration
# ============================
import os
import pickle
from math import sqrt

import pandas as pd
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.neural_network import MLPRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error, r2_score
import pyspark.sql.functions as F
from pyspark.sql.window import Window

# Model types to train
model_types = ["xgb", "xgb_deep", "ridge", "mlp", "lgbm"]


# Target horizon settings
targets = [f"stress_plus_{h}h" for h in range(1, 7)]

# ===============
# Save path
country = "ALL"
model_path = f"/Workspace/workspace/streamlit/models/{country}"
# ===============

# =========
# after 2nd try correction, but didn't work
# Save path (use /dbfs for Databricks file system)
# country = "ALL"
# model_path = f"/dbfs/workspace/streamlit/models/{country}"

# ... rest of your code remains unchanged ...
# =========

# ============================
# 2. Load Spark Tables
# ============================
df_train = spark.table("workspace.default.train_imputed_timebins_lags")
df_val = spark.table("workspace.default.validation_imputed_timebins_lags")

# ============================
# 3. Create Target Columns (1h to 6h ahead)
# ============================
for h in range(1, 7):
    df_train = df_train.withColumn(f"stress_plus_{h}h", F.lead("grid_stress_score", h).over(Window.partitionBy("country").orderBy("timestamp")))
    df_val = df_val.withColumn(f"stress_plus_{h}h", F.lead("grid_stress_score", h).over(Window.partitionBy("country").orderBy("timestamp")))

# ============================
# 4. Convert to Pandas
# ============================
df_train_pd = df_train.toPandas()
df_val_pd = df_val.toPandas()

# ============================
# 5. Feature Selection
# ============================
excluded_columns = [
    "index", "timestamp", "grid_stress_score", "country", *targets
]

selected_features = [
    col for col in df_train_pd.columns if col not in excluded_columns
]

# ============================
# 6. Clean Data
# ============================
df_train_pd = df_train_pd.dropna(subset=selected_features + targets)
df_val_pd = df_val_pd.dropna(subset=selected_features + targets)

for col in selected_features:
    if df_train_pd[col].dtype == "object":
        df_train_pd[col] = df_train_pd[col].astype("category")
        df_val_pd[col] = df_val_pd[col].astype("category")


# ============================
# 7. Define Model Function
# ============================
def get_model(model_type):
    if model_type == "xgb":
        return XGBRegressor(n_estimators=100, random_state=42, enable_categorical=True)
    elif model_type == "xgb_deep":
        return XGBRegressor(n_estimators=300, max_depth=10, learning_rate=0.05, subsample=0.7, colsample_bytree=0.8, random_state=42, enable_categorical=True)
    elif model_type == "lgbm":
        return LGBMRegressor(n_estimators=200, learning_rate=0.05, num_leaves=31, random_state=42)
    elif model_type == "ridge":
        return Ridge(alpha=1.0)
    elif model_type == "mlp":
        return MLPRegressor(hidden_layer_sizes=(64, 64), max_iter=300, random_state=42)


    else:
        raise ValueError(f"Unsupported model type: {model_type}")

# ============================
# 8. Train and Save Models
# ============================
for model_type in model_types:
    print(f"\nTraining models using: {model_type}")

    model_dir = os.path.join(model_path, model_type)
    os.makedirs(model_dir, exist_ok=True)

    for target in targets:
        print(f"Training model for target: {target}")

        # Copy feature sets
        X_train = df_train_pd[selected_features].copy()
        X_val = df_val_pd[selected_features].copy()

        # For non-XGBoost models, encode categorical columns
        if not model_type.startswith("xgb"):
            for col in X_train.select_dtypes(include="category").columns:
                X_train[col] = X_train[col].cat.codes
                X_val[col] = X_val[col].cat.codes

        y_train = df_train_pd[target]
        y_val = df_val_pd[target]

        model = get_model(model_type)
        model.fit(X_train, y_train)

        with open(os.path.join(model_dir, f"{target}.pkl"), "wb") as f:
            pickle.dump(model, f)

        y_pred = model.predict(X_val)
        rmse = sqrt(mean_squared_error(y_val, y_pred))
        mse = mean_squared_error(y_val, y_pred)
        r2 = r2_score(y_val, y_pred)

        print(f"{model_type} | {target} — RMSE: {rmse:.2f}, MSE: {mse:.2f}, R2: {r2:.2f}")



/home/spark-02edffbd-6028-407c-bf63-5c/.ipykernel/27287/command-7450199544602191-2068834007:78: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train_pd[col] = df_train_pd[col].astype("category")

Training models using: xgb
Training model for target: stress_plus_1h
xgb | stress_plus_1h — RMSE: 8.90, MSE: 79.17, R2: 0.71
Training model for target: stress_plus_2h
xgb | stress_plus_2h — RMSE: 10.87, MSE: 118.09, R2: 0.56
Training model for target: stress_plus_3h
xgb | stress_plus_3h — RMSE: 11.85, MSE: 140.34, R2: 0.48
Training model for target: stress_plus_4h
xgb | stress_plus_4h — RMSE: 12.49, MSE: 155.88, R2: 0.42
Training model for target: stress_plus_5h
xgb | stress_plus_5h — RMSE: 12.95, MSE: 167.61, R2: 0.38
Training model for target: stress_plus_6h
xgb | stress_plus_6h — RMSE: 13.21, MSE: 174.57, R2: 0.35

Training models using: xgb_deep
Training model for target: stress_plus_1h
xgb_deep | stress_plus_1h — RMSE: 8.57, MSE: 73.38, R2: 0.73
Training model for target: stress_plus_2h
xgb_deep | stress_plus_2h — RMSE: 10.50, MSE: 110.17, R2: 0.59
Training model for target: stress_plus_3h
xgb_deep | stress_plus_3h — RMSE: 11.51, MSE: 132.43, R2: 0.51
Training model for target: stress_plus_4h
xgb_deep | stress_plus_4h — RMSE: 12.11, MSE: 146.54, R2: 0.45
Training model for target: stress_plus_5h
xgb_deep | stress_plus_5h — RMSE: 12.46, MSE: 155.16, R2: 0.42
Training model for target: stress_plus_6h
xgb_deep | stress_plus_6h — RMSE: 12.70, MSE: 161.20, R2: 0.40

Training models using: ridge
Training model for target: stress_plus_1h
/databricks/python/lib/python3.12/site-packages/sklearn/linear_model/_ridge.py:215: LinAlgWarning: Ill-conditioned matrix (rcond=8.66363e-19): result may not be accurate.
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
ridge | stress_plus_1h — RMSE: 24.45, MSE: 597.88, R2: -1.23
Training model for target: stress_plus_2h
/databricks/python/lib/python3.12/site-packages/sklearn/linear_model/_ridge.py:215: LinAlgWarning: Ill-conditioned matrix (rcond=8.66363e-19): result may not be accurate.
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
ridge | stress_plus_2h — RMSE: 25.72, MSE: 661.64, R2: -1.46
Training model for target: stress_plus_3h
/databricks/python/lib/python3.12/site-packages/sklearn/linear_model/_ridge.py:215: LinAlgWarning: Ill-conditioned matrix (rcond=8.66363e-19): result may not be accurate.
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
ridge | stress_plus_3h — RMSE: 25.56, MSE: 653.46, R2: -1.43
Training model for target: stress_plus_4h
/databricks/python/lib/python3.12/site-packages/sklearn/linear_model/_ridge.py:215: LinAlgWarning: Ill-conditioned matrix (rcond=8.66363e-19): result may not be accurate.
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
ridge | stress_plus_4h — RMSE: 25.33, MSE: 641.76, R2: -1.39
Training model for target: stress_plus_5h
/databricks/python/lib/python3.12/site-packages/sklearn/linear_model/_ridge.py:215: LinAlgWarning: Ill-conditioned matrix (rcond=8.66363e-19): result may not be accurate.
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
ridge | stress_plus_5h — RMSE: 24.16, MSE: 583.59, R2: -1.17
Training model for target: stress_plus_6h
/databricks/python/lib/python3.12/site-packages/sklearn/linear_model/_ridge.py:215: LinAlgWarning: Ill-conditioned matrix (rcond=8.66363e-19): result may not be accurate.
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
ridge | stress_plus_6h — RMSE: 23.77, MSE: 564.88, R2: -1.10

Training models using: mlp
Training model for target: stress_plus_1h
mlp | stress_plus_1h — RMSE: 15.26, MSE: 232.99, R2: 0.13
Training model for target: stress_plus_2h
mlp | stress_plus_2h — RMSE: 16.21, MSE: 262.69, R2: 0.02
Training model for target: stress_plus_3h
mlp | stress_plus_3h — RMSE: 16.38, MSE: 268.32, R2: 0.00
Training model for target: stress_plus_4h
mlp | stress_plus_4h — RMSE: 16.18, MSE: 261.65, R2: 0.03
Training model for target: stress_plus_5h
mlp | stress_plus_5h — RMSE: 16.44, MSE: 270.42, R2: -0.01
Training model for target: stress_plus_6h
mlp | stress_plus_6h — RMSE: 16.44, MSE: 270.17, R2: -0.01

Training models using: lgbm
Training model for target: stress_plus_1h
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.165043 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 20859
[LightGBM] [Info] Number of data points in the train set: 222248, number of used features: 131
[LightGBM] [Info] Start training from score 28.392325
lgbm | stress_plus_1h — RMSE: 9.04, MSE: 81.71, R2: 0.70
Training model for target: stress_plus_2h
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.076179 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 20859
[LightGBM] [Info] Number of data points in the train set: 222248, number of used features: 131
[LightGBM] [Info] Start training from score 28.392550
lgbm | stress_plus_2h — RMSE: 11.09, MSE: 123.02, R2: 0.54
Training model for target: stress_plus_3h
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.047312 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 20859
[LightGBM] [Info] Number of data points in the train set: 222248, number of used features: 131
[LightGBM] [Info] Start training from score 28.393169
lgbm | stress_plus_3h — RMSE: 12.12, MSE: 146.96, R2: 0.45
Training model for target: stress_plus_4h
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.052681 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 20859
[LightGBM] [Info] Number of data points in the train set: 222248, number of used features: 131
[LightGBM] [Info] Start training from score 28.393619
lgbm | stress_plus_4h — RMSE: 12.60, MSE: 158.81, R2: 0.41
Training model for target: stress_plus_5h
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.083911 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 20859
[LightGBM] [Info] Number of data points in the train set: 222248, number of used features: 131
[LightGBM] [Info] Start training from score 28.393056
lgbm | stress_plus_5h — RMSE: 12.90, MSE: 166.53, R2: 0.38
Training model for target: stress_plus_6h
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.032492 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 20859
[LightGBM] [Info] Number of data points in the train set: 222248, number of used features: 131
[LightGBM] [Info] Start training from score 28.392494
lgbm | stress_plus_6h — RMSE: 13.06, MSE: 170.51, R2: 0.36