In [3]:
import os
import numpy as np
import pandas as pd
import lightgbm as lgb
import xgboost as xgb
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import joblib

# Set random seed
np.random.seed(100)

# Define directories
BASE_DIR = os.getcwd()
DATA_DIR = os.path.join(BASE_DIR, "../datasets/finalized")
MODEL_DIR = os.path.join(BASE_DIR, "../models")
os.makedirs(MODEL_DIR, exist_ok=True)

# Load dataset
df_hourly = pd.read_csv(os.path.join(DATA_DIR, "finalized_hourly_data.csv"), parse_dates=["Start date"], low_memory=False)

# Preprocess data
df_hourly.set_index("Start date", inplace=True)
df_hourly["Avg_Price_EUR_MWh"] = df_hourly[
    ["Germany/Luxembourg [/MWh] Original resolutions", "Belgium [/MWh] Original resolutions", "France [/MWh] Original resolutions"]
].mean(axis=1)

df_hourly["Price_Movement"] = np.where(
    df_hourly["Avg_Price_EUR_MWh"].pct_change().fillna(0) > 0.05, 1, 
    np.where(df_hourly["Avg_Price_EUR_MWh"].pct_change().fillna(0) < -0.05, 2, 0)
)

df_hourly["Rolling_Mean_24"] = df_hourly["Avg_Price_EUR_MWh"].rolling(24).mean()
df_hourly["Price_Change_1"] = df_hourly["Avg_Price_EUR_MWh"].pct_change() * 100
df_hourly["Lag_1"] = df_hourly["Avg_Price_EUR_MWh"].shift(1)
df_hourly.fillna(0, inplace=True)

# Prepare dataset
features = ["Rolling_Mean_24", "Price_Change_1", "Lag_1"]
X = df_hourly[features]
y = df_hourly["Price_Movement"]

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

# Handle class imbalance
X_train, y_train = SMOTE(random_state=42).fit_resample(X_train, y_train)

# Train models
lgb_classifier = lgb.LGBMClassifier()
xgb_classifier = xgb.XGBClassifier()

lgb_classifier.fit(X_train, y_train)
xgb_classifier.fit(X_train, y_train)

# Save models
joblib.dump(lgb_classifier, os.path.join(MODEL_DIR, "lgb_price_model.pkl"))
joblib.dump(xgb_classifier, os.path.join(MODEL_DIR, "xgb_price_model.pkl"))
joblib.dump(scaler, os.path.join(MODEL_DIR, "scaler.pkl"))

print("✅ Models trained and saved successfully!")


  df_hourly["Avg_Price_EUR_MWh"].pct_change().fillna(0) > 0.05, 1,
  np.where(df_hourly["Avg_Price_EUR_MWh"].pct_change().fillna(0) < -0.05, 2, 0)
  df_hourly["Price_Change_1"] = df_hourly["Avg_Price_EUR_MWh"].pct_change() * 100


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004188 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 35421, number of used features: 3
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
✅ Models trained and saved successfully!
