In [2]:
import sys
import os
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
import plotly_express as px
from datetime import datetime, timedelta
import plotly.graph_objects as go
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
from sklearn.model_selection import TimeSeriesSplit
import lightgbm as lgb



module_path = os.path.abspath(os.path.join(".."))
if module_path not in sys.path:
    sys.path.append(module_path)

In [3]:
from src.paths import TRANSFORMED_DATA_DIR, MODELS_DIR
from src.feature import train_test_kfolds


In [4]:
# Read the transformed data , included the topn features and the time features
wind_farms_data_features_target = pd.read_parquet(f'{TRANSFORMED_DATA_DIR}/wind_farm_topn_features.parquet')

In [5]:
# Create train test folds
time_split = TimeSeriesSplit(gap=24, test_size=93, n_splits=10, max_train_size=24*30)
target_features_required = list(wind_farms_data_features_target.columns)                                                
train_test_folds= train_test_kfolds(wind_farms_data_features_target, time_split=time_split, features_target=target_features_required)

In [6]:

result = {}
fig = px.line(
    wind_farms_data_features_target,
    x=wind_farms_data_features_target.index,
    y="CF",
    markers=True,
    hover_data=["CF"]
)
for key, value in train_test_folds.items():

    X_train, y_train, X_test, y_test = train_test_folds[key]
    model = lgb.LGBMRegressor(random_state=42, eta=0.1)
    model.fit(X_train,y_train)
    predictions = model.predict(X_test)
    result[key]= (mean_absolute_error(y_test, predictions), mean_absolute_percentage_error(y_test, predictions))

    fig.add_trace(go.Scatter(x=y_test.index, y=predictions, mode='lines', name=str(key), line=dict(color='red')))

print(f"Mean abs percentage error, MAPE:{np.mean([v[1] for v in result.values()])}") # Ignoring mape based on goal and low predicted values

print(f"Mean abs error, MAE:{np.mean([v[0] for v in result.values()])}")

fig.update_layout(title="Predicted vs Actuals")
# fig.add_annotation(dict(text="Max training date"))

fig.update_layout(annotations=[
    dict(
        x=5.05,  # x and y coordinates for positioning, adjust as needed
        y=1.02,
        xref='paper',
        yref='paper',
        text='Max Training date',  # Legend title text
        showarrow=False,
        align='right'
    )
])
fig.show()

  v = v.dt.to_pydatetime()


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000181 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2442
[LightGBM] [Info] Number of data points in the train set: 720, number of used features: 13
[LightGBM] [Info] Start training from score 0.158347
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000107 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2442
[LightGBM] [Info] Number of data points in the train set: 720, number of used features: 13
[LightGBM] [Info] Start training from score 0.160775
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000103 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2442
[LightGBM] [Info] Number of data points in the train set: 720, number of used features: 13
[LightGBM] [Info] Start training 

Hypertuning the XGBoost model 

In [8]:
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit



X = wind_farms_data_features_target.values[:,1:]
y = wind_farms_data_features_target.values[:,0]


# Create a time series cross-validator
tscv = TimeSeriesSplit(gap=24, test_size=93, n_splits=10, max_train_size=720)


# Parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.3],
    'max_depth': [3, 5, 7],
    'min_child_weight': [1, 3, 5],  
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9],
    'num_leaves': [31, 61, 91], 
    'reg_alpha': [0, 0.1, 0.5],  # L1 regularization
    'reg_lambda': [0, 0.1, 0.5]  # L2 regularization
}

# lgBMRegressor
lgbm_reg = lgb.LGBMRegressor()

# Randomized search
random_search = RandomizedSearchCV(lgbm_reg, param_grid, n_iter=50, 
                                   scoring='neg_mean_absolute_error', 
                                   cv=tscv, random_state=42)

random_search.fit(X, y)

# Best parameters
print(random_search.best_params_)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000119 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2442
[LightGBM] [Info] Number of data points in the train set: 720, number of used features: 13
[LightGBM] [Info] Start training from score 0.158347
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000128 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2442
[LightGBM] [Info] Number of data points in the train set: 720, number of used features: 13
[LightGBM] [Info] Start training from score 0.160775
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000144 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2442
[LightGBM] [Info] Number of data points in the train set: 720, number of used features: 13
[LightGBM] [Info] Start training 

There is an improvement in the accuracy of the model after   
The fold with January 5 and January 6 have interpolated data as the actuals , this could be leading to larger absolute error than realized

In [13]:
result = {}
params=random_search.best_params_
fig = px.line(
    wind_farms_data_features_target,
    x=wind_farms_data_features_target.index,
    y="CF",
    markers=True,
    hover_data=["CF"]
)
for key, value in train_test_folds.items():

    X_train, y_train, X_test, y_test = train_test_folds[key]
    model = lgb.LGBMRegressor(**params, random_state=42)
    model.fit(X_train,y_train)
    predictions = model.predict(X_test)
    result[key]= (mean_absolute_error(y_test, predictions), mean_absolute_percentage_error(y_test, predictions))

    fig.add_trace(go.Scatter(x=y_test.index, y=predictions, mode='lines', name=str(key), line=dict(color='red')))

print(f"Mean abs percentage error, MAPE:{np.mean([v[1] for v in result.values()])}") # Ignoring mape based on goal and low predicted values

print(f"Mean abs error, MAE:{np.mean([v[0] for v in result.values()])}")

fig.update_layout(title="Predicted vs Actuals, After tuning MAE: 0.04161")
# fig.add_annotation(dict(text="Max training date"))

fig.update_layout(annotations=[
    dict(
        x=1.05,  # x and y coordinates for positioning, adjust as needed
        y=1.02,
        xref='paper',
        yref='paper',
        text='Max Training date',  # Legend title text
        showarrow=False,
        align='right'
    )
])
fig.show()


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000107 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2442
[LightGBM] [Info] Number of data points in the train set: 720, number of used features: 13
[LightGBM] [Info] Start training from score 0.158347
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000115 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2442
[LightGBM] [Info] Number of data points in the train set: 720, number of used features: 13
[LightGBM] [Info] Start training from score 0.160775
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000116 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2442
[LightGBM] [Info] Number of data points in the train set: 720, number of used features: 13
[LightGBM] [Info] Start training 