# Model in Production for Wind Power Predictions - PE La Castellana


El dataset que se va a trabajar corresponde al Parque Eólico La Castellana (localizado en Bahia Blanca).

In [1]:
import pandas as pd
import numpy as np

pd.options.mode.copy_on_write = True 
from datetime import datetime, timedelta

from sklearn.metrics import mean_absolute_error, mean_squared_error#, r2_score
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit


# Utilities
import os
import pickle
import requests as req
from io import BytesIO


# LightGBM model
import lightgbm as lgb
from lightgbm import LGBMRegressor

In [2]:
FOLDER = 'dist' # Should be clean before the execution

#### Carga de dataset de Históricos

In [3]:
path = 'https://raw.githubusercontent.com/guilledesimone/MMA-Tesis/refs/heads/main/Datos/ds_histo_02022019-13062024.csv'
#path = 'D:\\Documents\\MMA\\1.0 Tesis\\Datos\\ds_histo_02022019-13062024.csv' 

df_histo_full = pd.read_csv(path, decimal=',', parse_dates=['FechaHora'], 
                 date_format='%Y-%m-%d %H:%M:%S', delimiter=';', header=0)

In [4]:
# Drop rows where EnergiaSMEC is NaN
df_histo_full.dropna(subset=['EnergiaSMEC'], inplace=True)

#### Carga de dataset de Forecast & Actual para evaluar performance

In [5]:
path = 'https://raw.githubusercontent.com/guilledesimone/MMA-Tesis/refs/heads/main/Datos/ds_fc_actual_02062024-11062024.csv'

#path = 'D:\\Documents\\MMA\\1.0 Tesis\\Datos\\ds_fc_actual_02062024-11062024.csv' #lote 1
#path = 'D:\\Documents\\MMA\\1.0 Tesis\\Datos\\ds_fc_actual_17062024-26062024.csv' #lote 2


df_fc_actual = pd.read_csv(path, decimal=',', parse_dates=['FechaHora'], 
                 date_format='%Y-%m-%d %H:%M:%S', delimiter=';', header=0)

In [6]:
#El forecast de aero_disp lo estimo en base al valor medio

# Filter the subset where AerosDisp is not null
filtered_df = df_fc_actual.dropna(subset=['aeros_disp'])

# Calculate the mean of aero_disp 
mean_aeros_disp = filtered_df.tail(12)['aeros_disp'].mean()

df_fc_actual['aeros_disp'] = mean_aeros_disp.round(2)

Selección de variables relevantes

In [7]:
# Define the desired column order
main_features = ['FechaHora','EnergiaSMEC','aeros_disp','ws100_avg', 'dir100_avg', 'temp_avg','energia_fc_cammesa'] #promedio de fuentes de ecwmf y gfs
#main_features = ['FechaHora','EnergiaSMEC','aeros_disp','ws100_ecmwf', 'dir100_ecmwf', 'temp_ecmwf']

In [8]:
df_fc_actual = df_fc_actual[main_features[:-1]]

In [9]:
#df_fc_actual.head()

#### Preparación de datos historico

In [10]:
# Get the minimum FechaHora
start_date = df_fc_actual['FechaHora'].min()

# Exclude data with FechaHora >= start_date, because this is the initial date for the forecast 
df_histo_full = df_histo_full[df_histo_full['FechaHora'] < start_date]

In [11]:
# Main the DataFrame columns
df_histo = df_histo_full[main_features[:-1]]

# Set FechaHora as Index
#df_histo.set_index('FechaHora', inplace=True)

#df_histo.head()

#### Division en train y test

In [12]:
# Define the start date for the train and test sets
train_start_dt = df_histo['FechaHora'].min()
test_start_dt = df_histo['FechaHora'].max() - pd.Timedelta(days=365)


In [13]:
# Create train set containing only the model features
df_train = df_histo[
(df_histo['FechaHora'] >= train_start_dt)
& (df_histo['FechaHora'] < test_start_dt)].copy()

# Create test set containing only the model features
df_test = df_histo_full[df_histo_full['FechaHora'] >= test_start_dt][main_features].copy()

print('Training data shape: ', df_train.shape)
print('Test data shape: ', df_test.shape)

Training data shape:  (37966, 6)
Test data shape:  (8761, 7)


#### Preparación de dataset forecast - exogenos

In [14]:
exog_features = [feature for feature in main_features if feature != 'EnergiaSMEC'and feature != 'energia_fc_cammesa']

# Reorder the DataFrame columns
df_fc_actual_exog = df_fc_actual[exog_features]
# Set FechaHora as Index
df_fc_actual_exog.set_index('FechaHora', inplace=True)

#df_fc_actual_exog.head()

### Entrenamiento del modelo LightGBM

In [15]:
df_test.set_index('FechaHora', inplace=True)
df_train.set_index('FechaHora', inplace=True)

In [16]:
# Separate features and target
x_train, y_train = df_train.drop(columns=['EnergiaSMEC']), df_train['EnergiaSMEC'].values
x_test, y_test = df_test.drop(columns=['EnergiaSMEC','energia_fc_cammesa']), df_test['EnergiaSMEC'].values

In [17]:
# Separate features and target
x_train, y_train = df_train.drop(columns=['EnergiaSMEC']), df_train['EnergiaSMEC'].values
x_test, y_test = df_test.drop(columns=['EnergiaSMEC','energia_fc_cammesa']), df_test['EnergiaSMEC'].values


In [18]:
type(y_train)

numpy.ndarray

In [19]:
def fit_lgbm(x_train: pd.DataFrame, y_train: np.ndarray) -> LGBMRegressor:
    """
        Fits a LigthGBM model, saves the model and returns it.
    """
    params = {
    'num_leaves': 30,
    'n_estimators': 100,
    'max_depth': 8,
    'min_child_samples': 200,
    'learning_rate': 0.05,
    'subsample': 0.70,
    'colsample_bytree': 0.75
    }

    # Initialize the model
    model = lgb.LGBMRegressor(**params)
    
    # Parameter grid for RandomizedSearchCV
    param_grid = {
        'num_leaves': [20, 30, 40, 50],
        'n_estimators': [50,70,100, 200, 400, 600],
        'max_depth': [6, 8, 10, 12],
        'min_child_samples': [20, 50, 100, 200, 400, 600],
        'learning_rate': [0.01, 0.05, 0.1, 0.15],
        'subsample': [0.5, 0.7, 0.8, 0.9],
        'colsample_bytree': [0.6, 0.75, 0.8, 0.9, 1]
    }
    
    # Set up TimeSeriesSplit
    tscv = TimeSeriesSplit(n_splits=5)
    
    # Set up RandomizedSearchCV with TimeSeriesSplit
    random_search = RandomizedSearchCV(
        model, param_distributions=param_grid, n_iter=25, cv=tscv, verbose=1, n_jobs=-1,
        random_state=14, scoring='neg_mean_squared_error'
    )

    # Fit the model
    random_search.fit(x_train, y_train)
       
    # Get the best model
    best_model = random_search.best_estimator_
    
    file_name = 'model.pkl'
    with open(os.path.join(FOLDER, file_name), 'wb') as f:
        pickle.dump(best_model, f)     

    print('Model saved')
    return best_model

In [20]:
model = fit_lgbm(x_train, y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000143 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1003
[LightGBM] [Info] Number of data points in the train set: 37966, number of used features: 4
[LightGBM] [Info] Start training from score 47.909617
Model saved


In [21]:
# Predict on test data
y_pred = model.predict(x_test)

In [22]:
# Evaluate the model
mae_t = mean_absolute_error(y_test, y_pred)
mse_t = mean_squared_error(y_test, y_pred)

rmse_t = np.sqrt(mse_t)
print(f'Best parameters found: {model}')
print(f'MAE: {mae_t}')
print(f'RMSE: {rmse_t}')

Best parameters found: LGBMRegressor(colsample_bytree=0.9, max_depth=8, min_child_samples=100,
              n_estimators=50, num_leaves=30, subsample=0.5)
MAE: 10.923722234236603
RMSE: 16.194633689456477


#### Predicción de Energia en dataset de TEST (último año) con LightGBM

In [24]:
if df_test.index.name != 'FechaHora':
    df_test.set_index('FechaHora', inplace=True)

In [25]:
# Get the minimum and maximum FechaHora
start_test_date = df_test.index.min()
end_test_date = df_test.index.max()
print(f"Start date: {start_test_date} | End date: {end_test_date}")

Start date: 2023-06-02 23:00:00 | End date: 2024-06-01 23:00:00


In [26]:
df_test_exog = df_test[exog_features[1:]]


In [27]:
# Predict on test data
pred_test_energia_lgb = model.predict(df_test_exog)

In [28]:
# Create a DataFrame with the predictions
df_pred_test_energia_lgb = pd.DataFrame({
    'FechaHora': df_test_exog.index,
    'pred_energia_lgb': pred_test_energia_lgb
})

In [29]:
df_pred_test_energia_lgb.head()

Unnamed: 0,FechaHora,pred_energia_lgb
0,2023-06-02 23:00:00,15.681673
1,2023-06-03 00:00:00,18.151282
2,2023-06-03 01:00:00,28.366471
3,2023-06-03 02:00:00,24.32143
4,2023-06-03 03:00:00,33.980329


#### Predicción de Energia a 10 dias con LightGBM

In [30]:
df_fc_actual_exog.reset_index(inplace=True)

In [31]:
# Get the minimum and maximum FechaHora
start_date_10d = df_fc_actual_exog['FechaHora'].min()
end_date_10d = df_fc_actual_exog['FechaHora'].max()
print(f"Start date: {start_date_10d} | End date: {end_date_10d}")

Start date: 2024-06-02 00:00:00 | End date: 2024-06-11 21:00:00


In [32]:
df_fc_actual_exog['FechaHora'] = pd.to_datetime(df_fc_actual_exog['FechaHora'])
df_fc_actual_exog.set_index('FechaHora', inplace=True)

In [33]:
# Predict on test data
pred_10d_energia_lgb = model.predict(df_fc_actual_exog)

In [34]:
# Create a DataFrame with the predictions
df_pred_10d_energia_lgb = pd.DataFrame({
    'FechaHora': df_fc_actual_exog.index,
    'pred_energia_lgb': pred_10d_energia_lgb
})

In [35]:
df_pred_10d_energia_lgb.head()

Unnamed: 0,FechaHora,pred_energia_lgb
0,2024-06-02 00:00:00,10.649182
1,2024-06-02 01:00:00,12.187967
2,2024-06-02 02:00:00,15.378987
3,2024-06-02 03:00:00,33.362576
4,2024-06-02 04:00:00,34.581394
