## Entrenamiento del Modelo de Prediccion de Generación Eólica - PE La Castellana


El dataset que se va a trabajar corresponde al Parque Eólico La Castellana (localizado en Bahia Blanca).

In [1]:
import pandas as pd
import numpy as np
import os

from dotenv import load_dotenv

pd.options.mode.copy_on_write = True 
from datetime import datetime, timedelta

from sklearn.metrics import mean_absolute_error, mean_squared_error#, r2_score
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit


# Utilities
import os
import pickle
import requests as req
from io import BytesIO


# LightGBM model
import lightgbm as lgb
from lightgbm import LGBMRegressor

In [2]:
os.environ["LOKY_MAX_CPU_COUNT"] = "6"

In [3]:
FOLDER = 'dist' # Should be clean before the execution

#### Carga de dataset de Históricos

In [4]:
# Ruta relativa al archivo .env
load_dotenv(dotenv_path='config.env') # Carga el archivo .env del directorio actual

path_histo = os.getenv('PATH_HISTO') 

print(f"PATH_HISTO: {path_histo}")

df_histo = pd.read_csv(path_histo, decimal=',', parse_dates=['FechaHora'], 
                 date_format='%d/%m/%Y %H:%M', delimiter=';', header=0)

PATH_HISTO: https://raw.githubusercontent.com/guilledesimone/MMA-Tesis/refs/heads/main/Datos/ds_histo_02022019-01062024.csv


In [5]:
df_histo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46727 entries, 0 to 46726
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   FechaHora           46727 non-null  datetime64[ns]
 1   EnergiaSMEC         46727 non-null  float64       
 2   aeros_disp          46727 non-null  float64       
 3   energia_fc_cammesa  34631 non-null  float64       
 4   ws100_ecmwf         46727 non-null  float64       
 5   dir100_ecmwf        46727 non-null  float64       
 6   temp_ecmwf          46727 non-null  float64       
 7   ws100_gfs           46727 non-null  float64       
 8   dir100_gfs          46727 non-null  float64       
 9   temp_gfs            46727 non-null  float64       
 10  ws100_avg           46727 non-null  float64       
 11  dir100_avg          46727 non-null  float64       
 12  temp_avg            46727 non-null  float64       
dtypes: datetime64[ns](1), float64(12)
memory usage

In [6]:
#path = 'https://raw.githubusercontent.com/guilledesimone/MMA-Tesis/refs/heads/main/Datos/ds_fc_actual_02062024-11062024.csv'

#path = 'D:\\Documents\\MMA\\1.0 Tesis\\Datos\\ds_fc_actual_02062024-11062024.csv' #lote 1
#path = 'D:\\Documents\\MMA\\1.0 Tesis\\Datos\\ds_fc_actual_17062024-26062024.csv' #lote 2


#df_fc_actual = pd.read_csv(path, decimal=',', parse_dates=['FechaHora'], 
#                 date_format='%Y-%m-%d %H:%M:%S', delimiter=';', header=0)

In [7]:
#El forecast de aero_disp lo estimo en base al valor medio

# Filter the subset where AerosDisp is not null
#filtered_df = df_fc_actual.dropna(subset=['aeros_disp'])

# Calculate the mean of aero_disp 
#mean_aeros_disp = filtered_df.tail(12)['aeros_disp'].mean()

#df_fc_actual['aeros_disp'] = mean_aeros_disp.round(2)

Selección de variables relevantes

In [8]:
# Define the desired column order
#main_features = ['FechaHora','EnergiaSMEC','aeros_disp','ws100_avg', 'dir100_avg', 'temp_avg','energia_fc_cammesa'] #promedio de fuentes de ecwmf y gfs
#main_features = ['FechaHora','EnergiaSMEC','aeros_disp','ws100_ecmwf', 'dir100_ecmwf', 'temp_ecmwf']

In [9]:
#df_fc_actual = df_fc_actual[main_features[:-1]]

In [10]:
#df_fc_actual.head()

#### Preparación de datos historico

In [11]:
# Define las columnas principales
main_features = ['FechaHora','EnergiaSMEC','aeros_disp','ws100_avg', 'dir100_avg', 'temp_avg'] #promedio de fuentes de ecwmf y gfs

# DataFrame con columnas principales
df_histo = df_histo[main_features]


#### Division en train y test

In [12]:
# Define el start date para el train y test datasets (ultimo año)
train_start_dt = df_histo['FechaHora'].min()
test_start_dt = df_histo['FechaHora'].max() - pd.Timedelta(days=365)


In [13]:
df_histo.head()

Unnamed: 0,FechaHora,EnergiaSMEC,aeros_disp,ws100_avg,dir100_avg,temp_avg
0,2019-02-02 01:00:00,44.5,30.83,8.868427,126.395675,19.159385
1,2019-02-02 02:00:00,50.97,32.0,8.438345,130.508031,18.227713
2,2019-02-02 03:00:00,44.83,32.0,8.316949,141.396578,17.171822
3,2019-02-02 04:00:00,67.56,32.0,8.288875,146.649944,17.000357
4,2019-02-02 05:00:00,59.14,32.0,8.174949,150.170589,16.486966


In [14]:
# Crea el train set que contiene solo las features del modelo
df_train = df_histo[
(df_histo['FechaHora'] >= train_start_dt)
& (df_histo['FechaHora'] < test_start_dt)].copy()

# Crea el test set que contiene solo las features del modelo
df_test = df_histo[df_histo['FechaHora'] >= test_start_dt][main_features].copy()


print('Training data shape: ', df_train.shape)
print('Test data shape: ', df_test.shape)

Training data shape:  (37966, 6)
Test data shape:  (8761, 6)


In [15]:
#exog_features = [feature for feature in main_features if feature != 'EnergiaSMEC'and feature != 'energia_fc_cammesa']

# Reorder the DataFrame columns
#df_fc_actual_exog = df_fc_actual[exog_features]
# Set FechaHora as Index
#df_fc_actual_exog.set_index('FechaHora', inplace=True)

#df_fc_actual_exog.head()

### Entrenamiento del modelo LightGBM

In [16]:
df_test.set_index('FechaHora', inplace=True)
df_train.set_index('FechaHora', inplace=True)

In [17]:
# Se separan features y target
x_train, y_train = df_train.drop(columns=['EnergiaSMEC']), df_train['EnergiaSMEC'].values
x_test, y_test = df_test.drop(columns=['EnergiaSMEC']), df_test['EnergiaSMEC'].values

In [18]:
def fit_lgbm(x_train: pd.DataFrame, y_train: np.ndarray) -> LGBMRegressor:
    """
        Entrenamiento del modelo LigthGBM, guarda el modelo y lo devuelve.
    """
    params = {
    'num_leaves': 30,
    'n_estimators': 100,
    'max_depth': 8,
    'min_child_samples': 200,
    'learning_rate': 0.05,
    'subsample': 0.70,
    'colsample_bytree': 0.75
    }

    # Initialización del modelo
    model = lgb.LGBMRegressor(**params)
    
    # Grilla de parámetros de RandomizedSearchCV
    param_grid = {
        'num_leaves': [20, 30, 40, 50],
        'n_estimators': [50,70,100, 200, 400, 600],
        'max_depth': [6, 8, 10, 12],
        'min_child_samples': [20, 50, 100, 200, 400, 600],
        'learning_rate': [0.01, 0.05, 0.1, 0.15],
        'subsample': [0.5, 0.7, 0.8, 0.9],
        'colsample_bytree': [0.6, 0.75, 0.8, 0.9, 1]
    }
    
    # Configuración de TimeSeriesSplit
    tscv = TimeSeriesSplit(n_splits=5)
    
    # Configuración de RandomizedSearchCV con TimeSeriesSplit
    random_search = RandomizedSearchCV(
        model, param_distributions=param_grid, n_iter=25, cv=tscv, verbose=1, n_jobs=-1,
        random_state=14, scoring='neg_mean_squared_error'
    )

    # Entrena el modelo
    random_search.fit(x_train, y_train)
       
    # Obtiene el mejor modelo
    best_model = random_search.best_estimator_
    
    # Guarda el modelo en un archivo pickle
    file_name = 'model.pkl'
    with open(os.path.join(FOLDER, file_name), 'wb') as f:
        pickle.dump(best_model, f)     

    print('Model saved')
    return best_model

In [19]:
model = fit_lgbm(x_train, y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000291 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1003
[LightGBM] [Info] Number of data points in the train set: 37966, number of used features: 4
[LightGBM] [Info] Start training from score 47.909617
Model saved


In [20]:
# Predicción con datos de test
y_pred = model.predict(x_test)

In [21]:
# Evaluación del modelo
mae_t = mean_absolute_error(y_test, y_pred)
mse_t = mean_squared_error(y_test, y_pred)

rmse_t = np.sqrt(mse_t)
print(f'Best parameters found: {model}')
print(f'MAE: {mae_t}')
print(f'RMSE: {rmse_t}')

Best parameters found: LGBMRegressor(colsample_bytree=0.9, max_depth=8, min_child_samples=100,
              n_estimators=50, num_leaves=30, subsample=0.5)
MAE: 10.923722234236603
RMSE: 16.194633689456477


## Docker Comandos

In [None]:
import os
os.environ['PATH'] += ';C:\\Program Files\\Docker\\Docker\\resources\\bin'

In [None]:
#comandos
!docker

In [None]:
# !docker image ls
!docker images

In [None]:
#conteiners corridos hasta el momento:
#!docker container ls --all


In [None]:
# Para correr la imagen:

# Terminal
#!docker run --interactive --tty ubuntu bash

In [None]:
#Correr un servicio foreground (ejecutar en primer plano) para entorno de desarrollo:

# Terminal
#!docker run --publish 80:80 flask

In [None]:
#correr un servicio detached (ejecutar en modo desacoplado) para entorno productivo:

#!docker run --detach --publish 80:80 flask

In [None]:
#para ver que servicios estan corriendo:

#`uname -a` nos va a mostrar que container y host están compartiendo el kernel

#!docker ps

In [None]:
#para ver los logs:

#!docker logs 7f98c7a7f655

In [None]:
#detener el contenedor:

#!docker stop 7f98c7a7f655

In [None]:
#para nombrar el container:

#!docker run --detach --publish 80:80 --name webserver nginx

In [None]:
#listar contenedores:

#!docker container ls --all

In [None]:
#eliminar un contenedor

#!docker rm webserver

In [None]:
#cuanto recursos ocupa el contenedor

#!docker stats --no-stream

## Construccion de la imagen

In [None]:
#Archivos de requerimientos
!type requirements.txt

In [None]:
#Instalacion en el ambiente

#!pip install -r requirements.txt

In [None]:
!type Dockerfile_energypred

In [None]:
# Terminal
!docker build --file Dockerfile_energypred --tag=energy_predictor . 

In [None]:
!docker images

In [None]:
#Delete image

#!docker image rm gdesimone/energy_predictor_app:v1.0
#!docker image rm energy_predictor

In [None]:
!docker ps

In [None]:
#!docker stop energy_pred_app


In [None]:
#!docker rename webserver energy_pred_app

In [None]:
# Terminal RUN image

!docker run -d --name energy_pred_app --rm --publish 80:5000 energy_predictor

#!docker run -d --name energy_pred_app --rm --publish 80:5000 gdesimone/energy_predictor_app:v1.0

In [None]:
#cuanto recursos ocupa el contenedor
!docker stats --no-stream

In [None]:
!curl http://localhost/energy_pred/url

In [None]:
#para inspeccionar desde adentro el contenedor que esta corriendo
#Terminal

#!docker exec --interactive --tty energy_pred_app bash

In [None]:
!docker logs energy_pred_app

## Publicacion de la imagen

In [None]:
# Terminal
!docker login

In [None]:
!docker images

In [None]:
!docker tag energy_predictor gdesimone/energy_predictor_app:v1.0
!docker push gdesimone/energy_predictor_app:v1.0

In [None]:
# Terminal RUN image - para probar con la imagen publicada en dockerhub

!docker run -d --name energy_pred_app --rm --publish 80:5000 gdesimone/energy_predictor_app:v1.0

In [None]:
## para llamarla desde el browser http://localhost/energy_pred/url toma url por defecto
## especificando la url: 
## http://localhost/energy_pred/url?url=https://raw.githubusercontent.com/guilledesimone/MMA-Tesis/refs/heads/main/Datos/ds_exog.csv

## http://localhost/energy_pred/url?url=https://raw.githubusercontent.com/guilledesimone/MMA-Tesis/refs/heads/main/Datos/ds_exog_1911-2811.csv