# Ensemble Tree methods for forecasting $ET_0$

In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import mean_absolute_error, mean_squared_error

import warnings
warnings.filterwarnings('ignore')

## Reading the data

### Dataset I - Quixadá

In [2]:
#features_train_dt_quixada = pd.read_csv('./data-quixada/features_treino.csv')
#et0_train_dt_quixada = pd.read_csv('./data-quixada/et0_treino.csv')
#features_val_dt_quixada = pd.read_csv('./data-quixada/features_val.csv')
#et0_val_dt_quixada = pd.read_csv('./data-quixada/et0_val.csv')

# et0_train_ = et0_train + et0_val
features_train__dt_quixada = pd.read_csv('./data-quixada/features_treino_completo.csv')
# et0_train_   = et0_train + et0_val
et0_train__dt_quixada = pd.read_csv('./data-quixada/et0_treino_completo.csv')

features_test_dt_quixada = pd.read_csv('./data-quixada/features_teste.csv')
et0_test_dt_quixada = pd.read_csv('./data-quixada/et0_teste.csv')

In [3]:
features_train__dt_quixada.drop(columns=['data'], inplace=True)
et0_train__dt_quixada.drop(columns=['data'], inplace=True)
features_test_dt_quixada.drop(columns=['data'], inplace=True)
et0_test_dt_quixada.drop(columns=['data'], inplace=True)

### Dataset II - Quixeramobin

In [4]:
#features_train_dt_quixeramobim = pd.read_csv('./data/features_train.csv')
#et0_train_dt_quixeramobim = pd.read_csv('./data/label_train.csv')
#features_val_dt_quixeramobim = pd.read_csv('./data/features_val.csv')
#et0_val_dt_quixeramobim = pd.read_csv('./data/label_val.csv')

# features_train_ = features_train + features_val
features_train__dt_quixeramobim = pd.read_csv('./data/features_train_.csv')
# et0_train_   = et0_train + et0_val
et0_train__dt_quixeramobim = pd.read_csv('./data/label_train_.csv')

features_test_dt_quixeramobim = pd.read_csv('./data/features_test.csv')
et0_test_dt_quixeramobim = pd.read_csv('./data/label_test.csv')

### Grid Search

In [5]:
model_rf = RandomForestRegressor(random_state=42)
model_gb = GradientBoostingRegressor(random_state=42)

In [6]:
#random forest
n_estimators_rf = [20, 30, 40, 50, 60, 70, 80, 90, 100, 200, 300]
max_depth_rf = [3, 5, 7, 9]

parameters_random_forest = {
    'n_estimators' : n_estimators_rf, 
    'max_depth': max_depth_rf
}

#gradient boosting

n_estimators_gb = [20, 30, 40, 50, 60, 70, 80, 90, 100, ]
max_depth_gb = [3, 5, 7, 9]
learning_rate_gb = [0.05, 0.1, 0.15, 0.2]

parameters_gradient_boosting = {
    'n_estimators': n_estimators_gb,
    'max_depth': max_depth_gb,
    'learning_rate': learning_rate_gb
}

#### Grid Search over Dataset Quixadá

In [7]:
grid_search_rf = GridSearchCV(estimator = model_rf, 
                              param_grid = parameters_random_forest, 
                              scoring='neg_mean_squared_error', 
                              cv=3)

grid_search_gb = GridSearchCV(estimator = model_gb, 
                              param_grid = parameters_gradient_boosting,
                              scoring='neg_mean_squared_error', 
                              cv=3)

In [8]:
grid_search_rf.fit(features_train__dt_quixada, et0_train__dt_quixada)

GridSearchCV(cv=3, error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100, n_jobs=None,
                                             oob_score=False, random_state=42,
                                             verbose=0, warm_start=False),
             iid='deprecated', n_jobs

In [9]:
print("From Quixadá's datset, the Random Forest best hyperparameters are: \n")
print(grid_search_rf.best_params_)

From Quixadá's datset, the Random Forest best hyperparameters are: 

{'max_depth': 7, 'n_estimators': 90}


In [10]:
grid_search_gb.fit(features_train__dt_quixada, et0_train__dt_quixada)

GridSearchCV(cv=3, error_score=nan,
             estimator=GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0,
                                                 criterion='friedman_mse',
                                                 init=None, learning_rate=0.1,
                                                 loss='ls', max_depth=3,
                                                 max_features=None,
                                                 max_leaf_nodes=None,
                                                 min_impurity_decrease=0.0,
                                                 min_impurity_split=None,
                                                 min_samples_leaf=1,
                                                 min_samples_split=2,
                                                 min_weight_fraction_leaf=0.0,
                                                 n_estimators=100,
                                                 n_iter_n...
                            

In [11]:
print("From Quixadá's dataset, the Gradient Boosting best hyperparameters are: \n")
print(grid_search_gb.best_params_)

From Quixadá's dataset, the Gradient Boosting best hyperparameters are: 

{'learning_rate': 0.15, 'max_depth': 5, 'n_estimators': 60}


#### Grid Search over Dataset Quixeramobim

In [12]:
grid_search_rf = GridSearchCV(estimator = model_rf, 
                              param_grid = parameters_random_forest, 
                              scoring='neg_mean_squared_error', 
                              cv=3)

grid_search_gb = GridSearchCV(estimator = model_gb, 
                              param_grid = parameters_gradient_boosting,
                              scoring='neg_mean_squared_error', 
                              cv=3)

In [13]:
grid_search_rf.fit(features_train__dt_quixeramobim, et0_train__dt_quixeramobim)

GridSearchCV(cv=3, error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100, n_jobs=None,
                                             oob_score=False, random_state=42,
                                             verbose=0, warm_start=False),
             iid='deprecated', n_jobs

In [14]:
print("From Quixeramobim's datset, the Random Forest best hyperparameters are: \n")
print(grid_search_rf.best_params_)

From Quixeramobim's datset, the Random Forest best hyperparameters are: 

{'max_depth': 9, 'n_estimators': 30}


In [15]:
grid_search_gb.fit(features_train__dt_quixeramobim, et0_train__dt_quixeramobim)

GridSearchCV(cv=3, error_score=nan,
             estimator=GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0,
                                                 criterion='friedman_mse',
                                                 init=None, learning_rate=0.1,
                                                 loss='ls', max_depth=3,
                                                 max_features=None,
                                                 max_leaf_nodes=None,
                                                 min_impurity_decrease=0.0,
                                                 min_impurity_split=None,
                                                 min_samples_leaf=1,
                                                 min_samples_split=2,
                                                 min_weight_fraction_leaf=0.0,
                                                 n_estimators=100,
                                                 n_iter_n...
                            

In [16]:
print("From Quixeramobim's dataset, the Gradient Boosting best hyperparameters are: \n")
print(grid_search_gb.best_params_)

From Quixeramobim's dataset, the Gradient Boosting best hyperparameters are: 

{'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}


## Final Evalutation on Prediction Models

1. Dataset Quixadá

In [17]:
models = {}
models['Random_Forest'] = {'model': RandomForestRegressor(
    max_depth=7, 
    n_estimators=90,
    random_state=42)}
models['Gradient_Boosting'] = {'model': GradientBoostingRegressor(
    learning_rate=0.15, 
    max_depth=5,
    n_estimators=60,
    random_state=42)}

In [18]:
for key in models:
    print('processando {}...'.format(key))
    models[key]['model'].fit(features_train__dt_quixada, et0_train__dt_quixada)
    et0_pred = models[key]['model'].predict(features_test_dt_quixada)
    models[key]['rmse'] = np.sqrt(mean_squared_error(et0_test_dt_quixada, et0_pred))
    models[key]['mae'] = (mean_absolute_error(et0_test_dt_quixada, et0_pred))

processando Random_Forest...
processando Gradient_Boosting...


In [19]:
df = pd.DataFrame(models)
df_T = df.transpose()
df_T[['rmse', 'mae']]

Unnamed: 0,rmse,mae
Random_Forest,0.00774661,0.00621343
Gradient_Boosting,0.00724261,0.00592777


2. Dataset Quixeramobim

In [20]:
models = {}
models['Random_Forest'] = {'model': RandomForestRegressor(
    max_depth=9, 
    n_estimators=30,
    random_state=42)}
models['Gradient_Boosting'] = {'model': GradientBoostingRegressor(
    learning_rate=0.1,
    max_depth=5,
    n_estimators=100,
    random_state=42)}

In [21]:
for key in models:
    print('processando {}...'.format(key))
    models[key]['model'].fit(features_train__dt_quixeramobim, et0_train__dt_quixeramobim)
    et0_pred = models[key]['model'].predict(features_test_dt_quixeramobim)
    models[key]['rmse'] = np.sqrt(mean_squared_error(et0_test_dt_quixeramobim, et0_pred))
    models[key]['mae'] = (mean_absolute_error(et0_test_dt_quixeramobim, et0_pred))

processando Random_Forest...
processando Gradient_Boosting...


In [22]:
df = pd.DataFrame(models)
df_T = df.transpose()
df_T[['rmse', 'mae']]

Unnamed: 0,rmse,mae
Random_Forest,0.124894,0.0761464
Gradient_Boosting,0.10016,0.0681099
