In [1]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import cross_val_score
import pandas as pd
from joblib import dump
import matplotlib.pyplot as plt
import numpy as np
import time

In [2]:
train = pd.read_excel('train.xlsx') 

In [3]:
y = train.PRODUCTO_1
X = train.drop('PRODUCTO_1', axis=1)

Ahora entrenamos todos los modelos que queramos variando hiperparámetros y se guardan sus scores. Los hiperparámetros que mejores scores den se usan para reentrenar el modelo con el dataset entero, medir el tiempo de ejecución y guardarlo

In [14]:
metrics = {}
for n_estimators in [5, 10, 25, 50, 100, 250, 500, 1000]:
    for max_depth in [2, 3, 5, 7, 10]:
        model = RandomForestRegressor(n_estimators = n_estimators, max_depth=max_depth, random_state=73)
        rmse_scores = - cross_val_score(model, X, y, scoring='neg_root_mean_squared_error')

        metrics[f'rf_{n_estimators}_{max_depth}'] = {
            'RMSE_1': rmse_scores[0],
            'RMSE_2': rmse_scores[1],
            'RMSE_3': rmse_scores[2],
            'RMSE_4': rmse_scores[3],
            'RMSE_5': rmse_scores[4],
            'MEAN_RMSE': rmse_scores.mean()
        }

metrics_rf = pd.DataFrame.from_dict(metrics, orient='index',columns=['RMSE_1', 'RMSE_2', 'RMSE_3', 'RMSE_4', 'RMSE_5', 'MEAN_RMSE'])
metrics_rf.sort_values(by='MEAN_RMSE')

Unnamed: 0,RMSE_1,RMSE_2,RMSE_3,RMSE_4,RMSE_5,MEAN_RMSE
rf_500_3,225.184304,262.37797,321.569337,346.217636,253.836963,281.837242
rf_500_10,226.187245,263.245937,320.192933,346.321699,253.999331,281.989429
rf_500_5,226.23592,262.727778,320.333255,345.790813,255.52014,282.121581
rf_500_7,226.223884,263.386807,319.862839,346.61657,254.531472,282.124315
rf_1000_3,226.603976,262.734379,321.826268,345.056907,256.037359,282.451778
rf_500_2,226.620172,261.109919,322.726006,346.386032,255.663046,282.501035
rf_1000_2,227.518253,261.158163,322.962567,345.585063,255.943335,282.633476
rf_1000_7,227.087047,263.58816,321.068876,345.465292,256.101789,282.662233
rf_1000_10,227.161061,263.619459,321.268224,345.483074,256.103925,282.727149
rf_1000_5,227.566942,263.286963,321.666425,344.783789,256.816299,282.824084


Guardamos esta tablita

In [15]:
metrics_rf.to_csv('../06_model_output/metrics/random_forest.csv')

Reentrenamos el modelo con todos los datos, lo guardamos, y guardamos el RMSE que obtenemos sobre el propio train y el tiempo de computación

In [4]:
time1 = time.time()
rf = RandomForestRegressor(n_estimators=500, max_depth=3, random_state=73).fit(X, y)
time2 = time.time() - time1
preds = rf.predict(X)
rmse = root_mean_squared_error(y, preds)
rmse

225.55590624304355

In [5]:
dump(rf, '../06_model_output/models/random_forest.joblib')

['../06_model_output/models/random_forest.joblib']

In [6]:
with open('../06_model_output/metrics/final_metrics.txt', 'a') as f:
    f.write(f'\nRF, {time2}, {rmse}')