In [2]:
# Importar librerías
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib
import xgboost as xg
from sklearn.svm import SVR
from sklearn import preprocessing
from datetime import datetime as dt
from sklearn.model_selection import train_test_split, GroupKFold, cross_val_score, GridSearchCV
from sklearn.metrics import mean_absolute_error
from functools import partial
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK, space_eval
from hyperopt.pyll import scope as ho_scope
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor 

In [3]:
# Importar la data
data = pd.read_csv('data.csv')

In [4]:
x = data.drop('actual_productivity', axis=1)
y = data['actual_productivity']

scaler = preprocessing.StandardScaler().fit(x)
x_scaled = scaler.transform(x)

x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, test_size=127, random_state=10)

### Modelo simple

In [5]:
# Definir el modelo ensamblado usando boosting
ensamble_bag = GradientBoostingRegressor(random_state=10)

ensamble_bag.fit(x_train, y_train)

GradientBoostingRegressor(random_state=10)

In [6]:
# Los parámetros que el modelo utiliza son

print('Parámetros por defecto del modelo:\n')
print(ensamble_bag.get_params())

Parámetros por defecto del modelo:

{'alpha': 0.9, 'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.1, 'loss': 'squared_error', 'max_depth': 3, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_iter_no_change': None, 'random_state': 10, 'subsample': 1.0, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}


In [7]:
start = dt.now()
# Ajustar el modelo al conjunto de datos de entrenamiento
ensamble_bag.fit(x_train, y_train)
end = dt.now()

In [8]:
# Tiempo que se requiere para ajustar el modelo
(end - start).seconds

0

In [9]:
# Valores predichos por el modelo
y_pred_train = ensamble_bag.predict(x_train)
y_pred_test = ensamble_bag.predict(x_test)

# Calcular el error para el conjunto de entrenamiento y prueba
error_train = mean_absolute_error(y_train, y_pred_train)
error_test = mean_absolute_error(y_test, y_pred_test)

In [10]:
# Se observa al aplicar un modelo ensamble con boosting con sus parámetros por defecto
# el MAE para el conjunto de entrenamiento es de 0.0295 y en el conjunto de pruebas es de 0.07878

print('MAE del conjunto de entrenamiento: {:.7f}'.format(error_train))
print('MAE del conjunto de prueba: {:.7f}'.format(error_test))

MAE del conjunto de entrenamiento: 0.0643757
MAE del conjunto de prueba: 0.0759289


### Modelo con ajuste de hiper parámetros

In [23]:
# estimators = [
#      ('SVM', SVR(kernel='rbf')),
#      ('REG', LinearRegression()),
#      ('RANDOM_FOREST', RandomForestRegressor(criterion='absolute_error', bootstrap=False)),
#      ('XGBOOST', xg.XGBRegressor(eval_metric='mae'))
# ]

# estimators = [SVR(kernel='rbf')
#     , LinearRegression()
#     , RandomForestRegressor(criterion='absolute_error', bootstrap=False)
#     , xg.XGBRegressor(eval_metric='mae')
#     , DecisionTreeRegressor(criterion='mae')
# ]

#### Grid search

In [18]:
# Crear la grilla de parámetros
param_grid = {
    'loss': ['absolute_error']
    , 'n_estimators': [50, 100, 200, 500, 1000]
    , 'subsample': [0.1, 0.3, 0.5, 0.7, 0.9]
    , 'min_samples_leaf': [2, 5, 10]
    , 'min_samples_split': [2, 5, 10]
    , 'max_depth': [3, 5, 8, 10, 15, 20, 25, 30]
}

In [19]:
# Definir el modelo de ensamble
ensamble_bag = GradientBoostingRegressor(random_state=10)

In [20]:
grid_search = GridSearchCV(estimator = ensamble_bag
             , param_grid = param_grid
             , cv = 10
             , n_jobs = -1)

In [21]:
start = dt.now()
grid_search.fit(x_train, y_train)
end = dt.now()

In [22]:
# Tiempo que se requiere para ajustar el modelo
(end - start).seconds

4

In [23]:
grid_search.best_params_

{'learning_rate': 0.01,
 'loss': 'absolute_error',
 'max_depth': 3,
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'n_estimators': 50,
 'subsample': 0.1}

In [25]:
best_grid = grid_search.best_estimator_

In [26]:
joblib.dump(best_grid, 'resultados/ensamble_boosting_grid.pkl')

['resultados/ensamble_boosting_grid.pkl']

In [27]:
# Valores predichos por el modelo
y_pred_train_grid = best_grid.predict(x_train)
y_pred_test_grid = best_grid.predict(x_test)

# Calcular el error para el conjunto de entrenamiento y prueba
error_train_grid = mean_absolute_error(y_train, y_pred_train_grid)
error_test_grid = mean_absolute_error(y_test, y_pred_test_grid)

In [28]:
# Se observa al aplicar un modelo ensamble con bagging con un ajuste de hiper parámetros
# el MAE para el conjunto de entrenamiento es de 0.0421 y en el conjunto de pruebas es de 0.065

print('MAE del conjunto de entrenamiento: {:.7f}'.format(error_train_grid))
print('MAE del conjunto de prueba: {:.7f}'.format(error_test_grid))

MAE del conjunto de entrenamiento: 0.1148050
MAE del conjunto de prueba: 0.1059305


#### Optimización bayesiana

In [41]:
def optimize(search_space):
    model = GradientBoostingRegressor(**search_space)
    scores = cross_val_score(model, X=x_train, y=y_train, cv=10, scoring='neg_mean_absolute_error').mean()
    
    return {'loss':scores, 'status': STATUS_OK, 'model': model}

In [59]:
search_space = {
    'loss': 'absolute_error'
    , 'n_estimators' : ho_scope.int(hp.quniform('n_estimators', 10, 1000, 1))
    , 'min_samples_leaf': ho_scope.int(hp.quniform('min_samples_leaf', 2, 10, 1))
    , 'min_samples_split' : ho_scope.int(hp.quniform('min_samples_split', 2, 10, 1))
    , 'max_depth': hp.quniform('max_depth', 3, 30, 1)
    , 'subsample': hp.quniform('subsample', 0.1, 0.9, 0.1)
#     , 'learning_rate': hp.quniform('learning_rate', 0.01, 10, 0.05)
}

optimization_function = partial(
    optimize
)

In [60]:
trials = Trials()
start = dt.now()

result = fmin(
    fn = optimization_function
    , space = search_space
    , max_evals = 100
    , trials=trials
    , algo = tpe.suggest
)

end = dt.now()

100%|████████████████████████████████████████| 5/5 [05:20<00:00, 64.19s/trial, best loss: -0.08316733252588646]


In [61]:
# Tiempo que se requiere para ajustar el modelo
(end - start).seconds

320

In [62]:
print(space_eval(search_space, result))

{'loss': 'absolute_error', 'max_depth': 20.0, 'min_samples_leaf': 6, 'min_samples_split': 2, 'n_estimators': 971, 'subsample': 0.4}


In [63]:
best_model_bo = trials.best_trial['result']['model']

In [64]:
joblib.dump(best_model_bo, 'resultados/ensamble_boosting_bo.pkl')

['resultados/ensamble_boosting_bo.pkl']

In [65]:
best_model_bo.fit(x_train, y_train)

GradientBoostingRegressor(loss='absolute_error', max_depth=20.0,
                          min_samples_leaf=6, n_estimators=971, subsample=0.4)

In [66]:
# Valores predichos por el modelo
y_pred_train_bo = best_model_bo.predict(x_train)
y_pred_test_bo = best_model_bo.predict(x_test)

# Calcular el error para el conjunto de entrenamiento y prueba
error_train_bo = mean_absolute_error(y_train, y_pred_train_bo)
error_test_bo = mean_absolute_error(y_test, y_pred_test_bo)

print('MAE del conjunto de entrenamiento: {:.7f}'.format(error_train_bo))
print('MAE del conjunto de prueba: {:.7f}'.format(error_test_bo))

MAE del conjunto de entrenamiento: 0.0186384
MAE del conjunto de prueba: 0.0769835


In [67]:
# trials.losses()

In [68]:
joblib.dump(trials, 'resultados/ensamble_boosting_bo_trials.pkl')

['resultados/ensamble_boosting_bo_trials.pkl']