In [1]:
# Importar librerías
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib
import xgboost as xg
from sklearn.svm import SVR
from sklearn import preprocessing
from datetime import datetime as dt
from sklearn.model_selection import train_test_split, GroupKFold, cross_val_score, GridSearchCV
from sklearn.metrics import mean_absolute_error
from functools import partial
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK, space_eval
from hyperopt.pyll import scope as ho_scope
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [2]:
# Importar la data
data = pd.read_csv('data.csv')

In [3]:
x = data.drop('actual_productivity', axis=1)
y = data['actual_productivity']

scaler = preprocessing.StandardScaler().fit(x)
x_scaled = scaler.transform(x)

x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, test_size=127, random_state=10)

### Modelo simple

In [4]:
# Definir el modelo ensamblado usando bagging
ensamble_bag = BaggingRegressor(random_state=10)

ensamble_bag.fit(x_train, y_train)

BaggingRegressor(random_state=10)

In [5]:
# Los parámetros que el modelo utiliza son

print('Parámetros por defecto del modelo:\n')
print(ensamble_bag.get_params())

Parámetros por defecto del modelo:

{'base_estimator': None, 'bootstrap': True, 'bootstrap_features': False, 'max_features': 1.0, 'max_samples': 1.0, 'n_estimators': 10, 'n_jobs': None, 'oob_score': False, 'random_state': 10, 'verbose': 0, 'warm_start': False}


In [6]:
start = dt.now()
# Ajustar el modelo al conjunto de datos de entrenamiento
ensamble_bag.fit(x_train, y_train)
end = dt.now()

In [7]:
# Tiempo que se requiere para ajustar el modelo
(end - start).seconds

0

In [8]:
# Valores predichos por el modelo
y_pred_train = ensamble_bag.predict(x_train)
y_pred_test = ensamble_bag.predict(x_test)

# Calcular el error para el conjunto de entrenamiento y prueba
error_train = mean_absolute_error(y_train, y_pred_train)
error_test = mean_absolute_error(y_test, y_pred_test)

In [9]:
# Se observa al aplicar un modelo ensamble con bagging con sus parámetros por defecto
# el MAE para el conjunto de entrenamiento es de 0.0295 y en el conjunto de pruebas es de 0.07878

print('MAE del conjunto de entrenamiento: {:.7f}'.format(error_train))
print('MAE del conjunto de prueba: {:.7f}'.format(error_test))

MAE del conjunto de entrenamiento: 0.0295142
MAE del conjunto de prueba: 0.0787753


### Modelo con ajuste de hiper parámetros

In [10]:
estimators = [
     ('SVM', SVR()),
     ('REG', LinearRegression()),
     ('RANDOM_FOREST', RandomForestRegressor(criterion='absolute_error', bootstrap=False)),
     ('XGBOOST', xg.XGBRegressor(eval_metric='mae'))
]

#### Grid search

In [None]:
# Crear la grilla de parámetros
param_grid = {
    'max_depth': [3, 5, 8, 10, 15, 20, 25, 30]
     ,'min_child_weight': [1, 5, 10]
     ,'gamma': [0.05, 0.5, 1, 2, 5]
     ,'eta': [0, 0.05, 0.1, 0.2, 0.5]
     ,'subsample': [0.1, 0.2, 0.4, 0.5, 0.6, 0.8, 1]
     ,'colsample_bytree': [0.1, 0.2, 0.4, 0.5, 0.6, 0.8, 1]
}

In [26]:
# Definir el modelo de ensamble
ensamble_bag = BaggingRegressor(random_state=10, estimators=estimators)

TypeError: BaggingRegressor.__init__() got an unexpected keyword argument 'estimators'

In [None]:
grid_search = GridSearchCV(estimator = model_xg
             , param_grid = param_grid
             , cv = 10
             , n_jobs = -1)

In [None]:
start = dt.now()
grid_search.fit(x_train, y_train)
end = dt.now()

In [None]:
# Tiempo que se requiere para ajustar el modelo
(end - start).seconds

In [None]:
grid_search.best_params_

In [None]:
best_grid = grid_search.best_estimator_

In [None]:
joblib.dump(best_grid, 'resultados/xgboost_grid.pkl')

In [None]:
# Valores predichos por el modelo
y_pred_train_grid = best_grid.predict(x_train)
y_pred_test_grid = best_grid.predict(x_test)

# Calcular el error para el conjunto de entrenamiento y prueba
error_train_grid = mean_absolute_error(y_train, y_pred_train_grid)
error_test_grid = mean_absolute_error(y_test, y_pred_test_grid)

In [None]:
# Se observa al aplicar un modelo XGBoost con un ajuste de hiper parámetros
# el MAE para el conjunto de entrenamiento es de 0.0421 y en el conjunto de pruebas es de 0.065

print('MAE del conjunto de entrenamiento: {:.7f}'.format(error_train_grid))
print('MAE del conjunto de prueba: {:.7f}'.format(error_test_grid))

#### Optimización bayesiana

In [None]:
def optimize(search_space):
    model = xg.XGBRegressor(**search_space)
    scores = cross_val_score(model, X=x_train, y=y_train, cv=10, scoring='neg_mean_absolute_error').mean()
    
    return {'loss':scores, 'status': STATUS_OK, 'model': model}

In [None]:
search_space = {
    'eval_metric': 'mae',
    'seed': 10,
    'max_depth': ho_scope.int(hp.quniform('max_depth', 3, 30, 1)),
    'min_child_weight': ho_scope.int(hp.quniform('min_child_weight', 1, 10, 1)),
    'gamma': hp.quniform('gamma', 0.05, 5, 0.05),
    'eta': hp.quniform('eta', 0, 0.5, 0.05),
    'subsample': hp.quniform('subsample', 0.1, 1, 0.1),
    'colsample_bytree': hp.quniform('colsasmple_bytree', 0.1, 1, 0.1)
}

optimization_function = partial(
    optimize
)

In [None]:
trials = Trials()
start = dt.now()

result = fmin(
    fn = optimization_function
    , space = search_space
    , max_evals = 200
    , trials=trials
    , algo = tpe.suggest
)

end = dt.now()

In [None]:
# Tiempo que se requiere para ajustar el modelo
(end - start).seconds

In [None]:
print(space_eval(search_space, result))

In [None]:
best_model_bo = trials.best_trial['result']['model']

In [None]:
joblib.dump(best_model_bo, 'resultados/xgboost_bo.pkl')

In [None]:
best_model_bo.fit(x_train, y_train)

In [None]:
# Valores predichos por el modelo
y_pred_train_bo = best_model_bo.predict(x_train)
y_pred_test_bo = best_model_bo.predict(x_test)

# Calcular el error para el conjunto de entrenamiento y prueba
error_train_bo = mean_absolute_error(y_train, y_pred_train_bo)
error_test_bo = mean_absolute_error(y_test, y_pred_test_bo)

print('MAE del conjunto de entrenamiento: {:.7f}'.format(error_train_bo))
print('MAE del conjunto de prueba: {:.7f}'.format(error_test_bo))

In [None]:
# trials.losses()

In [None]:
joblib.dump(trials, 'resultados/xgoost_bo_trials.pkl')