In [None]:
# Importar librerías
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib
import xgboost as xg
from sklearn.svm import SVR
from sklearn import preprocessing
from datetime import datetime as dt
from sklearn.model_selection import train_test_split, GroupKFold, cross_val_score, GridSearchCV
from sklearn.metrics import mean_absolute_error
from functools import partial
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK, space_eval
from hyperopt.pyll import scope as ho_scope

In [None]:
# Importar la data
data = pd.read_csv('data.csv')

In [None]:
x = data.drop('actual_productivity', axis=1)
y = data['actual_productivity']

In [None]:
scaler = preprocessing.StandardScaler().fit(x)
x_scaled = scaler.transform(x)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, test_size=127, random_state=10)

### Modelo simple

In [None]:
# Definir el modelo SVC
model_svm = SVR()

In [None]:
# Los parámetros que el modelo utiliza son

print('Parámetros el modelo:\n')
print(model_svm.get_params())

In [None]:
start = dt.now()
# Ajustar el modelo al conjunto de datos de entrenamiento
model_svm.fit(x_train, y_train)
end = dt.now()

In [None]:
# Tiempo que se requiere para ajustar el modelo
(end - start).seconds

In [None]:
# Valores predichos por el modelo
y_pred_train = model_svm.predict(x_train)
y_pred_test = model_svm.predict(x_test)

# Calcular el error para el conjunto de entrenamiento y prueba
error_train = mean_absolute_error(y_train, y_pred_train)
error_test = mean_absolute_error(y_test, y_pred_test)

In [None]:
# Se observa al aplicar un modelo SVM con sus parámetros por defecto
# el MAE para el conjunto de entrenamiento es de 0.08 y en el conjunto de pruebas es de 0.0876

print('MAE del conjunto de entrenamiento: {:.7f}'.format(error_train))
print('MAE del conjunto de prueba: {:.7f}'.format(error_test))

### Modelo con ajuste de hiper parámetros

#### Grid search

In [None]:
# Crear la grilla de parámetros
param_grid = {
    'C': [0.05, 10, 20, 50, 100, 200, 300]
    , 'kernel': ['rbf']
    , 'gamma': [0.05, 1, 10, 20, 50, 100]
}

In [None]:
# Definir el modelo SVC
model_svm = SVR()

In [None]:
grid_search = GridSearchCV(estimator = model_svm
             , param_grid = param_grid
             , cv = 10
             , n_jobs = -1)

In [None]:
start = dt.now()
grid_search.fit(x_train, y_train)
end = dt.now()

In [None]:
# Tiempo que se requiere para ajustar el modelo
(end - start).seconds

In [None]:
grid_search.best_params_

In [None]:
best_grid = grid_search.best_estimator_

In [None]:
joblib.dump(best_grid, 'resultados/smv_grid.pkl')

In [None]:
# Valores predichos por el modelo
y_pred_train_grid = best_grid.predict(x_train)
y_pred_test_grid = best_grid.predict(x_test)

# Calcular el error para el conjunto de entrenamiento y prueba
error_train_grid = mean_absolute_error(y_train, y_pred_train_grid)
error_test_grid = mean_absolute_error(y_test, y_pred_test_grid)

In [None]:
# Se observa al aplicar un modelo SVM con un ajuste de hiper parámetros
# el MAE para el conjunto de entrenamiento es de 0.0421 y en el conjunto de pruebas es de 0.065

print('MAE del conjunto de entrenamiento: {:.7f}'.format(error_train_grid))
print('MAE del conjunto de prueba: {:.7f}'.format(error_test_grid))

#### Optimización bayesiana

In [None]:
def optimize(search_space):
    model = SVR(**search_space)
    scores = cross_val_score(model, X=x_train, y=y_train, cv=10, scoring='neg_mean_absolute_error').mean()
    
    return {'loss':scores, 'status': STATUS_OK, 'model': model}

In [None]:
search_space = {
    'C': ho_scope.int(hp.quniform('C', 0.01, 300, 50.1))
    ,'kernel': hp.choice('kernel', ['rbf'])
    ,'gamma': ho_scope.int(hp.quniform('gamma', 0.01, 100, 0.1))
}

optimization_function = partial(
    optimize
)

In [None]:
trials = Trials()
start = dt.now()

result = fmin(
    fn = optimization_function
    , space = search_space
    , max_evals = 200
    , trials=trials
    , algo = tpe.suggest
)

end = dt.now()

In [None]:
# Tiempo que se requiere para ajustar el modelo
(end - start).seconds

In [None]:
print(space_eval(search_space, result))

In [None]:
best_model_bo = trials.best_trial['result']['model']

In [None]:
joblib.dump(best_model_bo, 'resultados/svm_bo.pkl')

In [None]:
best_model_bo.fit(x_train, y_train)

In [None]:
# Valores predichos por el modelo
y_pred_train_bo = best_model_bo.predict(x_train)
y_pred_test_bo = best_model_bo.predict(x_test)

# Calcular el error para el conjunto de entrenamiento y prueba
error_train_bo = mean_absolute_error(y_train, y_pred_train_bo)
error_test_bo = mean_absolute_error(y_test, y_pred_test_bo)

print('MAE del conjunto de entrenamiento: {:.7f}'.format(error_train_bo))
print('MAE del conjunto de prueba: {:.7f}'.format(error_test_bo))

In [None]:
# trials.losses()

In [None]:
joblib.dump(trials, 'resultados/svm_bo_trials.pkl')