### Regresion Costo Educativo

En esta ocasión se busca desarrollar un proceso de GridSearch con el fin de buscar los mejores hiperparámetros de distintos modelos dentro de un rango de valores definidos por nosotros mismos. Igualmente, se busca que usted aplique y comprenda los siguientes modelos:

- Árboles de decisión
- Bosques de decisión
- AdaBoost
- XGBoost
- Bagging

In [1]:

import sklearn
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler, RobustScaler
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd   
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
from sklearn.model_selection import train_test_split

import missingno as msn

Lectura de CSV y borrado de filas incompletas.

In [3]:
# cargar bases
# quitar na's de las observaciones
data = pd.read_csv('data/La_base_definitiva_final.csv')
data = data[data['Var_y']<=1]
data

Unnamed: 0,DIRECTORIO,SECUENCIA_P.x,Max_educ,transporte,tiempo_promedio_transporte,tiempo_moda_estudio,tiempo_max_estudio,tiempo_min_estudio,beca,monto_beca,...,P1070,P4005,P4015,P4567,P8520S1A1,P8520S5,P8520S3,P8520S4,Var_y,log_y
0,7566841,1,9,4,5.00,3,4,3,2,0,...,1,1,4,3,3,1,1,1,0.018996,-3.963518
1,7566846,1,7,1,27.50,4,4,4,2,0,...,1,1,6,3,2,1,1,1,0.004717,-5.356492
2,7566901,1,5,3,20.00,6,6,3,2,0,...,1,1,6,4,8,2,2,2,0.020878,-3.869051
3,7566909,1,5,8,15.00,5,5,5,2,0,...,1,1,4,3,2,1,1,1,0.009258,-4.682259
4,7566924,1,5,4,31.50,3,4,3,2,0,...,1,1,6,4,9,1,2,1,0.010065,-4.598698
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17578,7787848,1,5,3,18.75,5,5,3,2,0,...,1,1,4,1,1,2,2,1,0.046154,-3.075775
17579,7787851,1,11,4,5.00,4,4,4,2,0,...,2,1,4,1,1,1,2,1,0.006439,-5.045321
17580,7787853,1,5,4,17.00,5,5,5,2,0,...,1,1,6,4,1,2,2,2,0.004705,-5.359078
17581,7787862,1,11,3,20.00,1,1,1,2,0,...,1,5,5,4,1,2,2,1,0.007042,-4.955827


In [4]:
# Función para estandarizar las variables continuas
def estandarizacion(df):
    return df.apply(lambda x: (x-x.mean())/x.std(), axis=0)

# Codificación One-Hot 
def convert_dummy(df, feature,rank=0):
    pos = pd.get_dummies(df[feature], prefix=feature)
    mode = df[feature].value_counts().index[rank]
    biggest = feature + '_' + str(mode)
    pos.drop([biggest],axis=1,inplace=True)
    df.drop([feature],axis=1,inplace=True)
    df=df.join(pos)
    return df

def convert_dummy2(df):
    pd.get_dummies(df, drop_first=True)
    return df

In [5]:
data.columns

Index(['DIRECTORIO', 'SECUENCIA_P.x', 'Max_educ', 'transporte',
       'tiempo_promedio_transporte', 'tiempo_moda_estudio',
       'tiempo_max_estudio', 'tiempo_min_estudio', 'beca', 'monto_beca',
       'subsidio', 'monto_subsidio', 'credito', 'monto_credito',
       'ayudas_total', 'arte', 'ciencia', 'deportes', 'grupos_estudio',
       'parque', 'lectura', 'juegos', 'alguna_extra', 'moda_estado',
       'madres_jovenes', 'P5000', 'P5010', 'CANT_PERSONAS_HOGAR', 'I_HOGAR',
       'I_UGASTO', 'PERCAPITA', 'P5230', 'P9090', 'P784S1', 'P1077S1',
       'P1077S2', 'P1077S3', 'P1077S4', 'P1077S5', 'P1077S6', 'P1077S7',
       'P1077S8', 'P1077S9', 'P1077S10', 'P1077S14', 'P1077S15', 'P1077S16',
       'P1077S17', 'P1077S19', 'P1077S21', 'P1077S22', 'P1077S23', 'P1075',
       'P1913S2', 'P3353', 'P3354', 'Trabajo._10_años', 'Trabajo._totales',
       'Desempleo._10_años', 'Oficios._10_años', 'Estudiante._10_años',
       'Incapacidad._10_años', 'Otra._10_años', 'cotizan_pension_10', 'P509

In [6]:
# crear dummys a partir de variables categoricas
# variables categoricas
vars_cate = ["Max_educ", "transporte", 
                           "tiempo_moda_estudio", "tiempo_max_estudio", "tiempo_min_estudio",
                           "beca", "subsidio", "credito", "arte", "ciencia", "deportes",
                           "grupos_estudio", "parque", "lectura", "juegos", "alguna_extra",
                           "moda_estado", "madres_jovenes", "P5230", "P9090", "P784S1",
                           "P1077S1", "P1077S2", "P1077S3", "P1077S4", "P1077S5",
                           "P1077S6", "P1077S7", "P1077S8", "P1077S9", "P1077S10",
                           "P1077S14", "P1077S15", "P1077S16", "P1077S17", "P1077S19",
                           "P1077S21", "P1077S22", "P1077S23", "P1075", "P1913S2",
                           "P3353", "P3354", "P5095", "CLASE", "P2102", "P1070",
                           "P4005", "P4015", "P4567", "P8520S1A1", "P8520S5", "P8520S3",
                           "P8520S4"]

# iterar sobre variables categoricas
for var in vars_cate:
    # generar dummies a partir de la base
    data = convert_dummy(data, var)   
data



Unnamed: 0,DIRECTORIO,SECUENCIA_P.x,tiempo_promedio_transporte,monto_beca,monto_subsidio,monto_credito,ayudas_total,P5000,P5010,CANT_PERSONAS_HOGAR,...,P8520S1A1_2,P8520S1A1_3,P8520S1A1_4,P8520S1A1_5,P8520S1A1_6,P8520S1A1_8,P8520S1A1_9,P8520S5_2,P8520S3_2,P8520S4_2
0,7566841,1,5.00,0,0,0,0,3,2,4,...,0,1,0,0,0,0,0,0,0,0
1,7566846,1,27.50,0,0,0,0,4,3,9,...,1,0,0,0,0,0,0,0,0,0
2,7566901,1,20.00,0,0,0,0,3,3,7,...,0,0,0,0,0,1,0,1,1,1
3,7566909,1,15.00,0,0,0,0,4,3,4,...,1,0,0,0,0,0,0,0,0,0
4,7566924,1,31.50,0,0,0,0,2,2,3,...,0,0,0,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17578,7787848,1,18.75,0,100000,0,100000,8,6,13,...,0,0,0,0,0,0,0,1,1,0
17579,7787851,1,5.00,0,0,0,0,4,3,5,...,0,0,0,0,0,0,0,0,1,0
17580,7787853,1,17.00,0,0,0,0,3,3,5,...,0,0,0,0,0,0,0,1,1,1
17581,7787862,1,20.00,0,0,0,0,4,2,2,...,0,0,0,0,0,0,0,1,1,0


In [7]:
# ver variables de la base
data.columns

Index(['DIRECTORIO', 'SECUENCIA_P.x', 'tiempo_promedio_transporte',
       'monto_beca', 'monto_subsidio', 'monto_credito', 'ayudas_total',
       'P5000', 'P5010', 'CANT_PERSONAS_HOGAR',
       ...
       'P8520S1A1_2', 'P8520S1A1_3', 'P8520S1A1_4', 'P8520S1A1_5',
       'P8520S1A1_6', 'P8520S1A1_8', 'P8520S1A1_9', 'P8520S5_2', 'P8520S3_2',
       'P8520S4_2'],
      dtype='object', length=203)

In [8]:
data

Unnamed: 0,DIRECTORIO,SECUENCIA_P.x,tiempo_promedio_transporte,monto_beca,monto_subsidio,monto_credito,ayudas_total,P5000,P5010,CANT_PERSONAS_HOGAR,...,P8520S1A1_2,P8520S1A1_3,P8520S1A1_4,P8520S1A1_5,P8520S1A1_6,P8520S1A1_8,P8520S1A1_9,P8520S5_2,P8520S3_2,P8520S4_2
0,7566841,1,5.00,0,0,0,0,3,2,4,...,0,1,0,0,0,0,0,0,0,0
1,7566846,1,27.50,0,0,0,0,4,3,9,...,1,0,0,0,0,0,0,0,0,0
2,7566901,1,20.00,0,0,0,0,3,3,7,...,0,0,0,0,0,1,0,1,1,1
3,7566909,1,15.00,0,0,0,0,4,3,4,...,1,0,0,0,0,0,0,0,0,0
4,7566924,1,31.50,0,0,0,0,2,2,3,...,0,0,0,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17578,7787848,1,18.75,0,100000,0,100000,8,6,13,...,0,0,0,0,0,0,0,1,1,0
17579,7787851,1,5.00,0,0,0,0,4,3,5,...,0,0,0,0,0,0,0,0,1,0
17580,7787853,1,17.00,0,0,0,0,3,3,5,...,0,0,0,0,0,0,0,1,1,1
17581,7787862,1,20.00,0,0,0,0,4,2,2,...,0,0,0,0,0,0,0,1,1,0


In [9]:
# guardar variables para ver los resultados

In [10]:
# variable y: proporcion de gasto en educacion
y = data['Var_y']
X = data.drop(['DIRECTORIO','SECUENCIA_P.x'], axis = 1)
X

Unnamed: 0,tiempo_promedio_transporte,monto_beca,monto_subsidio,monto_credito,ayudas_total,P5000,P5010,CANT_PERSONAS_HOGAR,I_HOGAR,I_UGASTO,...,P8520S1A1_2,P8520S1A1_3,P8520S1A1_4,P8520S1A1_5,P8520S1A1_6,P8520S1A1_8,P8520S1A1_9,P8520S5_2,P8520S3_2,P8520S4_2
0,5.00,0,0,0,0,3,2,4,2.213166e+06,2.213166e+06,...,0,1,0,0,0,0,0,0,0,0
1,27.50,0,0,0,0,4,3,9,3.533000e+06,3.533000e+06,...,1,0,0,0,0,0,0,0,0,0
2,20.00,0,0,0,0,3,3,7,9.583333e+05,9.583333e+05,...,0,0,0,0,0,1,0,1,1,1
3,15.00,0,0,0,0,4,3,4,1.305167e+06,1.305167e+06,...,1,0,0,0,0,0,0,0,0,0
4,31.50,0,0,0,0,2,2,3,7.700000e+05,7.700000e+05,...,0,0,0,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17578,18.75,0,100000,0,100000,8,6,13,1.300000e+06,1.300000e+06,...,0,0,0,0,0,0,0,1,1,0
17579,5.00,0,0,0,0,4,3,5,2.200000e+06,2.200000e+06,...,0,0,0,0,0,0,0,0,1,0
17580,17.00,0,0,0,0,3,3,5,1.416859e+06,1.416859e+06,...,0,0,0,0,0,0,0,1,1,1
17581,20.00,0,0,0,0,4,2,2,1.242500e+06,1.242500e+06,...,0,0,0,0,0,0,0,1,1,0


## Parte 1: Prueba de Modelos y GridSearch/RandomizedSearch


In [12]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.metrics import make_scorer

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.neural_network import MLPRegressor

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

from sklearn.metrics import classification_report

Partición de datos de entrenamiento/validación/prueba.

In [61]:
#Se separan los datos en entrenamiento y prueba.  Es importante aclarar que usaremos validación cruzada, así que los datos
#de prueba son para probar los mejores modelos que se obtengan de acuerdo de cada conjunto de variables seleccionadas.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 100)

# dividir datos de prueba en datos de validacion y datos de prueba/evaluacion con la base de datos de prueba
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.3, random_state = 100)

# guardar valor de log_y gasto en educacion real

y_train = X_train['log_y']
y_val = X_val['log_y']
y_test = X_test['log_y']

# guardar valor de y gasto en educacion real
price_train = X_train['Var_y'].to_numpy()
price_val = X_val['Var_y'].to_numpy()
pricey_test = X_test['Var_y'].to_numpy()

# drop columnas  y, log_y
X_train = X_train.drop(['log_y','Var_y'], axis = 1)
X_val = X_val.drop(['log_y','Var_y'], axis = 1)
X_test = X_test.drop(['log_y','Var_y'], axis = 1)



num_features = np.sqrt(X_train.shape[1])
num_features

min_samples_split_05 = X_train.shape[0]*0.005
min_samples_split_1= X_train.shape[0]*0.01

type(pricey_test)

numpy.ndarray

## Parte 1A

### Definición de Parámetros de GridSearch/RandomizedSearch

In [14]:
# crear escalador Robusto() sobre los predictores X para X_train, X_val y X_test

scaler = RobustScaler() # Utilice un MinMaxScaler #
X_train_scaled = scaler.fit_transform(X_train) # Aplique el Escalamiento (función fit_transform) # 
X_val_scaled = scaler.transform(X_val) # Aplique el Escalamiento (función fit_transform) # 
X_test_scaled =  scaler.transform(X_test) # Aplique el Escalamiento (función fit_transform) #

X_test_scaled

array([[ 0. ,  0. ,  0. , ...,  1. ,  1. ,  1. ],
       [-0.5,  0. ,  0. , ...,  0. ,  0. ,  0. ],
       [-0.5,  0. ,  0. , ...,  0. ,  0. ,  0. ],
       ...,
       [ 0. ,  0. ,  0. , ...,  0. ,  0. ,  0. ],
       [ 0. ,  0. ,  0. , ...,  1. ,  1. ,  1. ],
       [-0.5,  0. ,  0. , ...,  0. ,  0. ,  0. ]])

In [151]:
# Búsqueda de Hiperparámetros
# crear modelos de ML para Regresión

dt = DecisionTreeRegressor(splitter='best', random_state=100)
rf = RandomForestRegressor(n_jobs=-1, verbose = 3, random_state=100)
#ada = AdaBoostRegressor( loss= 'linear', random_state=100)

bagging = BaggingRegressor( n_jobs=-1, random_state= 100, verbose = 3)
nn = MLPRegressor(random_state=100, verbose = 3, learning_rate = 'adaptive')


# crear grilla de hiper parametros

#DecisionTreeRegressor
para_dt = { 'criterion': ['squared_error'], 
           'max_depth': np.linspace(7,20,num=6,dtype=int), 
           'min_samples_split': np.linspace(60,125,num=5,dtype=int), 
           'min_samples_leaf': np.linspace(6,20,num=5,dtype=int)}
grid_dt = GridSearchCV(dt, param_grid=para_dt, scoring= 'neg_mean_squared_error', n_jobs=-1, cv=5, verbose = 3)

#Random Forest
# hiperparametros 
params_rf = {'n_estimators':np.linspace(100,600,num=3,dtype=int),
             'criterion':['mse'], 
            'max_features':np.linspace(10,40,num=5,dtype=int),
             'max_depth': np.linspace(7,20,num=3,dtype=int),  
             'min_samples_split': np.linspace(60,125,num=3,dtype=int) }
grid_rf = GridSearchCV(rf, param_grid=params_rf, scoring= 'neg_mean_squared_error', n_jobs=-1, cv=5, verbose = 3)


#AdaBoost
params_ada = {'n_estimators':np.linspace(100,600,num=3,dtype=int),
              'learning_rate':np.linspace(0.01,1,num=3,dtype=float) }
#grid_ada =  GridSearchCV(ada, param_grid=params_ada, scoring= 'neg_mean_squared_error', cv=5)



#BaggingRegressor
params_bagg = {'n_estimators':np.linspace(100,600,num=3,dtype=int), 
               'max_features':np.linspace(10,40,num=5,dtype=int),        
    'oob_score':[True]  }
grid_bagg =  GridSearchCV(bagging, param_grid=params_bagg, scoring= 'neg_mean_squared_error', cv=5)


#MLP-Regressor - Redes Neuronales
params_nn = {'hidden_layer_sizes':[(5,5,5),(10,10,10),(15,15,15),(5,5,5,5,5),(10,10,10,10,10),(15,15,15,15,15),(50,50,50),(100,100,100)],
              'activation':['relu'],
              'alpha':np.linspace(0.01,1,num=5,dtype=float)}

grid_nn =  GridSearchCV(nn, param_grid=params_nn, scoring= 'neg_mean_squared_error', n_jobs=-1, cv=5, verbose = 3)

grid_nn

GridSearchCV(cv=5,
             estimator=MLPRegressor(learning_rate='adaptive', random_state=100,
                                    verbose=3),
             n_jobs=-1,
             param_grid={'activation': ['relu'],
                         'alpha': array([0.01  , 0.2575, 0.505 , 0.7525, 1.    ]),
                         'hidden_layer_sizes': [(5, 5, 5), (10, 10, 10),
                                                (15, 15, 15), (5, 5, 5, 5, 5),
                                                (10, 10, 10, 10, 10),
                                                (15, 15, 15, 15, 15),
                                                (50, 50, 50),
                                                (100, 100, 100)]},
             scoring='neg_mean_squared_error', verbose=3)

In [114]:
sklearn.metrics.SCORERS.keys()

dict_keys(['explained_variance', 'r2', 'max_error', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_absolute_percentage_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_root_mean_squared_error', 'neg_mean_poisson_deviance', 'neg_mean_gamma_deviance', 'accuracy', 'top_k_accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'neg_brier_score', 'adjusted_rand_score', 'rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_wei

In [None]:
# ajustar/entrenar Decision Tree Regressor
grid_dt.fit(X_train, y_train)

In [None]:
# ajustar/entrenar Random Forest
grid_rf.fit(X_train, y_train)

In [None]:
# ajustar/entrenar Bagging 
grid_bagg.fit(X_train, y_train)

In [155]:
# ajustar/entrenar Redes Neuronales
grid_nn.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 40 candidates, totalling 200 fits
Iteration 1, loss = 317140.44976185
Iteration 2, loss = 8338.57686047
Iteration 3, loss = 6120.37276845
Iteration 4, loss = 7297.46358370
Iteration 5, loss = 5178.39538967
Iteration 6, loss = 3472.54203599
Iteration 7, loss = 2658.07614927
Iteration 8, loss = 1794.66465112
Iteration 9, loss = 46725.00666830
Iteration 10, loss = 5672.46316536
Iteration 11, loss = 3090.31869729
Iteration 12, loss = 1689.65886576
Iteration 13, loss = 918.76861845
Iteration 14, loss = 1012.49902550
Iteration 15, loss = 1390.21789679
Iteration 16, loss = 1023.03397372
Iteration 17, loss = 864.95339777
Iteration 18, loss = 634.84917659
Iteration 19, loss = 340.66574803
Iteration 20, loss = 1885.34566914
Iteration 21, loss = 367.98165973
Iteration 22, loss = 297.76592660
Iteration 23, loss = 409.24940726
Iteration 24, loss = 974.26664821
Iteration 25, loss = 2026.34441881
Iteration 26, loss = 8248.60603275
Iteration 27, loss = 1783.99248224
Iterati

GridSearchCV(cv=5,
             estimator=MLPRegressor(learning_rate='adaptive', random_state=100,
                                    verbose=3),
             n_jobs=-1,
             param_grid={'activation': ['relu'],
                         'alpha': array([0.01  , 0.2575, 0.505 , 0.7525, 1.    ]),
                         'hidden_layer_sizes': [(5, 5, 5), (10, 10, 10),
                                                (15, 15, 15), (5, 5, 5, 5, 5),
                                                (10, 10, 10, 10, 10),
                                                (15, 15, 15, 15, 15),
                                                (50, 50, 50),
                                                (100, 100, 100)]},
             scoring='neg_mean_squared_error', verbose=3)

### Búsqueda de Hiperparámetros

A continuación se debe realizar el proceso de GridSearch. Esta operación puede tardar.

In [None]:
# ajustar/entrenar los optimizador de hiperparametros GridSearchCV de cada modelo
grid_dt.fit(X_train, y_train)
grid_rf.fit(X_train, y_train)
#grid_gb.fit(X_train, y_train)
grid_bagg.fit(X_train, y_train)
grid_nn.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 150 candidates, totalling 750 fits


In [None]:
###Resultados de optimizador de hiperparámetros GridSearchCV/RandomizedSearch
# retornar los resultados de la K-Fold (cv=5) sobre los datos de entrenamientot
# print los mejores hiper parametros de cada modelo
print("Mejores parámetros Árbol de Decisión:", grid_dt.best_params_)
print("Mejores parámetros Random Forest:", grid_rf.best_params_)
#print("Mejores parámetros GradientBoost:", grid_gb.best_params_)
print("Mejores parámetros Bagging:", grid_bagg.best_params_)
print("Mejores parámetros Redes Neuronales :", grid_nn.best_params_)
print('')

# mejores modelos obtenidos por sintonización mediante GridSearchCV y Randomized Search para cada modelo
dt_op = grid_dt.best_estimator_
rf_op = grid_rf.best_estimator_
#gb_op = grid_gb.best_estimator_
bagg_op = grid_bagg.best_estimator_
nn_op = grid_nn.best_estimator_


# calcular el promedio de la métrica de evaluación a maximizar 
mean_dt_mse = grid_dt.best_score_
mean_rf_mse = grid_rf.best_score_
#mean_gb_mse = grid_gb.best_score_
mean_bagg_mse = grid_bagg.best_score_
mean_nn_mse = grid_nn.best_score_

# print los mejores hiper parametros de cada modelo
print("Mejores score de Promedio K-Fold Árbol de Decisión:", mean_dt_mse)
print("Mejores score de Promedio K-Fold Random Forest:", mean_rf_mse)
#print("Mejores score de Promedio K-Fold Gradient Boosting:", mean_gb_mse)
print("Mejores score de Promedio K-Fold Bagging:", mean_bagg_mse)
print("Mejores score de Promedio K-Fold Redes Neuronales:", mean_nn_mse)



In [None]:
# construir modelos optimos
dt_op = DecisionTreeRegressor(criterion='squared_error', max_depth = 7, min_samples_leaf =  20, min_samples_split = 125, splitter='best', random_state = 100)
rf_op = RandomForestRegressor(criterion ='mse', max_depth = 20, max_features = 40, min_samples_split = 60, n_estimators = 350, n_jobs=-1, verbose = 3, random_state=100)
bagg_op = BaggingRegressor(max_features = 40, n_estimators = 600, oob_score = True, n_jobs=-1, random_state= 100, verbose = 3)
nn_op = MLPRegressor(activation = 'relu', alpha= 1.0, hidden_layer_sizes = (10, 10, 10, 10, 10), random_state=100, verbose = 3)


In [135]:
# entrenar/ajustar modelos optimos
dt_op.fit(X_train, y_train)

DecisionTreeRegressor(max_depth=7, min_samples_leaf=20, min_samples_split=125,
                      random_state=100)

In [136]:
rf_op.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


building tree 1 of 350building tree 2 of 350
building tree 3 of 350
building tree 4 of 350

building tree 5 of 350
building tree 6 of 350
building tree 7 of 350
building tree 8 of 350
building tree 9 of 350
building tree 10 of 350
building tree 11 of 350
building tree 12 of 350
building tree 13 of 350
building tree 14 of 350
building tree 15 of 350
building tree 16 of 350
building tree 17 of 350
building tree 18 of 350
building tree 19 of 350
building tree 20 of 350
building tree 21 of 350
building tree 22 of 350
building tree 23 of 350
building tree 24 of 350
building tree 25 of 350
building tree 26 of 350
building tree 27 of 350
building tree 28 of 350
building tree 29 of 350
building tree 30 of 350
building tree 31 of 350


[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    1.1s


building tree 32 of 350
building tree 33 of 350
building tree 34 of 350
building tree 35 of 350
building tree 36 of 350
building tree 37 of 350
building tree 38 of 350
building tree 39 of 350
building tree 40 of 350
building tree 41 of 350
building tree 42 of 350
building tree 43 of 350
building tree 44 of 350
building tree 45 of 350
building tree 46 of 350
building tree 47 of 350
building tree 48 of 350
building tree 49 of 350
building tree 50 of 350
building tree 51 of 350
building tree 52 of 350
building tree 53 of 350
building tree 54 of 350
building tree 55 of 350
building tree 56 of 350
building tree 57 of 350
building tree 58 of 350
building tree 59 of 350
building tree 60 of 350
building tree 61 of 350
building tree 62 of 350
building tree 63 of 350
building tree 64 of 350
building tree 65 of 350
building tree 66 of 350
building tree 67 of 350
building tree 68 of 350
building tree 69 of 350
building tree 70 of 350
building tree 71 of 350
building tree 72 of 350
building tree 73

[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:    4.9s


building tree 125 of 350
building tree 126 of 350
building tree 127 of 350
building tree 128 of 350
building tree 129 of 350
building tree 130 of 350
building tree 131 of 350
building tree 132 of 350
building tree 133 of 350
building tree 134 of 350
building tree 135 of 350
building tree 136 of 350
building tree 137 of 350
building tree 138 of 350
building tree 139 of 350
building tree 140 of 350
building tree 141 of 350
building tree 142 of 350
building tree 143 of 350
building tree 144 of 350
building tree 145 of 350
building tree 146 of 350
building tree 147 of 350
building tree 148 of 350
building tree 149 of 350
building tree 150 of 350
building tree 151 of 350
building tree 152 of 350
building tree 153 of 350
building tree 154 of 350
building tree 155 of 350
building tree 156 of 350
building tree 157 of 350
building tree 158 of 350
building tree 159 of 350
building tree 160 of 350
building tree 161 of 350
building tree 162 of 350
building tree 163 of 350
building tree 164 of 350


[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:   10.9s


building tree 288 of 350
building tree 289 of 350
building tree 290 of 350
building tree 291 of 350
building tree 292 of 350
building tree 293 of 350
building tree 294 of 350
building tree 295 of 350
building tree 296 of 350
building tree 297 of 350
building tree 298 of 350
building tree 299 of 350
building tree 300 of 350
building tree 301 of 350
building tree 302 of 350
building tree 303 of 350
building tree 304 of 350
building tree 305 of 350
building tree 306 of 350
building tree 307 of 350
building tree 308 of 350
building tree 309 of 350
building tree 310 of 350
building tree 311 of 350
building tree 312 of 350
building tree 313 of 350
building tree 314 of 350
building tree 315 of 350
building tree 316 of 350
building tree 317 of 350
building tree 318 of 350
building tree 319 of 350
building tree 320 of 350
building tree 321 of 350
building tree 322 of 350
building tree 323 of 350
building tree 324 of 350
building tree 325 of 350
building tree 326 of 350
building tree 327 of 350


[Parallel(n_jobs=-1)]: Done 350 out of 350 | elapsed:   13.3s finished


RandomForestRegressor(criterion='mse', max_depth=20, max_features=40,
                      min_samples_split=60, n_estimators=350, n_jobs=-1,
                      random_state=100, verbose=3)

In [137]:
bagg_op.fit(X_train, y_train)

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   34.8s finished


BaggingRegressor(max_features=40, n_estimators=600, n_jobs=-1, oob_score=True,
                 random_state=100, verbose=3)

In [154]:
nn_op.fit(X_train, y_train)

Iteration 1, loss = 59588195.34207927
Iteration 2, loss = 888764.25432652
Iteration 3, loss = 305284.94429044
Iteration 4, loss = 146188.06361134
Iteration 5, loss = 106859.76728142
Iteration 6, loss = 136165.10557848
Iteration 7, loss = 96894.13697137
Iteration 8, loss = 78671.15429058
Iteration 9, loss = 128966.74272109
Iteration 10, loss = 766951.82198657
Iteration 11, loss = 104603.67373607
Iteration 12, loss = 71149.91093548
Iteration 13, loss = 188358.71777501
Iteration 14, loss = 63103.24772321
Iteration 15, loss = 89331.99826532
Iteration 16, loss = 37927.62976157
Iteration 17, loss = 30557.12240806
Iteration 18, loss = 40791.60093405
Iteration 19, loss = 28304.13416595
Iteration 20, loss = 20834.28111025
Iteration 21, loss = 7733.34648889
Iteration 22, loss = 10658.18376786
Iteration 23, loss = 6010.88451634
Iteration 24, loss = 4918.91716733
Iteration 25, loss = 8025.02353050
Iteration 26, loss = 4991.90659087
Iteration 27, loss = 5332.04975754
Iteration 28, loss = 7033.75105

MLPRegressor(alpha=1.0, hidden_layer_sizes=(10, 10, 10, 10, 10),
             random_state=100, verbose=3)

In [139]:
y_real = pricey_test
y_real

array([0.00222222, 0.01340083, 0.00261905, ..., 0.00412088, 0.00365497,
       0.03518519])

In [145]:
### Resultados y Métricas de Evaluacion
##DecisionTree Regressor

# crear DataFrame de resultados
df_res1 = pd.DataFrame()

# crear de predicciones y_predict
y_predict_train1 = np.exp(dt_op.predict(X_train))
y_predict_val1 = np.exp(dt_op.predict(X_val))
y_predict_test1 =np.exp(dt_op.predict(X_test))

# crear vector de error sobre datos de prueba (X_test)
error1 = y_predict_test1-y_real

# calcular funcion de error


df_res1['y_real'] = y_real
df_res1['y_predict'] = y_predict_test1
df_res1['error'] = error1

# definir funcion del error
def fun_error(x):   
    if x>0:
        return x**2
    else:
        return np.abs(x)
        
df_res1['fun_error'] = df_res1['error'].apply(lambda x: fun_error(x))

# crear DataFrame de resultados
df_metricas1 = pd.DataFrame()
rmse1 = np.sqrt(np.mean(df_res1['error']**2))
mae1 = np.mean(df_res1['error']**2)
avg1 = np.mean(df_res1['fun_error'])

rmse1,mae1,avg1


df_res1.to_csv('data/res_dt.csv')

In [146]:
### Resultados y Métricas de Evaluacion
##RandomForest Regressor

# crear DataFrame de resultados
df_res2 = pd.DataFrame()

# crear de predicciones y_predict
y_predict_train2 = np.exp(rf_op.predict(X_train))
y_predict_val2 = np.exp(rf_op.predict(X_val))
y_predict_test2 =np.exp(rf_op.predict(X_test))

# crear vector de error sobre datos de prueba (X_test)
error2 = y_predict_test2-y_real

# calcular funcion de error
df_res2['y_real'] = y_real
df_res2['y_predict'] = y_predict_test2
df_res2['error'] = error2

# definir funcion del error
def fun_error(x):   
    if x>0:
        return x**2
    else:
        return np.abs(x)
        
df_res2['fun_error'] = df_res2['error'].apply(lambda x: fun_error(x))

# crear DataFrame de resultados
df_metricas2 = pd.DataFrame()
rmse2 = np.sqrt(np.mean(df_res2['error']**2))
mae2 = np.mean(df_res2['error']**2)
avg2 = np.mean(df_res2['fun_error'])

rmse2,mae2,avg2

df_res2.to_csv('data/res_rf.csv')

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 120 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 280 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 350 out of 350 | elapsed:    0.4s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 120 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 280 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 350 out of 350 | elapsed:    0.1s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 120 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 280 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 350 out of 350 | elapsed:    0.1s finished


In [147]:
### Resultados y Métricas de Evaluacion
##Bagging Regressor

# crear DataFrame de resultados
df_res3 = pd.DataFrame()

# crear de predicciones y_predict
y_predict_train3 = np.exp(bagg_op.predict(X_train))
y_predict_val3 = np.exp(bagg_op.predict(X_val))
y_predict_test3 =np.exp(bagg_op.predict(X_test))

# crear vector de error sobre datos de prueba (X_test)
error3 = y_predict_test3-y_real

# calcular funcion de error
df_res3['y_real'] = y_real
df_res3['y_predict'] = y_predict_test3
df_res3['error'] = error3

# definir funcion del error
def fun_error(x):   
    if x>0:
        return x**2
    else:
        return np.abs(x)
        
df_res3['fun_error'] = df_res3['error'].apply(lambda x: fun_error(x))

# crear DataFrame de resultados
df_metricas3 = pd.DataFrame()
rmse3 = np.sqrt(np.mean(df_res3['error']**2))
mae3 = np.mean(df_res3['error']**2)
avg3 = np.mean(df_res3['fun_error'])

rmse3,mae3,avg3


df_res3.to_csv('data/res_bagg.csv')

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   27.5s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   14.7s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    6.7s finished


In [148]:
### Resultados y Métricas de Evaluacion
##Redes Neuronales

# crear DataFrame de resultados
df_res4 = pd.DataFrame()

# crear de predicciones y_predict
y_predict_train4 = np.exp(nn_op.predict(X_train))
y_predict_val4 = np.exp(nn_op.predict(X_val))
y_predict_test4 =np.exp(nn_op.predict(X_test))

# crear vector de error sobre datos de prueba (X_test)
error4 = y_predict_test4-y_real

# calcular funcion de error
df_res4['y_real'] = y_real
df_res4['y_predict'] = y_predict_test4
df_res4['error'] = error4

# definir funcion del error
def fun_error(x):   
    if x>0:
        return x**2
    else:
        return np.abs(x)
        
df_res4['fun_error'] = df_res4['error'].apply(lambda x: fun_error(x))

# crear DataFrame de resultados
df_metricas4 = pd.DataFrame()
rmse4 = np.sqrt(np.mean(df_res4['error']**2))
mae4 = np.mean(df_res4['error']**2)
avg4 = np.mean(df_res4['fun_error'])

rmse4,mae4,avg4

OverflowError: (34, 'Result too large')