In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import Ridge, ElasticNet, Lasso, HuberRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,AdaBoostRegressor,ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor
import xgboost as xgb


In [2]:
traindata=pd.read_csv('train.csv')#reading the data
traindata=traindata.drop('ID_Customer',axis=1)
print('Number of rows and columns of the training set: ',traindata.shape)
X_traindata=traindata.drop('Poder_Adquisitivo',axis=1)
y_traindata=traindata['Poder_Adquisitivo']

Number of rows and columns of the training set:  (363834, 89)


In [3]:
models=[
    Ridge(),
    Lasso(),
    ElasticNet(),
    KNeighborsRegressor(),
    HuberRegressor(),
    GradientBoostingRegressor(),
    AdaBoostRegressor(),
    RandomForestRegressor(),
    ExtraTreesRegressor()
    XGBRegressor()
    
]
param_grid_list=[
    {'alpha':[0.05,0.1,0.3,0.6,1,1.5,3,5,10,15,30,50,80]},
    {'alpha':[0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1, 0.3, 0.6, 1.0]},
    {'l1_ratio':[0.1, 0.3, 0.5, 0.7, 0.9, 1],'alpha':[0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5]},
    {'n_neighbors':[3,5,7,9]},
    {'epsilon':[1.0,1.2,1.35,1.5,1.7,2.0], 'alpha':[0.00005,0.0001,0.0003,0.0006,0.0009,0.0012]},
    {'learning_rate': [0.1, 0.05, 0.01], 'max_depth': [4, 6, 8, 10, 12], 'n_estimators': range(50, 200, 25)},
    {'learning_rate': [0.5, 1, 3,5], 'n_estimators': range(50, 200, 25)},
    {'max_features':['auto', 'sqrt', 40], 'max_depth': [4, 6, 8, 10, None], 'n_estimators': range(50, 300, 50)}
    {'max_features':['auto', 'sqrt', 40], 'max_depth': [4, 6, 8, 10, None], 'n_estimators': range(50, 300, 50)},
    {}
]


In [4]:
#Input: Resultados de una comparacion de modelos
#Output: los resultados en forma de tabla
def get_results(model_names,errors,params=[]):
    results={}
    results['Regressor']=model_names
    results['error']=errors
    if len(params)>0:
        results['params']=params
    return pd.DataFrame(results).set_index('Regressor')

In [5]:
#Input: Un modelo
#Output: El nombre del modelo
def get_model_name(model):
    return str(model.__class__).split('.')[-1].split("'")[0]

In [6]:
#Input: Un modelo y un dataset
#Output: Obtiene el error de cross validation del modelo sobre el dataset
def mae_cv(model,X, y, verbose=0):
    return (-cross_val_score(model, X, y, scoring = 'neg_mean_absolute_error', n_jobs=-1, cv=8,verbose=verbose)).mean()

In [7]:
#Input: Un modelo, un dataset y un conjunto de parametros
#Output: Estima los parametros del modelo con cross validation y devuelve el error de la mejor cmbinacion (y la mejor combinacion)
def mae_cv_param_grid(model,X, y,param_grid, verbose=0):
    grid_model=GridSearchCV(model,param_grid,scoring='neg_mean_absolute_error',n_jobs=-1,cv=8,refit=False,verbose=verbose).fit(X,y)
    return -grid_model.best_score_, grid_model.best_params_


In [None]:
#Input: Un modelo, un dataset y (opcional) Un grid de parámetros
#Output: Devuelve una tabla con la comparativa de los modelos en terminos de error sobre el dataset (y sus parametros optimos si estimate_params=True)
def compare_models(models, X, y, param_grid_list=[], estimate_params=False, verbose=0):

    model_names=[]
    errors=[]
    params=[]
    
    
    for i in range(len(models)):
            
        model_name=get_model_name(models[i])
        
        if estimate_params:
            score,param = mae_cv_param_grid(models[i],X, y,param_grid_list[i],verbose=verbose)
            params.append(param)
        
        else:
            score=mae_cv(models[i],X, y,verbose=verbose)
        
        if verbose>0:
            print(model_name,': ',score)

        model_names.append(model_name)
        errors.append(score)
        

    return get_results(model_names,errors,params)
    

In [None]:
#Comparamos modelos (cada uno con su mejor combinacion de parametros)
compare_models(models, X_traindata, y_traindata, param_grid_list, True, verbose=4)

Fitting 8 folds for each of 54 candidates, totalling 432 fits


In [None]:
#Comparamos modelos (cada uno con su combinacion de parametros por defecto)
compare_models(models, X_traindata, y_traindata, verbose=4)