In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split 
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import Ridge, ElasticNet, Lasso, HuberRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,AdaBoostRegressor,ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor
import xgboost as xgb


In [None]:
#Input: Resultados de una comparacion de modelos
#Output: los resultados en forma de tabla
def get_models_table(models):
    models_table={}
    model_names=[get_model_name(model) for model in models]
    models_table['Regressor']=model_names
    models_table['model']=models
    models_table['error']=np.repeat(np.nan,len(models))
    models_table['params']=[{} for i in range(len(models))]

    return pd.DataFrame(models_table).set_index('Regressor')


#Input: Un modelo
#Output: El nombre del modelo
def get_model_name(model):
    return str(model.__class__).split('.')[-1].split("'")[0]


#Input: Un modelo y un dataset separado en train y test
#Output: Obtiene la prediccion del modelo entrenado con el train sobre el conjunto de test
def fit_predict(model,X,y,X_test):
    model.fit(X,y)
    y_pred=model.predict(X_test)
    y_pred[y_pred<y.min()]=y.min()
    return y_pred


#Input: Un modelo y un dataset
#Output: Obtiene el error de cross validation del modelo sobre el dataset
def mae_cv(model,X, y, verbose=0):
    return (-cross_val_score(model, X, y, scoring = 'neg_mean_absolute_error', n_jobs=-1, cv=8,verbose=verbose)).mean()


#Input: Un modelo, un dataset y un conjunto de parametros
#Output: Estima los parametros del modelo con cross validation y devuelve el error de la mejor cmbinacion (y la mejor combinacion)
def mae_cv_param_grid(model,X, y,param_grid, verbose=0):
    grid_model=GridSearchCV(model,param_grid,scoring='neg_mean_absolute_error',n_jobs=-1,cv=8,refit=False,verbose=verbose).fit(X,y)
    return -grid_model.best_score_, grid_model.best_params_


#Input: Un modelo, un dataset y (opcional) Un grid de parámetros
#Output: Devuelve una tabla con la comparativa de los modelos en terminos de error sobre el dataset (y sus parametros optimos si estimate_params=True)
def compare_models(models_table, X, y, param_grid_list=[], estimate_params=False, verbose=0):
    
    errors=[]
    params=[]
    for i in range(models_table.shape[0]):
        
        model=models_table['model'].iloc[i] 
        if estimate_params:
            score,param = mae_cv_param_grid(model,X, y,param_grid_list[i],verbose=verbose)
            params.append(param)
        
        else:
            score=mae_cv(models[i],X, y,verbose=verbose)
        
        if verbose>0:
            print(models_table.index[i],': ',score)

        errors.append(score)
    
    models_table['error']=errors
    if estimate_params:
        models_table['params']=params

In [None]:
traindata=pd.read_csv('train.csv')#reading the data
testdata=pd.read_csv('test.csv')#reading the data

traindata=traindata.drop('ID_Customer',axis=1)
test_ids=testdata['ID_Customer'] #Nos lo guardamos para submision
testdata=testdata.drop('ID_Customer',axis=1)

print('Number of rows and columns of the training set: ',traindata.shape)
print('Number of rows and columns of the test set: ',traindata.shape)

X_traindata=traindata.drop('Poder_Adquisitivo',axis=1)
y_traindata=traindata['Poder_Adquisitivo']

In [None]:
models=[
    Ridge(),
    Lasso(),
    ElasticNet(),
    KNeighborsRegressor(),
    HuberRegressor(),
    GradientBoostingRegressor(),
    AdaBoostRegressor(),
    RandomForestRegressor(),
    ExtraTreesRegressor()
    XGBRegressor()   
]


models_table=get_models_table(models)

    
param_grid_list=[
    {'alpha':[0.05,0.1,0.3,0.6,1,1.5,3,5,10,15,30,50,80]},
    {'alpha':[0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1, 0.3, 0.6, 1.0]},
    {'l1_ratio':[0.1, 0.3, 0.5, 0.7, 0.9, 1],'alpha':[0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5]},
    {'n_neighbors':[3,5,7,9]},
    {'epsilon':[1.0,1.2,1.35,1.5,1.7,2.0], 'alpha':[0.00005,0.0001,0.0003,0.0006,0.0009,0.0012]},
    {'learning_rate': [0.1, 0.05, 0.01], 'max_depth': [4, 6, 8, 10, 12], 'n_estimators': range(50, 200, 25)},
    {'learning_rate': [0.5, 1, 3,5], 'n_estimators': range(50, 200, 25)},
    {'max_features':['auto', 'sqrt', 40], 'max_depth': [4, 6, 8, 10, None], 'n_estimators': range(50, 300, 50)},
    {'max_features':['auto', 'sqrt', 40], 'max_depth': [4, 6, 8, 10, None], 'n_estimators': range(50, 300, 50)},
    {}
]


#separamos nuestro conjunto de train en train y validacion
X_train, X_val, y_train, y_val = train_test_split(X_traindata, y_traindata, test_size=0.33)

In [None]:
#Comparamos modelos (cada uno con su mejor combinacion de parametros)
compare_models(models_table, X_train, y_train, param_grid_list, True, verbose=4)

#Alternativamente podemos Comparar modelos cada uno con su combinacion de parametros por defecto (mas rapido)
#compare_models(models_table, X_traindata, y_traindata, verbose=4)
models_table

In [None]:
#Escogemos un modelo y sus parametros en base a los resultados obtenidos arriba
#Validamos el modelo obteniendo el error para el conjunto de test
key=models_table['error'].argmin()

best_model=models_table.loc[key,'model'].set_params(**models_table.loc[key,'params'])
print('Validation mean absolute error: ',mean_absolute_error(y_val,fit_predict(best_model,X_train,y_train,X_val)))

In [None]:
best_model.fit(X_traindata, y_traindata)

submision=pd.DataFrame()
submision['ID_Customer']=test_ids
submision['Poder_Adquisitivo']=fit_predict(best_model,X_traindata,y_traindata,testdata)
print('The description of the submision:\n',submision.describe())
submision.to_csv('submision.csv',index=False)