In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import Ridge, ElasticNet, Lasso
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,ExtraTreesRegressor
import xgboost as xgb
import lightgbm as lgb
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import make_pipeline
from Utilidades import *
from Utilidades_selection_validation import *

In [None]:
folder='Total'

traindata=pd.read_csv(folder+'/traindata.csv')#reading the data
testdata=pd.read_csv(folder+'/TEST.csv')#reading the data

traindata=traindata.drop('ID_Customer',axis=1)
test_ids=testdata['ID_Customer'] #Nos lo guardamos para submision
testdata=testdata.drop('ID_Customer',axis=1)

print('Number of rows and columns of the training set: ',traindata.shape)
print('Number of rows and columns of the test set: ',testdata.shape)

X_traindata=traindata.drop('Poder_Adquisitivo',axis=1)
y_traindata=traindata['Poder_Adquisitivo']

In [None]:
models=[
    Ridge(),
    Lasso(),
    ElasticNet(),
    RandomForestRegressor(),
    ExtraTreesRegressor(),
    GradientBoostingRegressor(loss='huber'),
    xgb.XGBRegressor(),
    lgb.LGBMRegressor(objective='huber')
]


models_table=get_models_table(models)

    
param_grid_list=[
    #LINEAR MODELS
    {'alpha':(0.05,100)},
    {'alpha':(0.0001,1.0)},
    {'l1_ratio':(0.1,1),'alpha':(0.001,1)},
    
    #ENSEMBLE MODELS
    {'n_estimators': (10, 300),'min_samples_split': (2, 25),'max_features': (0.1, 0.999),'max_depth': (4,12)},
    {'n_estimators': (10, 300),'min_samples_split': (2, 25),'max_features': (0.1, 0.999),'max_depth': (4,12)},
    
    {'n_estimators':(100,3000),'learning_rate':(0.05,0.5),'subsample':(0.5,1),'max_depth':(5,15),
     'min_samples_leaf':(5, 20),'min_samples_split':(2, 12),'alpha':(0,1.5)},
    
    {'n_estimators':(100,3000),'learning_rate':(0.05,0.5),'subsample':(0.5,1),'max_depth':(5,15),'reg_alpha':(0,1.4),
     'reg_lambda':(0,1.4),'min_child_weight':(1,10),'colsample_bytree':(0.1,1),'gamma':(0,1.4)},
    
    {'n_estimators':(100,3000),'learning_rate':(0.005,0.1),'subsample':(0.5,1),'max_depth':(5,15),'reg_alpha':(0,1.4),
     'reg_lambda':(0,1.4),'colsample_bytree':(0.6,0.8),'max_bin':(128,512),'num_leaves':(2,32),'min_data_in_leaf':(20,200)}
]

y_traindata = np.log10(y_traindata) #GRACIAS A ESTA TRANSFORMACION CONSEGUIMOS QUE LA VARIABLE OBJETIVO SIGA UNA DISTRIBUCION NORMAL

#separamos nuestro conjunto de train en train y validacion
X_train, X_val, y_train, y_val = train_test_split(X_traindata, y_traindata, test_size=0.33)

X_train=RobustScaler().fit_transform(X_train)
X_val=RobustScaler().fit_transform(X_val)
testdata=RobustScaler().fit_transform(testdata)


In [None]:
#Comparamos modelos (cada uno con su mejor combinacion de parametros)
compare_models(models_table, X_train, y_train, param_grid_list, verbose=1, cv=4)

#Alternativamente podemos Comparar modelos cada uno con su combinacion de parametros por defecto (mas rapido)
#compare_models(models_table, X_traindata, y_traindata, verbose=1, cv=4)#, metric=scorer)
models_table

# Ejecuta todo hasta aqui

In [None]:
#Escogemos un modelo y sus parametros en base a los resultados obtenidos arriba
#Validamos el modelo obteniendo el error para el conjunto de test
key=models_table['error'].argmin()

best_model=models_table.loc[key,'model'].set_params(**models_table.loc[key,'params'])
print('Validation mean absolute error: ',mean_absolute_error(y_val,fit_predict(best_model,X_train,y_train,X_val)))

In [None]:
best_model.fit(X_traindata, y_traindata)

submision=pd.DataFrame()
submision['ID_Customer']=test_ids
submision['PA_Est']=fit_predict(best_model,X_traindata,y_traindata,testdata)
print('The description of the submision:\n',submision.describe())
submision.to_csv('Test_Mission.txt',index=False)

In [None]:
y_pred6=fit_predict(Stacking_model([Ridge(),Ridge()],Ridge()),X_train,y_train,X_val)

In [None]:
y_pred6=transformacion_exponencial(y_pred6)

In [None]:
mean_absolute_error(y_val,y_pred6)

In [None]:
error_cv(Stacking_model([Ridge(),Ridge()],Ridge()),X_traindata,y_traindata,verbose=4)

In [None]:
y_pred=fit_predict(GradientBoostingRegressor(n_estimators=3000,loss='huber'),X_train,y_train,X_val)

In [None]:
y_pred2=fit_predict(xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1),X_train,y_train,X_val)

In [None]:
y_pred3=fit_predict(lgb.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11),X_train,y_train,X_val)

In [None]:
mean_absolute_error(y_val,y_pred)

In [None]:
mean_absolute_error(y_val,y_pred2)

In [None]:
mean_absolute_error(y_val,y_pred3)

In [None]:
mean_absolute_error(y_val,y_pred4)

In [None]:
mae_cv(xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1),X_train,y_train)

In [None]:
error_cv(Ridge(),X_traindata,y_traindata,cv=4,verbose=1)