In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import Ridge, ElasticNet, Lasso
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,ExtraTreesRegressor
import xgboost as xgb
import lightgbm as lgb
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import make_pipeline
from Utilidades import *
from Utilidades_selection_validation import *

In [7]:
#LECTURA DEL DATASET

folder='Total'

traindata=pd.read_csv(folder+'/traindata.csv')#reading the data
testdata=pd.read_csv(folder+'/TEST.csv')#reading the data

traindata=traindata.drop('ID_Customer',axis=1)
test_ids=testdata['ID_Customer'] #Nos lo guardamos para submision
testdata=testdata.drop('ID_Customer',axis=1)

print('Number of rows and columns of the training set: ',traindata.shape)
print('Number of rows and columns of the test set: ',testdata.shape)

X_traindata=traindata.drop('Poder_Adquisitivo',axis=1)
y_traindata=traindata['Poder_Adquisitivo']

Number of rows and columns of the training set:  (363834, 146)
Number of rows and columns of the test set:  (156315, 145)


In [8]:
#DEFINICION DE MODELOS Y SU GRID DE PARAMETROS

models=[
    Ridge(),
    Lasso(),
    ElasticNet(),
    RandomForestRegressor(),
    ExtraTreesRegressor(),
    GradientBoostingRegressor(),
    xgb.XGBRegressor(),
    lgb.LGBMRegressor()
]


models_table=get_models_table(models)

    
param_grid_list=[
    #LINEAR MODELS
    {'alpha':(0.05,100)},
    {'alpha':(0.0001,1.0)},
    {'l1_ratio':(0.1,1),'alpha':(0.001,1)},
    
    #ENSEMBLE MODELS
    {'n_estimators': (10, 300),'min_samples_split': (2, 25),'max_features': (0.1, 0.999),'max_depth': (4,8)},
    {'n_estimators': (10, 300),'min_samples_split': (2, 25),'max_features': (0.1, 0.999),'max_depth': (4,8)},
    
    {'n_estimators':(50,300),'learning_rate':(0.05,0.5),'subsample':(0.5,1),'max_depth':(4,8),
     'min_samples_leaf':(5, 20),'min_samples_split':(2, 12)},
    
    {'n_estimators':(50,300),'learning_rate':(0.05,0.5),'subsample':(0.5,1),'max_depth':(4,10),'reg_alpha':(0,1.4),
     'reg_lambda':(0,1.4),'min_child_weight':(1,10),'colsample_bytree':(0.1,1),'gamma':(0,1.4)},
    
    {'n_estimators':(100,3000),'learning_rate':(0.005,0.1),'subsample':(0.5,1),'max_depth':(5,15),'reg_alpha':(0,1.4),
     'reg_lambda':(0,1.4),'colsample_bytree':(0.6,0.8),'max_bin':(128,512),'num_leaves':(2,32),'min_data_in_leaf':(20,200)}
]

y_traindata = np.log10(y_traindata) #GRACIAS A ESTA TRANSFORMACION CONSEGUIMOS QUE LA VARIABLE OBJETIVO SIGA UNA DISTRIBUCION NORMAL

#separamos nuestro conjunto de train en train y validacion
X_train, X_val, y_train, y_val = train_test_split(X_traindata, y_traindata, test_size=0.33)


In [None]:
#COMPARACION DE MODELOS CON O SIN ESTIMACION DE PARAMETROS
#SE RECOMIENDA NO EJECUTAR ESTE CODIGO, PUES COMPARA MUCHOS MODELOS SOBRE UN DATASET MUY GRANDE
#POR LO QUE TARDARA VARIAS HORAS
compare_models(models_table, X_train, y_train, param_grid_list, verbose=1, cv=4)

#Alternativamente podemos Comparar modelos cada uno con su combinacion de parametros por defecto (mas rapido)
#compare_models(models_table, X_traindata, y_traindata, verbose=1, cv=4)#, metric=scorer)
models_table

In [9]:
# NOS QUEDAMOS CON EL MEJOR MODELO CON SU MEJOR COMBINACION DE PARAMETROS
lightg=lgb.LGBMRegressor(objective='huber',n_estimators=804,learning_rate=0.1,subsample=1,max_depth=15,reg_alpha=1.4,
     reg_lambda=0,colsample_bytree=0.8,max_bin=357,num_leaves=32,min_data_in_leaf=200)

In [10]:
#OBTENEMOS EL ERROR DE NUETRO MODELO SOBRE EL CONJUNTO DE VALIDACION
log_scoring(y_val,fit_predict(lightg,X_train,y_train,X_val))

4030.2613299009704

In [11]:
#GENERAMOS LOS RESULTADOS
submision=pd.DataFrame()
submision['ID_Customer']=test_ids
submision['PA_Est']=np.power(10,fit_predict(lightg,X_traindata,y_traindata,testdata))
print('The description of the submision:\n',submision.describe())
submision.to_csv('Test_Mission.txt',index=False)

The description of the submision:
               PA_Est
count  156315.000000
mean    15262.960413
std     11254.956463
min      4477.449743
25%      9504.925282
50%     12722.722337
75%     18008.632730
max    633027.644677
