In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split,KFold
from sklearn.metrics import mean_absolute_error, make_scorer
from sklearn.linear_model import Ridge, ElasticNet, Lasso, HuberRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,ExtraTreesRegressor
import xgboost as xgb
import lightgbm as lgb
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.pipeline import make_pipeline
from bayes_opt import BayesianOptimization
from functools import partial

In [None]:
#Input: Resultados de una comparacion de modelos
#Output: los resultados en forma de tabla
def get_models_table(models):
    models_table={}
    model_names=[get_model_name(model) for model in models]
    models_table['Regressor']=model_names
    models_table['model']=models
    models_table['error']=np.repeat(np.nan,len(models))
    models_table['params']=[{} for i in range(len(models))]

    return pd.DataFrame(models_table).set_index('Regressor')



def remove_outlier_predictions(y_pred,y_train):
    min_val, max_val = y_train.min(), y_train.max()
    result=y_pred.copy()
    result[y_pred<min_val]= min_val
    result[y_pred>max_val]= max_val
    
    return result


#Input: Un modelo
#Output: El nombre del modelo
def get_model_name(model):
    if 'pipeline' in str(type(model)):
        return model.steps[1][0]
    else:
        return str(model.__class__).split('.')[-1].split("'")[0]


#Input: Un modelo y un dataset separado en train y test
#Output: Obtiene la prediccion del modelo entrenado con el train sobre el conjunto de test
def fit_predict(model,X,y,X_test):
    model.fit(X,y)
    return remove_outlier_predictions(model.predict(X_test),y)


#Input: Un modelo y un dataset
#Output: Obtiene el error de cross validation del modelo sobre el dataset
def error_cv(model,X, y, verbose=0, metric='neg_mean_absolute_error', cv=8):
    return abs(cross_val_score(model, X, y, scoring = metric, n_jobs=-1, cv=cv,verbose=verbose)).mean()

def cast_to_int(params):
    result={}
    for key in set(['n_estimators','min_samples_split','max_depth','max_bin','num_leaves','min_data_in_leaf']).intersection(set(params.keys())):
        result[key]=int(params[key])
    return result

def model_evaluate(model,X,y,cv,**params):
    return -error_cv(model(**cast_to_int(params)),X,y,cv=cv)

#Input: Un modelo, un dataset y un conjunto de parametros
#Output: Estima los parametros del modelo con cross validation y devuelve el error de la mejor cmbinacion (y la mejor combinacion)
def error_cv_param_grid(model,X, y,param_grid, verbose=0, metric='neg_mean_absolute_error', cv=8):
    modeltype=str(type(model))
    if 'ensemble' in modeltype or 'xgboost' in modeltype or 'lightgbm' in modeltype:
        rfcBO = BayesianOptimization(partial(model_evaluate,model=type(model),X=X,y=y,cv=cv),param_grid)
        rfcBO.maximize(n_iter=1)
        return abs(rfcBO.res['max']['max_val']),rfcBO.res['max']['max_params']
        
    else:
        
        grid_model=GridSearchCV(model,param_grid,scoring=metric,n_jobs=-1,cv=cv,refit=False,verbose=verbose).fit(X,y)
        return abs(grid_model.best_score_), grid_model.best_params_


#Input: Un modelo, un dataset y (opcional) Un grid de parámetros
#Output: Devuelve una tabla con la comparativa de los modelos en terminos de error sobre el dataset (y sus parametros optimos si estimate_params=True)
def compare_models(models_table, X, y, estimate_params=False, verbose=0, metric='neg_mean_absolute_error', cv=8):
    
    errors=[]
    params=[]
    for i in range(models_table.shape[0]):
        
        model=models_table['model'].iloc[i] 
        if estimate_params:
            score,param = error_cv_param_grid(model,X, y,estimate_params[i],verbose=verbose, metric=metric, cv=cv)
            params.append(param)
        
        else:
            score=error_cv(models[i],X, y,verbose=verbose ,metric=metric, cv=cv)
        
        if verbose>0:
            print(models_table.index[i],': ',score)

        errors.append(score)
    
    models_table['error']=errors
    if estimate_params:
        models_table['params']=params
        
         
class Stacking_model(BaseEstimator, RegressorMixin):
    def __init__(self, base_models, meta_model, n_folds=5, metric=mean_absolute_error):
        self.base_models = base_models
        self.meta_model = meta_model
        self.n_folds = n_folds
        self.metric = metric
   
    def fit(self, X, y):
        
        X_matrix=X
        y_matrix=y
        if type(X)== pd.core.frame.DataFrame:
            X_matrix=X_matrix.as_matrix()
            
        if type(y)==pd.core.series.Series:    
            y_matrix=y_matrix.as_matrix()
        
        
        self.base_models_ = [list() for x in self.base_models]
        self.meta_model_ = clone(self.meta_model)
        kfold = KFold(n_splits=self.n_folds, shuffle=True, random_state=156)
        
        
        out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models)))
        for i, model in enumerate(self.base_models):
            for train_index, holdout_index in kfold.split(X_matrix, y_matrix):
                instance = clone(model)
                self.base_models_[i].append(instance)
                
                instance.fit(X_matrix[train_index], y_matrix[train_index])
                y_pred = instance.predict(X_matrix[holdout_index])
                out_of_fold_predictions[holdout_index, i] = y_pred
                
        
        self.meta_model_.fit(out_of_fold_predictions, y)
        return self
   
    def predict(self, X_test):
        df_test=pd.DataFrame()
        for i, base_models in enumerate(self.base_models_):
            df_test[i]=np.zeros(X_test.shape[0])
            for model in base_models:
                df_test[i]+=model.predict(X_test)
                
            df_test[i]/=len(base_models)
        
        return self.meta_model_.predict(df_test)
    
    
    def score(self,X_test,y_test):
        return self.metric(y_test,self.predict(X_test))

In [None]:
folder='Total'

traindata=pd.read_csv(folder+'/traindata.csv')#reading the data
testdata=pd.read_csv(folder+'/TEST.csv')#reading the data

traindata=traindata.drop('ID_Customer',axis=1)
test_ids=testdata['ID_Customer'] #Nos lo guardamos para submision
testdata=testdata.drop('ID_Customer',axis=1)

traindata=traindata[traindata['Poder_Adquisitivo']<1000000]

print('Number of rows and columns of the training set: ',traindata.shape)
print('Number of rows and columns of the test set: ',traindata.shape)

X_traindata=traindata.drop('Poder_Adquisitivo',axis=1)
y_traindata=traindata['Poder_Adquisitivo']

In [None]:
models=[
    make_pipeline(RobustScaler(), Ridge()),
    make_pipeline(RobustScaler(), Lasso()),
    make_pipeline(RobustScaler(), ElasticNet()),
    HuberRegressor(),
    RandomForestRegressor(),
    ExtraTreesRegressor(),
    GradientBoostingRegressor(loss='huber'),
    xgb.XGBRegressor(),
    lgb.LGBMRegressor(objective='huber')
]


models_table=get_models_table(models)

    
param_grid_list=[
    #LINEAR MODELS
    {'ridge__alpha':[0.05,0.1,0.3,0.6,1,1.5,3,5,10,15,30,50,80,100]},
    {'lasso__alpha':[0.0001,0.0003,0.0006,0.001,0.003,0.006,0.01,0.03,0.06,0.1,0.3,0.6,1.0]},
    {'elasticnet__l1_ratio':[0.1,0.3,0.6,0.9,1],'elasticnet__alpha':[0.001,0.005,0.01,0.05,0.1,0.5,1]},
    {'epsilon':[1.0,1.2,1.35,1.5,1.7,2.0], 'alpha':[0.00005,0.0001,0.0003,0.0006,0.0009,0.0012]},
    
    #ENSEMBLE MODELS
    {'n_estimators': (10, 300),'min_samples_split': (2, 25),'max_features': (0.1, 0.999),'max_depth': (4,12)},
    {'n_estimators': (10, 300),'min_samples_split': (2, 25),'max_features': (0.1, 0.999),'max_depth': (4,12)},
    
    {'n_estimators':(100,3000),'learning_rate':(0.05,0.5),'subsample':(0.5,1),'max_depth':(5,15),
     'min_samples_leaf':(5, 20),'min_samples_split':(2, 12),'alpha':(0,1.5)},
    
    {'n_estimators':(100,3000),'learning_rate':(0.05,0.5),'subsample':(0.5,1),'max_depth':(5,15),'reg_alpha':(0,1.4),
     'reg_lambda':(0,1.4),'min_child_weight':(1,10),'colsample_bytree':(0.1,1),'gamma':(0,1.4)},
    
    {'n_estimators':(100,3000),'learning_rate':(0.005,0.1),'subsample':(0.5,1),'max_depth':(5,15),'reg_alpha':(0,1.4),
     'reg_lambda':(0,1.4),'colsample_bytree':(0.6,0.8),'max_bin':(128,512),'num_leaves':(2,32),'min_data_in_leaf':(20,200)}
]


#separamos nuestro conjunto de train en train y validacion
X_train, X_val, y_train, y_val = train_test_split(X_traindata, y_traindata, test_size=0.33)

In [None]:
#Comparamos modelos (cada uno con su mejor combinacion de parametros)
compare_models(models_table, X_train, y_train, param_grid_list, verbose=1, cv=4)

#Alternativamente podemos Comparar modelos cada uno con su combinacion de parametros por defecto (mas rapido)
#compare_models(models_table, X_traindata, y_traindata, verbose=1, cv=4)#, metric=scorer)
models_table

# Ejecuta todo hasta aqui

In [None]:
#Escogemos un modelo y sus parametros en base a los resultados obtenidos arriba
#Validamos el modelo obteniendo el error para el conjunto de test
key=models_table['error'].argmin()

best_model=models_table.loc[key,'model'].set_params(**models_table.loc[key,'params'])
print('Validation mean absolute error: ',mean_absolute_error(y_val,fit_predict(best_model,X_train,y_train,X_val)))

In [None]:
best_model.fit(X_traindata, y_traindata)

submision=pd.DataFrame()
submision['ID_Customer']=test_ids
submision['PA_Est']=fit_predict(best_model,X_traindata,y_traindata,testdata)
print('The description of the submision:\n',submision.describe())
submision.to_csv('Test_Mission.txt',index=False)

In [None]:
y_pred6=fit_predict(Stacking_model([Ridge(),Ridge()],Ridge()),X_train,y_train,X_val)

In [None]:
y_pred6=transformacion_exponencial(y_pred6)

In [None]:
mean_absolute_error(y_val,y_pred6)

In [None]:
error_cv(Stacking_model([Ridge(),Ridge()],Ridge()),X_traindata,y_traindata,verbose=4)

In [None]:
y_pred=fit_predict(GradientBoostingRegressor(n_estimators=3000,loss='huber'),X_train,y_train,X_val)

In [None]:
y_pred2=fit_predict(xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1),X_train,y_train,X_val)

In [None]:
y_pred3=fit_predict(lgb.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11),X_train,y_train,X_val)

In [None]:
mean_absolute_error(y_val,y_pred)

In [None]:
mean_absolute_error(y_val,y_pred2)

In [None]:
mean_absolute_error(y_val,y_pred3)

In [None]:
mean_absolute_error(y_val,y_pred4)

In [None]:
mae_cv(xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1),X_train,y_train)