In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from datetime import datetime
from scipy.stats import skew  # for some statistics
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax
from sklearn.linear_model import ElasticNetCV, LassoCV, RidgeCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error
from mlxtend.regressor import StackingCVRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

import os

In [2]:
#Cargo los sets
X_train = pd.read_csv('data/X_train_ohc.csv', dtype={'id':'int',\
                                           'tipodepropiedad':'category',\
                                           'ciudad':'category', 'provincia':'category',\
                                           'antiguedad':'float', 'habitaciones':'float',\
                                           'garages':'float','banos':'float',\
                                           'metroscubiertos':'float', 'metrostotales':'float',\
                                           'idzona':'float',\
                                           'gimnasio':'bool', 'usosmultiples':'bool',\
                                           'piscina':'bool', 'escuelascercanas':'bool',\
                                           'centroscomercialescercanos':'bool'\
                                           }, index_col='id')
y_train = pd.read_csv('data/y_train.csv',header=None,squeeze=True,index_col=0)
X_test = pd.read_csv('data/X_test_ohc.csv', dtype={'id':'int',\
                                           'tipodepropiedad':'category',\
                                           'ciudad':'category', 'provincia':'category',\
                                           'antiguedad':'float', 'habitaciones':'float',\
                                           'garages':'float','banos':'float',\
                                           'metroscubiertos':'float', 'metrostotales':'float',\
                                           'idzona':'float',\
                                           'gimnasio':'bool', 'usosmultiples':'bool',\
                                           'piscina':'bool', 'escuelascercanas':'bool',\
                                           'centroscomercialescercanos':'bool'\
                                           }, index_col='id')
y_test = pd.read_csv('data/y_test.csv',header=None,squeeze=True,index_col=0)

In [3]:
features_n = pd.read_csv('data/features_importancia_primera_etapa.csv',index_col=0)\
             .head(70).index.tolist()
features_n[0:5]

['metroscubiertos',
 'metrostotales',
 'banos',
 'provincia__Distrito Federal',
 'idzona']

In [4]:
X_train_n = X_train[features_n]
X_test_n = X_test[features_n]

In [5]:
y_train_log = np.log1p(y_train)
y_test_log = np.log1p(y_test)

In [6]:
kfolds = KFold(n_splits=10, shuffle=True, random_state=42)

def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

def cv_rmse(model, X=X_train_n):
    rmse = np.sqrt(-cross_val_score(model, X, y_train_log, scoring="neg_mean_squared_error", cv=kfolds, n_jobs=5))
    return (rmse)

In [7]:
alphas_alt = [14.5, 14.6, 14.7, 14.8, 14.9, 15, 15.1, 15.2, 15.3, 15.4, 15.5]
alphas2 = [5e-05, 0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008]
e_alphas = [0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007]
e_l1ratio = [0.8, 0.85, 0.9, 0.95, 0.99, 1]

In [8]:
ridge = make_pipeline(RobustScaler(), RidgeCV(alphas=alphas_alt, cv=kfolds))
lasso = make_pipeline(RobustScaler(), LassoCV(max_iter=1e7, alphas=alphas2, random_state=42, cv=kfolds))
elasticnet = make_pipeline(RobustScaler(), ElasticNetCV(max_iter=1e7, alphas=e_alphas, cv=kfolds, l1_ratio=e_l1ratio))                                
svr = make_pipeline(RobustScaler(), SVR(C= 20, epsilon= 0.008, gamma=0.0003,))

In [9]:
gbr = GradientBoostingRegressor(n_estimators=1000, learning_rate=0.05, max_depth=4, max_features='sqrt', min_samples_leaf=15, min_samples_split=10, loss='huber', random_state =42)  

In [10]:
lightgbm = LGBMRegressor(objective='regression', 
                                       num_leaves=4,
                                       learning_rate=0.01, 
                                       n_estimators=1000,
                                       max_bin=200, 
                                       bagging_fraction=0.75,
                                       bagging_freq=5, 
                                       bagging_seed=7,
                                       feature_fraction=0.2,
                                       feature_fraction_seed=7,
                                       verbose=-1,
                                       n_jobs=5
                                       )

In [11]:
xgboost = XGBRegressor(learning_rate=0.01,n_estimators=1000,
                                     max_depth=3, min_child_weight=0,
                                     gamma=0, subsample=0.7,
                                     colsample_bytree=0.7,
                                     objective='reg:linear', n_jobs=5,
                                     scale_pos_weight=1, seed=27,
                                     reg_alpha=0.00006)

In [12]:
stack_gen = StackingCVRegressor(regressors=(ridge, lasso, elasticnet, gbr, xgboost, lightgbm),
                                meta_regressor=xgboost,
                                use_features_in_secondary=True)

In [13]:
score = cv_rmse(ridge)
score = cv_rmse(lasso)
print("LASSO: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )

score = cv_rmse(elasticnet)
print("elastic net: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )

score = cv_rmse(svr)
print("SVR: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )

score = cv_rmse(lightgbm)
print("lightgbm: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )

score = cv_rmse(gbr)
print("gbr: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )

score = cv_rmse(xgboost)
print("xgboost: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )


LASSO: 0.4312 (0.0024)
 2019-11-11 11:42:09.872189
elastic net: 0.4313 (0.0024)
 2019-11-11 11:45:08.834746


KeyboardInterrupt: 

In [None]:
print('START Fit')

print('stack_gen')
stack_gen_model = stack_gen.fit(np.array(X_train_n), np.array(y_train_log))

print('elasticnet')
elastic_model_full_data = elasticnet.fit(X_train_n, y_train_log)

print('Lasso')
lasso_model_full_data = lasso.fit(X_train_n, y_train_log)

print('Ridge')
ridge_model_full_data = ridge.fit(X_train_n, y_train_log)

print('Svr')
svr_model_full_data = svr.fit(X_train_n, y_train_log)

print('GradientBoosting')
gbr_model_full_data = gbr.fit(X_train_n, y_train_log)

print('xgboost')
xgb_model_full_data = xgboost.fit(X_train_n, y_train_log)

print('lightgbm')
lgb_model_full_data = lightgbm.fit(X_train_n, y_train_log)

In [None]:
def blend_models_predict(X):
    return ((0.1 * elastic_model_full_data.predict(X_train_n)) + \
            (0.05 * lasso_model_full_data.predict(X_train_n)) + \
            (0.1 * ridge_model_full_data.predict(X_train_n)) + \
            (0.1 * svr_model_full_data.predict(X_train_n)) + \
            (0.1 * gbr_model_full_data.predict(X_train_n)) + \
            (0.15 * xgb_model_full_data.predict(X_train_n)) + \
            (0.1 * lgb_model_full_data.predict(X_train_n)) + \
            (0.3 * stack_gen_model.predict(np.array(X_train_n))))

In [None]:
print('RMSLE score on train data:')
print(rmsle(y_train_log, blend_models_predict(X_train_n)))