#### Libraries

In [1]:
%%javascript
utils.load_extension('collapsible_headings/main')
utils.load_extension('hide_input/main')
utils.load_extension('autosavetime/main')
utils.load_extension('execute_time/ExecuteTime')
utils.load_extension('code_prettify/code_prettify')
utils.load_extension('scroll_down/main')
utils.load_extension('jupyter-js-widgets/extension')

<IPython.core.display.Javascript object>

In [2]:
from sklearn import *
import sklearn
import time
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
import pandas as pd
pd.set_option('max_columns', None)
import joblib
from sklearn.metrics import mean_absolute_error,mean_squared_error
from sklearn import pipeline
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

import zipfile
import os

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor




from pmlb import fetch_data,regression_dataset_names
from tqdm import tqdm
import time

from category_encoders.target_encoder import TargetEncoder

import warnings
warnings.filterwarnings('ignore')

import sktools


This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [3]:
class TypeSelector(BaseEstimator, TransformerMixin):
    '''
    Transformer that filters a type of columns of a given data frame.
    '''
    def __init__(self, dtype):
        self.dtype = dtype
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        assert isinstance(X, pd.DataFrame)
        #print("Type Selector out shape {}".format(X.select_dtypes(include=[self.dtype]).shape))
        #print(X.select_dtypes(include=[self.dtype]).dtypes)
        return X.select_dtypes(include=[self.dtype])

def elapsed_time_mins (time1,time2):
    elapsed = np.round(np.abs(time1-time2)/60,decimals=2)

    return elapsed



def fit_pipe(pipe,pipe_grid,X,y,subsample=False,n_max=20_000,best_params=True):
    
    if subsample:
        X = X[0:n_max]
        y = y[0:n_max]
    
    # Instantiate the grid
    pipe_cv = GridSearchCV(pipe, param_grid=pipe_grid, n_jobs = n_jobs, cv=cv, scoring="neg_mean_absolute_error")
    
    pipe_cv.fit(X,y)
    
    best_estimator = pipe_cv.best_estimator_.fit( X_tr, y_tr)
    grid_results = pd.DataFrame(pipe_cv.cv_results_)
    
    return best_estimator,grid_results,pipe_cv.best_params_

## Define the data

In [7]:
data = [
    'data/house_kaggle.csv',
    'data/stackoverflow.csv'
]

In [8]:
drop = [
    ['Id'],
    ['Respondent','Salary']
]

In [9]:
cols_enc = [
    ['MSSubClass','MSZoning','LotShape','LandContour','Utilities','LotConfig','Neighborhood','BldgType','HouseStyle','YearBuilt','RoofStyle','RoofMatl','Exterior1st','Exterior2nd','ExterQual','MasVnrType','Heating','HeatingQC'],
    ['Country','Employment','FormalEducation','UndergradMajor','CompanySize','DevType','YearsCoding','LanguageWorkedWith','LanguageDesireNextYear','RaceEthnicity']
]

In [10]:
target = [
    ['SalePrice'],
    ['ConvertedSalary']
    
]

# Loop

In [11]:
n_jobs = 1
float_eltype = np.float32
resultados = []
tic=time.time()

n_max = 20_000
cv = 4
filter_size = 2_000
columns =['NameDataset',
          # Scores
          'enet_te_train_mae','enet_te_test_mae',
          'enet_te_train_mse','enet_te_test_mse',
          
          'enet_pe_train_mae','enet_pe_test_mae',
          'enet_pe_train_mse','enet_pe_test_mse',
          
          'xgb_te_train_mae','xgb_te_test_mae',
          'xgb_te_train_mse','xgb_te_test_mse',
          
          'xgb_pe_train_mae','xgb_pe_test_mae',
          'xgb_pe_train_mse','xgb_pe_test_mse',
          
          
          'size',
          
          # Params
          'enet_te_best_params','enet_pe_best_params',
          # Time
          'time_train_m']    


In [12]:
print('---------------------------------------------------------------------------')
print('|   Data        |  Model        |      Train         |       Test         |')
print('|---------------|---------------|--------------------|--------------------|')

for i in range(0,len(data)):
    
    # Read data
    df = pd.read_csv(data[i])
    df = df.sample(frac=0.2)
    
    # Drop columns 
    df = df.drop(columns=drop[i])
    
    # Fillna
    df.fillna(df.mean(), inplace=True)
    # if its just NaNs
    df.fillna(0,inplace=True)
    
    # Train-Test Split
    X_tr, X_te, y_tr, y_te = sklearn.model_selection.train_test_split(df.drop(columns=target[i]), df[target[i]])
        
    # Elastic Net + target encoding
    scaler  = sklearn.preprocessing.StandardScaler()
    clf = sklearn.linear_model.ElasticNet()
    te = TargetEncoder(cols=cols_enc[i])
        

    pipe = Pipeline([
            ('te',te),
            ('selector', TypeSelector(np.number)), # Selects Numerical Columns only
            ('scaler', scaler),
            ('clf',clf )])
        
    pipe_grid = {}
    
    # Train model
    enet_te,grid_results,enet_te_params = fit_pipe(pipe,pipe_grid,X_tr,y_tr)


    score_enet_te_train = mean_absolute_error(y_tr, enet_te.predict(X_tr))
    score_enet_te_test = mean_absolute_error(y_te, enet_te.predict(X_te))
    
    score_enet_te_train_mse = mean_squared_error(y_tr, enet_te.predict(X_tr))
    score_enet_te_test_mse = mean_squared_error(y_te, enet_te.predict(X_te))

    print('| {0:}    |   enet_te     |     {1:.5f}        |      {2:.5f}       |'.format(data[i][:10],score_enet_te_train,score_enet_te_test))

    
    # Elastic Net + percentile encoding
    scaler  = sklearn.preprocessing.StandardScaler()
    clf = sklearn.linear_model.ElasticNet()
    pe = sktools.PercentileEncoder(cols= cols_enc[i],percentile=50,m=0)
        

    pipe = Pipeline([
            ('pe',pe),
            ('selector', TypeSelector(np.number)), # Selects Numerical Columns only
            ('scaler', scaler),
            ('clf',clf )])
        
    pipe_grid = { 
        "pe__m":[0,1,100],
        "pe__percentile":[25,50,75],
        }
    
    # Train model
    enet_pe,grid_results,enet_pe_params = fit_pipe(pipe,pipe_grid,X_tr,y_tr)


    score_enet_pe_train = mean_absolute_error(y_tr, enet_pe.predict(X_tr))
    score_enet_pe_test = mean_absolute_error(y_te, enet_pe.predict(X_te))
    
    score_enet_pe_train_mse = mean_squared_error(y_tr, enet_pe.predict(X_tr))
    score_enet_pe_test_mse = mean_squared_error(y_te, enet_pe.predict(X_te))
    print('| {0:}    |   enet_pe     |     {1:.5f}        |      {2:.5f}       |'.format(data[i][:10],score_enet_pe_train,score_enet_pe_test))
    
         
    # xgb + target encoding
    scaler  = sklearn.preprocessing.StandardScaler()
    clf = LGBMRegressor()
    te = TargetEncoder(cols=cols_enc[i])
        

    pipe = Pipeline([
            ('te',te),
            ('selector', TypeSelector(np.number)), # Selects Numerical Columns only
            ('scaler', scaler),
            ('clf',clf )])
        
    pipe_grid = {}
    
    # Train model
    xgb_te,grid_results,xgb_te_params = fit_pipe(pipe,pipe_grid,X_tr,y_tr)


    score_xgb_te_train = mean_absolute_error(y_tr, xgb_te.predict(X_tr))
    score_xgb_te_test = mean_absolute_error(y_te, xgb_te.predict(X_te))
    
    score_xgb_te_train_mse = mean_squared_error(y_tr, xgb_te.predict(X_tr))
    score_xgb_te_test_mse = mean_squared_error(y_te, xgb_te.predict(X_te))

    print('| {0:}    |   xgb_te     |     {1:.5f}        |      {2:.5f}       |'.format(data[i][:10],score_xgb_te_train,score_xgb_te_test))

    # xgb + percentile encoding
    scaler  = sklearn.preprocessing.StandardScaler()
    clf = LGBMRegressor()
    pe = sktools.PercentileEncoder(cols= cols_enc[i],percentile=50,m=0)
        

    pipe = Pipeline([
            ('pe',pe),
            ('selector', TypeSelector(np.number)), # Selects Numerical Columns only
            ('scaler', scaler),
            ('clf',clf )])
        
    pipe_grid = { 
        "pe__m":[0,1,100],
        "pe__percentile":[25,50,75],
        }
    
    # Train model
    xgb_pe,grid_results,xgb_pe_params = fit_pipe(pipe,pipe_grid,X_tr,y_tr)


    score_xgb_pe_train = mean_absolute_error(y_tr, xgb_pe.predict(X_tr))
    score_xgb_pe_test = mean_absolute_error(y_te, xgb_pe.predict(X_te))
    
    score_xgb_pe_train_mse = mean_squared_error(y_tr, xgb_pe.predict(X_tr))
    score_xgb_pe_test_mse = mean_squared_error(y_te, xgb_pe.predict(X_te))
    
    print('| {0:}    |   xgb_pe     |     {1:.5f}        |      {2:.5f}       |'.format(data[i][:10],score_xgb_pe_train,score_xgb_pe_test))

    # Add Results
    resultados.append([data[i].split('/')[1],
                       #Scores
                       score_enet_te_train,score_enet_te_test,
                       score_enet_te_train_mse,score_enet_te_test_mse,
                       
                       score_enet_pe_train,score_enet_pe_test,
                       score_enet_pe_train_mse,score_enet_pe_test_mse,
                       
                       score_xgb_te_train,score_xgb_te_test,
                       score_xgb_te_train_mse,score_xgb_te_test_mse,
                       
                       score_xgb_pe_train,score_xgb_pe_test,
                       score_xgb_pe_train_mse,score_xgb_pe_test_mse,
                       
                       # Shape
                       df.shape,
                       
                       # params
                       enet_te_params,
                       enet_pe_params,
                       
                       # Time
                       elapsed_time_mins(tic,time.time())])
    print('|---------------|---------------|--------------------|--------------------|')

    
print('|-----------------------------------------------------------------|')  

---------------------------------------------------------------------------
|   Data        |  Model        |      Train         |       Test         |
|---------------|---------------|--------------------|--------------------|
| data/house    |   enet_te     |     19913.07752        |      28497.79219       |
| data/house    |   enet_pe     |     20789.79583        |      28081.39420       |
| data/house    |   xgb_te     |     7993.95809        |      26006.88782       |
| data/house    |   xgb_pe     |     8291.61565        |      28063.79100       |
|---------------|---------------|--------------------|--------------------|
| data/stack    |   enet_te     |     73973.73198        |      76926.43126       |
| data/stack    |   enet_pe     |     45679.04494        |      63967.78923       |
| data/stack    |   xgb_te     |     44344.95849        |      74524.76223       |
| data/stack    |   xgb_pe     |     23041.81891        |      67886.65787       |
|---------------|-------------