#### Libraries

In [1]:
%%javascript
utils.load_extension('collapsible_headings/main')
utils.load_extension('hide_input/main')
utils.load_extension('autosavetime/main')
utils.load_extension('execute_time/ExecuteTime')
utils.load_extension('code_prettify/code_prettify')
utils.load_extension('scroll_down/main')
utils.load_extension('jupyter-js-widgets/extension')

<IPython.core.display.Javascript object>

In [2]:
from sklearn import *
import sklearn
import time
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
import pandas as pd
pd.set_option('max_columns', None)
import joblib
from sklearn.metrics import mean_absolute_error,mean_squared_error
from sklearn import pipeline
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import BaggingRegressor, BraggingRegressor
from sklearn.tree import DecisionTreeRegressor

import zipfile
import os

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor




from pmlb import fetch_data,regression_dataset_names
from tqdm import tqdm
import time

from category_encoders.target_encoder import TargetEncoder

import warnings
warnings.filterwarnings('ignore')


This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [3]:
class TypeSelector(BaseEstimator, TransformerMixin):
    '''
    Transformer that filters a type of columns of a given data frame.
    '''
    def __init__(self, dtype):
        self.dtype = dtype
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        assert isinstance(X, pd.DataFrame)
        #print("Type Selector out shape {}".format(X.select_dtypes(include=[self.dtype]).shape))
        #print(X.select_dtypes(include=[self.dtype]).dtypes)
        return X.select_dtypes(include=[self.dtype])

def elapsed_time_mins (time1,time2):
    elapsed = np.round(np.abs(time1-time2)/60,decimals=2)

    return elapsed



def fit_pipe(pipe,pipe_grid,X,y,subsample=False,n_max=20_000,best_params=True):
    
    if subsample:
        X = X[0:n_max]
        y = y[0:n_max]
    
    # Instantiate the grid
    pipe_cv = GridSearchCV(pipe, param_grid=pipe_grid, n_jobs = n_jobs, cv=cv, scoring="neg_mean_absolute_error")
    
    pipe_cv.fit(X,y)
    
    best_estimator = pipe_cv.best_estimator_.fit( X_tr, y_tr)
    grid_results = pd.DataFrame(pipe_cv.cv_results_)
    
    return best_estimator,grid_results,pipe_cv.best_params_

In [4]:
cd ..

/Users/cmougan/Desktop/sktools


In [5]:
import sktools

In [6]:
cd CARLOS_TEST/

/Users/cmougan/Desktop/sktools/CARLOS_TEST


## Define the data

In [7]:
data = [
    'data/house_kaggle.csv',
    'data/stackoverflow.csv'
]

In [8]:
drop = [
    ['Id'],
    ['Respondent','Salary']
]

In [9]:
cols_enc = [
    ['MSSubClass','MSZoning','LotShape','LandContour','Utilities','LotConfig','Neighborhood','BldgType','HouseStyle','YearBuilt','RoofStyle','RoofMatl','Exterior1st','Exterior2nd','ExterQual','MasVnrType','Heating','HeatingQC'],
    ['Country','Employment','FormalEducation','UndergradMajor','CompanySize','DevType','YearsCoding','LanguageWorkedWith','LanguageDesireNextYear','RaceEthnicity']
]

In [10]:
target = [
    ['SalePrice'],
    ['ConvertedSalary']
    
]

# Loop

In [11]:
n_jobs = 1
float_eltype = np.float32
resultados = []
tic=time.time()

n_max = 20_000
cv = 4
filter_size = 2_000
columns =['NameDataset',
          # Scores
          'bag_train_mae','bag_te_test_mae',
          'bag_te_train_mse','bag_te_test_mse',
          
          'brag_train_mae','brag_te_test_mae',
          'brag_te_train_mse','brag_te_test_mse',
         
          
          
          'size',
          
          # Params
          #'enet_te_best_params','enet_pe_best_params',
          # Time
          'time_train_m']        


In [12]:
print('---------------------------------------------------------------------------')
print('|   Data        |  Model        |      Train         |       Test         |')
print('|---------------|---------------|--------------------|--------------------|')

for i in range(0,len(data)):
    
    # Read data
    df = pd.read_csv(data[i])
    #df = df.sample(frac=0.2,random_state=0)
    
    # Drop columns 
    df = df.drop(columns=drop[i])
    
    # Fillna
    df.fillna(df.mean(), inplace=True)
    # if its just NaNs
    df.fillna(0,inplace=True)
    
    # Train-Test Split
    X_tr, X_te, y_tr, y_te = sklearn.model_selection.train_test_split(df.drop(columns=target[i]), df[target[i]])
        
        
    # Bagging
    scaler  = sklearn.preprocessing.StandardScaler()
    clf = BaggingRegressor(base_estimator=DecisionTreeRegressor(criterion='mse',max_features='auto'),
                           n_estimators=50,max_samples=0.7,random_state=0)
    pe = sktools.PercentileEncoder(cols= cols_enc[i],percentile=50,m=0)
        

    pipe = Pipeline([
            ('pe',pe),
            ('selector', TypeSelector(np.number)), # Selects Numerical Columns only
            ('clf',clf )])
        
    pipe_grid = { 
        "pe__m":[0,1,100],
        "pe__percentile":[25,50,75],
        }
    
    # Train model
    #bag,grid_results,bag_params = fit_pipe(pipe,pipe_grid,X_tr,y_tr)
    bag = pipe.fit(X_tr,y_tr)


    score_bag_train = mean_absolute_error(y_tr, bag.predict(X_tr))
    score_bag_test = mean_absolute_error(y_te,  bag.predict(X_te))
    
    score_bag_train_mse = mean_squared_error(y_tr, bag.predict(X_tr))
    score_bag_test_mse = mean_squared_error(y_te, bag.predict(X_te))
    print('| {0:}    |   bag     |     {1:.5f}        |      {2:.5f}       |'.format(data[i][5:10],score_bag_train,score_bag_test))
    
    # Bragging
    scaler  = sklearn.preprocessing.StandardScaler()
    clf = BraggingRegressor(base_estimator=DecisionTreeRegressor(criterion='mae',max_features='auto'),
                            n_estimators=50,max_samples=0.7,random_state=0)
    pe = sktools.PercentileEncoder(cols= cols_enc[i],percentile=50,m=0)
        

    pipe = Pipeline([
            ('pe',pe),
            ('selector', TypeSelector(np.number)), # Selects Numerical Columns only
            ('clf',clf )])
        
    pipe_grid = { 
        "pe__m":[0,1,100],
        "pe__percentile":[25,50,75],
        }
    
    # Train model
    #brag,grid_results,brag_params = fit_pipe(pipe,pipe_grid,X_tr,y_tr)
    brag = pipe.fit(X_tr,y_tr)


    score_brag_train = mean_absolute_error(y_tr, brag.predict(X_tr))
    score_brag_test = mean_absolute_error(y_te,  brag.predict(X_te))
    
    score_brag_train_mse = mean_squared_error(y_tr, brag.predict(X_tr))
    score_brag_test_mse = mean_squared_error(y_te, brag.predict(X_te))
    print('| {0:}    |   brag     |     {1:.5f}        |      {2:.5f}       |'.format(data[i][5:10],score_brag_train,score_brag_test))
    
     
    # Add Results
    resultados.append([data[i].split('/')[1],
                       #Scores
                       score_bag_train,score_bag_test,
                       score_bag_train_mse,score_bag_test_mse,
                       
                       score_brag_train,score_brag_test,
                       score_brag_train_mse,score_brag_test_mse,
                       
                       
                       # Shape
                       df.shape,
                       
                       # params
                       #enet_te_params,
                       #enet_pe_params,
                       
                       # Time
                       elapsed_time_mins(tic,time.time())])
    print('|---------------|---------------|--------------------|--------------------|')

    
print('|-----------------------------------------------------------------|')  

---------------------------------------------------------------------------
|   Data        |  Model        |      Train         |       Test         |
|---------------|---------------|--------------------|--------------------|
| data/house    |   bag     |     8987.49470        |      16723.65708       |
| data/house    |   brag     |     1412.60913        |      17383.84795       |
|---------------|---------------|--------------------|--------------------|
| data/stack    |   bag     |     23097.17075        |      78586.88307       |
| data/stack    |   brag     |     6417.58174        |      66191.41818       |
|---------------|---------------|--------------------|--------------------|
|-----------------------------------------------------------------|


In [13]:
pd.DataFrame(resultados,columns=columns)

Unnamed: 0,NameDataset,bag_train_mae,bag_te_test_mae,bag_te_train_mse,bag_te_test_mse,brag_train_mae,brag_te_test_mae,brag_te_train_mse,brag_te_test_mse,size,time_train_m
0,house_kaggle.csv,8987.494697,16723.657078,241900900.0,683134200.0,1412.609132,17383.847945,64507390.0,729282000.0,"(1460, 80)",0.1
1,stackoverflow.csv,23097.170753,78586.883067,4796642000.0,47911000000.0,6417.581738,66191.418183,5529915000.0,48557420000.0,"(47702, 127)",275.64


---------------------------------------------------------------------------
|   Data        |  Model        |      Train         |       Test         |
|---------------|---------------|--------------------|--------------------|
| data/house    |   bag         |     8987.49470     |      16723.65708   |
| data/house    |   brag        |     1412.60913     |      17383.84795   |
| data/stack    |   bag         |     23097.17075    |      78586.88307   |
| data/stack    |   brag     |     6417.58174        |      66191.41818       |