In [1]:
%%javascript
utils.load_extension('collapsible_headings/main')
utils.load_extension('hide_input/main')
utils.load_extension('autosavetime/main')
utils.load_extension('execute_time/ExecuteTime')
utils.load_extension('code_prettify/code_prettify')
utils.load_extension('scroll_down/main')
utils.load_extension('jupyter-js-widgets/extension')

<IPython.core.display.Javascript object>

In [2]:
from sklearn import *
import sklearn
import time
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
import pandas as pd
import joblib
from sklearn.metrics import mean_absolute_error
from sklearn import pipeline
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

import zipfile
import os




from pmlb import fetch_data,regression_dataset_names
from tqdm import tqdm
import time

from category_encoders.target_encoder import TargetEncoder

import warnings
warnings.filterwarnings('ignore')


In [4]:
import sktools

In [7]:
def elapsed_time_mins (time1,time2):
    elapsed = np.round(np.abs(time1-time2)/60,decimals=2)

    return elapsed


def fit_cv_subsample (pipe_cv, X, y, n_max = 20_000):
    X_sub = X[0:n_max]
    y_sub = y[0:n_max]
    pipe_cv.fit(X_sub,y_sub)
    #pipe_cv.best_estimator_.fit(X,y)
    return pipe_cv, pipe_cv.best_estimator_

In [8]:
n_jobs = 1
float_eltype = np.float32
resultados = []

n_max = 20_000
cv = 4
filter_size = 2_000
columns =['NameDataset',
          'enet_te_train','enet_te_test',
          'enet_pe_train','enet_pe_test',
          'size','enet_te_best_params','enet_pe_best_params','time_train_m']    


In [9]:
print('---------------------------------------------------------------------------')
print('|   Data        |  Model        |      Train         |       Test         |')
print('|---------------|---------------|--------------------|--------------------|')
 

for regression_dataset in regression_dataset_names:

    X, y = fetch_data(regression_dataset, return_X_y=True,local_cache_dir='./results_regression/datasets/')
    tic = time.time()
    
    # Filter by syze
    if len(X) > filter_size:

        X = pd.DataFrame(X)
        y = pd.DataFrame(y)
        
        # Encode all feature with lower cardinality than 50
        cols_enc = []
        for col in X.columns:
            if X[col].nunique()<50:
                cols_enc.append(col)

        #X.dropna(inplace=True)
        X.fillna(X.mean(), inplace=True)
        

        # Train-Test Split
        X_tr, X_te, y_tr, y_te = sklearn.model_selection.train_test_split(X, y)
        
        # Elastic Net + target encoding
        scaler  = sklearn.preprocessing.StandardScaler()
        clf = sklearn.linear_model.ElasticNet()
        te = TargetEncoder(cols=cols_enc)
        

        pipe = Pipeline([
            ('te',te),
            ('scaler', scaler),
            ('clf',clf )])
        
        pipe_grid = { 
                         "clf__alpha": list(np.logspace(-10, 0, 5)),
                         "clf__l1_ratio":[0,0.5,1]
        }

        pipe_cv = GridSearchCV(pipe, param_grid=pipe_grid, n_jobs = n_jobs, cv=cv, scoring="neg_mean_absolute_error")
        pipe_enet_te, pipe_enet_te_best_estimator  = fit_cv_subsample(pipe_cv, X_tr, y_tr, n_max = n_max)
        enet_te = pipe_enet_te_best_estimator.fit( X_tr, y_tr)
        grid_results_logit = pd.DataFrame(pipe_enet_te.cv_results_)


        score_enet_te_train = mean_absolute_error(y_tr, enet_te.predict(X_tr))
        score_enet_te_test = mean_absolute_error(y_te, enet_te.predict(X_te))

        print('| {0:}    |   enet_te     |     {1:.5f}        |      {2:.5f}       |'.format(regression_dataset[:10],score_enet_te_train,score_enet_te_test))

        # Elastic Net + percentile encoding
        scaler  = sklearn.preprocessing.StandardScaler()
        clf = sklearn.linear_model.ElasticNet()
        pe = sktools.PercentileEncoder(cols= cols_enc,percentile=50)
        

        pipe = Pipeline([
            ('pe',pe),
            ('scaler', scaler),
            ('clf',clf )])
        
        pipe_grid = { 
                         "pe__m":[0,1,100],                 
                         "pe__percentile":[25,50,75],
                         "clf__alpha": list(np.logspace(-10, 0, 5)),
                         "clf__l1_ratio":[0,0.5,1]
        }

        pipe_cv = GridSearchCV(pipe, param_grid=pipe_grid, n_jobs = n_jobs, cv=cv, scoring="neg_mean_absolute_error")
        pipe_enet_pe, pipe_enet_pe_best_estimator  = fit_cv_subsample(pipe_cv, X_tr, y_tr, n_max = n_max)
        enet_pe = pipe_enet_pe_best_estimator.fit( X_tr, y_tr)
        grid_results_logit = pd.DataFrame(pipe_enet_pe.cv_results_)


        score_enet_pe_train = mean_absolute_error(y_tr, enet_pe.predict(X_tr))
        score_enet_pe_test = mean_absolute_error(y_te, enet_pe.predict(X_te))

        print('| {0:}    |   enet_pe     |     {1:.5f}        |      {2:.5f}       |'.format(regression_dataset[:10],score_enet_pe_train,score_enet_pe_test))

        # Add Results
        resultados.append([regression_dataset,
                           score_enet_te_train,score_enet_te_test,
                           score_enet_pe_train,score_enet_pe_test,
                           X.shape,
                           pipe_enet_te.best_params_,
                           pipe_enet_pe.best_params_,
                           elapsed_time_mins(tic,time.time())])

        # Save partial result
        partial_result = [regression_dataset,
                           score_enet_te_train,score_enet_te_test,
                          score_enet_pe_train,score_enet_pe_test,
                           X.shape,
                          pipe_enet_te.best_params_,
                          pipe_enet_pe.best_params_,
                         elapsed_time_mins(tic,time.time())]

        # Grid Results
        pd.DataFrame(pipe_enet_te.cv_results_).to_csv('./results_regression/grid_results/enet_te_cv_results_{}.csv'.format(regression_dataset))
        print('|---------------|---------------|--------------------|--------------------|')

    
print('|-----------------------------------------------------------------|')

# Final Results
final_df = pd.DataFrame(resultados,columns=columns).set_index('NameDataset')
final_df.to_csv('./results_regression/pipeline_stracking_probs.csv')



---------------------------------------------------------------------------
|   Data        |  Model        |      Train         |       Test         |
|---------------|---------------|--------------------|--------------------|
| 1191_BNG_p    |   enet_te     |     720.94928        |      721.45684       |
| 1191_BNG_p    |   enet_pe     |     720.63114        |      721.11185       |
|---------------|---------------|--------------------|--------------------|
| 1193_BNG_l    |   enet_te     |     383.32045        |      384.29923       |
| 1193_BNG_l    |   enet_pe     |     380.97269        |      382.24874       |
|---------------|---------------|--------------------|--------------------|
| 1196_BNG_p    |   enet_te     |     227.34589        |      227.19753       |
| 1196_BNG_p    |   enet_pe     |     227.70487        |      227.58865       |
|---------------|---------------|--------------------|--------------------|
| 1199_BNG_e    |   enet_te     |     9.19208        |      9.25

In [4]:
pd.read_csv('./results_regression/pipeline_stracking_probs.csv')

Unnamed: 0,NameDataset,enet_te_train,enet_te_test,enet_pe_train,enet_pe_test,size,enet_te_best_params,enet_pe_best_params,time_train_m
0,1191_BNG_pbc,720.949283,721.456843,720.63114,721.111848,"(1000000, 18)","{'clf__alpha': 1e-10, 'clf__l1_ratio': 1}","{'clf__alpha': 1e-10, 'clf__l1_ratio': 1, 'pe_...",2.6
1,1193_BNG_lowbwt,383.320452,384.299226,380.972689,382.248743,"(31104, 9)","{'clf__alpha': 0.0031622776601683794, 'clf__l1...","{'clf__alpha': 0.0031622776601683794, 'clf__l1...",1.81
2,1196_BNG_pharynx,227.345894,227.197527,227.704875,227.588652,"(1000000, 10)","{'clf__alpha': 1.0, 'clf__l1_ratio': 1}","{'clf__alpha': 1e-10, 'clf__l1_ratio': 1, 'pe_...",2.25
3,1199_BNG_echoMonths,9.192075,9.258661,9.192075,9.258661,"(17496, 9)","{'clf__alpha': 1e-10, 'clf__l1_ratio': 1}","{'clf__alpha': 1e-10, 'clf__l1_ratio': 1, 'pe_...",0.98
4,1201_BNG_breastTumor,7.54197,7.535178,7.538207,7.532956,"(116640, 9)","{'clf__alpha': 0.0031622776601683794, 'clf__l1...","{'clf__alpha': 0.0031622776601683794, 'clf__l1...",2.29
5,1203_BNG_pwLinear,2.395596,2.400112,2.410536,2.41549,"(177147, 10)","{'clf__alpha': 0.0031622776601683794, 'clf__l1...","{'clf__alpha': 0.0031622776601683794, 'clf__l1...",2.7
6,1595_poker,0.618548,0.617811,0.618587,0.617855,"(1025010, 10)","{'clf__alpha': 1e-10, 'clf__l1_ratio': 0}","{'clf__alpha': 1e-10, 'clf__l1_ratio': 0, 'pe_...",2.75
7,197_cpu_act,5.58188,5.346392,5.58188,5.346392,"(8192, 21)","{'clf__alpha': 1.0, 'clf__l1_ratio': 0.5}","{'clf__alpha': 1.0, 'clf__l1_ratio': 0.5, 'pe_...",0.48
8,201_pol,26.581048,26.439517,26.595679,26.438678,"(15000, 48)","{'clf__alpha': 0.0031622776601683794, 'clf__l1...","{'clf__alpha': 0.0031622776601683794, 'clf__l1...",5.9
9,215_2dplanes,1.92403,1.914455,1.924609,1.91452,"(40768, 10)","{'clf__alpha': 1e-05, 'clf__l1_ratio': 1}","{'clf__alpha': 1e-10, 'clf__l1_ratio': 1, 'pe_...",2.32
