#### Libraries

In [1]:
%%javascript
utils.load_extension('collapsible_headings/main')
utils.load_extension('hide_input/main')
utils.load_extension('autosavetime/main')
utils.load_extension('execute_time/ExecuteTime')
utils.load_extension('code_prettify/code_prettify')
utils.load_extension('scroll_down/main')
utils.load_extension('jupyter-js-widgets/extension')

<IPython.core.display.Javascript object>

In [2]:
from sklearn import *
import sklearn
import time
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
import pandas as pd
pd.set_option('max_columns', None)
import joblib
from sklearn.metrics import mean_absolute_error,mean_squared_error
from sklearn import pipeline
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV,RepeatedKFold
from scipy.stats import wilcoxon


from sklearn.feature_selection import VarianceThreshold
import zipfile
import os

from lightgbm import LGBMRegressor


import random
random.seed(0)

from tqdm import tqdm
import time

from category_encoders.target_encoder import TargetEncoder
from category_encoders.m_estimate import MEstimateEncoder

import warnings
warnings.filterwarnings('ignore')

import sktools
from tabulate import tabulate

In [3]:
class TypeSelector(BaseEstimator, TransformerMixin):
    """
    Transformer that filters a type of columns of a given data frame.
    """

    def __init__(self, dtype):
        self.dtype = dtype

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        assert isinstance(X, pd.DataFrame)
        # print("Type Selector out shape {}".format(X.select_dtypes(include=[self.dtype]).shape))
        # print(X.select_dtypes(include=[self.dtype]).dtypes)
        return X.select_dtypes(include=[self.dtype])


def elapsed_time_mins(time1, time2):
    elapsed = np.round(np.abs(time1 - time2) / 60, decimals=2)

    return elapsed


def fit_pipe(pipe, pipe_grid, X, y, subsample=False, n_max=20_000, best_params=True):

    if subsample:
        X = X[0:n_max]
        y = y[0:n_max]

    # Instantiate the grid
    pipe_cv = GridSearchCV(
        pipe,
        param_grid=pipe_grid,
        n_jobs=n_jobs,
        cv=cv,
        scoring="neg_mean_absolute_error",
    )

    pipe_cv.fit(X, y)

    best_estimator = pipe_cv.best_estimator_.fit(X_tr, y_tr)
    grid_results = pd.DataFrame(pipe_cv.cv_results_)

    return best_estimator, grid_results, pipe_cv.best_params_


def compare_results(grid_1_res, grid_2_res):

    all_results = grid_1_res.melt().merge(
        grid_2_res.melt(), on="variable", suffixes=("_te", "_pe")
    )

    all_results = all_results[all_results["variable"].str.contains("split")]

    test_results = wilcoxon(
        all_results.value_pe, all_results.value_te, alternative="greater"
    )

    return test_results.pvalue.round(3)

In [4]:
# Check directories


directory = './results_regression/pickle'
if not os.path.exists(directory):
        os.makedirs(directory)
directory = './results_regression/grid_results/'
if not os.path.exists(directory):
        os.makedirs(directory)
directory = './results_regression/partial/'
if not os.path.exists(directory):
        os.makedirs(directory)
directory = './results_regression/datasets/'
if not os.path.exists(directory):
        os.makedirs(directory)


## Define the data

In [5]:
data = [
    'data/house_kaggle.csv',
    'data/stackoverflow.csv',
    'data/so2019.csv',
    'data/ks.csv',
    'data/medical_payments_sample.csv',
    'data/cauchy.csv'
]

In [6]:
drop = [
    [
        "Id",
        "BsmtQual",
        "BsmtCond",
        "BsmtExposure",
        "BsmtFinType1",
        "BsmtFinSF1",
        "BsmtFinType2",
        "BsmtFinSF2",
        "BsmtUnfSF",
        "LowQualFinSF",
        "FullBath",
        "HalfBath",
    ],
    ["Respondent", "Salary"],
    [],
    [],
    [],
    [],
]

In [7]:
cols_enc = [
    [
        "MSSubClass",
        "MSZoning",
        "LotShape",
        "LandContour",
        "Utilities",
        "LotConfig",
        "Neighborhood",
        "BldgType",
        "HouseStyle",
        "YearBuilt",
        "RoofStyle",
        "RoofMatl",
        "Exterior1st",
        "Exterior2nd",
        "ExterQual",
        "MasVnrType",
        "Heating",
        "HeatingQC",
    ],
    [
        "Country",
        "Employment",
        "FormalEducation",
        "UndergradMajor",
        "CompanySize",
        "DevType",
        "YearsCoding",
        "LanguageWorkedWith",
        "LanguageDesireNextYear",
        "RaceEthnicity",
    ],
    ["yearscode", "country"],
    ["category", "main_category", "currency", "state", "country"],
    [
        "Recipient_City",
        "Recipient_State",
        "Recipient_Zip_Code",
        "Recipient_Country",
        "Physician_Primary_Type",
        "Physician_License_State_code1",
        "Submitting_Applicable_Manufacturer_or_Applicable_GPO_Name",
        "Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_Country",
        "Form_of_Payment_or_Transfer_of_Value",
        "Nature_of_Payment_or_Transfer_of_Value",
    ],
    ["value_1", "value_2"],
]

In [8]:
target = [
    ['SalePrice'],
    ['ConvertedSalary'],
    ['convertedcomp'],
    ['goal'],
    ['Total_Amount_of_Payment_USDollars'],
    ['target']
]

# Loop

In [9]:
n_jobs = 1
float_eltype = np.float32
resultados = []
tic = time.time()

n_max = 20_000
cv = 4
filter_size = 2_000
columns = [
    "NameDataset",
    # Scores
    "enet_te_train_mae",
    "enet_te_test_mae",
    "enet_te_train_mse",
    "enet_te_test_mse",
    "enet_pe_train_mae",
    "enet_pe_test_mae",
    "enet_pe_train_mse",
    "enet_pe_test_mse",
    "xgb_te_train_mae",
    "xgb_te_test_mae",
    "xgb_te_train_mse",
    "xgb_te_test_mse",
    "xgb_pe_train_mae",
    "xgb_pe_test_mae",
    "xgb_pe_train_mse",
    "xgb_pe_test_mse",
    "size",
    # Params
    "enet_te_best_params",
    "enet_pe_best_params",
    # Time
    "time_train_m",
]

In [10]:
i = 1
data_i = data[i]
n_jobs = -1

from constants import *
from utils import *
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import ElasticNet
from lightgbm import LGBMRegressor
from category_encoders import MEstimateEncoder
from sktools import TypeSelector
from tabulate import tabulate

results_dict = {}

In [11]:
dataset_keep = keep[i] + target[i]

tic = time.time()

cv = RepeatedKFold(n_repeats=3, n_splits=4)

# Read data
df = pd.read_csv(data_i)

# if df.shape[0] > 100_000:
#     df = df.sample(n=100_000)

# Drop columns
df = df.loc[:, dataset_keep]

# Fillna
df.fillna(0, inplace=True)

print(df.shape)
# Train-Test Split
X_tr, X_te, y_tr, y_te = train_test_split(df.drop(columns=target[i]), df[target[i]])

results_dict[data_i] = {}

# Elastic Net + target encoding
scaler = StandardScaler()
lm = ElasticNet()
lgbm = LGBMRegressor(verbose=-1)
te = MEstimateEncoder(cols=cols_enc[i])
pe = QuantileEncoder(cols=cols_enc[i], quantile=0.50)
se = SummaryEncoder(cols=cols_enc[i], quantiles=[0.1, 0.50, 0.9], m=15)

encoders = {"te": te, "pe": pe, "se": se}
learners = {"lm": lm, "lg": lgbm}

for learner_name, learner in learners.items():

    results_dict[data_i][learner_name] = {}

    for encoder_name, encoder in encoders.items():
        results_dict[data_i][learner_name][encoder_name] = {}

        pipe = Pipeline(
            [
                ("enc", encoder),
                (
                    "selector",
                    TypeSelector(np.number),
                ),  # Selects Numerical Columns only
                ("scaler", scaler),
                ("learner", learner),
            ]
        )

        pipe_grid = {}

        # Train model
        enet_te, enet_te_grid_results, enet_te_params = fit_pipe(
            pipe, pipe_grid, X_tr, y_tr, n_jobs=-1, cv=cv
        )

        results_dict[data_i][learner_name][encoder_name][
            "grid_results"
        ] = enet_te_grid_results

        results_dict[data_i][learner_name][encoder_name][
            "cv_mae"
        ] = -enet_te_grid_results["mean_test_score"]

        results_dict[data_i][learner_name][encoder_name][
            "train_mae"
        ] = mean_absolute_error(y_tr, enet_te.predict(X_tr))
        results_dict[data_i][learner_name][encoder_name][
            "test_mae"
        ] = mean_absolute_error(y_te, enet_te.predict(X_te))

        results_dict[data_i][learner_name][encoder_name][
            "train_mse"
        ] = mean_squared_error(y_tr, enet_te.predict(X_tr))
        results_dict[data_i][learner_name][encoder_name][
            "test_mse"
        ] = mean_squared_error(y_te, enet_te.predict(X_te))

        print(
            tabulate(
                tabular_data=[
                    [
                        data_i[5:10],
                        f"{learner_name}_{encoder_name}",
                        results_dict[data_i][learner_name][encoder_name][
                            "train_mae"
                        ],
                        results_dict[data_i][learner_name][encoder_name]["cv_mae"],
                        results_dict[data_i][learner_name][encoder_name][
                            "test_mae"
                        ],
                    ]
                ],
                tablefmt="psql",
            )
        )


(47702, 6)
+-------+-------+---------+---------+---------+
| stack | lm_te | 60024.4 | 79898.2 | 80149.9 |
+-------+-------+---------+---------+---------+
+-------+-------+---------+-------+---------+
| stack | lm_pe | 57187.7 | 71877 | 72627.4 |
+-------+-------+---------+-------+---------+
+-------+-------+-------+---------+---------+
| stack | lm_se | 70972 | 82118.8 | 82712.6 |
+-------+-------+-------+---------+---------+
+-------+-------+---------+---------+---------+
| stack | lg_te | 46731.3 | 74414.9 | 74601.5 |
+-------+-------+---------+---------+---------+
+-------+-------+-------+---------+---------+
| stack | lg_pe | 46902 | 70689.9 | 70215.3 |
+-------+-------+-------+---------+---------+
+-------+-------+-------+---------+---------+
| stack | lg_se | 43825 | 77068.4 | 75439.7 |
+-------+-------+-------+---------+---------+


In [12]:
# Read data
df = pd.read_csv(data_i)

# if df.shape[0] > 100_000:
#     df = df.sample(n=100_000)

# Drop columns
df = df.loc[:, dataset_keep]

# Fillna
df.fillna(0, inplace=True)

print(df.shape)
# Train-Test Split
X_tr, X_te, y_tr, y_te = train_test_split(df.drop(columns=target[i]), df[target[i]])

(47702, 6)


In [13]:
encoder = se
learner = lm


In [17]:
se.fit(X_tr, y_tr)

SummaryEncoder(cols=['Country', 'Employment', 'FormalEducation', 'DevType',
                     'LanguageWorkedWith'],
               m=15, quantiles=[0.1, 0.5, 0.9])

In [18]:
se.transform(X_tr)

Unnamed: 0,Country,Employment,FormalEducation,DevType,LanguageWorkedWith,Country_10,Employment_10,FormalEducation_10,DevType_10,LanguageWorkedWith_10,Country_50,Employment_50,FormalEducation_50,DevType_50,LanguageWorkedWith_50,Country_90,Employment_90,FormalEducation_90,DevType_90,LanguageWorkedWith_90
19842,India,Employed full-time,"Bachelor’s degree (BA, BS, B.Eng., etc.)",Back-end developer;Designer;Front-end develope...,PHP;CSS,2281.597136,10618.415922,7344.137308,6105.855319,7559.250000,9618.969411,59355.825447,58337.126218,48959.000000,52160.812500,47425.981614,140896.465323,140001.595323,126073.138298,133477.218750
30110,China,Employed full-time,"Bachelor’s degree (BA, BS, B.Eng., etc.)",Back-end developer;Desktop or enterprise appli...,Java;JavaScript;HTML;Bash/Shell,9224.800000,10618.415922,7344.137308,12275.475000,7591.636364,28110.066667,59355.825447,58337.126218,55542.976562,62863.136364,61814.166667,140896.465323,140001.595323,130461.425781,145039.404545
40863,Germany,Employed full-time,"Bachelor’s degree (BA, BS, B.Eng., etc.)",Back-end developer;Designer;Front-end develope...,C;C++;C#;Go;Java;JavaScript;Matlab;PHP;Python;...,13739.082353,10618.415922,7344.137308,7405.263158,7367.250000,61158.005882,59355.825447,58337.126218,50212.473684,51968.812500,114958.499412,140896.465323,140001.595323,231274.121053,133285.218750
1524,United Kingdom,Employed full-time,"Master’s degree (MA, MS, M.Eng., MBA, etc.)",DevOps specialist;Full-stack developer,C#;JavaScript;Python;HTML;CSS;Bash/Shell,30435.909253,10618.415922,13295.627203,33536.432787,8328.900000,62467.327402,59355.825447,59350.348142,76935.450820,55720.000000,232873.500534,140896.465323,149985.372499,126672.087705,117092.850000
43586,Germany,"Independent contractor, freelancer, or self-em...","Master’s degree (MA, MS, M.Eng., MBA, etc.)",Front-end developer,JavaScript;TypeScript;HTML;CSS,13739.082353,6007.447865,13295.627203,8846.680275,7263.849541,61158.005882,57935.227408,59350.348142,54917.355250,43365.252294,114958.499412,178574.556604,149985.372499,134804.698234,132436.786697
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17747,Malta,Employed full-time,"Bachelor’s degree (BA, BS, B.Eng., etc.)",Back-end developer;Designer;Front-end develope...,C#;JavaScript;SQL;HTML;CSS,17293.304348,10618.415922,7344.137308,6105.855319,14576.024034,47651.847826,59355.825447,58337.126218,48959.000000,66611.995708,244195.728261,140896.465323,140001.595323,126073.138298,131917.006438
22693,United Kingdom,Employed full-time,"Master’s degree (MA, MS, M.Eng., MBA, etc.)",Back-end developer;DevOps specialist;System ad...,JavaScript;Python;R;SQL;Bash/Shell,30435.909253,10618.415922,13295.627203,21152.400000,19344.000000,62467.327402,59355.825447,59350.348142,73520.000000,63005.681818,232873.500534,140896.465323,149985.372499,179518.402778,241526.704545
3251,Portugal,Employed full-time,"Master’s degree (MA, MS, M.Eng., MBA, etc.)",Mobile developer,Java;JavaScript;SQL;Bash/Shell,12359.014847,10618.415922,13295.627203,4835.518480,13464.000000,28448.820961,59355.825447,59350.348142,38396.528316,63117.416667,66474.687773,140896.465323,149985.372499,140945.005887,135118.458333
37781,India,Employed full-time,"Bachelor’s degree (BA, BS, B.Eng., etc.)",Front-end developer;Mobile developer,Java;JavaScript;HTML;CSS,2281.597136,10618.415922,7344.137308,4878.371681,5178.282353,9618.969411,59355.825447,58337.126218,45668.694690,39116.382353,47425.981614,140896.465323,140001.595323,116779.590708,112888.497059


In [16]:
encoder = se
learner = lm

pipe = Pipeline(
    [
        ("enc", encoder),
        (
            "selector",
            TypeSelector(np.number),
        ),  # Selects Numerical Columns only
        ("scaler", scaler),
        ("learner", learner),
    ]
)

pipe_grid = {}

# Train model
enet_te, enet_te_grid_results, enet_te_params = fit_pipe(
    pipe, pipe_grid, X_tr, y_tr, n_jobs=-1, cv=cv
)