# target encoder & ohe & LogisticRegression

In [1]:
%load_ext autoreload
%autoreload 2

import os
import warnings
warnings.filterwarnings('ignore')

import sys
sys.path.append(os.path.abspath('..'))
# ---------------------------------
import numpy as np
import pandas as pd
import scipy
import tqdm
import tensorflow as tf
import matplotlib.pyplot as plt

from category_encoders import TargetEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression, SGDRegressor
from sklearn.metrics import roc_auc_score

from hyperopt import hp
# ---------------------------------
from tools import CV, Tuning, CVGetScore, IdxValEncoder, linear_regression

In [2]:
train_df = pd.read_csv('../data/train.csv', index_col='id')
test_df = pd.read_csv('../data/test.csv', index_col='id')

# ord_5
for i in range(2):
    train_df[f'ord_5_{i}'] = train_df['ord_5'].str[i]
    test_df[f'ord_5_{i}'] = test_df['ord_5'].str[i]

# fillna
for col in test_df.columns:
    train_df[col].fillna(train_df[col].mode()[0], inplace=True)
    test_df[col].fillna(test_df[col].mode()[0], inplace=True)

# target
target = train_df['target']
y_train = target.values

# drop
train_df.drop(['target', 'ord_5'], axis=1, inplace=True)
test_df.drop(['ord_5'], axis=1, inplace=True)

In [3]:
feature_col = train_df.columns

bin_col = ['bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4']

class_col = ['nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4',
             'nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9',
             'ord_0', 'ord_1', 'ord_2', 'ord_3', 'ord_4',
             'day', 'month', 'ord_5_0', 'ord_5_1']

In [4]:
for col in bin_col:
    map_dict = dict(zip(train_df[col].unique(), [0., 1.]))
    train_df[col] = train_df[col].map(map_dict)
    test_df[col] = test_df[col].map(map_dict)

In [5]:
ecd = IdxValEncoder(feature_col, bin_col, class_col)
ecd.fit(train_df, verbose=1)
ecd.get_vocabulary()

600000it [00:11, 51306.98it/s]


5607

In [6]:
ecd.fit(test_df, verbose=1)
ecd.get_vocabulary()

400000it [00:07, 52989.04it/s]


5608

In [7]:
idx, val = ecd.transform(train_df, verbose=0)
idx_test, val_test = ecd.transform(test_df, verbose=0)

# Tuning

In [None]:
# fit param
callback = tf.keras.callbacks.EarlyStopping(monitor='val_AUC', patience=2, mode='max')
fit_param = {'batch_size': 8192, 'epochs':200, 'verbose': 0, 'callbacks':[callback]}

cv_fit_param = {'fit_params': fit_param, 
                'eval_param': {'batch_size':8192},
                'use_proba':False, 
                'fit_use_valid': True}

# model_fix_param & model_search_space
opt = tf.keras.optimizers.Adam(learning_rate=1e-3)

model_fix_param = {'vocabulary_size':ecd.get_vocabulary(), 
                   'feature_number': len(feature_col),
                   'activation': 'sigmoid',
                   'metrics': ['AUC'],
                   'optimizer': opt}

model_search_space = {'loss': hp.choice('loss', ['MSE', tf.keras.losses.BinaryCrossentropy()]),
                      'l1': hp.loguniform('l1', -10, 0),
                      'l2': hp.loguniform('l2', -10, 0)}

# cv get score
def neg_auc(y_true, y_pred):
    return - roc_auc_score(y_true, y_pred)

gs = CVGetScore(x=[idx, val],
                y=y_train, 
                metrics_func=neg_auc,
                split_method=StratifiedKFold,
                nfolds=5, 
                random_state=2333,
                model=linear_regression, 
                cv_fit_params=cv_fit_param, 
                model_fix_params=model_fix_param, 
                model_search_space=model_search_space)

tuning = Tuning(gs, verbose=1)
tuning.fmin(gs.GET_SEARCH_SPACE(), max_evals=100)

# CV training use Low Learning Rate

In [8]:
batch_size = 8192
epochs = 20
learning_rate = 1e-3
patience = 1

# fit param
callback = tf.keras.callbacks.EarlyStopping(monitor='val_AUC', patience=patience, mode='max')
fit_param = {'batch_size': batch_size, 'epochs':epochs, 'verbose': 1, 'callbacks':[callback]}

# model_fix_param & model_search_space
opt = tf.keras.optimizers.Adam(learning_rate=learning_rate)

model_param = {'vocabulary_size':ecd.get_vocabulary(), 
               'feature_number': len(feature_col),
               'activation': 'sigmoid',
               'metrics': ['AUC'],
               'optimizer': opt, 
               'loss': tf.keras.losses.BinaryCrossentropy(),
               'l1':0., 
               'l2':0.}

model = linear_regression(**model_param)
cv = CV(model, 10)

cv.fit(x=[idx, val],
       y=y_train, 
       metrics_func=roc_auc_score,
       split_method=StratifiedKFold,
       fit_params=fit_param,
       eval_param={'batch_size':batch_size},
       use_proba=False, 
       verbose=True,
       fit_use_valid=True)

Train on 540000 samples, validate on 60000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
folds 0 is done, score is 0.7822568046392454
Train on 540000 samples, validate on 60000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
folds 1 is done, score is 0.7875284172596831
Train on 540000 samples, validate on 60000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
folds 2 is done, score is 0.7868724026319043
Train on 540000 samples, validate on 60000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20


Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
folds 3 is done, score is 0.7850101446215958
Train on 540000 samples, validate on 60000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
folds 4 is done, score is 0.7889532938469997
Train on 540000 samples, validate on 60000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
folds 5 is done, score is 0.7828637286193343
Train on 540000 samples, validate on 60000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
folds 6 is done, score is 0.7825971129120095


Train on 540000 samples, validate on 60000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
folds 7 is done, score is 0.7870060404248734
Train on 540000 samples, validate on 60000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
folds 8 is done, score is 0.7870930275098302
Train on 540000 samples, validate on 60000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
folds 9 is done, score is 0.7820211601024375


0.7852202132567914

In [9]:
model = cv.model
opt = tf.keras.optimizers.Adam(learning_rate=2e-5)
for submodel in model:
    submodel.compile(loss=tf.keras.losses.BinaryCrossentropy(), metrics=['AUC'], optimizer=opt)

In [10]:
batch_size = 8192
epochs = 400

# fit param
callback = tf.keras.callbacks.EarlyStopping(monitor='val_AUC',
                                            patience=5,
                                            mode='max')

fit_param = {
    'batch_size': batch_size,
    'epochs':epochs, 
    'verbose': 1, 
    'callbacks':[callback]
}

cv = CV(model, 10)

cv.fit(x=[idx, val],
       y=y_train, 
       metrics_func=roc_auc_score,
       split_method=StratifiedKFold,
       fit_params=fit_param,
       eval_param={'batch_size':batch_size},
       use_proba=False, 
       verbose=True,
       fit_use_valid=True)

Train on 540000 samples, validate on 60000 samples
Epoch 1/400
Epoch 2/400
Epoch 3/400
Epoch 4/400
Epoch 5/400
Epoch 6/400
Epoch 7/400
Epoch 8/400
Epoch 9/400
Epoch 10/400
Epoch 11/400
Epoch 12/400
folds 0 is done, score is 0.7823374427371925
Train on 540000 samples, validate on 60000 samples
Epoch 1/400
Epoch 2/400
Epoch 3/400
Epoch 4/400
Epoch 5/400
Epoch 6/400
Epoch 7/400
Epoch 8/400
Epoch 9/400
Epoch 10/400
Epoch 11/400
Epoch 12/400
Epoch 13/400
Epoch 14/400
Epoch 15/400
Epoch 16/400
Epoch 17/400
folds 1 is done, score is 0.7875244584248183
Train on 540000 samples, validate on 60000 samples
Epoch 1/400
Epoch 2/400
Epoch 3/400
Epoch 4/400
Epoch 5/400
Epoch 6/400
folds 2 is done, score is 0.786901427089409
Train on 540000 samples, validate on 60000 samples
Epoch 1/400
Epoch 2/400
Epoch 3/400
Epoch 4/400
Epoch 5/400
Epoch 6/400
folds 3 is done, score is 0.7850263359549675
Train on 540000 samples, validate on 60000 samples
Epoch 1/400
Epoch 2/400
Epoch 3/400
Epoch 4/400
Epoch 5/400
Epo

Epoch 3/400
Epoch 4/400
Epoch 5/400
Epoch 6/400
Epoch 7/400
Epoch 8/400
folds 5 is done, score is 0.78287394473181
Train on 540000 samples, validate on 60000 samples
Epoch 1/400
Epoch 2/400
Epoch 3/400
Epoch 4/400
Epoch 5/400
Epoch 6/400
Epoch 7/400
Epoch 8/400
folds 6 is done, score is 0.7826016532035976
Train on 540000 samples, validate on 60000 samples
Epoch 1/400
Epoch 2/400
Epoch 3/400
Epoch 4/400
Epoch 5/400
Epoch 6/400
Epoch 7/400
Epoch 8/400
Epoch 9/400
Epoch 10/400
Epoch 11/400
Epoch 12/400
Epoch 13/400
folds 7 is done, score is 0.7870336463174951
Train on 540000 samples, validate on 60000 samples
Epoch 1/400
Epoch 2/400
Epoch 3/400
Epoch 4/400
Epoch 5/400
Epoch 6/400
Epoch 7/400
Epoch 8/400
Epoch 9/400
folds 8 is done, score is 0.7871102345287656
Train on 540000 samples, validate on 60000 samples
Epoch 1/400
Epoch 2/400
Epoch 3/400
Epoch 4/400
Epoch 5/400
Epoch 6/400
folds 9 is done, score is 0.7820439713594518


0.7852418518377157

# predict

In [13]:
score = cv.predict(x=[idx_test, val_test], pred_param={'batch_size': batch_size})

In [15]:
submission = pd.read_csv('../data/sample_submission.csv', index_col='id')
submission['target'] = score
submission.to_csv('../tmp/submission/main_1_lr.csv')