# target encoder & ohe & FM

In [1]:
%load_ext autoreload
%autoreload 2

import os
import warnings
warnings.filterwarnings('ignore')

import sys
sys.path.append(os.path.abspath('..'))
# ---------------------------------
import numpy as np
import pandas as pd
import scipy
import tqdm
import tensorflow as tf
import matplotlib.pyplot as plt

from category_encoders import TargetEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression, SGDRegressor
from sklearn.metrics import roc_auc_score

from hyperopt import hp
# ---------------------------------
from tools import CV, Tuning, CVGetScore, IdxValEncoder, fm

In [2]:
train_df = pd.read_csv('../data/train.csv', index_col='id')
test_df = pd.read_csv('../data/test.csv', index_col='id')

# ord_5
for i in range(2):
    train_df[f'ord_5_{i}'] = train_df['ord_5'].str[i]
    test_df[f'ord_5_{i}'] = test_df['ord_5'].str[i]

# fillna
for col in test_df.columns:
    train_df[col].fillna(train_df[col].mode()[0], inplace=True)
    test_df[col].fillna(test_df[col].mode()[0], inplace=True)

# target
target = train_df['target']
y_train = target.values

# drop
train_df.drop(['target', 'ord_5'], axis=1, inplace=True)
test_df.drop(['ord_5'], axis=1, inplace=True)

In [3]:
feature_col = train_df.columns

bin_col = ['bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4']

class_col = ['nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4',
             'nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9',
             'ord_0', 'ord_1', 'ord_2', 'ord_3', 'ord_4',
             'day', 'month', 'ord_5_0', 'ord_5_1']

In [4]:
for col in bin_col:
    map_dict = dict(zip(train_df[col].unique(), [0., 1.]))
    train_df[col] = train_df[col].map(map_dict)
    test_df[col] = test_df[col].map(map_dict)

In [5]:
ecd = IdxValEncoder(feature_col, bin_col, class_col)
ecd.fit(train_df, verbose=1)

600000it [00:11, 52322.58it/s]


In [6]:
idx, val = ecd.transform(train_df, verbose=1)

600000it [00:12, 48506.24it/s]


# fit one

In [7]:
# fit param
batch_size = 8192

callback = tf.keras.callbacks.EarlyStopping(monitor='val_AUC', patience=2, mode='max')
fit_param = {'batch_size': batch_size,
             'epochs':10,
             'verbose': 1, 
             'callbacks':[callback]}

# model_param
opt = tf.keras.optimizers.Adam(learning_rate=5e-4)

model_param = {'vocabulary_size':ecd.get_vocabulary(), 
               'feature_number': len(feature_col),
               'activation': 'sigmoid',
               'metrics': ['AUC'],
               'optimizer': opt, 
               'k': 5, 'loss': tf.keras.losses.BinaryCrossentropy()}

In [8]:
model = fm(**model_param)
cv = CV(model, 5)

In [9]:
cv.fit(x=[idx, val],
       y=y_train, 
       metrics_func=roc_auc_score,
       split_method=StratifiedKFold,
       fit_params=fit_param,
       eval_param={'batch_size': batch_size},
       use_proba=False, 
       verbose=True,
       fit_use_valid=True)

Train on 480000 samples, validate on 120000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
folds 0 is done, score is 0.7828379608215226
Train on 480000 samples, validate on 120000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
folds 1 is done, score is 0.7832072990286207
Train on 480000 samples, validate on 120000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
folds 2 is done, score is 0.7827803931341122
Train on 480000 samples, validate on 120000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
folds 3 is done, score is 0.7823051974779471
Train on 480000 samples, validate on 120000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
folds 4 is done, score is 0.7821357944509547


0.7826533289826314

In [11]:
# fit param
batch_size = 8192

callback = tf.keras.callbacks.EarlyStopping(monitor='val_AUC', patience=2, mode='max')
fit_param = {'batch_size': batch_size,
             'epochs':200,
             'verbose': 1, 
             'callbacks':[callback]}

# model_param
opt = tf.keras.optimizers.Adam(learning_rate=1e-5)

model = cv.model
for i in model:
    i.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
              loss=tf.keras.losses.BinaryCrossentropy(), metrics=['AUC'])
    
cv = CV(model, 5)

cv.fit(x=[idx, val],
       y=y_train, 
       metrics_func=roc_auc_score,
       split_method=StratifiedKFold,
       fit_params=fit_param,
       eval_param={'batch_size': batch_size},
       use_proba=False, 
       verbose=True,
       fit_use_valid=True)

Train on 480000 samples, validate on 120000 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
folds 0 is done, score is 0.7829645524849089
Train on 480000 samples, validate on 120000 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
folds 1 is done, score is 0.7832149572335567
Train on 480000 samples, validate on 120000 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
folds 2 is done, score is 0.7827846327394261
Train on 480000 samples, validate on 120000 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
folds 3 is done, score is 0.7822922990737937
Train on 480000 samples, validate on 120000 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
folds 4 is done, score is 0.7821688546554154


0.7826850592374202

# Tuning

In [None]:
# fit param
callback = tf.keras.callbacks.EarlyStopping(monitor='val_AUC', patience=2, mode='max')
fit_param = {'batch_size': 4096, 'epochs':200, 'verbose': 0, 'callbacks':[callback]}

cv_fit_param = {'fit_params': fit_param, 
                'eval_param': {'batch_size':4096},
                'use_proba':False, 
                'fit_use_valid': True}

# model_fix_param & model_search_space
opt = tf.keras.optimizers.Adam(learning_rate=1e-4)

model_fix_param = {'vocabulary_size':ecd.get_vocabulary(), 
                   'feature_number': len(feature_col),
                   'activation': 'sigmoid',
                   'metrics': ['AUC'],
                   'optimizer': opt, 
                   'k': 10}

model_search_space = {'loss': hp.choice('loss', ['MSE', tf.keras.losses.BinaryCrossentropy()]),
                      'l1': hp.loguniform('l1', -10, 0),
                      'l2': hp.loguniform('l2', -10, 0)}

# cv get score
def neg_auc(y_true, y_pred):
    return - roc_auc_score(y_true, y_pred)

gs = CVGetScore(x=[idx, val],
                y=y_train, 
                metrics_func=neg_auc,
                split_method=StratifiedKFold,
                nfolds=5, 
                random_state=2333,
                model=fm, 
                cv_fit_params=cv_fit_param, 
                model_fix_params=model_fix_param, 
                model_search_space=model_search_space)

tuning = Tuning(gs, verbose=1)
tuning.fmin(gs.GET_SEARCH_SPACE(), max_evals=100)

In [None]:
opt = tf.keras.optimizers.Adam(learning_rate=1e-4)

model = linear_regression(vocabulary_size=ecd.get_vocabulary(), 
                          feature_number=len(feature_col),, 
                          activation='sigmoid',
                          loss='mse',
                          metrics=['AUC'],
                          optimizer=opt, 
                          l1=0., l2=0.)

In [None]:
callback = tf.keras.callbacks.EarlyStopping(monitor='val_AUC', patience=5, mode='max')
fit_param = {'batch_size':1024, 'epochs':20, 'verbose': 1, 'callbacks':[callback]}

In [None]:
pred_param = {'batch_size':1024}

cv = CV(model, 5)

cv.fit(x=[idx, val],
       y=y_train, 
       metrics_func=roc_auc_score,
       split_method=StratifiedKFold,
       fit_params=fit_param,
       eval_param=pred_param,
       use_proba=False, 
       verbose=True,
       fit_use_valid=True)

In [None]:
np.exp(-10)