# target encoder & ohe & DeepFM - predict - stacking

In [1]:
%load_ext autoreload
%autoreload 2

import os
import warnings
warnings.filterwarnings('ignore')

import sys
sys.path.append(os.path.abspath('..'))
# ---------------------------------
from time import sleep
import numpy as np
import pandas as pd
import scipy
import tqdm
import tensorflow as tf
from tensorflow.keras.layers import Activation
import matplotlib.pyplot as plt

from category_encoders import TargetEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression, SGDRegressor
from sklearn.metrics import roc_auc_score

from hyperopt import hp
# ---------------------------------
from tools import CV, Tuning, CVGetScore, IdxValEncoder, deepfm, CyclicLR, MaxLrFinder
# ---------------------------------
from tools import focal_loss, gelu, mish
from tensorflow.keras.utils import get_custom_objects

get_custom_objects().update({'focal_loss': focal_loss()})
get_custom_objects().update({'mish': mish})
get_custom_objects().update({'gelu': gelu})

In [2]:
train_df = pd.read_csv('../data/train.csv', index_col='id')
test_df = pd.read_csv('../data/test.csv', index_col='id')

# ord_5
for i in range(2):
    train_df[f'ord_5_{i}'] = train_df['ord_5'].str[i]
    test_df[f'ord_5_{i}'] = test_df['ord_5'].str[i]

# null
train_df['null'] = train_df.isna().sum(axis=1)
test_df['null'] = test_df.isna().sum(axis=1)

for col in test_df.columns:
    train_df[col].fillna('isnull', inplace=True)
    test_df[col].fillna('isnull', inplace=True)

# target
target = train_df['target']
y_train = target.values

# drop
train_df.drop(['target', 'ord_5'], axis=1, inplace=True)
test_df.drop(['ord_5'], axis=1, inplace=True)

In [3]:
feature_col = train_df.columns

bin_col = ['null']

class_col = ['bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4',
             'nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4',
             'nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9',
             'ord_0', 'ord_1', 'ord_2', 'ord_3', 'ord_4',
             'day', 'month', 'ord_5_0', 'ord_5_1']

In [4]:
# for col in bin_col:
#     map_dict = dict(zip(train_df[col].unique(), [0., 1.]))
#     train_df[col] = train_df[col].map(map_dict)
#     test_df[col] = test_df[col].map(map_dict)

In [5]:
ecd = IdxValEncoder(feature_col, bin_col=bin_col, class_col=class_col)
ecd.fit(train_df, verbose=1)

600000it [00:10, 55411.43it/s]


In [6]:
ecd.fit(test_df, verbose=1)

400000it [00:07, 53877.34it/s]


In [7]:
idx, val = ecd.transform(train_df, verbose=1)
idx_test, val_test = ecd.transform(test_df, verbose=1)

600000it [00:12, 46665.01it/s]
400000it [00:08, 46405.70it/s]


# pred

In [8]:
log = pd.read_csv('../tmp/deepfm/03051921.csv')

In [44]:
log.sort_values('score').head(10)

Unnamed: 0,score,update,usetime,deep_activation,deep_dropout,l2_deep,l2_pair,num_neuron
37,-0.788738,True,184.576666,gelu,0.036688,3.9e-05,0.001919,256
22,-0.788713,True,161.217921,gelu,0.101581,0.000104,0.000939,128
59,-0.788669,False,135.140283,gelu,0.091591,8e-06,0.003207,128
40,-0.788648,False,130.189999,gelu,0.04467,1.2e-05,0.001696,128
53,-0.788645,False,180.608015,gelu,0.155234,0.00011,0.00071,256
68,-0.788643,False,171.826066,gelu,0.128133,4.5e-05,0.00163,128
13,-0.788623,True,221.660247,gelu,0.407878,0.000107,0.001208,256
85,-0.788617,False,194.85345,gelu,0.367318,6e-06,0.002529,256
55,-0.788603,False,214.681732,gelu,0.118396,8.5e-05,0.001304,256
96,-0.788595,False,224.605684,gelu,0.176778,0.000157,0.00069,256


In [10]:
batch_size = 8192
epochs = 200
nflod = 20
nmodel = 10

In [11]:
# model params
model_tuning_param = log.sort_values('score').head(nmodel).reset_index(drop=True).to_dict()

model_fix_param = {'vocabulary_size':ecd.get_vocabulary(), 
                   'feature_number': len(feature_col),
                   'activation': 'sigmoid',
                   'metrics': ['AUC'],
                   'use_fm': True,
                   'k': 5,
                   'deep_use_bn': False,
                   'optimizer': 'Adam',
                   'loss': 'binary_crossentropy',
                   'num_deep_layer':2}

In [12]:
# callbacks
clr = CyclicLR(
    base_lr=1e-5,
    max_lr = 1e-4, 
    step_size= int(4.0*(train_df.shape[0]*((nflod-1)/nflod)) / batch_size),
    mode='exp_range',
    gamma=1.0)

es = tf.keras.callbacks.EarlyStopping(monitor='val_AUC', 
                                      patience=3,
                                      mode='max',
                                      restore_best_weights=True)

# fit
fit_param = {
    'batch_size': batch_size, 
    'epochs':epochs, 
    'verbose': 0,
    'callbacks':[es, clr]
}

# stacking 1

In [14]:
pred_lst = []
score_lst = []
pred_arr_lst = []

for i in range(nmodel):
    model_params = {}
    for param_name, param_value in model_fix_param.items():
        model_params[param_name] = param_value
        
    for param_name in model_tuning_param.keys():
        if param_name not in ['score', 'update', 'usetime', 'index']:
            model_params[param_name] = model_tuning_param[param_name][i]
            
    # cv
    model = deepfm(**model_params)
    cv = CV(model, nflod)
    
    score, pred_arr = cv.fit(x=[idx, val],
                             y=y_train, 
                             metrics_func=roc_auc_score,
                             split_method=StratifiedKFold,
                             fit_params=fit_param,
                             eval_param={'batch_size':batch_size},
                             use_proba=False, 
                             verbose=True,
                             fit_use_valid=True,
                             output_oof_pred=True)
    
    pred = cv.predict(x=[idx_test, val_test], pred_param={'batch_size': batch_size})
    
    pred_lst.append(pred)
    score_lst.append(score)
    pred_arr_lst.append(pred_arr)
    
    print('score: ', score)
    tf.keras.backend.clear_session()

folds 0 is done, score is 0.7860738197447208
folds 1 is done, score is 0.789516510902717
folds 2 is done, score is 0.7913933436689137
folds 3 is done, score is 0.7897472716334469
folds 4 is done, score is 0.786214431863218
folds 5 is done, score is 0.7943257075128896
folds 6 is done, score is 0.7890275724331868
folds 7 is done, score is 0.7882536416680219
folds 8 is done, score is 0.7912310396546987
folds 9 is done, score is 0.7950927155656691
folds 10 is done, score is 0.7881046061858787
folds 11 is done, score is 0.7859464469485385
folds 12 is done, score is 0.7855004723801886
folds 13 is done, score is 0.7879865403484888
folds 14 is done, score is 0.7920841471171606
folds 15 is done, score is 0.7893588220300921
folds 16 is done, score is 0.7897881506882285
folds 17 is done, score is 0.7935690075134796
folds 18 is done, score is 0.7847190761641609
folds 19 is done, score is 0.7870710119153563
score:  0.7892502167969526
folds 0 is done, score is 0.786082166432755
folds 1 is done, scor

folds 16 is done, score is 0.7899782076227183
folds 17 is done, score is 0.7935550544643145
folds 18 is done, score is 0.7848024877987303
folds 19 is done, score is 0.787047665565432
score:  0.7892706194961752
folds 0 is done, score is 0.7860479472025466
folds 1 is done, score is 0.7895519350353977
folds 2 is done, score is 0.7914977137815372
folds 3 is done, score is 0.7900414723049443
folds 4 is done, score is 0.7862170242291429
folds 5 is done, score is 0.7944560889140326
folds 6 is done, score is 0.7889292596939005
folds 7 is done, score is 0.7881996985494668
folds 8 is done, score is 0.7911747889653483
folds 9 is done, score is 0.7950696289040593
folds 10 is done, score is 0.7880324033180415
folds 11 is done, score is 0.7857922157808865
folds 12 is done, score is 0.7856399416669471
folds 13 is done, score is 0.7882466203727071
folds 14 is done, score is 0.7917522293470661
folds 15 is done, score is 0.7893809009550888
folds 16 is done, score is 0.7899655232857846
folds 17 is done, 

In [2]:
(0.7892681298073729+0.7892706194961752+ 0.7892502167969526+0.7893004285795743+ 0.789148022871695+0.7890578308444915+0.7892578966858569+0.7892620450624376+0.7892461506391837+0.7891741490278104)/10

0.7892235489811552

In [39]:
np.random.randint(0, 2**32)

1615107092

In [41]:
pred_arr = np.array(pred_arr_lst).squeeze().T
np.save('../tmp/deepfm/1615107092stacking1.npy', pred_arr)
pred_arr.shape

(600000, 10)

In [43]:
pred = np.array(pred_lst).squeeze().T
np.save('../tmp/deepfm/1615107092predict.npy', pred)
pred.shape

(400000, 10)

# stacking 2

In [45]:
pred

array([[0.11874194, 0.12003708, 0.1162231 , ..., 0.11542787, 0.12152357,
        0.12602474],
       [0.24645662, 0.25164986, 0.25201955, ..., 0.24638703, 0.24964896,
        0.24954316],
       [0.14907558, 0.15091169, 0.15238895, ..., 0.15072805, 0.15095451,
        0.15080431],
       ...,
       [0.53353614, 0.53340095, 0.5322024 , ..., 0.53753656, 0.5293952 ,
        0.5285688 ],
       [0.27679983, 0.27474433, 0.26973158, ..., 0.26627794, 0.27561447,
        0.27187088],
       [0.19893648, 0.20642264, 0.19504796, ..., 0.19918235, 0.20271268,
        0.20293233]], dtype=float32)

# Submission

In [None]:
submission = pd.read_csv('../data/sample_submission.csv', index_col='id')
submission['target'] = np.mean(pred_lst, axis=0)
submission.to_csv('../tmp/submission/main_3_deepfm030601.csv')