# deepctr

In [1]:
%load_ext autoreload
%autoreload 2

import os
import warnings
warnings.filterwarnings('ignore')

import sys
sys.path.append(os.path.abspath('..'))
# ---------------------------------
from time import sleep
import numpy as np
import pandas as pd
import scipy
import tqdm
from copy import deepcopy
import tensorflow as tf
from tensorflow.keras.layers import Activation
import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder

from hyperopt import hp
from deepctr.models import xDeepFM
from deepctr.inputs import  SparseFeat, DenseFeat, get_feature_names
# ---------------------------------
from tools import CV, Tuning, CVGetScore, IdxValEncoder, LE, CyclicLR, MaxLrFinder
# ---------------------------------
from tools import focal_loss, gelu, mish
from tensorflow.keras.utils import get_custom_objects

get_custom_objects().update({'focal_loss': focal_loss()})
get_custom_objects().update({'mish': mish})
get_custom_objects().update({'gelu': gelu})

In [2]:
train_df = pd.read_csv('../data/train.csv', index_col='id')
test_df = pd.read_csv('../data/test.csv', index_col='id')

# ord_5
for i in range(2):
    train_df[f'ord_5_{i}'] = train_df['ord_5'].str[i]
    test_df[f'ord_5_{i}'] = test_df['ord_5'].str[i]

# null
train_df['null'] = train_df.isna().sum(axis=1)
test_df['null'] = test_df.isna().sum(axis=1)

for col in test_df.columns:
    train_df[col].fillna('isnull', inplace=True)
    test_df[col].fillna('isnull', inplace=True)

# target
target = train_df['target']
y_train = target.values

# drop
train_df.drop(['target', 'ord_5'], axis=1, inplace=True)
test_df.drop(['ord_5'], axis=1, inplace=True)

In [3]:
feature_col = train_df.columns

bin_col = ['null']

class_col = ['bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4',
             'nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4',
             'nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9',
             'ord_0', 'ord_1', 'ord_2', 'ord_3', 'ord_4',
             'day', 'month', 'ord_5_0', 'ord_5_1']

In [4]:
ecd = LE(feature_col, bin_col=bin_col, class_col=class_col)

ecd.fit(train_df, verbose=1)
ecd.fit(test_df, verbose=1)

x_train_arr = ecd.transform(train_df, verbose=1)
x_test_arr = ecd.transform(test_df, verbose=1)

del train_df, test_df

600000it [00:09, 62562.61it/s]
400000it [00:06, 61641.16it/s]
600000it [00:09, 60983.68it/s]
400000it [00:06, 60945.46it/s]


In [5]:
# x_train_df = pd.DataFrame(data=x_train_arr, columns=feature_col)
# x_test_df = pd.DataFrame(data=x_test_arr, columns=feature_col)

In [6]:
def col_func(vocabulary, sparse_features, dense_features, k=5):
    # sparse
    feature_col = list()
    for f in sparse_features:
        feature_col.append(SparseFeat(f, vocabulary_size=vocabulary[f], embedding_dim=k))
    for f in dense_features:
        feature_col.append(DenseFeat(f, 1))

    dnn_f = feature_col
    linear_f= feature_col
    fn = get_feature_names(linear_f + dnn_f)
    return dnn_f, linear_f, fn

In [7]:
def xdeepfm(vocabulary, k, loss, metrics, optimizer, 
            num_deep_layer=2, num_neuron=256,
            num_cin_layer=2, num_cin=128,**kwargs):
    
    dnn_f, linear_f, _ = col_func(vocabulary, sparse_features=class_col, dense_features=bin_col, k=k)
    tf.random.set_seed(1024)
    model = xDeepFM(linear_feature_columns=linear_f,
                    dnn_feature_columns=dnn_f, 
                    cin_layer_size=tuple(num_cin for _ in range(num_cin_layer)),
                    dnn_hidden_units=tuple(num_neuron for _ in range(num_deep_layer)),
                    **kwargs)
    model.compile(loss=loss, metrics=metrics, optimizer=optimizer)
    return model

In [8]:
def mkinput(input_arr, feature_col):
    return dict(zip(feature_col, input_arr.T))

# fit one

In [10]:
seed = 4293006264
log = pd.read_csv(f'/data/{seed}.csv')

In [43]:
model_param = {key: list(val.values())[0] for key, val in log.sort_values('score').head(1).to_dict().items()}

model_fix_param = {'vocabulary': ecd.get_vocabulary(),
                   'loss': 'binary_crossentropy',
                   'metrics': ['AUC'], 
                   'optimizer': 'Adam',
                   'dnn_activation': 'mish', 
                   'cin_activation': 'linear',
                   'dnn_use_bn': False, 
                   'num_deep_layer': 2, 
                   'num_neuron': 256, 
                   'num_cin_layer': 2}

model_params = dict(list(model_fix_param.items()) + list(model_param.items()))

for col in ['score', 'update', 'usetime', 'index']:
    model_params.pop(col, None)

In [None]:
# fit
def fit(model, epoch=100, batch_size=8192):
    

In [52]:
batch_size=8192
epochs=1
nflod=5
base_lr=3.5
max_lr=4.5
verbose=1


clr = CyclicLR(base_lr=0.1**(base_lr),
               max_lr = 0.1**(max_lr), 
               step_size= int(4.0*(x_train_arr.shape[0]*((nflod-1)/nflod)) / batch_size),
               mode='triangular2',
               gamma=1.0)

es = tf.keras.callbacks.EarlyStopping(monitor='val_AUC', patience=2, mode='max', restore_best_weights=True)
sw = SampleWeight()

fit_param = {'batch_size': batch_size, 'epochs':epochs, 'verbose': verbose, 'callbacks':[es, clr, sw]}

model = xdeepfm(**model_params)

cv = CV(model, nflod)

score = cv.fit(x=mkinput(x_train_arr, feature_col),
               y=y_train,
               metrics_func=roc_auc_score,
               split_method=StratifiedKFold,
               fit_params=fit_param,
               eval_param={'batch_size':batch_size},
               use_proba=False, 
               verbose=verbose,
               fit_use_valid=True)

tf.keras.backend.clear_session()

Train on 479999 samples, validate on 120001 samples
folds 0 is done, score is 0.6022806690894351
Train on 479999 samples, validate on 120001 samples
folds 1 is done, score is 0.6848844192161267
Train on 480000 samples, validate on 120000 samples
folds 2 is done, score is 0.7398930147157022
Train on 480001 samples, validate on 119999 samples
folds 3 is done, score is 0.746228401399702
Train on 480001 samples, validate on 119999 samples
folds 4 is done, score is 0.7458381322261116


In [55]:
cv.model[0].

AttributeError: 'Model' object has no attribute 'sample_weight'