In [1]:
"""

Draft.

Train/test split by time:
train [2017-11-06, 2017-11-09]
test  [2017-11-10]

Cannot use kfold if optimizing ROC for each fold separately. Comparing optimizing xentropy for each fold vs optimizing roc-auc without folds.
"""

import os
import random
import warnings

warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning)
os.environ['KERAS_BACKEND'] = 'tensorflow'

import pandas as pd
import numpy as np
import keras

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

SEED=42

np.random.seed(SEED)
random.seed(SEED)

Using TensorFlow backend.


In [2]:
dtypes = {
    'ip':'uint32',
    'app': 'uint16',
    'device': 'uint16',
    'os': 'uint16',
    'channel': 'uint16',
    'is_attributed': 'uint8'
}   

def add_mean(df_train, df_val, df_test=None):
    df1 = df_train[['is_attributed', 'app']].groupby(['app']).median().rename(columns={'is_attributed': 'mean1'}).reset_index()
    df_train = pd.merge(df_train, df1, on=['app'], how='left')
    df_val = pd.merge(df_val, df1, on=['app'], how='left')
    #df_test = pd.merge(df_test, df1, on=['app'], how='left')
    
    df2 = df_train[['is_attributed', 'app', 'channel']].groupby(['app', 'channel']).median().rename(columns={'is_attributed': 'mean2'}).reset_index()    
    df_train = pd.merge(df_train, df2, on=['app', 'channel'], how='left')
    df_val = pd.merge(df_val, df2, on=['app', 'channel'], how='left')
    #df_test = pd.merge(df_test, df2, on=['app', 'channel'], how='left')
                   
    df3 = df_train[['is_attributed', 'app', 'device']].groupby(['app', 'device']).median().rename(columns={'is_attributed': 'mean3'}).reset_index()    
    df_train = pd.merge(df_train, df3, on=['app', 'device'], how='left')
    df_val = pd.merge(df_val, df3, on=['app', 'device'], how='left')
    #df_test = pd.merge(df_test, df3, on=['app', 'device'], how='left')    
    return df_train, df_val #, df_test


In [3]:
sample = pd.read_csv('input/train_sample.csv', dtype=dtypes, parse_dates=['click_time', 'attributed_time'])
sample['click_hour'] = sample.click_time.dt.hour

print(sample.columns.values)
print(sample.shape)


['ip' 'app' 'device' 'os' 'channel' 'click_time' 'attributed_time'
 'is_attributed' 'click_hour']
(100000, 9)


In [4]:
import keras
from keras.models import Model
from keras.layers import *
from keras.regularizers import l2
from keras.optimizers import Adam
from keras.engine.topology import Layer
import keras.backend as K

EMBEDS = [
    {'name': 'ip', 'dim': [1024, 100]},
    {'name': 'app', 'dim': [1024, 100]},
    {'name': 'device', 'dim': [1024, 50]},
    {'name': 'os', 'dim': [1024, 50]},
    {'name': 'channel', 'dim': [1024, 50]},
    {'name': 'click_hour', 'dim': [24, 5]}
]

def embed(input_dim, output_dim, x):
    e = Embedding(input_dim, output_dim, input_length=1, embeddings_regularizer=l2(1e-8))
    r = e(x)
    r = Reshape((output_dim,))(r)
    return r

def build_model(features):
    misc = Input(shape=(features,), name='misc')
    
    for e in EMBEDS:
        e['input'] = Input(shape=(1,), name=e['name'])
        e['layer'] = embed(e['dim'][0], e['dim'][1], e['input'])
        
    h = concatenate([misc] + [e['layer'] for e in EMBEDS])
    h = BatchNormalization()(h)
    
    h = Dense(32, activation='relu')(h)
    h = Dropout(0.5)(h)
    
    h = Dense(16, activation='relu')(h)    
    h = Dense(1, activation='sigmoid')(h)
    
    model = Model(inputs=[misc] + [e['input'] for e in EMBEDS], 
                  outputs=h)
    return model    
    
    

In [5]:
import tensorflow as tf

def roc_auc_score_fn(y_true, y_pred):
    """ ROC AUC Score.
    
    Source:
    https://github.com/tflearn/tflearn/blob/1b0e4d3539e1c4c6b31735499328c27da3a874b0/tflearn/objectives.py
    
    Approximates the Area Under Curve score, using approximation based on
    the Wilcoxon-Mann-Whitney U statistic.
    Yan, L., Dodier, R., Mozer, M. C., & Wolniewicz, R. (2003).
    Optimizing Classifier Performance via an Approximation to the Wilcoxon-Mann-Whitney Statistic.
    Measures overall performance for a full range of threshold levels.
    `y_pred` and `y_true` must have the same type and shape.
    """
    pos = tf.boolean_mask(y_pred, tf.cast(y_true, tf.bool))
    neg = tf.boolean_mask(y_pred, ~tf.cast(y_true, tf.bool))

    pos = tf.expand_dims(pos, 0)
    neg = tf.expand_dims(neg, 1)

    # original paper suggests performance is robust to exact parameter choice
    gamma = 0.2
    p     = 3

    difference = tf.zeros_like(pos * neg) + pos - neg - gamma

    masked = tf.boolean_mask(difference, difference < 0.0)

    return tf.reduce_sum(tf.pow(-masked, p))
      

In [6]:
def hashed(s, size=10):
    return s.apply(lambda x: hash(x) % 2 ** size)

def build_input(df):
    embed_names = [e['name'] for e in EMBEDS]
    h = {'misc': df[[x for x in df.columns if x not in embed_names]] }
    h.update(dict([(name, hashed(df.loc[:, name]).values) for name in embed_names]))
    return h


In [7]:
class CustomMetric(keras.callbacks.Callback):    
    def __init__(self, val_x, val_y):
        self.val_x = val_x
        self.val_y = val_y
        
    def on_epoch_end(self, epoch, logs=None):
        if logs is not None:
            logs['val-auc'] = roc_auc_score(self.val_y, self.model.predict(self.val_x).ravel())
        


In [19]:
class FoldDataset:
    def __init__(self, X_train, y_train, X_dev, y_dev):
        self.X_train = X_train
        self.y_train = y_train
        self.X_dev = X_dev
        self.y_dev = y_dev
        
    def build(self):
        #build features inside fold.
        #X_train1, X_dev1 = add_meanx2(X_train1, X_dev1)    

        X_train1 = self.X_train.drop(['attributed_time', 'click_time', 'is_attributed'], axis=1)
        X_dev1 = self.X_dev.drop(['attributed_time', 'click_time', 'is_attributed'], axis=1)

        self.train_input = build_input(X_train1)
        self.dev_input = build_input(X_dev1)        
    
    def parts(self):
        return self.train_input, self.y_train, self.dev_input, self.y_dev
        

In [24]:
class Classifier:
    def __init__(self, loss='binary_crossentropy'):
        # roc_auc_score_fn
        self.loss = loss
        self.optimizer = 'sgd'
            
    def train(self, fold_no, ds, epochs=1):
        
        train1_input, y_train, dev_input, y_dev = ds.parts()

        model = build_model(train1_input['misc'].shape[1])
        model.compile(optimizer=self.optimizer, loss=self.loss)

        # callbacks
        ckpt = keras.callbacks.ModelCheckpoint(filepath='tmp/weights.hdf5', 
                                               # val-auc is calculated by the custom eval_callback
                                               monitor='val-auc', mode='max',
                                               verbose=0, save_best_only=True)    
        eval_callback = CustomMetric(dev_input, y_dev1)

        model.fit(train1_input, y_train1, 
                  validation_data=(dev_input, y_dev1),
                  callbacks=[eval_callback, ckpt],
                  epochs=epochs)

        model.load_weights('tmp/weights.hdf5')

        print('fold {}, best train-auc: {} val-auc: {}'.format(
            fold_no, 
            roc_auc_score(y_train1, model.predict(train1_input).ravel()),
            roc_auc_score(y_dev1, model.predict(dev_input).ravel())))
        
        return model
    
    def predict(self, model, X_test):
        print('.')        
        

In [27]:

X_train = sample.copy()
y_train = sample.is_attributed.values.copy()

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=SEED)
oof_pred = np.zeros(y_train.shape, dtype=np.float32)

for fold_no, (train_index, dev_index) in enumerate(skf.split(X_train, y_train)):
    X_train1, X_dev1 = X_train.iloc[train_index], X_train.iloc[dev_index]
    y_train1, y_dev1 = y_train[train_index], y_train[dev_index]

    ds = FoldDataset(X_train1, y_train1, X_dev1, y_dev1)
    ds.build()
    
    c = Classifier(loss='binary_crossentropy')
    model = c.train(fold_no, ds, epochs=10)
    
    _, _, dev_input, _ = ds.parts()
    oof_pred[dev_index] = model.predict(dev_input).ravel()

print('final val auc', roc_auc_score(y_train, oof_pred))


Train on 89999 samples, validate on 10001 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
fold 0, best train-auc: 0.997122203188308 val-auc: 0.9744829930194254
Train on 89999 samples, validate on 10001 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
fold 1, best train-auc: 0.9899739220817788 val-auc: 0.9560816404786182
Train on 89999 samples, validate on 10001 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
fold 2, best train-auc: 0.9945358381673288 val-auc: 0.9675939240241576
Train on 90000 samples, validate on 10000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
fold 3, best train-auc: 0.9626210205004982 val-auc: 0.9474094765787398
Train on 90000 samples, validate on 10000 samples
Epoch 1/10
Epoch 2/10
E

In [None]:
#10-fold final val auc 0.9624749061426707

In [28]:
#
#without kfold, optimizing roc-auc directly.
#final val auc 0.9705715637963195

X_train = sample.copy()
y_train = sample.is_attributed.values.copy()

X_train1, X_dev1, y_train1, y_dev1 = train_test_split(X_train, y_train, test_size=0.2, random_state=SEED, stratify=y_train)

ds = FoldDataset(X_train1, y_train1, X_dev1, y_dev1)
ds.build()

c = Classifier(loss=roc_auc_score_fn)
model = c.train(0, ds, epochs=10)

_, _, dev_input, _ = ds.parts()
print('final val auc', roc_auc_score(y_dev1, model.predict(dev_input).ravel()))

Train on 80000 samples, validate on 20000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
fold 0, best train-auc: 0.9859320407223136 val-auc: 0.9705715637963195
final val auc 0.9705715637963195
