In [1]:
"""

Draft.

"""

import os
import random
import warnings

warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning)
os.environ['KERAS_BACKEND'] = 'tensorflow'

import pandas as pd
import numpy as np
import keras

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

SEED=42

D=12

np.random.seed(SEED)
random.seed(SEED)

Using TensorFlow backend.


In [2]:
dtypes = {
    'ip':'uint32',
    'app': 'uint16',
    'device': 'uint16',
    'os': 'uint16',
    'channel': 'uint16',
    'is_attributed': 'uint8'
}   




In [3]:
def load_sample():
    sample = pd.read_csv('input/train_sample.csv', dtype=dtypes, parse_dates=['click_time', 'attributed_time'])
    sample['app_channel'] = sample[['app', 'channel']].apply(lambda row: ' '.join([str(row['app']), str(row['channel'])]), axis=1)
    sample['app_device'] = sample[['app', 'device']].apply(lambda row: ' '.join([str(row['app']), str(row['device'])]), axis=1)
    sample['ip_device'] = sample[['ip', 'device']].apply(lambda row: ' '.join([str(row['ip']), str(row['device'])]), axis=1)
    sample['click_hour'] = sample.click_time.dt.hour
    sample['click_minute'] = sample.click_time.dt.minute

    print(sample.columns.values)
    print(sample.shape)
    return sample


In [4]:
import keras
from keras.models import Model
from keras.layers import *
from keras.regularizers import l2
from keras.optimizers import Adam
from keras.engine.topology import Layer
import keras.backend as K

# Important to keep same ordering as in train.npz & test.npz            
EMBEDS = [
    {'name': 'ip', 'dim': [2**D, 100]},
    {'name': 'app', 'dim': [2**D, 100]},
    {'name': 'device', 'dim': [2**D, 50]},
    {'name': 'os', 'dim': [2**D, 50]},
    {'name': 'channel', 'dim': [2**D, 50]},
    {'name': 'app_channel', 'dim': [2**D, 100]},
    {'name': 'app_device', 'dim': [2**D, 100]},
    {'name': 'ip_device', 'dim': [2**D, 100]},
    {'name': 'click_hour', 'dim': [24, 5]},
    {'name': 'click_minute', 'dim': [60, 5]}
]

def embed(input_dim, output_dim, x):
    e = Embedding(input_dim, output_dim, input_length=1, embeddings_regularizer=l2(1e-8))
    r = e(x)
    r = Reshape((output_dim,))(r)
    return r

def build_model(features):    
    #misc = Input(shape=(features,), name='misc')
    
    for e in EMBEDS:
        e['input'] = Input(shape=(1,), name=e['name'])
        e['layer'] = embed(e['dim'][0], e['dim'][1], e['input'])
        
    # removed [misc]
    h = concatenate([e['layer'] for e in EMBEDS])
    h = BatchNormalization()(h)
    
    h = Dense(128, activation='relu')(h)
    h = BatchNormalization()(h)
    h = Dropout(0.5)(h)
    
    h = Dense(64, activation='relu')(h)
    #h = BatchNormalization()(h)
    h = Dropout(0.2)(h)
    
    h = Dense(32, activation='relu')(h)
    #h = BatchNormalization()(h)
    
    h = Dense(1, activation='sigmoid')(h)
    
    # removed [misc]
    model = Model(inputs=[e['input'] for e in EMBEDS], 
                  outputs=h)
    return model    
    
    

In [5]:
import tensorflow as tf

def roc_auc_score_fn(y_true, y_pred):
    """ ROC AUC Score.
    
    Source:
    https://github.com/tflearn/tflearn/blob/1b0e4d3539e1c4c6b31735499328c27da3a874b0/tflearn/objectives.py
    
    Approximates the Area Under Curve score, using approximation based on
    the Wilcoxon-Mann-Whitney U statistic.
    Yan, L., Dodier, R., Mozer, M. C., & Wolniewicz, R. (2003).
    Optimizing Classifier Performance via an Approximation to the Wilcoxon-Mann-Whitney Statistic.
    Measures overall performance for a full range of threshold levels.
    `y_pred` and `y_true` must have the same type and shape.
    """
    pos = tf.boolean_mask(y_pred, tf.cast(y_true, tf.bool))
    neg = tf.boolean_mask(y_pred, ~tf.cast(y_true, tf.bool))

    pos = tf.expand_dims(pos, 0)
    neg = tf.expand_dims(neg, 1)

    # original paper suggests performance is robust to exact parameter choice
    gamma = 0.2
    p     = 3

    difference = tf.zeros_like(pos * neg) + pos - neg - gamma

    masked = tf.boolean_mask(difference, difference < 0.0)

    return tf.reduce_sum(tf.pow(-masked, p))
      

In [6]:
def hashed(s, size=D):
    return s.apply(lambda x: hash(x) % 2 ** size)

def build_input(df):
    embed_names = [e['name'] for e in EMBEDS]
    h = {'misc': df[[x for x in df.columns if x not in embed_names]] }
    h.update(dict([(name, hashed(df.loc[:, name]).values) for name in embed_names]))
    return h


In [7]:
class CustomMetric(keras.callbacks.Callback):    
    def __init__(self, val_x, val_y):
        self.val_x = val_x
        self.val_y = val_y
        
    def on_epoch_end(self, epoch, logs=None):
        if logs is not None:
            logs['val-auc'] = roc_auc_score(self.val_y, self.model.predict(self.val_x).ravel())
        


In [8]:
class FoldDataset:
    def __init__(self, X_train, y_train, X_dev, y_dev):
        self.X_train = X_train
        self.y_train = y_train
        self.X_dev = X_dev
        self.y_dev = y_dev
        
    def build(self):
        #build features inside fold.
        #X_train1, X_dev1 = add_meanx2(X_train1, X_dev1)    

        X_train1 = self.X_train.drop(['attributed_time', 'click_time', 'is_attributed'], axis=1)
        X_dev1 = self.X_dev.drop(['attributed_time', 'click_time', 'is_attributed'], axis=1)

        self.train_input = build_input(X_train1)
        self.dev_input = build_input(X_dev1)        
    
    def parts(self):
        return self.train_input, self.y_train, self.dev_input, self.y_dev
        

In [18]:
class Classifier:
    def __init__(self, optimizer='sgd', loss='binary_crossentropy'):
        # roc_auc_score_fn
        self.loss = loss
        self.optimizer = optimizer
            
    def train(self, fold_no, ds, **fit_args):
        
        train1_input, y_train, dev_input, y_dev = ds.parts()

        if len(train1_input['misc'].shape) == 1:
            features = 1
        else:
            features = train1_input['misc'].shape[1]
        model = build_model(features)
        model.compile(optimizer=self.optimizer, loss=self.loss)

        # callbacks
        ckpt = keras.callbacks.ModelCheckpoint(filepath='tmp/weights.hdf5', 
                                               # val-auc is calculated by the custom eval_callback
                                               monitor='val-auc', mode='max',
                                               verbose=0, save_best_only=True)    
        eval_callback = CustomMetric(dev_input, y_dev)
        reduce_lr = keras.callbacks.ReduceLROnPlateau(monitor='val-auc', mode='max', patience=2)
        
        fit_args['callbacks']: [eval_callback, ckpt, reduce_lr]        
        
        if dev_input is None:
            fit_args['validation_data'] = None
            fit_args['validation_split'] = 0.05
        else:
            fit_args['validation_data'] = (dev_input, y_dev)

        model.fit(train1_input, y_train, **fit_args)

        model.load_weights('tmp/weights.hdf5')

        print('fold {}, best train-auc: {} val-auc: {}'.format(
            fold_no, 
            roc_auc_score(y_train, model.predict(train1_input).ravel()),
            roc_auc_score(y_dev, model.predict(dev_input).ravel())))
        
        return model
    
    def predict(self, model, X_test):
        print('.')        
        

In [10]:
def run_kfold(splits):
    X_train = sample.copy()
    y_train = sample.is_attributed.values.copy()

    skf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=SEED)
    oof_pred = np.zeros(y_train.shape, dtype=np.float32)

    for fold_no, (train_index, dev_index) in enumerate(skf.split(X_train, y_train)):
        X_train1, X_dev1 = X_train.iloc[train_index], X_train.iloc[dev_index]
        y_train1, y_dev1 = y_train[train_index], y_train[dev_index]

        ds = FoldDataset(X_train1, y_train1, X_dev1, y_dev1)
        ds.build()

        c = Classifier(loss='binary_crossentropy')
        model = c.train(fold_no, ds, epochs=10)

        _, _, dev_input, _ = ds.parts()
        oof_pred[dev_index] = model.predict(dev_input).ravel()

    print('final val auc', roc_auc_score(y_train, oof_pred))


In [11]:
#10-fold final val auc 0.9624749061426707

In [12]:

#without kfold, optimizing roc-auc directly.
#final val auc varies between 0.958 and 0.970

def run_single(test_size=0.1, epochs=4):
    X_train = sample.copy()
    y_train = sample.is_attributed.values.copy()

    X_train1, X_dev1, y_train1, y_dev1 = train_test_split(X_train, y_train, test_size=test_size, random_state=1+SEED, stratify=y_train)

    ds = FoldDataset(X_train1, y_train1, X_dev1, y_dev1)
    ds.build()

    c = Classifier(loss=roc_auc_score_fn)
    model = c.train(0, ds, epochs=epochs)

    _, _, dev_input, _ = ds.parts()
    print('final val auc', roc_auc_score(y_dev1, model.predict(dev_input).ravel()))

In [13]:
#run_single()

In [14]:
# ! added app_channel, app_device, ip_device and val-auc increased from 0.97 to 0.99

In [15]:
# run_kfold(10)

In [12]:

def run_single_split_by_time(test_size=0.1, epochs=6):
    
    train_sample = sample[sample.click_time < pd.datetime(2017,11,9)]
    dev_sample = sample[sample.click_time >= pd.datetime(2017,11,9)]
    
    X_train = train_sample.copy()
    y_train = train_sample.is_attributed.values.copy()
    
    X_dev = dev_sample.copy()
    y_dev = dev_sample.is_attributed.values.copy()

    ds = FoldDataset(X_train, y_train, X_dev, y_dev)
    ds.build()

    c = Classifier(loss=roc_auc_score_fn)
    model = c.train(0, ds, epochs=epochs)

    _, _, dev_input, _ = ds.parts()
    print('final val auc', roc_auc_score(y_dev, model.predict(dev_input).ravel()))

In [12]:
#run_single_split_by_time()

Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead
Train on 71439 samples, validate on 28561 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
fold 0, best train-auc: 0.99162356324719 val-auc: 0.9739991484391818
final val auc 0.9739991484391818


In [16]:


class CachedDataset:
    def __init__(self, fname):
        self.arr = np.load(fname)
        self.is_train = False
        
    def build(self):
        self.X = self.arr['x']
        if 'y' in self.arr:
            self.is_train = True
            self.y = self.arr['y']         
        self.X_prepared = self.build_input()
            
    def build_input(self):
        embed_names = [e['name'] for e in EMBEDS]
        #misc = np.concatenate([self.X[:, 0], self.X[:, idx], ...]) #TODO
        misc = self.X[:, 0]
        h = {'misc': misc} 
        h.update(dict([(name, self.X[:, idx]) for idx, name in enumerate(embed_names)]))
        return h
            
    def parts(self):
        if self.is_train:
            return self.X_prepared, self.y, None, None
        else:
            return self.X_prepared
        

In [17]:
ds = CachedDataset('tmp/train.npz')
ds.build()


In [None]:
c = Classifier(optimizer='sgd', loss=roc_auc_score_fn)
model = c.train(0, ds, epochs=10, batch_size=64, shuffle=True)

Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead
Train on 175658695 samples, validate on 9245195 samples
Epoch 1/10
  6874432/175658695 [>.............................] - ETA: 4:53:59 - loss: 0.0169

In [19]:
x, y, _, _ = ds.parts()

In [20]:

x.keys()

dict_keys(['misc', 'ip', 'app', 'device', 'os', 'channel', 'app_channel', 'app_device', 'ip_device', 'click_hour', 'click_minute'])

8