In [12]:
import os
import random

os.environ['KERAS_BACKEND'] = 'tensorflow'

import pandas as pd
import numpy as np
import keras

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

SEED=42

np.random.seed(SEED)
random.seed(SEED)

In [2]:
dtypes = {
    'ip':'uint32',
    'app': 'uint16',
    'device': 'uint16',
    'os': 'uint16',
    'channel': 'uint16',
    'is_attributed': 'uint8'
}   


def add_meanx(df):
    df1 = sample[['is_attributed', 'app']].groupby(['app']).median().rename(columns={'is_attributed': 'mean1'}).reset_index()
    df = pd.merge(df, df1, on=['app'], how='left')
    
    #df6 = sample[['is_attributed', 'app']].groupby(['app']).std().rename(columns={'is_attributed': 'mean6'}).reset_index()
    #df = pd.merge(df, df6, on=['app'], how='left')

    #df7 = sample[['is_attributed', 'app']].groupby(['app']).var().rename(columns={'is_attributed': 'mean7'}).reset_index()
    #df = pd.merge(df, df7, on=['app'], how='left')

    df2 = sample[['is_attributed', 'app', 'channel']].groupby(['app', 'channel']).median().rename(columns={'is_attributed': 'mean2'}).reset_index()    
    df = pd.merge(df, df2, on=['app', 'channel'], how='left')
                   
    #df4 = sample[['is_attributed', 'app', 'channel']].groupby(['app', 'channel']).var().rename(columns={'is_attributed': 'mean4'}).reset_index()    
    #df = pd.merge(df, df4, on=['app', 'channel'], how='left')
    
    #df5 = sample[['is_attributed', 'app', 'channel']].groupby(['app', 'channel']).std().rename(columns={'is_attributed': 'mean5'}).reset_index()    
    #df = pd.merge(df, df5, on=['app', 'channel'], how='left')
    
    df3 = sample[['is_attributed', 'app', 'device']].groupby(['app', 'device']).median().rename(columns={'is_attributed': 'mean3'}).reset_index()    
    df = pd.merge(df, df3, on=['app', 'device'], how='left')
    
    return df


In [3]:
sample = pd.read_csv('input/train_sample.csv', dtype=dtypes, parse_dates=['click_time', 'attributed_time'])
sample['click_wd'] = sample.click_time.dt.weekday
sample['click_hour'] = sample.click_time.dt.hour
sample = add_meanx(sample)


In [4]:
sample_y = sample.is_attributed.values
sample = sample.drop(['attributed_time', 'click_time', 'is_attributed'], axis=1)

len(sample)

100000

In [5]:

X_train, X_val, y_train, y_val = train_test_split(sample, sample_y, test_size=0.2, random_state=SEED)
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

(80000, 10) (20000, 10) (80000,) (20000,)


In [26]:
import keras
from keras.models import Model
from keras.layers import *
from keras.regularizers import l2
from keras.optimizers import Adam
from keras.engine.topology import Layer
import keras.backend as K

EMBEDS = [
    {'name': 'ip', 'dim': [1024, 100]},
    {'name': 'app', 'dim': [1024, 100]},
    {'name': 'device', 'dim': [1024, 50]},
    {'name': 'os', 'dim': [1024, 50]},
    {'name': 'channel', 'dim': [1024, 50]},
    {'name': 'click_wd', 'dim': [3, 3]},
    {'name': 'click_hour', 'dim': [24, 5]}
]

def embed(input_dim, output_dim, x):
    e = Embedding(input_dim, output_dim, input_length=1, embeddings_regularizer=l2(1e-8))
    r = e(x)
    r = Reshape((output_dim,))(r)
    return r

def build_model(features):
    misc = Input(shape=(features,), name='misc')
    
    for e in EMBEDS:
        e['input'] = Input(shape=(1,), name=e['name'])
        e['layer'] = embed(e['dim'][0], e['dim'][1], e['input'])
        
    h = concatenate([misc] + [e['layer'] for e in EMBEDS])
    h = BatchNormalization()(h)
    
    h = Dense(32, activation='relu')(h)
    h = Dropout(0.5)(h)
    
    h = Dense(16, activation='relu')(h)
    
    h = Dense(1, activation='sigmoid')(h)
    
    model = Model(
            inputs=[misc] + [e['input'] for e in EMBEDS],
            outputs=h)
    
    model.compile(optimizer='adam', loss='binary_crossentropy')
    return model    
    
    

In [27]:
def hashed(s, size=10):
    return s.apply(lambda x: hash(x) % 2 ** size)

def build_input(df):
    embed_names = [e['name'] for e in EMBEDS]
    h = {'misc': df[[x for x in df.columns if x not in embed_names]] }
    h.update(dict([(name, hashed(df.loc[:, name]).values) for name in embed_names]))
    return h

train_input = build_input(X_train)
val_input = build_input(X_val)

model = build_model(train_input['misc'].shape[1])
model.fit(train_input, y_train, 
          validation_data=(val_input, y_val),
          epochs=3)

Train on 80000 samples, validate on 20000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fc46089b2b0>

In [28]:

val_input = build_input(X_val)



In [29]:
print('train auc:', roc_auc_score(y_train, model.predict(train_input).ravel()))
print('val auc:', roc_auc_score(y_val, model.predict(val_input).ravel()))


train auc: 0.5599002464512838
val auc: 0.5319284764384474
