In [103]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import os
from sklearn.metrics import roc_auc_score
os.environ['OMP_NUM_THREADS'] = '4'
import gc

In [2]:
dtypes = {
        'ip'            : 'uint32',
        'app'           : 'uint16',
        'device'        : 'uint16',
        'os'            : 'uint16',
        'channel'       : 'uint16',
        'is_attributed' : 'uint8',
        'click_id'      : 'uint32'
        }
train_df = pd.read_csv("../train.csv", dtype=dtypes, skiprows = range(1, 131886954), usecols=['ip','app','device','os', 'channel', 'click_time', 'is_attributed'])
print('load train....')

load train....


In [6]:
len(train_df)

53016937

In [7]:
test_df = pd.read_csv("../test.csv", dtype=dtypes, usecols=['ip','app','device','os', 'channel', 'click_time', 'click_id'])
len_train = len(train_df)
print(len_train)

53016937


In [8]:
train_df=train_df.append(test_df)
del test_df; gc.collect()

28

In [9]:
print('hour, day, wday....')
train_df['hour'] = pd.to_datetime(train_df.click_time).dt.hour.astype('uint8')
train_df['day'] = pd.to_datetime(train_df.click_time).dt.day.astype('uint8')
train_df['wday']  = pd.to_datetime(train_df.click_time).dt.dayofweek.astype('uint8')

hour, day, wday....


In [10]:
print('grouping by ip-day-hour combination....')
gp = train_df[['ip','day','hour','channel']].groupby(by=['ip','day','hour'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'qty'})
train_df = train_df.merge(gp, on=['ip','day','hour'], how='left')

grouping by ip-day-hour combination....


In [11]:
del gp; gc.collect()
print('group by ip-app combination....')
gp = train_df[['ip','app', 'channel']].groupby(by=['ip', 'app'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'ip_app_count'})
train_df = train_df.merge(gp, on=['ip','app'], how='left')
del gp; gc.collect()
print('group by ip-app-os combination....')
gp = train_df[['ip','app', 'os', 'channel']].groupby(by=['ip', 'app', 'os'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'ip_app_os_count'})
train_df = train_df.merge(gp, on=['ip','app', 'os'], how='left')
del gp; gc.collect()

group by ip-app combination....
group by ip-app-os combination....


117

In [12]:
print("vars and data type....")
train_df['qty'] = train_df['qty'].astype('uint16')
train_df['ip_app_count'] = train_df['ip_app_count'].astype('uint16')
train_df['ip_app_os_count'] = train_df['ip_app_os_count'].astype('uint16')

vars and data type....


In [13]:
from sklearn.preprocessing import LabelEncoder
train_df[['app','device','os', 'channel', 'hour', 'day', 'wday']].apply(LabelEncoder().fit_transform)
print ('final part of preparation....')

final part of preparation....


In [14]:
## Look for train_df
train_df.shape, len_train

((71807406, 14), 53016937)

In [15]:
test_df = train_df[len_train:]
train_df = train_df[:len_train]

In [16]:
train_df.head()

Unnamed: 0,app,channel,click_id,click_time,device,ip,is_attributed,os,hour,day,wday,qty,ip_app_count,ip_app_os_count
0,11,487,,2017-11-09 00:00:00,1,201143,0.0,13,0,9,3,70,56,13
1,2,469,,2017-11-09 00:00:00,1,34684,0.0,13,0,9,3,54,68,8
2,26,477,,2017-11-09 00:00:00,1,207368,0.0,19,0,9,3,101,27,13
3,18,121,,2017-11-09 00:00:00,1,110176,0.0,8,0,9,3,146,214,10
4,12,265,,2017-11-09 00:00:00,1,109644,0.0,19,0,9,3,393,1423,351


In [18]:
test_df.head()

Unnamed: 0,app,channel,click_id,click_time,device,ip,is_attributed,os,hour,day,wday,qty,ip_app_count,ip_app_os_count
53016937,9,107,0.0,2017-11-10 04:00:00,1,5744,,3,4,10,4,34,87,3
53016938,9,466,1.0,2017-11-10 04:00:00,1,119901,,3,4,10,4,403,1089,14
53016939,21,128,2.0,2017-11-10 04:00:00,1,72287,,19,4,10,4,229,417,52
53016940,15,111,3.0,2017-11-10 04:00:00,1,78477,,13,4,10,4,239,193,45
53016941,12,328,4.0,2017-11-10 04:00:00,1,123080,,13,4,10,4,60,79,27


In [19]:
def reg_target_encoding(train, test, col, target='is_attributed'):
    """ Computes regularize mean encoding.
    Inputs:
       train: training dataframe
    """
    
    new_col_name='%s_count_enc'%col 
    
    
    temp = train.groupby(col)[target].count()
    train[new_col_name]= train[col].map(temp)
    test[new_col_name]= test[col].map(temp)
    global_mean = train[target].count()
    train[new_col_name] = train[new_col_name].fillna(global_mean)
    test[new_col_name] = test[new_col_name].fillna(global_mean)

reg_target_encoding(train_df,test_df, col = "ip")
reg_target_encoding(train_df,test_df, col = "app")
reg_target_encoding(train_df,test_df, col = "device")
reg_target_encoding(train_df,test_df, col = "os")
reg_target_encoding(train_df,test_df, col = "hour")
reg_target_encoding(train_df,test_df, col = "channel")

In [74]:
test_df.head()

Unnamed: 0,app,channel,click_id,click_time,device,ip,is_attributed,os,hour,day,wday,qty,ip_app_count,ip_app_os_count,ip_count_enc,app_count_enc,device_count_enc,os_count_enc,hour_count_enc,channel_count_enc
53016937,9,107,0.0,2017-11-10 04:00:00,1,5744,,3,4,10,4,34,87,3,378.0,6213966.0,49891688.0,856404.0,4032691,2954839.0
53016938,9,466,1.0,2017-11-10 04:00:00,1,119901,,3,4,10,4,403,1089,14,5835.0,6213966.0,49891688.0,856404.0,4032691,1171913.0
53016939,21,128,2.0,2017-11-10 04:00:00,1,72287,,19,4,10,4,229,417,52,3584.0,1098062.0,49891688.0,12537977.0,4032691,746233.0
53016940,15,111,3.0,2017-11-10 04:00:00,1,78477,,13,4,10,4,239,193,45,3312.0,3432186.0,49891688.0,11284964.0,4032691,160937.0
53016941,12,328,4.0,2017-11-10 04:00:00,1,123080,,13,4,10,4,60,79,27,375.0,6349572.0,49891688.0,11284964.0,4032691,493720.0


In [75]:
target = train_df.is_attributed
train_df.drop(['click_id', 'click_time', 'is_attributed'], axis=1, inplace=True)
test_df.drop(['click_id', 'click_time', 'is_attributed'], axis=1, inplace=True)

In [88]:
train_df.drop('ip', axis=1, inplace=True)
test_df.drop('ip', axis=1, inplace=True)

In [76]:
print ('neural network....')
from keras.layers import Input, Embedding, Dense, Flatten, Dropout, concatenate, Reshape, Merge, Activation
from keras.layers import BatchNormalization, SpatialDropout1D
from keras.callbacks import Callback
from keras.models import Sequential

from keras.models import Model
from keras.optimizers import Adam

neural network....


In [21]:
max_app = np.max([train_df['app'].max(), test_df['app'].max()])+1
max_ch = np.max([train_df['channel'].max(), test_df['channel'].max()])+1
max_dev = np.max([train_df['device'].max(), test_df['device'].max()])+1
max_os = np.max([train_df['os'].max(), test_df['os'].max()])+1
max_h = np.max([train_df['hour'].max(), test_df['hour'].max()])+1
max_d = np.max([train_df['day'].max(), test_df['day'].max()])+1
max_wd = np.max([train_df['wday'].max(), test_df['wday'].max()])+1
max_qty = np.max([train_df['qty'].max(), test_df['qty'].max()])+1
max_c1 = np.max([train_df['ip_app_count'].max(), test_df['ip_app_count'].max()])+1
max_c2 = np.max([train_df['ip_app_os_count'].max(), test_df['ip_app_os_count'].max()])+1

In [77]:
def get_emb_models(name, in_dim, emb_n):
    if in_dim<50:
        emb_n=5
    
    in_app= Sequential(name=name)
    in_app.add(Embedding(in_dim, emb_n, input_length=1))
    in_app.add(Reshape(target_shape=(emb_n,)))
    return in_app

In [91]:
def get_num_models(name):
    in_app= Sequential(name=name)
    in_app.add(Dense(1, input_dim=1))
    return in_app

In [179]:
for name, in_dim in zip(['app', 'ch', 'dev', 'os', 'h', 'd', 'wd', 'qty','c1','c2'], [max_app, max_ch, max_dev, max_os, max_h, max_d, max_wd, max_qty, max_c1, max_c2]):
    print(name, in_dim)

app 769
ch 501
dev 4228
os 957
h 17
d 11
wd 5
qty 43959
c1 65409
c2 16654


In [93]:
def get_model():
    models = []
    emb_n = 50
    dense_n = 1000
    
    for name, in_dim in zip(['app', 'ch', 'dev', 'os', 'h', 'd', 'wd', 'qty','c1','c2'], [max_app, max_ch, max_dev, max_os, max_h, max_d, max_wd, max_qty, max_c1, max_c2]):
        models.append(get_emb_models(name, in_dim, emb_n))
    
    for name in ['ip_count_enc','app_count_enc','device_count_enc','os_count_enc','hour_count_enc','channel_count_enc']:
        models.append(get_num_models(name))
    
    self_model = Sequential()
    self_model.add(Merge(models, mode='concat',  concat_axis=1))
    self_model.add(Dense(250, init='uniform'))
    self_model.add(Activation('relu'))
    
    self_model.add(Dropout(0.2))
    self_model.add(Dense(125, init='uniform'))
    self_model.add(Activation('relu'))

    self_model.add(Dense(1))
    self_model.add(Activation('sigmoid'))

    sgd = Adam(lr = 0.0001)
    self_model.compile(loss='binary_crossentropy', optimizer='adam')
    
    return self_model


In [94]:
get_model().summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
merge_13 (Merge)             (None, 371)               0         
_________________________________________________________________
dense_41 (Dense)             (None, 250)               93000     
_________________________________________________________________
activation_19 (Activation)   (None, 250)               0         
_________________________________________________________________
dropout_9 (Dropout)          (None, 250)               0         
_________________________________________________________________
dense_42 (Dense)             (None, 125)               31375     
_________________________________________________________________
activation_20 (Activation)   (None, 125)               0         
_________________________________________________________________
dense_43 (Dense)             (None, 1)                 126       
__________

In [96]:
def preprocess(X):
    final_x=list()
    for cols in X.columns:
        temp=X[cols] #1
        final_x.append(temp)
    return final_x

In [173]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_df, target,stratify=target, test_size=0.25)

In [174]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((39762702, 16), (13254235, 16), (39762702,), (13254235,))

In [175]:
import random
def generator(data, labels, num):
    idx = np.arange(0 , data.shape[0])
    np.random.shuffle(idx)
    idx = idx[:num]
    while True:
        data_shuffle = preprocess(data.iloc[idx])
        labels_shuffle = labels[idx]
        yield data_shuffle, labels_shuffle

In [172]:
train_df.head()

Unnamed: 0,app,channel,device,os,hour,day,wday,qty,ip_app_count,ip_app_os_count,ip_count_enc,app_count_enc,device_count_enc,os_count_enc,hour_count_enc,channel_count_enc
0,11,487,1,13,0,9,3,70,56,13,973,1164213,49891688,11284964,3318301,108393
1,2,469,1,13,0,9,3,54,68,8,836,6059213,49891688,11284964,3318301,715197
2,26,477,1,19,0,9,3,101,27,13,1079,1025214,49891688,12537977,3318301,1973977
3,18,121,1,8,0,9,3,146,214,10,1864,4773490,49891688,1511197,3318301,1294239
4,12,265,1,19,0,9,3,393,1423,351,8677,6349572,49891688,12537977,3318301,1598648


In [164]:
len(X_train)/20000

1988.1351

In [None]:
from keras import backend as K
K.clear_session()
self_model=get_model()
self_model.fit_generator(generator(X_train, y_train, 4000), epochs=1, steps_per_epoch= 20,
               callbacks=[roc_callback(training_data=(X_train, y_train),validation_data=(X_test, y_test))])

Epoch 1/1


In [180]:
gc.collect()

122152

In [181]:
del train_df
gc.collect()

1207

In [110]:
from sklearn.metrics import roc_auc_score
class roc_callback(Callback):
    def __init__(self,training_data,validation_data):
        self.x = training_data[0]
        self.y = training_data[1]
        self.x_val = validation_data[0]
        self.y_val = validation_data[1]


    def on_train_begin(self, logs={}):
        return

    def on_train_end(self, logs={}):
        return

    def on_epoch_begin(self, epoch, logs={}):
        return

    def on_epoch_end(self, epoch, logs={}):
        y_pred = self.model.predict(self.x)
        roc = roc_auc_score(self.y, y_pred)
        y_pred_val = self.model.predict(self.x_val)
        roc_val = roc_auc_score(self.y_val, y_pred_val)
        print('\rroc-auc: %s - roc-auc_val: %s' % (str(round(roc,4)),str(round(roc_val,4))),end=100*' '+'\n')
        return

    def on_batch_begin(self, batch, logs={}):
        return

    def on_batch_end(self, batch, logs={}):
        return