In [1]:
%matplotlib inline
import numpy as np
import pandas as pd

In [7]:
dat = pd.read_csv('../data/train_sample.csv')

In [8]:
dat.head()

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed
0,87540,12,1,13,497,2017-11-07 09:30:38,,0
1,105560,25,1,17,259,2017-11-07 13:40:27,,0
2,101424,12,1,19,212,2017-11-07 18:05:24,,0
3,94584,13,1,13,477,2017-11-07 04:58:08,,0
4,68413,12,1,1,178,2017-11-09 09:00:09,,0


In [9]:
dat.loc[:, 'click_datetime'] = pd.to_datetime(dat.click_time, format='%Y-%m-%d %H:%M:%S')
dat.loc[:, 'date'] = dat.click_datetime.dt.date
dat.loc[:, 'minute'] = dat.click_datetime.dt.hour * 60 + dat.click_datetime.dt.minute

In [10]:
dat.head()

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed,click_datetime,date,minute
0,87540,12,1,13,497,2017-11-07 09:30:38,,0,2017-11-07 09:30:38,2017-11-07,570
1,105560,25,1,17,259,2017-11-07 13:40:27,,0,2017-11-07 13:40:27,2017-11-07,820
2,101424,12,1,19,212,2017-11-07 18:05:24,,0,2017-11-07 18:05:24,2017-11-07,1085
3,94584,13,1,13,477,2017-11-07 04:58:08,,0,2017-11-07 04:58:08,2017-11-07,298
4,68413,12,1,1,178,2017-11-09 09:00:09,,0,2017-11-09 09:00:09,2017-11-09,540


In [11]:
gp_idm = dat.groupby(['ip', 'date', 'minute'])

In [12]:
def get_sum(col):
    res = gp_idm[col].nunique()
    res.name = col + 's'
    return res

In [13]:
def get_entropy(col):
    values = gp_idm[col].value_counts(normalize=True)
    values_log = pd.Series(np.log(values), index=values.index)
    entropy_prep = - values * values_log
    entropy = entropy_prep.groupby(level=[0, 1, 2]).sum()
    entropy.name = col + '_ent'
    return entropy

In [14]:
def create_feats():
    print('clicks ...')
    clicks = gp_idm.size()
    clicks.name = 'clicks'
    print('apps ...')
    apps = get_sum('app')
    devices = get_sum('device')
    oss = get_sum('os')
    channels = get_sum('channel')
    print('app entropy ...')
    app_entropy = get_entropy('app')
    device_entropy = get_entropy('device')
    os_entropy = get_entropy('os')
    channel_entropy = get_entropy('channel')
    return pd.concat([clicks, apps, app_entropy, devices, device_entropy, oss, os_entropy, channels, channel_entropy], axis=1)

In [15]:
feats = create_feats()

clicks ...
apps ...
app entropy ...


In [16]:
feats.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,clicks,apps,app_ent,devices,device_ent,oss,os_ent,channels,channel_ent
ip,date,minute,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
9,2017-11-07,966,1,1,0.0,1,0.0,1,0.0,1,0.0
10,2017-11-07,97,1,1,0.0,1,0.0,1,0.0,1,0.0
10,2017-11-07,453,1,1,0.0,1,0.0,1,0.0,1,0.0
10,2017-11-08,679,1,1,0.0,1,0.0,1,0.0,1,0.0
19,2017-11-08,552,1,1,0.0,1,0.0,1,0.0,1,0.0


In [17]:
dat = pd.merge(dat, feats.reset_index(), on=['ip', 'date', 'minute'], how='left')

In [18]:
dat.isnull().sum()

ip                     0
app                    0
device                 0
os                     0
channel                0
click_time             0
attributed_time    99773
is_attributed          0
click_datetime         0
date                   0
minute                 0
clicks                 0
apps                   0
app_ent                0
devices                0
device_ent             0
oss                    0
os_ent                 0
channels               0
channel_ent            0
dtype: int64

In [19]:
from sklearn.ensemble import RandomForestClassifier

In [43]:
rfc = RandomForestClassifier(n_estimators=50)

In [21]:
train_features = ['app', 'apps', 'app_ent',
                  'device', 'devices', 'device_ent',
                  'os', 'oss', 'os_ent',
                  'channel', 'channels', 'channel_ent',
                  'minute', 'ip', 'is_attributed']

In [22]:
train = dat.loc[dat.click_datetime < '2017-11-09', train_features]

In [23]:
train.head()

Unnamed: 0,app,apps,app_ent,device,devices,device_ent,os,oss,os_ent,channel,channels,channel_ent,minute,ip,is_attributed
0,12,1,0.0,1,1,0.0,13,1,0.0,497,1,0.0,570,87540,0
1,25,1,0.0,1,1,0.0,17,1,0.0,259,1,0.0,820,105560,0
2,12,1,0.0,1,1,0.0,19,1,0.0,212,1,0.0,1085,101424,0
3,13,1,0.0,1,1,0.0,13,1,0.0,477,1,0.0,298,94584,0
7,9,1,0.0,1,1,0.0,25,1,0.0,442,1,0.0,601,121505,0


In [24]:
valid = dat.loc[dat.click_datetime >= '2017-11-09', train_features]

In [44]:
rfc.fit(train.iloc[:, :-1], train.iloc[:, -1])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [45]:
pred = rfc.predict(valid.iloc[:, :-1])

In [27]:
from sklearn.metrics import precision_score, recall_score, roc_auc_score

In [46]:
precision_score(valid.iloc[:, -1], pred)

0.4642857142857143

In [47]:
recall_score(valid.iloc[:, -1], pred)

0.22033898305084745

In [48]:
pred_prob = rfc.predict_proba(valid.iloc[:, :-1])

In [49]:
roc_auc_score(valid.iloc[:, -1], pred_prob[:, 1])

0.94244620359677422

In [50]:
rfc_bm = RandomForestClassifier(n_estimators=50)

In [51]:
rfc_bm.fit(train.loc[:, ['ip', 'app', 'device', 'os', 'channel', 'minute']], train.iloc[:, -1])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [52]:
pred = rfc_bm.predict(valid.loc[:, ['ip', 'app', 'device', 'os', 'channel', 'minute']])

In [53]:
precision_score(valid.iloc[:, -1], pred)

0.54761904761904767

In [54]:
recall_score(valid.iloc[:, -1], pred)

0.38983050847457629

In [55]:
pred_prob = rfc_bm.predict_proba(valid.loc[:, ['ip', 'app', 'device', 'os', 'channel', 'minute']])

In [56]:
roc_auc_score(valid.iloc[:, -1], pred_prob[:, 1])

0.92679252957568248