In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import gc

# Prepare data

In [2]:
samples = pd.read_csv('../data/train_sample.csv')

In [2]:
train = pd.read_csv('../data/train.csv')

In [3]:
test = pd.read_csv('../data/test.csv')

In [4]:
submit = pd.read_csv('../data/sample_submission.csv')

In [4]:
samples.loc[:, 'click_datetime'] = pd.to_datetime(samples.click_time, format='%Y-%m-%d %H:%M:%S')
samples.loc[:, 'date'] = samples.click_datetime.dt.date
samples.loc[:, 'minute'] = samples.click_datetime.dt.hour * 60 + samples.click_datetime.dt.minute

In [16]:
samples = samples.drop('click_time', axis=1)

In [5]:
train.loc[:, 'click_datetime'] = pd.to_datetime(train.click_time, format='%Y-%m-%d %H:%M:%S')
train.loc[:, 'date'] = train.click_datetime.dt.date
train.loc[:, 'minute'] = train.click_datetime.dt.hour * 60 + train.click_datetime.dt.minute

In [24]:
train = train.drop(['click_time', 'click_datetime', 'attributed_time'], axis=1)

In [7]:
test.loc[:, 'click_datetime'] = pd.to_datetime(test.click_time, format='%Y-%m-%d %H:%M:%S')
test.loc[:, 'date'] = test.click_datetime.dt.date
test.loc[:, 'minute'] = test.click_datetime.dt.hour * 60 + test.click_datetime.dt.minute

In [21]:
del test

In [8]:
test = test.drop('click_time', axis=1)

In [5]:
samples_gp_idm = samples.groupby(['ip', 'date', 'minute'])

In [9]:
train_gp_idm = train.groupby(['ip', 'date', 'minute'])
test_gp_idm = test.groupby(['ip', 'date', 'minute'])

In [25]:
del train_gp_idm
del test_gp_idm

In [26]:
gc.collect()

5115

In [10]:
def get_sum(group_obj, col):
    res = group_obj[col].nunique()
    res.name = col + 's'
    return res

In [11]:
def get_entropy(group_obj, col):
    values = group_obj[col].value_counts(normalize=True)
    values_log = pd.Series(np.log(values), index=values.index)
    entropy_prep = - values * values_log
    entropy = entropy_prep.groupby(level=[0, 1, 2]).sum()
    entropy.name = col + '_ent'
    return entropy

In [12]:
def create_feats(group_obj, feats=['app', 'device', 'os', 'channel']):
    print('clicks ...')
    clicks = group_obj.size()
    clicks.name = 'clicks'
    print('others ...')
    series = [clicks]
    for feat in feats:
        series.append(get_sum(group_obj, feat))
        series.append(get_entropy(group_obj, feat))
    return pd.concat(series, axis=1)

In [11]:
samples_feats = create_feats(samples_gp_idm)

clicks ...
others ...


In [12]:
samples_feats.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,clicks,apps,app_ent,devices,device_ent,oss,os_ent,channels,channel_ent
ip,date,minute,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
9,2017-11-07,966,1,1,0.0,1,0.0,1,0.0,1,0.0
10,2017-11-07,97,1,1,0.0,1,0.0,1,0.0,1,0.0
10,2017-11-07,453,1,1,0.0,1,0.0,1,0.0,1,0.0
10,2017-11-08,679,1,1,0.0,1,0.0,1,0.0,1,0.0
19,2017-11-08,552,1,1,0.0,1,0.0,1,0.0,1,0.0


In [13]:
train_feats = create_feats(train_gp_idm)

clicks ...
others ...


In [14]:
train_feats.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,clicks,apps,app_ent,devices,device_ent,oss,os_ent,channels,channel_ent
ip,date,minute,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,2017-11-08,1045,4,4,1.386294,1,0.0,1,0.0,3,1.039721
1,2017-11-08,1067,1,1,0.0,1,0.0,1,0.0,1,0.0
1,2017-11-08,1177,1,1,0.0,1,0.0,1,0.0,1,0.0
1,2017-11-08,1199,1,1,0.0,1,0.0,1,0.0,1,0.0
1,2017-11-08,1205,1,1,0.0,1,0.0,1,0.0,1,0.0


In [15]:
test_feats = create_feats(test_gp_idm)

clicks ...
others ...


In [16]:
test_feats.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,clicks,apps,app_ent,devices,device_ent,oss,os_ent,channels,channel_ent
ip,date,minute,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,2017-11-10,291,1,1,0.0,1,0.0,1,0.0,1,0.0
0,2017-11-10,294,1,1,0.0,1,0.0,1,0.0,1,0.0
1,2017-11-10,294,1,1,0.0,1,0.0,1,0.0,1,0.0
2,2017-11-10,637,1,1,0.0,1,0.0,1,0.0,1,0.0
2,2017-11-10,783,4,4,1.386294,1,0.0,1,0.0,3,1.039721


In [14]:
samples = pd.merge(samples, samples_feats.reset_index(), on=['ip', 'date', 'minute'], how='left')

In [15]:
samples.head()

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed,click_datetime,date,minute,clicks,apps,app_ent,devices,device_ent,oss,os_ent,channels,channel_ent
0,87540,12,1,13,497,2017-11-07 09:30:38,,0,2017-11-07 09:30:38,2017-11-07,570,1,1,0.0,1,0.0,1,0.0,1,0.0
1,105560,25,1,17,259,2017-11-07 13:40:27,,0,2017-11-07 13:40:27,2017-11-07,820,1,1,0.0,1,0.0,1,0.0,1,0.0
2,101424,12,1,19,212,2017-11-07 18:05:24,,0,2017-11-07 18:05:24,2017-11-07,1085,1,1,0.0,1,0.0,1,0.0,1,0.0
3,94584,13,1,13,477,2017-11-07 04:58:08,,0,2017-11-07 04:58:08,2017-11-07,298,1,1,0.0,1,0.0,1,0.0,1,0.0
4,68413,12,1,1,178,2017-11-09 09:00:09,,0,2017-11-09 09:00:09,2017-11-09,540,1,1,0.0,1,0.0,1,0.0,1,0.0


In [18]:
train.head()

Unnamed: 0,ip,app,device,os,channel,attributed_time,is_attributed,click_datetime,date,minute
0,83230,3,1,13,379,,0,2017-11-06 14:32:21,2017-11-06,872
1,17357,3,1,19,379,,0,2017-11-06 14:33:34,2017-11-06,873
2,35810,3,1,13,379,,0,2017-11-06 14:34:12,2017-11-06,874
3,45745,14,1,13,478,,0,2017-11-06 14:34:52,2017-11-06,874
4,161007,3,1,13,379,,0,2017-11-06 14:35:08,2017-11-06,875


In [27]:
train = pd.merge(train, train_feats.reset_index(), on=['ip', 'date', 'minute'], how='left')

In [28]:
train.head()

Unnamed: 0,ip,app,device,os,channel,is_attributed,date,minute,clicks,apps,app_ent,devices,device_ent,oss,os_ent,channels,channel_ent
0,83230,3,1,13,379,0,2017-11-06,872,1,1,0.0,1,0.0,1,0.0,1,0.0
1,17357,3,1,19,379,0,2017-11-06,873,1,1,0.0,1,0.0,1,0.0,1,0.0
2,35810,3,1,13,379,0,2017-11-06,874,1,1,0.0,1,0.0,1,0.0,1,0.0
3,45745,14,1,13,478,0,2017-11-06,874,1,1,0.0,1,0.0,1,0.0,1,0.0
4,161007,3,1,13,379,0,2017-11-06,875,1,1,0.0,1,0.0,1,0.0,1,0.0


In [30]:
del train_feats

In [31]:
gc.collect()

654

In [44]:
%who

create_feats	 gc	 get_entropy	 get_sum	 np	 pd	 train	 


In [34]:
del test_feats

In [35]:
del submit

In [56]:
gc.collect()

131

In [46]:
%reset Out

Once deleted, variables cannot be recovered. Proceed (y/[n])? y
Flushing output cache (8 entries)


In [47]:
Out

{}

In [48]:
train.drop(['date', 'ip'], axis=1, inplace=True)

MemoryError: 

In [None]:
test = pd.merge(test, test_feats.reset_index(), on=['ip', 'date', 'minute'], how='left')

In [17]:
samples.isnull().sum()

ip                     0
app                    0
device                 0
os                     0
channel                0
attributed_time    99773
is_attributed          0
click_datetime         0
date                   0
minute                 0
clicks                 0
apps                   0
app_ent                0
devices                0
device_ent             0
oss                    0
os_ent                 0
channels               0
channel_ent            0
dtype: int64

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

# Random forest

In [2]:
from sklearn.ensemble import RandomForestClassifier

In [3]:
rfc = RandomForestClassifier(n_estimators=25, n_jobs=4, class_weight={0: 1, 1: 10})

In [52]:
train_features = ['app', 'apps', 'app_ent',
                  'device', 'devices', 'device_ent',
                  'os', 'oss', 'os_ent',
                  'channel', 'channels', 'channel_ent',
                  'minute', 'ip', 'is_attributed']

In [22]:
train = train.loc[train.click_trainetime < '2017-11-09', train_features]

In [62]:
train.head()

Unnamed: 0,ip,app,device,os,channel,is_attributed,date,minute,clicks,apps,app_ent,devices,device_ent,oss,os_ent,channels,channel_ent
0,83230,3,1,13,379,0,2017-11-06,872,1,1,0.0,1,0.0,1,0.0,1,0.0
1,17357,3,1,19,379,0,2017-11-06,873,1,1,0.0,1,0.0,1,0.0,1,0.0
2,35810,3,1,13,379,0,2017-11-06,874,1,1,0.0,1,0.0,1,0.0,1,0.0
3,45745,14,1,13,478,0,2017-11-06,874,1,1,0.0,1,0.0,1,0.0,1,0.0
4,161007,3,1,13,379,0,2017-11-06,875,1,1,0.0,1,0.0,1,0.0,1,0.0


In [97]:
train.to_csv('../derived_data/aggregated_clicks.csv', index=False)

In [4]:
read_cols = ['app', 'device', 'os', 'channel', 'minute', 'clicks', 'apps', 'app_ent',
             'devices', 'device_ent', 'oss', 'os_ent', 'channels', 'channel_ent', 'is_attributed']

In [5]:
train = pd.read_csv('../derived_data/aggregated_clicks.csv', usecols=read_cols, nrows=10000000)

In [6]:
train.head()

Unnamed: 0,app,device,os,channel,is_attributed,minute,clicks,apps,app_ent,devices,device_ent,oss,os_ent,channels,channel_ent
0,3,1,13,379,0,872,1,1,0.0,1,0.0,1,0.0,1,0.0
1,3,1,19,379,0,873,1,1,0.0,1,0.0,1,0.0,1,0.0
2,3,1,13,379,0,874,1,1,0.0,1,0.0,1,0.0,1,0.0
3,14,1,13,478,0,874,1,1,0.0,1,0.0,1,0.0,1,0.0
4,3,1,13,379,0,875,1,1,0.0,1,0.0,1,0.0,1,0.0


In [9]:
train = train.reindex(columns=read_cols)

In [10]:
train.head()

Unnamed: 0,app,device,os,channel,minute,clicks,apps,app_ent,devices,device_ent,oss,os_ent,channels,channel_ent,is_attributed
0,3,1,13,379,872,1,1,0.0,1,0.0,1,0.0,1,0.0,0
1,3,1,19,379,873,1,1,0.0,1,0.0,1,0.0,1,0.0,0
2,3,1,13,379,874,1,1,0.0,1,0.0,1,0.0,1,0.0,0
3,14,1,13,478,874,1,1,0.0,1,0.0,1,0.0,1,0.0,0
4,3,1,13,379,875,1,1,0.0,1,0.0,1,0.0,1,0.0,0


In [11]:
train.is_attributed.value_counts()

0    9981283
1      18717
Name: is_attributed, dtype: int64

In [24]:
valid = train.loc[train.click_trainetime >= '2017-11-09', train_features]

In [57]:
train.shape

(184903890, 17)

In [58]:
len(train) // 10000

18490

In [63]:
features = [1, 2, 3, 4, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]

In [72]:
batch_size = 10000

In [12]:
%%time
rfc.fit(train.iloc[:, :-1], train.iloc[:, -1])

CPU times: user 18min 13s, sys: 20.2 s, total: 18min 33s
Wall time: 4min 59s


RandomForestClassifier(bootstrap=True, class_weight={0: 1, 1: 10},
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=25, n_jobs=4, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [13]:
train.head(1000000).is_attributed.value_counts()

0    998307
1      1693
Name: is_attributed, dtype: int64

In [74]:
%%time
for i in range(len(train)//batch_size+1):
    rfc.fit(train.iloc[i*batch_size:(i+1)*batch_size, features], train.iloc[i*batch_size:(i+1)*batch_size, 5])

CPU times: user 19min 46s, sys: 18.5 s, total: 20min 4s
Wall time: 27min 28s


In [14]:
pred = rfc.predict(train.iloc[:1000000, :-1])

In [15]:
from sklearn.metrics import precision_score, recall_score, roc_auc_score

In [16]:
precision_score(train.iloc[:1000000, -1], pred)

0.6704035874439462

In [17]:
recall_score(train.iloc[:1000000, -1], pred)

0.8830478440637921

In [18]:
pred_prob = rfc.predict_proba(train.iloc[:1000000, :-1])

In [19]:
roc_auc_score(train.iloc[:1000000, -1], pred_prob[:, 1])

0.9996733140204594

In [20]:
import pickle

In [21]:
with open('../derived_data/random_forest_aggregated_clicks.pk', 'wb') as f:
    pickle.dump(rfc, f)

In [50]:
rfc_bm = RandomForestClassifier(n_estimators=50)

In [51]:
rfc_bm.fit(train.loc[:, ['ip', 'app', 'device', 'os', 'channel', 'minute']], train.iloc[:, -1])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [52]:
pred = rfc_bm.predict(valid.loc[:, ['ip', 'app', 'device', 'os', 'channel', 'minute']])

In [53]:
precision_score(valid.iloc[:, -1], pred)

0.54761904761904767

In [54]:
recall_score(valid.iloc[:, -1], pred)

0.38983050847457629

In [55]:
pred_prob = rfc_bm.predict_proba(valid.loc[:, ['ip', 'app', 'device', 'os', 'channel', 'minute']])

In [56]:
roc_auc_score(valid.iloc[:, -1], pred_prob[:, 1])

0.92679252957568248