In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import gc
import lightgbm as lgb
from sklearn.metrics import precision_score, recall_score, roc_auc_score
import matplotlib.pyplot as plt
import pickle

# Prepare data

In [11]:
dtypes = {
    'ip'            : 'uint32',
    'app'           : 'uint16',
    'device'        : 'uint16',
    'os'            : 'uint16',
    'channel'       : 'uint16',
    'is_attributed' : 'uint8',
    'click_id'      : 'uint32'
}

In [3]:
usecols = ['click_id', 'ip', 'app', 'device', 'os', 'channel', 'click_time']

In [4]:
%%time
test = pd.read_csv('../data/test.csv', dtype=dtypes, usecols=usecols)

CPU times: user 8.42 s, sys: 659 ms, total: 9.08 s
Wall time: 9.57 s


In [5]:
test.loc[:, 'click_time'] = pd.to_datetime(test.click_time, format='%Y-%m-%d %H:%M:%S')

In [6]:
%%time
test.loc[:, 'date'] = test.click_time.dt.date

CPU times: user 27.5 s, sys: 869 ms, total: 28.4 s
Wall time: 28.4 s


In [7]:
test.loc[:, 'minute'] = test.click_time.dt.hour * 60 + test.click_time.dt.minute

In [8]:
test_gp_idm = test.groupby(['ip', 'date', 'minute'])

In [9]:
def get_sum(group_obj, col):
    res = group_obj[col].nunique()
    res.name = col + 's'
    return res

In [10]:
def get_entropy(group_obj, col):
    values = group_obj[col].value_counts(normalize=True)
    values_log = pd.Series(np.log(values), index=values.index)
    entropy_prep = - values * values_log
    entropy = entropy_prep.groupby(level=[0, 1, 2]).sum()
    entropy.name = col + '_ent'
    return entropy

In [11]:
def create_feats(group_obj, feats=['app', 'device', 'os', 'channel']):
    print('clicks ...')
    clicks = group_obj.size()
    clicks.name = 'clicks'
    print('others ...')
    series = [clicks]
    for feat in feats:
        series.append(get_sum(group_obj, feat))
        series.append(get_entropy(group_obj, feat))
    return pd.concat(series, axis=1)

In [12]:
%%time
test_feats = create_feats(test_gp_idm)

clicks ...
others ...
CPU times: user 1min 12s, sys: 7.07 s, total: 1min 19s
Wall time: 1min 1s


In [13]:
test_feats.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,clicks,apps,app_ent,devices,device_ent,oss,os_ent,channels,channel_ent
ip,date,minute,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,2017-11-10,291,1,1,0.0,1,0.0,1,0.0,1,0.0
0,2017-11-10,294,1,1,0.0,1,0.0,1,0.0,1,0.0
1,2017-11-10,294,1,1,0.0,1,0.0,1,0.0,1,0.0
2,2017-11-10,637,1,1,0.0,1,0.0,1,0.0,1,0.0
2,2017-11-10,783,4,4,1.386294,1,0.0,1,0.0,3,1.039721


In [14]:
%who

create_feats	 dtypes	 gc	 get_entropy	 get_sum	 lgb	 np	 pd	 plt	 
precision_score	 recall_score	 roc_auc_score	 test	 test_feats	 test_gp_idm	 usecols	 


In [15]:
del test_gp_idm
gc.collect()

41

In [16]:
%%time
test = pd.merge(test, test_feats, left_on=['ip', 'date', 'minute'], right_index=True, how='left')

CPU times: user 7.12 s, sys: 2.38 s, total: 9.5 s
Wall time: 8.18 s


In [17]:
test.head()

Unnamed: 0,click_id,ip,app,device,os,channel,click_time,date,minute,clicks,apps,app_ent,devices,device_ent,oss,os_ent,channels,channel_ent
0,0,5744,9,1,3,107,2017-11-10 04:00:00,2017-11-10,240,2,2,0.693147,1,0.0,1,0.0,2,0.693147
1,1,119901,9,1,3,466,2017-11-10 04:00:00,2017-11-10,240,15,8,1.771624,1,0.0,3,0.853236,9,2.048883
2,2,72287,21,1,19,128,2017-11-10 04:00:00,2017-11-10,240,5,4,1.332179,1,0.0,4,1.332179,5,1.609438
3,3,78477,15,1,13,111,2017-11-10 04:00:00,2017-11-10,240,9,6,1.676988,1,0.0,2,0.348832,8,2.043192
4,4,123080,12,1,13,328,2017-11-10 04:00:00,2017-11-10,240,4,4,1.386294,1,0.0,1,0.0,4,1.386294


In [18]:
del test_feats
gc.collect()

126

In [19]:
test.drop(['ip', 'click_time', 'date'], axis=1, inplace=True)

In [20]:
gc.collect()

0

In [21]:
%%time
test.to_csv('../derived_data/agg_clicks_test.csv', index=False)

CPU times: user 2min 48s, sys: 971 ms, total: 2min 49s
Wall time: 2min 50s


In [22]:
post_dtypes = {
    'minute': 'uint16',
    'clicks': 'uint16',
    'apps': 'uint16',
    'devices': 'uint16',
    'oss': 'uint16',
    'channels': 'uint16',
    'app_ent': 'float16',
    'device_ent': 'float16',
    'os_ent': 'float16',
    'channel_ent': 'float16',
}

In [23]:
test = test.astype(post_dtypes)

In [12]:
submit = pd.read_csv('../data/sample_submission.csv', dtype=dtypes)

In [25]:
%who

create_feats	 dtypes	 gc	 get_entropy	 get_sum	 lgb	 np	 pd	 plt	 
post_dtypes	 precision_score	 recall_score	 roc_auc_score	 submit	 test	 usecols	 


In [26]:
gc.collect()

2532

In [27]:
%reset Out

Once deleted, variables cannot be recovered. Proceed (y/[n])? y
Flushing output cache (6 entries)


# Light GBM

In [2]:
test = pd.read_csv('../derived_data/aggregated_clicks_test.csv')

In [3]:
with open('../results/lgm.pickle', 'rb') as f:
    lgb_model = pickle.load(f)

In [4]:
predictors = [
    'app', 'apps', 'app_ent',
    'device', 'devices', 'device_ent',
    'os', 'oss', 'os_ent',
    'channel', 'channels', 'channel_ent',
    'minute', 'clicks'
]

In [5]:
%%time
pred = lgb_model.predict(test.loc[:, predictors], num_iteration=lgb_model.best_iteration)

CPU times: user 6min 38s, sys: 2.89 s, total: 6min 41s
Wall time: 1min 46s


In [6]:
pred.shape

(18790469,)

In [7]:
test.loc[:, 'pred'] = pred

In [8]:
test.sort_values('click_id', inplace=True)

In [13]:
submit.sort_values('click_id', inplace=True)

In [14]:
submit.loc[:, 'is_attributed'] = test.pred

In [15]:
submit.head()

Unnamed: 0,click_id,is_attributed
0,0,0.158271
1,1,0.106318
2,2,0.040968
3,3,0.083967
4,4,0.035841


In [17]:
submit = submit.astype({'is_attributed': 'float32'})

In [18]:
submit.to_csv('../results/agg_full_lgb_float32.csv', index=False)