In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import gc
import lightgbm as lgb
from sklearn.metrics import precision_score, recall_score, roc_auc_score
import matplotlib.pyplot as plt
from itertools import combinations

# Prepare data

In [2]:
hdf_path = '../derived_data/source.hdf'

In [3]:
%%time
test = pd.read_hdf(hdf_path, key='test')

CPU times: user 203 ms, sys: 959 ms, total: 1.16 s
Wall time: 4.47 s


In [4]:
test.loc[:, 'day'] = test.click_time.dt.day.astype('uint8')

In [5]:
test.loc[:, 'hour'] = test.click_time.dt.hour.astype('uint8')

In [6]:
test.head()

Unnamed: 0,click_id,ip,app,device,os,channel,click_time,day,hour
0,0,5744,9,1,3,107,2017-11-10 04:00:00,10,4
1,1,119901,9,1,3,466,2017-11-10 04:00:00,10,4
2,2,72287,21,1,19,128,2017-11-10 04:00:00,10,4
3,3,78477,15,1,13,111,2017-11-10 04:00:00,10,4
4,4,123080,12,1,13,328,2017-11-10 04:00:00,10,4


In [7]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18790469 entries, 0 to 18790468
Data columns (total 9 columns):
click_id      uint32
ip            uint32
app           uint16
device        uint16
os            uint16
channel       uint16
click_time    datetime64[ns]
day           uint8
hour          uint8
dtypes: datetime64[ns](1), uint16(4), uint32(2), uint8(2)
memory usage: 609.3 MB


In [8]:
%%time
test.drop(['click_time'], axis=1, inplace=True)

CPU times: user 576 ms, sys: 414 ms, total: 990 ms
Wall time: 262 ms


In [9]:
gc.collect()

17

In [10]:
most_freq_hours_in_test_data = [4, 5, 9, 10, 13, 14]
least_freq_hours_in_test_data = [6, 11, 15]

In [11]:
test.loc[:, 'in_test_hh'] = (
    3 - 2*test.hour.isin(most_freq_hours_in_test_data) - 1*test.hour.isin(least_freq_hours_in_test_data)
).astype('uint8')

In [12]:
test.head()

Unnamed: 0,click_id,ip,app,device,os,channel,day,hour,in_test_hh
0,0,5744,9,1,3,107,10,4,1
1,1,119901,9,1,3,466,10,4,1
2,2,72287,21,1,19,128,10,4,1
3,3,78477,15,1,13,111,10,4,1
4,4,123080,12,1,13,328,10,4,1


In [13]:
gc.collect()

38

In [14]:
uint8_max = np.iinfo(np.uint8).max
uint16_max = np.iinfo(np.uint16).max
uint32_max = np.iinfo(np.uint32).max
def choose_int_type(n):
    if n <= uint8_max:
        return 'uint8'
    elif n <= uint16_max:
        return 'uint16'
    elif n <= uint32_max:
        return 'uint32'
    else:
        return 'uint64'

In [15]:
def process(dat, itercols, combo_max=None):
    combos = []
    if combo_max is None:
        combo_max = len(itercols)
    for i in range(combo_max):
        combos += list(combinations(itercols, i+1))
    for hour in ['hour', 'in_test_hh']:
        for combo in combos:
            print('Process {hour}, {combo}'.format(hour=hour, combo=combo))
            combo = list(combo)
            feats = ['day', hour] + combo
            gp = dat[feats].groupby(feats).size()
            int_type = choose_int_type(gp.max())
            gp = gp.astype(int_type)
            gp = gp.reset_index().rename(columns={0: '_'.join(['n'] + feats)})
            dat = dat.merge(gp, on=feats, how='left')
            del gp
            gc.collect()
    return dat

In [16]:
gc.collect()

0

In [17]:
%%time
test = process(test, ['ip', 'app', 'device', 'os', 'channel'], combo_max=2)

Process hour, ('ip',)
Process hour, ('app',)
Process hour, ('device',)
Process hour, ('os',)
Process hour, ('channel',)
Process hour, ('ip', 'app')
Process hour, ('ip', 'device')
Process hour, ('ip', 'os')
Process hour, ('ip', 'channel')
Process hour, ('app', 'device')
Process hour, ('app', 'os')
Process hour, ('app', 'channel')
Process hour, ('device', 'os')
Process hour, ('device', 'channel')
Process hour, ('os', 'channel')
Process in_test_hh, ('ip',)
Process in_test_hh, ('app',)
Process in_test_hh, ('device',)
Process in_test_hh, ('os',)
Process in_test_hh, ('channel',)
Process in_test_hh, ('ip', 'app')
Process in_test_hh, ('ip', 'device')
Process in_test_hh, ('ip', 'os')
Process in_test_hh, ('ip', 'channel')
Process in_test_hh, ('app', 'device')
Process in_test_hh, ('app', 'os')
Process in_test_hh, ('app', 'channel')
Process in_test_hh, ('device', 'os')
Process in_test_hh, ('device', 'channel')
Process in_test_hh, ('os', 'channel')
CPU times: user 7min 24s, sys: 4min 41s, total: 12

In [18]:
test.head()

Unnamed: 0,click_id,ip,app,device,os,channel,day,hour,in_test_hh,n_day_hour_ip,...,n_day_in_test_hh_ip_app,n_day_in_test_hh_ip_device,n_day_in_test_hh_ip_os,n_day_in_test_hh_ip_channel,n_day_in_test_hh_app_device,n_day_in_test_hh_app_os,n_day_in_test_hh_app_channel,n_day_in_test_hh_device_os,n_day_in_test_hh_device_channel,n_day_in_test_hh_os_channel
0,0,5744,9,1,3,107,10,4,1,34,...,28,91,3,4,2605722,45011,269100,274004,1182849,29931
1,1,119901,9,1,3,466,10,4,1,403,...,289,2069,43,38,2605722,45011,372385,274004,396504,6854
2,2,72287,21,1,19,128,10,4,1,229,...,312,2092,197,158,615512,173657,219945,4213080,486787,134873
3,3,78477,15,1,13,111,10,4,1,239,...,42,1190,328,2,1048609,230437,67035,3759433,66694,14169
4,4,123080,12,1,13,328,10,4,1,60,...,24,203,49,2,2139490,524679,191979,3759433,190644,43693


In [19]:
gc.collect()

119

In [20]:
test.drop(['ip', 'day'], axis=1, inplace=True)

# Light GBM

In [21]:
test.columns

Index(['click_id', 'app', 'device', 'os', 'channel', 'hour', 'in_test_hh',
       'n_day_hour_ip', 'n_day_hour_app', 'n_day_hour_device', 'n_day_hour_os',
       'n_day_hour_channel', 'n_day_hour_ip_app', 'n_day_hour_ip_device',
       'n_day_hour_ip_os', 'n_day_hour_ip_channel', 'n_day_hour_app_device',
       'n_day_hour_app_os', 'n_day_hour_app_channel', 'n_day_hour_device_os',
       'n_day_hour_device_channel', 'n_day_hour_os_channel',
       'n_day_in_test_hh_ip', 'n_day_in_test_hh_app',
       'n_day_in_test_hh_device', 'n_day_in_test_hh_os',
       'n_day_in_test_hh_channel', 'n_day_in_test_hh_ip_app',
       'n_day_in_test_hh_ip_device', 'n_day_in_test_hh_ip_os',
       'n_day_in_test_hh_ip_channel', 'n_day_in_test_hh_app_device',
       'n_day_in_test_hh_app_os', 'n_day_in_test_hh_app_channel',
       'n_day_in_test_hh_device_os', 'n_day_in_test_hh_device_channel',
       'n_day_in_test_hh_os_channel'],
      dtype='object')

In [22]:
predictors = test.columns.tolist()

In [23]:
predictors.remove('click_id')

In [24]:
predictors

['app',
 'device',
 'os',
 'channel',
 'hour',
 'in_test_hh',
 'n_day_hour_ip',
 'n_day_hour_app',
 'n_day_hour_device',
 'n_day_hour_os',
 'n_day_hour_channel',
 'n_day_hour_ip_app',
 'n_day_hour_ip_device',
 'n_day_hour_ip_os',
 'n_day_hour_ip_channel',
 'n_day_hour_app_device',
 'n_day_hour_app_os',
 'n_day_hour_app_channel',
 'n_day_hour_device_os',
 'n_day_hour_device_channel',
 'n_day_hour_os_channel',
 'n_day_in_test_hh_ip',
 'n_day_in_test_hh_app',
 'n_day_in_test_hh_device',
 'n_day_in_test_hh_os',
 'n_day_in_test_hh_channel',
 'n_day_in_test_hh_ip_app',
 'n_day_in_test_hh_ip_device',
 'n_day_in_test_hh_ip_os',
 'n_day_in_test_hh_ip_channel',
 'n_day_in_test_hh_app_device',
 'n_day_in_test_hh_app_os',
 'n_day_in_test_hh_app_channel',
 'n_day_in_test_hh_device_os',
 'n_day_in_test_hh_device_channel',
 'n_day_in_test_hh_os_channel']

In [25]:
gc.collect()

0

In [26]:
import pickle

with open('../results/kaggle_popular9.pickle', 'rb') as f:
    lgb_model = pickle.load(f)

In [28]:
%%time
pred = lgb_model.predict(test.loc[:, predictors], num_iteration=lgb_model.best_iteration)

CPU times: user 11min 53s, sys: 6.48 s, total: 12min
Wall time: 3min 5s


In [29]:
submit = pd.DataFrame({'click_id': test.click_id, 'is_attributed': pred})

submit.to_csv('../results/kaggle_popular_day9_single_combo2.csv', index=False, float_format='%.9f')