In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import gc
import lightgbm as lgb
from sklearn.metrics import precision_score, recall_score, roc_auc_score
import matplotlib.pyplot as plt
from itertools import combinations

# Prepare data

In [2]:
hdf_path = '../derived_data/source.hdf'

In [3]:
%%time
test = pd.read_hdf(hdf_path, key='test')

CPU times: user 307 ms, sys: 999 ms, total: 1.31 s
Wall time: 6.24 s


In [4]:
test.head()

Unnamed: 0,click_id,ip,app,device,os,channel,click_time
0,0,5744,9,1,3,107,2017-11-10 04:00:00
1,1,119901,9,1,3,466,2017-11-10 04:00:00
2,2,72287,21,1,19,128,2017-11-10 04:00:00
3,3,78477,15,1,13,111,2017-11-10 04:00:00
4,4,123080,12,1,13,328,2017-11-10 04:00:00


In [5]:
test.set_index('click_time', inplace=True)

In [6]:
gc.collect()

0

In [7]:
uint8_max = np.iinfo(np.uint8).max
uint16_max = np.iinfo(np.uint16).max
uint32_max = np.iinfo(np.uint32).max
def choose_int_type(n):
    if n <= uint8_max:
        return 'uint8'
    elif n <= uint16_max:
        return 'uint16'
    elif n <= uint32_max:
        return 'uint32'
    else:
        return 'uint64'

In [8]:
def rolling_count(df, time_win='1h'):
    return df.groupby(level=0, sort=False).size().rolling(time_win).sum()

In [9]:
gc.collect()

0

In [10]:
def process(df, features, time_win='1h'):
    res = df.reset_index()
    for feature in features:
        print('Processing {feature}'.format(feature=feature))
        if isinstance(feature, str):
            feature = [feature]
        elif isinstance(feature, list):
            pass
        else:
            print('Skip invalid feature!')
            continue
        tmp = df.groupby(feature, sort=False).apply(rolling_count, time_win=time_win)
        tmp = tmp.astype(choose_int_type(tmp.max()))
        tmp.name = 'n_' + '_'.join(feature)
        tmp = tmp.reset_index()
        res = res.merge(tmp, on=['click_time']+feature, how='left')
        del tmp
        gc.collect()
    return res

In [13]:
features = ['ip', 'app', 'device', 'os', 'channel']
feature_combos = list(combinations(features, 1)) + list(combinations(features, 2))
feature_combos = [list(feature_combo) for feature_combo in feature_combos]

In [14]:
%%time
test = process(test, feature_combos, time_win='30min')

Processing ['ip']
Processing ['app']
Processing ['device']
Processing ['os']
Processing ['channel']
Processing ['ip', 'app']
Processing ['ip', 'device']
Processing ['ip', 'os']
Processing ['ip', 'channel']
Processing ['app', 'device']
Processing ['app', 'os']
Processing ['app', 'channel']
Processing ['device', 'os']
Processing ['device', 'channel']
Processing ['os', 'channel']
CPU times: user 1h 21min 5s, sys: 1min 11s, total: 1h 22min 17s
Wall time: 1h 20min 28s


In [15]:
gc.collect()

73

In [16]:
test.head()

Unnamed: 0,click_time,click_id,ip,app,device,os,channel,n_ip,n_app,n_device,...,n_ip_app,n_ip_device,n_ip_os,n_ip_channel,n_app_device,n_app_os,n_app_channel,n_device_os,n_device_channel,n_os_channel
0,2017-11-10 04:00:00,0,5744,9,1,3,107,1,49,408,...,1,1,1,1,45,3,7,4,30,1
1,2017-11-10 04:00:00,1,119901,9,1,3,466,1,49,408,...,1,1,1,1,45,3,5,4,7,1
2,2017-11-10 04:00:00,2,72287,21,1,19,128,1,10,408,...,1,1,1,1,10,3,3,100,6,2
3,2017-11-10 04:00:00,3,78477,15,1,13,111,1,30,408,...,1,1,1,1,30,10,1,73,1,1
4,2017-11-10 04:00:00,4,123080,12,1,13,328,1,40,408,...,1,1,1,1,39,5,3,73,3,1


In [17]:
gc.collect()

72

In [18]:
test.drop(['ip', 'click_time'], axis=1, inplace=True)

In [19]:
gc.collect()

0

In [20]:
test.to_hdf('../derived_data/rolling_window.hdf', key='test', mode='r+')

In [2]:
test = pd.read_hdf('../derived_data/rolling_window.hdf', key='test')

# Light GBM

In [23]:
test.columns

Index(['click_id', 'app', 'device', 'os', 'channel', 'n_ip', 'n_app',
       'n_device', 'n_os', 'n_channel', 'n_ip_app', 'n_ip_device', 'n_ip_os',
       'n_ip_channel', 'n_app_device', 'n_app_os', 'n_app_channel',
       'n_device_os', 'n_device_channel', 'n_os_channel'],
      dtype='object')

In [24]:
res_id = 'click_id'

In [25]:
predictors = test.columns.tolist()

In [26]:
predictors.remove(res_id)

In [27]:
predictors

['app',
 'device',
 'os',
 'channel',
 'n_ip',
 'n_app',
 'n_device',
 'n_os',
 'n_channel',
 'n_ip_app',
 'n_ip_device',
 'n_ip_os',
 'n_ip_channel',
 'n_app_device',
 'n_app_os',
 'n_app_channel',
 'n_device_os',
 'n_device_channel',
 'n_os_channel']

In [28]:
categorical = ['app', 'device', 'os', 'channel']

In [29]:
gc.collect()

0

In [30]:
import pickle

with open('../results/rolling_window_combo2_day8.pickle', 'rb') as f:
    lgb_model = pickle.load(f)

In [31]:
pred = lgb_model.predict(test.loc[:, predictors], num_iteration=lgb_model.best_iteration)

In [29]:
submit = pd.DataFrame({'click_id': test.click_id, 'is_attributed': pred})

submit.to_csv('../results/rolling_window.csv', index=False, float_format='%.10f')

In [30]:
submit.head()

Unnamed: 0,click_id,is_attributed
0,0,0.254864
1,1,0.537364
2,2,0.179967
3,3,0.352443
4,4,0.028241
