In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import gc
import lightgbm as lgb
from sklearn.metrics import precision_score, recall_score, roc_auc_score
import matplotlib.pyplot as plt
from itertools import combinations

# Prepare data

In [2]:
hdf_path = '../derived_data/source.hdf'

In [3]:
%%time
test = pd.read_hdf(hdf_path, key='test')

CPU times: user 348 ms, sys: 1.05 s, total: 1.4 s
Wall time: 5.79 s


In [4]:
test.head()

Unnamed: 0,click_id,ip,app,device,os,channel,click_time
0,0,5744,9,1,3,107,2017-11-10 04:00:00
1,1,119901,9,1,3,466,2017-11-10 04:00:00
2,2,72287,21,1,19,128,2017-11-10 04:00:00
3,3,78477,15,1,13,111,2017-11-10 04:00:00
4,4,123080,12,1,13,328,2017-11-10 04:00:00


In [5]:
test.set_index('click_time', inplace=True)

In [6]:
gc.collect()

0

In [7]:
uint8_max = np.iinfo(np.uint8).max
uint16_max = np.iinfo(np.uint16).max
uint32_max = np.iinfo(np.uint32).max
def choose_int_type(n):
    if n <= uint8_max:
        return 'uint8'
    elif n <= uint16_max:
        return 'uint16'
    elif n <= uint32_max:
        return 'uint32'
    else:
        return 'uint64'

In [8]:
def rolling_count(df, time_win='1h'):
    return df.groupby(level=0, sort=False).size().rolling(time_win).sum()

In [9]:
gc.collect()

0

In [10]:
def process(df, features, time_win='1h'):
    res = df.reset_index()
    for feature in features:
        print('Processing {feature}'.format(feature=feature))
        if isinstance(feature, str):
            feature = [feature]
        elif isinstance(feature, list):
            pass
        else:
            print('Skip invalid feature!')
            continue
        tmp = df.groupby(feature, sort=False).apply(rolling_count, time_win=time_win)
        tmp = tmp.astype(choose_int_type(tmp.max()))
        tmp.name = 'n_' + '_'.join(feature)
        tmp = tmp.reset_index()
        res = res.merge(tmp, on=['click_time']+feature, how='left')
        del tmp
        gc.collect()
    return res

In [11]:
%%time
test = process(test, ['ip', 'app', 'channel'])

Processing ip
Processing app
Processing channel
CPU times: user 1min 51s, sys: 7.08 s, total: 1min 58s
Wall time: 1min 41s


In [12]:
gc.collect()

55

In [13]:
test.head()

Unnamed: 0,click_time,click_id,ip,app,device,os,channel,n_ip,n_app,n_channel
0,2017-11-10 04:00:00,0,5744,9,1,3,107,1,49,31
1,2017-11-10 04:00:00,1,119901,9,1,3,466,1,49,7
2,2017-11-10 04:00:00,2,72287,21,1,19,128,1,10,6
3,2017-11-10 04:00:00,3,78477,15,1,13,111,1,30,1
4,2017-11-10 04:00:00,4,123080,12,1,13,328,1,40,3


In [14]:
gc.collect()

0

In [15]:
test.drop(['ip', 'click_time'], axis=1, inplace=True)

In [16]:
gc.collect()

0

In [17]:
test.to_hdf('../derived_data/rolling_window.hdf', key='test', mode='w')

In [2]:
test = pd.read_hdf('../derived_data/rolling_window.hdf', key='test')

# Light GBM

In [18]:
test.columns

Index(['click_id', 'app', 'device', 'os', 'channel', 'n_ip', 'n_app',
       'n_channel'],
      dtype='object')

In [20]:
res_id = 'click_id'

In [21]:
predictors = test.columns.tolist()

In [22]:
predictors.remove(res_id)

In [23]:
predictors

['app', 'device', 'os', 'channel', 'n_ip', 'n_app', 'n_channel']

In [24]:
categorical = ['app', 'device', 'os', 'channel']

In [25]:
gc.collect()

0

In [27]:
import pickle

with open('../results/rolling_window.pickle', 'rb') as f:
    lgb_model = pickle.load(f)

In [28]:
pred = lgb_model.predict(test.loc[:, predictors], num_iteration=lgb_model.best_iteration)

In [29]:
submit = pd.DataFrame({'click_id': test.click_id, 'is_attributed': pred})

submit.to_csv('../results/rolling_window.csv', index=False, float_format='%.10f')

In [30]:
submit.head()

Unnamed: 0,click_id,is_attributed
0,0,0.254864
1,1,0.537364
2,2,0.179967
3,3,0.352443
4,4,0.028241
