In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import gc

In [23]:
%time train = pd.read_feather('data/train.feather')

CPU times: user 1.5 s, sys: 1.18 s, total: 2.68 s
Wall time: 2.68 s


In [24]:
%time test = pd.read_feather('data/test.feather')

CPU times: user 124 ms, sys: 92 ms, total: 216 ms
Wall time: 216 ms


In [25]:
len_train = len(train)
print('The initial size of the train set is', len_train)
print('Binding the training and test set together...')
train=train.append(test)

The initial size of the train set is 184903890
Binding the training and test set together...


In [26]:
del test
gc.collect()

3342

In [27]:
print("Creating new time features: 'hour' and 'day'...")
train['hour'] = pd.to_datetime(train.click_time).dt.hour.astype('uint8')
train['day'] = pd.to_datetime(train.click_time).dt.day.astype('uint8')

Creating new time features: 'hour' and 'day'...


In [28]:
gc.collect()

14

In [29]:
print("Creating new count features: 'n_channels', 'ip_app_count', 'ip_app_os_count'...")

print('Computing the number of channels associated with ')
print('a given IP address within each hour...')
n_chans = train[['ip','day','hour','channel']].groupby(by=['ip','day',
          'hour'])[['channel']].count().reset_index().rename(columns={'channel': 'n_channels'})

print('Merging the channels data with the main data set...')
train = train.merge(n_chans, on=['ip','day','hour'], how='left')
del n_chans
gc.collect()

print('Computing the number of channels associated with ')
print('a given IP address and app...')
n_chans = train[['ip','app', 'channel']].groupby(by=['ip', 
          'app'])[['channel']].count().reset_index().rename(columns={'channel': 'ip_app_count'})
          
print('Merging the channels data with the main data set...')
train = train.merge(n_chans, on=['ip','app'], how='left')
del n_chans
gc.collect()

print('Computing the number of channels associated with ')
print('a given IP address, app, and os...')
n_chans = train[['ip','app', 'os', 'channel']].groupby(by=['ip', 'app', 
          'os'])[['channel']].count().reset_index().rename(columns={'channel': 'ip_app_os_count'})
          
print('Merging the channels data with the main data set...')       
train = train.merge(n_chans, on=['ip','app', 'os'], how='left')
del n_chans
gc.collect()

Creating new count features: 'n_channels', 'ip_app_count', 'ip_app_os_count'...
Computing the number of channels associated with 
a given IP address within each hour...
Merging the channels data with the main data set...
Computing the number of channels associated with 
a given IP address and app...
Merging the channels data with the main data set...
Computing the number of channels associated with 
a given IP address, app, and os...
Merging the channels data with the main data set...


117

In [30]:
print("Adjusting the data types of the new count features... ")
train.info()
train['n_channels'] = train['n_channels'].astype('uint16')
train['ip_app_count'] = train['ip_app_count'].astype('uint16')
train['ip_app_os_count'] = train['ip_app_os_count'].astype('uint16')

Adjusting the data types of the new count features... 
<class 'pandas.core.frame.DataFrame'>
Int64Index: 203694359 entries, 0 to 203694358
Data columns (total 14 columns):
app                uint16
attributed_time    datetime64[ns]
channel            uint16
click_id           float64
click_time         datetime64[ns]
device             uint16
ip                 uint32
is_attributed      float64
os                 uint16
hour               uint8
day                uint8
n_channels         int64
ip_app_count       int64
ip_app_os_count    int64
dtypes: datetime64[ns](2), float64(2), int64(3), uint16(4), uint32(1), uint8(2)
memory usage: 14.8 GB


In [31]:
test = train[len_train:]
print('The size of the test set is ', len(test))

r = 0.1 # the fraction of the train data to be used for validation
val = train[(len_train-round(r*len_train)):len_train]
print('The size of the validation set is ', len(val))

train = train[:(len_train-round(r*len_train))]
print('The size of the train set is ', len(train))

target = 'is_attributed'
train[target] = train[target].astype('uint8')
train.info()

predictors = ['ip', 'device', 'app', 'os', 'channel', 'hour', 'n_channels', 'ip_app_count', 'ip_app_os_count']
categorical = ['ip', 'app', 'device', 'os', 'channel', 'hour']
gc.collect()

The size of the test set is  18790469
The size of the validation set is  18490389
The size of the train set is  166413501
<class 'pandas.core.frame.DataFrame'>
Int64Index: 166413501 entries, 0 to 166413500
Data columns (total 14 columns):
app                uint16
attributed_time    datetime64[ns]
channel            uint16
click_id           float64
click_time         datetime64[ns]
device             uint16
ip                 uint32
is_attributed      uint8
os                 uint16
hour               uint8
day                uint8
n_channels         uint16
ip_app_count       uint16
ip_app_os_count    uint16
dtypes: datetime64[ns](2), float64(1), uint16(7), uint32(1), uint8(3)
memory usage: 8.2 GB


17

In [17]:
print("Preparing the datasets for training...")

params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'learning_rate': 0.1,
    'num_leaves': 255,  
    'max_depth': 8,  
    'min_child_samples': 100,  
    'max_bin': 100,  
    'subsample': 0.7,  
    'subsample_freq': 1,  
    'colsample_bytree': 0.7,  
    'min_child_weight': 0,  
    'subsample_for_bin': 200000,  
    'min_split_gain': 0,  
    'reg_alpha': 0,  
    'reg_lambda': 0,  
   # 'nthread': 8,
    'verbose': 0,
    'scale_pos_weight':99 
    }
    
dtrain = lgb.Dataset(train[predictors].values, label=train[target].values,
                      feature_name=predictors,
                      categorical_feature=categorical
                      )
dvalid = lgb.Dataset(val[predictors].values, label=val[target].values,
                      feature_name=predictors,
                      categorical_feature=categorical
                      )

Preparing the datasets for training...


In [18]:
evals_results = {}

print("Training the model...")

lgb_model = lgb.train(params, 
                 dtrain, 
                 valid_sets=[dtrain, dvalid], 
                 valid_names=['train','valid'], 
                 evals_result=evals_results, 
                 num_boost_round=350,
                 early_stopping_rounds=30,
                 verbose_eval=True, 
                 feval=None)

del train
del val
gc.collect()


Training the model...




[1]	train's auc: 0.941001	valid's auc: 0.94027
Training until validation scores don't improve for 30 rounds.
[2]	train's auc: 0.960152	valid's auc: 0.957416
[3]	train's auc: 0.966549	valid's auc: 0.9642
[4]	train's auc: 0.968111	valid's auc: 0.966882
[5]	train's auc: 0.969138	valid's auc: 0.968428
[6]	train's auc: 0.969609	valid's auc: 0.968831
[7]	train's auc: 0.970116	valid's auc: 0.969378
[8]	train's auc: 0.970055	valid's auc: 0.969044
[9]	train's auc: 0.970572	valid's auc: 0.969754
[10]	train's auc: 0.970964	valid's auc: 0.970297
[11]	train's auc: 0.97128	valid's auc: 0.97062
[12]	train's auc: 0.971518	valid's auc: 0.970929
[13]	train's auc: 0.97177	valid's auc: 0.971008
[14]	train's auc: 0.971549	valid's auc: 0.970554
[15]	train's auc: 0.971605	valid's auc: 0.970617
[16]	train's auc: 0.971847	valid's auc: 0.970821
[17]	train's auc: 0.971897	valid's auc: 0.970894
[18]	train's auc: 0.972192	valid's auc: 0.971095
[19]	train's auc: 0.972424	valid's auc: 0.971447
[20]	train's auc: 0.97

[167]	train's auc: 0.985981	valid's auc: 0.976832
[168]	train's auc: 0.98599	valid's auc: 0.976835
[169]	train's auc: 0.985994	valid's auc: 0.976837
[170]	train's auc: 0.986	valid's auc: 0.976844
[171]	train's auc: 0.986004	valid's auc: 0.976843
[172]	train's auc: 0.986013	valid's auc: 0.976842
[173]	train's auc: 0.986016	valid's auc: 0.97684
[174]	train's auc: 0.98602	valid's auc: 0.976842
[175]	train's auc: 0.986032	valid's auc: 0.976839
[176]	train's auc: 0.986072	valid's auc: 0.97685
[177]	train's auc: 0.986079	valid's auc: 0.976849
[178]	train's auc: 0.986084	valid's auc: 0.976846
[179]	train's auc: 0.986087	valid's auc: 0.976848
[180]	train's auc: 0.98609	valid's auc: 0.976847
[181]	train's auc: 0.986095	valid's auc: 0.976845
[182]	train's auc: 0.986097	valid's auc: 0.976847
[183]	train's auc: 0.986105	valid's auc: 0.976845
[184]	train's auc: 0.986109	valid's auc: 0.976847
[185]	train's auc: 0.986111	valid's auc: 0.976842
[186]	train's auc: 0.986148	valid's auc: 0.976831
[187]	tr

[332]	train's auc: 0.987752	valid's auc: 0.977143
[333]	train's auc: 0.98778	valid's auc: 0.977159
[334]	train's auc: 0.987782	valid's auc: 0.977162
[335]	train's auc: 0.987785	valid's auc: 0.977164
[336]	train's auc: 0.987787	valid's auc: 0.977163
[337]	train's auc: 0.98779	valid's auc: 0.977163
[338]	train's auc: 0.987803	valid's auc: 0.977157
[339]	train's auc: 0.987808	valid's auc: 0.977155
[340]	train's auc: 0.98781	valid's auc: 0.977152
[341]	train's auc: 0.987834	valid's auc: 0.977151
[342]	train's auc: 0.987845	valid's auc: 0.97716
[343]	train's auc: 0.987847	valid's auc: 0.977157
[344]	train's auc: 0.987853	valid's auc: 0.977155
[345]	train's auc: 0.987877	valid's auc: 0.97716
[346]	train's auc: 0.987894	valid's auc: 0.977159
[347]	train's auc: 0.987898	valid's auc: 0.977158
[348]	train's auc: 0.987923	valid's auc: 0.977148
[349]	train's auc: 0.987936	valid's auc: 0.977144
[350]	train's auc: 0.987962	valid's auc: 0.977139
Did not meet early stopping. Best iteration is:
[350]	t

229

In [33]:
test.head()

Unnamed: 0,app,attributed_time,channel,click_id,click_time,device,ip,is_attributed,os,hour,day,n_channels,ip_app_count,ip_app_os_count
184903890,9,NaT,107,0.0,2017-11-10 04:00:00,1,5744,,3,4,10,34,113,3
184903891,9,NaT,466,1.0,2017-11-10 04:00:00,1,119901,,3,4,10,403,2302,27
184903892,21,NaT,128,2.0,2017-11-10 04:00:00,1,72287,,19,4,10,229,701,121
184903893,15,NaT,111,3.0,2017-11-10 04:00:00,1,78477,,13,4,10,239,827,261
184903894,12,NaT,328,4.0,2017-11-10 04:00:00,1,123080,,13,4,10,60,205,38


In [34]:
# Feature names:
print('Feature names:', lgb_model.feature_name())

# Feature importances:
print('Feature importances:', list(lgb_model.feature_importance()))

print("Preparing data for submission...")

submit = pd.read_csv('data/test.csv', dtype='int', usecols=['click_id'])

print("Predicting the submission data...")

submit['is_attributed'] = lgb_model.predict(test[predictors], num_iteration=lgb_model.best_iteration)

print("Writing the submission data into a csv file...")

submit.to_csv('output/submission.csv', index=False)

print("All done...")

Feature names: ['ip', 'device', 'app', 'os', 'channel', 'hour', 'n_channels', 'ip_app_count', 'ip_app_os_count']
Feature importances: [3625, 527, 5433, 6333, 6751, 5952, 12066, 13853, 9855]
Preparing data for submission...
Predicting the submission data...
Writing the submission data into a csv file...
All done...
