In [1]:
import pandas as pd
import lightgbm as lgb
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split  
from skopt import BayesSearchCV
import gc
import os
import psutil
import time



In [2]:
train_cols = ['ip', 'app', 'device', 'os', 'channel', 'click_time', 'is_attributed']
test_cols = ['ip', 'app', 'device', 'os', 'channel', 'click_time']

In [3]:
dtypes = {
        'ip'            : 'uint32',
        'app'           : 'uint16',
        'device'        : 'uint16',
        'os'            : 'uint16',
        'channel'       : 'uint16',
        'is_attributed' : 'uint8',
        'click_id'      : 'uint32'
        }

In [4]:
#68941878(2017-11-08 00:00:00) [68941878:131886952]
#131886953(2017-11-09 00:00:00)[131886953:]
#62945076   2017-11-08 23:59:59
#62945077   2017-11-09 00:00:00

In [5]:
skip = range(1,131886952)
print("Loading Data")
train = pd.read_csv('train.csv', skiprows=skip, dtype=dtypes,
        header=0,usecols=train_cols,parse_dates=["click_time"])#.sample(1000)
test = pd.read_csv('test.csv', dtype=dtypes, header=0,
        usecols=test_cols,parse_dates=["click_time"])#.sample(1000)

Loading Data


In [6]:
len_train = len(train)
print('The initial size of the train set is', len_train)
print('Binding the training and test set together...')
train=train.append(test)
del test
gc.collect()

('The initial size of the train set is', 53016939)
Binding the training and test set together...


14

In [7]:
print("Creating new time features: 'hour' and 'day'...")
train['hour'] = train["click_time"].dt.hour.astype('uint8')
train['day'] = train["click_time"].dt.day.astype('uint8')

Creating new time features: 'hour' and 'day'...


In [8]:
print("Creating new time features: 'ip_count'...")
n_chans = train[['ip','channel']].groupby(by=['ip'])[['channel']].count().reset_index().rename(columns={'channel': 'ip_count'})
print('Merging the channels data with the main data set...')
train = train.merge(n_chans, on=['ip'], how='left')
del n_chans
gc.collect()

Creating new time features: 'ip_count'...
Merging the channels data with the main data set...


61

In [9]:
print("Creating new time features: 'app_count'...")
n_chans = train[['app','channel']].groupby(by=['app'])[['channel']].count().reset_index().rename(columns={'channel': 'app_count'})
print('Merging the channels data with the main data set...')
train = train.merge(n_chans, on=['app'], how='left')
del n_chans
gc.collect()

Creating new time features: 'app_count'...
Merging the channels data with the main data set...


85

In [10]:
print("Creating new time features: 'user_count'...")
n_chans = train[['app','ip','device','os','channel']].groupby(by=['app','ip','device','os'])[['channel']].count().reset_index().rename(columns={'channel': 'user_count'})
print('Merging the channels data with the main data set...')
train = train.merge(n_chans, on=['app','ip','device','os'], how='left')
del n_chans
gc.collect()

Creating new time features: 'user_count'...
Merging the channels data with the main data set...


133

In [11]:
print("Creating new time features: 'os_count'...")
n_chans = train[['ip','device','os','channel']].groupby(by=['ip','device','os'])[['channel']].count().reset_index().rename(columns={'channel': 'os_count'})
print('Merging the channels data with the main data set...')
train = train.merge(n_chans, on=['ip','device','os'], how='left')
del n_chans
gc.collect()

Creating new time features: 'os_count'...
Merging the channels data with the main data set...


117

In [12]:
print("Creating new time features: 'channel_count'...")
n_chans = train[['app','channel']].groupby(by=['channel'])[['app']].count().reset_index().rename(columns={'app': 'channel_count'})
print('Merging the channels data with the main data set...')
train = train.merge(n_chans, on=['channel'], how='left')
del n_chans
gc.collect()

Creating new time features: 'channel_count'...
Merging the channels data with the main data set...


85

In [13]:
print("Creating new count features: 'n_channels'")
n_chans = train[['ip','day','hour','channel']].groupby(by=['ip','day',
          'hour'])[['channel']].count().reset_index().rename(columns={'channel': 'n_channels'})
print('Merging the channels data with the main data set...')
train = train.merge(n_chans, on=['ip','day','hour'], how='left')
del n_chans
gc.collect()

Creating new count features: 'n_channels'
Merging the channels data with the main data set...


117

In [14]:
print("Creating new count features: 'ip_day_ch_hour_var'")
n_chans = train[['ip','day','hour','channel']].groupby(by=['ip','day',
          'channel'])[['hour']].var().reset_index().rename(columns={'hour': 'ip_day_ch_hour_var'})
print('Merging the channels data with the main data set...')
train = train.merge(n_chans, on=['ip','day','channel'], how='left')
del n_chans
gc.collect()

Creating new count features: 'ip_day_ch_hour_var'
Merging the channels data with the main data set...


117

In [15]:
print("Creating new count features: 'ip_app_day_hour_count'")
n_chans = train[['ip','app','day','hour','channel']].groupby(by=['ip','day','hour', 
          'app'])[['channel']].count().reset_index().rename(columns={'channel': 'ip_app_day_hour_count'})
print('Merging the channels data with the main data set...')
train = train.merge(n_chans, on=['ip','day','hour','app'], how='left')
del n_chans
gc.collect()

Creating new count features: 'ip_app_day_hour_count'
Merging the channels data with the main data set...


133

In [16]:
print("Creating new count features: 'ip_app_count'")
n_chans = train[['ip','app', 'channel']].groupby(by=['ip', 
          'app'])[['channel']].count().reset_index().rename(columns={'channel': 'ip_app_count'})
print('Merging the channels data with the main data set...')
train = train.merge(n_chans, on=['ip','app'], how='left')
del n_chans
gc.collect()

Creating new count features: 'ip_app_count'
Merging the channels data with the main data set...


100

In [17]:
print("Creating new count features: 'ip_app_os_count'")
n_chans = train[['ip','app', 'os', 'channel']].groupby(by=['ip', 'app', 
          'os'])[['channel']].count().reset_index().rename(columns={'channel': 'ip_app_os_count'})
print('Merging the channels data with the main data set...')       
train = train.merge(n_chans, on=['ip','app', 'os'], how='left')
del n_chans
gc.collect()

Creating new count features: 'ip_app_os_count'
Merging the channels data with the main data set...


116

In [18]:
print("Creating new count features: 'ip_day_channel_var'")
gp = train[['ip','day','hour','channel']].groupby(by=['ip',
        'day','channel'])[['hour']].var().reset_index().rename(index=str, columns={'hour': 'ip_day_channel_var'})
print('Merging the channels data with the main data set...')   
train = train.merge(gp, on=['ip','day','channel'], how='left')
del gp
gc.collect()

Creating new count features: 'ip_day_channel_var'
Merging the channels data with the main data set...


116

In [19]:
print("Creating new count features: 'ip_app_os_var'")
gp = train[['ip','app', 'os', 'hour']].groupby(by=['ip', 
    'app', 'os'])[['hour']].var().reset_index().rename(index=str, columns={'hour': 'ip_app_os_var'})
print('Merging the channels data with the main data set...')   
train = train.merge(gp, on=['ip','app', 'os'], how='left')
del gp
gc.collect()

Creating new count features: 'ip_app_os_var'
Merging the channels data with the main data set...


115

In [20]:
print("Creating new count features: 'ip_app_channel_var_day'")
gp = train[['ip','app', 'channel', 'day']].groupby(by=['ip', 
    'app', 'channel'])[['day']].var().reset_index().rename(index=str, columns={'day': 'ip_app_channel_var_day'})
print('Merging the channels data with the main data set...')   
train = train.merge(gp, on=['ip','app','channel'], how='left')
del gp
gc.collect()

Creating new count features: 'ip_app_channel_var_day'
Merging the channels data with the main data set...


115

In [21]:
print("Creating new count features: 'ip_app_channel_count_day'")
gp = train[['ip','app', 'channel', 'day']].groupby(by=['ip', 
    'app', 'channel'])[['day']].count().reset_index().rename(index=str, columns={'day': 'ip_app_channel_count_day'})
print('Merging the channels data with the main data set...')   
train = train.merge(gp, on=['ip','app','channel'], how='left')
del gp
gc.collect()

Creating new count features: 'ip_app_channel_count_day'
Merging the channels data with the main data set...


115

In [22]:
print("Creating new count features: 'ip_app_channel_mean_hour'")
gp = train[['ip','app', 'channel','hour']].groupby(by=['ip', 
    'app', 'channel'])[['hour']].mean().reset_index().rename(index=str, columns={'hour': 'ip_app_channel_mean_hour'})
print('Merging the channels data with the main data set...')   
train = train.merge(gp, on=['ip','app', 'channel'], how='left')
del gp
gc.collect()

Creating new count features: 'ip_app_channel_mean_hour'
Merging the channels data with the main data set...


115

In [23]:
print("Creating new count features: 'ip_dev'")
gp = train[['ip', 'device', 'hour', 'channel']].groupby(by=['ip', 'device', 
            'hour'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'ip_dev'})
print('Merging the channels data with the main data set...')   
train = train.merge(gp, on=['ip','device','hour'], how='left')
del gp
gc.collect()

Creating new count features: 'ip_dev'
Merging the channels data with the main data set...


115

In [24]:
print('grouping by ip-day-hour combination...')
gp = train[['ip','day','hour','channel']].groupby(by=['ip','day',
                'hour'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'ip_tcount'})
train = train.merge(gp, on=['ip','day','hour'], how='left')
del gp
gc.collect()

grouping by ip-day-hour combination...


115

In [25]:
print("Creating new count features: 'in_test_hh'")
most_freq_hours_in_test_data = [4, 5, 9, 10, 13, 14]
least_freq_hours_in_test_data = [6, 11, 15]
train['in_test_hh'] = (3 - 2*train['hour'].isin(most_freq_hours_in_test_data) - 1*train['hour'].isin( least_freq_hours_in_test_data ) ).astype('uint8')
gp = train[['ip', 'day', 'in_test_hh', 'channel']].groupby(by=['ip', 'day', 'in_test_hh'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'nip_day_test_hh'})
print('Merging the channels data with the main data set...')   
train = train.merge(gp, on=['ip','day','in_test_hh'], how='left')
train.drop(['in_test_hh'], axis=1, inplace=True)
del gp
gc.collect()

Creating new count features: 'in_test_hh'
Merging the channels data with the main data set...


79

In [26]:
naddfeat=9
for i in range(0,naddfeat):
    if i==0: selcols=['ip', 'channel']; QQ=4;
    if i==1: selcols=['ip', 'device', 'os', 'app']; QQ=5;
    if i==2: selcols=['ip', 'day', 'hour']; QQ=4;
    if i==3: selcols=['ip', 'app']; QQ=4;
    if i==4: selcols=['ip', 'app', 'os']; QQ=4;
    if i==5: selcols=['ip', 'device']; QQ=4;
    if i==6: selcols=['app', 'channel']; QQ=4;
    if i==7: selcols=['ip', 'os']; QQ=5;
    if i==8: selcols=['ip', 'device', 'os', 'app']; QQ=4;
    print('selcols',selcols,'QQ',QQ)
        
    if QQ==4:
        gp = train[selcols].groupby(by=selcols[0:len(selcols)-1])[selcols[len(selcols)-1]].nunique().reset_index().rename(index=str, columns={selcols[len(selcols)-1]: 'X'+str(i)})
        train = train.merge(gp, on=selcols[0:len(selcols)-1], how='left')
    if QQ==5:
        gp = train[selcols].groupby(by=selcols[0:len(selcols)-1])[selcols[len(selcols)-1]].cumcount() + 1
        train['X'+str(i)]=gp.values
    
    del gp
    gc.collect()

('selcols', ['ip', 'channel'], 'QQ', 4)
('selcols', ['ip', 'device', 'os', 'app'], 'QQ', 5)
('selcols', ['ip', 'day', 'hour'], 'QQ', 4)
('selcols', ['ip', 'app'], 'QQ', 4)
('selcols', ['ip', 'app', 'os'], 'QQ', 4)
('selcols', ['ip', 'device'], 'QQ', 4)
('selcols', ['app', 'channel'], 'QQ', 4)
('selcols', ['ip', 'os'], 'QQ', 5)
('selcols', ['ip', 'device', 'os', 'app'], 'QQ', 4)


In [27]:
# print("Creating new count features: 'UsrappNewness'")
# train['UsrappNewness'] = train.groupby(['ip','app', 'device', 'os']).cumcount() + 1
# gc.collect()

In [28]:
print("Creating new count features: 'UsrNewness'")
train['UsrNewness'] = train.groupby(['ip', 'device', 'os']).cumcount() + 1
gc.collect()

Creating new count features: 'UsrNewness'


14

In [29]:
print("Creating new count features: 'time_delta_user'")
train['time_delta_user'] = train.sort_values(['click_time']).groupby(by=['ip','device', 'os'])[['click_time']].diff().astype('timedelta64[s]')
gc.collect()

# print("Creating new count features: 'time_delta_ip'")
# train['time_delta_ip'] = train.sort_values(['click_time']).groupby(by=['ip'])[['click_time']].diff().astype('timedelta64[s]')
# gc.collect()

print("Creating new count features: 'time_delta_user_app'")
train['time_delta_ip_app'] = train.sort_values(['click_time']).groupby(by=['ip','app'])[['click_time']].diff().astype('timedelta64[s]')
gc.collect()

# print("Creating new count features: 'time_delta_ip_cha'")
# train['time_delta_ip_cha'] = train.sort_values(['click_time']).groupby(by=['ip','channel'])[['click_time']].diff().astype('timedelta64[s]')
# gc.collect()

# print("Creating new count features: 'time_delta_ip_dev'")
# train['time_delta_ip_dev'] = train.sort_values(['click_time']).groupby(by=['ip','device'])[['click_time']].diff().astype('timedelta64[s]')
# gc.collect()

Creating new count features: 'time_delta_user'
Creating new count features: 'time_delta_user_app'


54

In [30]:
print("Creating new count features: 'next_click'")
start = time.time()
train['click_time_s'] = (train['click_time'].astype(np.int64) // 10 ** 9).astype(np.int32)
train['nextClick'] = (train.groupby(['ip', 'app', 'device', 'os']).click_time_s.shift(-1) - train.click_time_s).astype(np.float32)
gc.collect()

Creating new count features: 'next_click'


21

In [31]:
#Change Here When Change feature
predictors = ['ip', 'device', 'app', 'os', 'channel', 'hour', 'n_channels',
              'ip_count','ip_app_count', 'ip_app_os_count','day',
              'ip_day_channel_var','ip_app_os_var','ip_app_channel_var_day'
              ,'ip_app_channel_mean_hour','ip_dev','nip_day_test_hh',
              'ip_day_ch_hour_var','ip_app_day_hour_count','UsrNewness'
              ,'channel_count','os_count','user_count',
              'app_count','ip_app_channel_count_day','ip_tcount','nextClick','time_delta_user'
             ,'time_delta_ip_app']#,'time_delta_ip_app','time_delta_ip_cha','time_delta_ip_dev']

for i in range(0,naddfeat):
        predictors.append('X'+str(i))
categorical = ['ip', 'app', 'device', 'os', 'channel', 'hour','day']

gc.collect()

0

In [32]:
# new_feature = 'nextClick'
# D=2**26
# train['category'] = (train['ip'].astype(str) + "_" + train['app'].astype(str) + "_" + train['device'].astype(str) \
#         + "_" + train['os'].astype(str)).apply(hash) % D
# click_buffer= np.full(D, 3000000000, dtype=np.uint32)
# train['epochtime']= train['click_time'].astype(np.int64) // 10 ** 9
# next_clicks= []
# for category, t in zip(reversed(train['category'].values), reversed(train['epochtime'].values)):
#     next_clicks.append(click_buffer[category]-t)
#     click_buffer[category]= t
# del(click_buffer)
# QQ= list(reversed(next_clicks))
# train.drop(['epochtime','category'], axis=1, inplace=True)
# train[new_feature] = pd.Series(QQ).astype('float32')
# predictors.append(new_feature)
# train[new_feature+'_shift'] = train[new_feature].shift(+1).values
# predictors.append(new_feature+'_shift')
# del QQ, next_clicks
# gc.collect()

In [33]:
print("predictors:", len(predictors), "train:", len(train.columns) - 2)

('predictors:', 38, 'train:', 39)


In [34]:
#Change Here When Change feature
print("Adjusting the data types of the new count features... ")
train['n_channels'] = train['n_channels'].astype('uint16')
train['ip_app_count'] = train['ip_app_count'].astype('uint16')
train['ip_app_os_count'] = train['ip_app_os_count'].astype('uint16')
train['ip_dev'] = train['ip_dev'].astype('uint32')
train['nip_day_test_hh'] = train['nip_day_test_hh'].astype('uint32')
train['ip_app_day_hour_count'] = train['ip_app_day_hour_count'].astype('uint16')
train['app_count'] = train['app_count'].astype('uint16')
train['channel_count'] = train['channel_count'].astype('uint16')
train['ip_app_channel_count_day'] = train['ip_app_channel_count_day'].astype('uint16')
train['ip_count'] = train['ip_count'].astype('uint16')
train['user_count'] = train['user_count'].astype('uint16')
train['os_count'] = train['os_count'].astype('uint16')
train['UsrNewness'] = train['UsrNewness'].astype('uint16')
#train['UsrappNewness'] = train['UsrappNewness'].astype('uint16')
train['ip_tcount'] = train['ip_tcount'].astype('uint16')

#train.info()

Adjusting the data types of the new count features... 


In [35]:
# #use Day 8 as train, day 9 as valid
# splitTar = 62945077
# test = train[len_train:]
# train_X = train[:splitTar]
# val_X = train[splitTar:]
# train_y = train[:splitTar].is_attributed
# val_y = train[splitTar:].is_attributed
# print('The size of the test set is ', len(test))
# print('The size of the validation set is ', len(val_X))
# print('The size of the train set is ', len(train_X))

In [55]:
#random split for both 
test = train[len_train:]
train_ = train[:len_train]
target = 'is_attributed'
target = train_[target]
train_X,val_X, train_y, val_y = train_test_split(train_,target,test_size = 0.1,random_state = 0) 
train_y = train_y.astype('uint8')
val_y = val_y.astype('uint8')
print('The size of the test set is ', len(test))
print('The size of the validation set is ', len(val_X))
print('The size of the train set is ', len(train_X))

#del train
del train_
gc.collect()

('The size of the test set is ', 18790469)
('The size of the validation set is ', 5301694)
('The size of the train set is ', 47715245)


35

In [56]:
print("Preparing the datasets for training...")

params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'learning_rate': 0.25,
    'num_leaves': 15,  
    'max_depth': 4,  
    'min_child_samples': 100,  
    'max_bin': 100,  
    'subsample': 0.7,  
    'subsample_freq': 1,  
    'colsample_bytree': 0.7,  
    'min_child_weight': 0,  
    'subsample_for_bin': 200000,  
    'min_split_gain': 0,  
    'reg_alpha': 0,  
    'reg_lambda': 0,  
   # 'nthread': 8,
    'verbose': 0,
    'scale_pos_weight':100 
    }

dtrain = lgb.Dataset(train_X[predictors].values, label=train_y.values,
                      feature_name=predictors,
                      categorical_feature=categorical
                      )
dvalid = lgb.Dataset(val_X[predictors].values, label=val_y.values,
                      feature_name=predictors,
                      categorical_feature=categorical
                      )
                      
evals_results = {}

del train_X
del val_X
gc.collect()

Preparing the datasets for training...


47

In [57]:
print("Training the model...")

lgb_model = lgb.train(params, 
                 dtrain, 
                 valid_sets=[dtrain, dvalid], 
                 valid_names=['train','valid'], 
                 evals_result=evals_results, 
                 num_boost_round=1500,
                 early_stopping_rounds=50,
                 verbose_eval=True, 
                 feval=None)


Training the model...
[1]	train's auc: 0.933033	valid's auc: 0.933647
Training until validation scores don't improve for 50 rounds.
[2]	train's auc: 0.96234	valid's auc: 0.9629
[3]	train's auc: 0.962396	valid's auc: 0.962706
[4]	train's auc: 0.963959	valid's auc: 0.964348
[5]	train's auc: 0.966369	valid's auc: 0.966587
[6]	train's auc: 0.966874	valid's auc: 0.967167
[7]	train's auc: 0.967167	valid's auc: 0.967475
[8]	train's auc: 0.971023	valid's auc: 0.970686
[9]	train's auc: 0.972458	valid's auc: 0.972093
[10]	train's auc: 0.973787	valid's auc: 0.97353
[11]	train's auc: 0.974831	valid's auc: 0.974256
[12]	train's auc: 0.976052	valid's auc: 0.975074
[13]	train's auc: 0.976839	valid's auc: 0.975615
[14]	train's auc: 0.978075	valid's auc: 0.97657
[15]	train's auc: 0.978379	valid's auc: 0.976715
[16]	train's auc: 0.978761	valid's auc: 0.977225
[17]	train's auc: 0.978887	valid's auc: 0.977199
[18]	train's auc: 0.979371	valid's auc: 0.977713
[19]	train's auc: 0.979947	valid's auc: 0.978159

[167]	train's auc: 0.989968	valid's auc: 0.982133
[168]	train's auc: 0.989967	valid's auc: 0.982123
[169]	train's auc: 0.989972	valid's auc: 0.982103
[170]	train's auc: 0.990001	valid's auc: 0.982121
[171]	train's auc: 0.990014	valid's auc: 0.98211
[172]	train's auc: 0.990029	valid's auc: 0.982098
[173]	train's auc: 0.990054	valid's auc: 0.982117
[174]	train's auc: 0.990059	valid's auc: 0.982115
[175]	train's auc: 0.990092	valid's auc: 0.982123
[176]	train's auc: 0.990101	valid's auc: 0.982116
[177]	train's auc: 0.990111	valid's auc: 0.982117
[178]	train's auc: 0.990133	valid's auc: 0.982157
[179]	train's auc: 0.990141	valid's auc: 0.98214
[180]	train's auc: 0.99015	valid's auc: 0.98213
[181]	train's auc: 0.990173	valid's auc: 0.982179
[182]	train's auc: 0.990189	valid's auc: 0.982167
[183]	train's auc: 0.990197	valid's auc: 0.982163
[184]	train's auc: 0.9902	valid's auc: 0.982168
[185]	train's auc: 0.990212	valid's auc: 0.982168
[186]	train's auc: 0.990221	valid's auc: 0.98216
[187]	t

In [39]:
#	train's auc: 0.990168	valid's auc: 0.982148 'learning_rate': 0.2,'num_leaves': 9,'max_depth': 4, 
#   train's auc: 0.991313	valid's auc: 0.983414 'learning_rate': 0.1,'num_leaves': 12,'max_depth': 4,
#	train's auc: 0.991293	valid's auc: 0.983208 'learning_rate': 0.1,'num_leaves': 15,'max_depth': 5,


In [40]:
# print("REtraining the model...")

# dtrain = lgb.Dataset(train[predictors].values, 
#                      label=train.is_attributed.values,
#                      feature_name=predictors,
#                      categorical_feature=categorical
#                     )

# lgb_model_re = lgb.train(params, 
#                          train, 
#                          num_boost_round=1000, 
#                          feature_name=predictors, 
#                          categorical_feature=categorical
#                          )


In [41]:
# f, ax = plt.subplots(figsize=[7,10])
# lgb.plot_importance(lgb_model, ax=ax,)
# plt.title("Light GBM Feature Importance")
# plt.savefig('feature_import.png')

# # Feature names:
# print('Feature names:', lgb_model.feature_name())
# # Feature importances:
# print('Feature importances:', list(lgb_model.feature_importance()))

# feature_imp = pd.DataFrame(lgb_model.feature_name(),list(lgb_model.feature_importance()))


In [58]:
print("Preparing data for submission...")

submit = pd.read_csv('test.csv', dtype='int', usecols=['click_id'])

print("Predicting the submission data...")

#submit['is_attributed'] = 0
submit['is_attributed'] = lgb_model.predict(test[predictors], num_iteration=lgb_model.best_iteration)
#submit['is_attributed'] = lgb_model_re.predict(test[predictors], num_iteration= -1)


print("Writing the submission data into a csv file...")
print(submit.shape)
submit.to_csv('submission_lgb_delta6.csv', index=False)

print("All done...")

Preparing data for submission...
Predicting the submission data...
Writing the submission data into a csv file...
(18790469, 2)
All done...


In [59]:
#random split for both 
test = train[len_train:]
train_ = train[:len_train]
target = 'is_attributed'
target = train_[target]
train_X,val_X, train_y, val_y = train_test_split(train_,target,test_size = 0.1,random_state = 0) 
train_y = train_y.astype('uint8')
val_y = val_y.astype('uint8')
print('The size of the test set is ', len(test))
print('The size of the validation set is ', len(val_X))
print('The size of the train set is ', len(train_X))

#del train
del train_
gc.collect()

('The size of the test set is ', 18790469)
('The size of the validation set is ', 5301694)
('The size of the train set is ', 47715245)


101

In [60]:
print("Preparing the datasets for training...")

params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'learning_rate': 0.1,
    'num_leaves': 15,  
    'max_depth': 5,  
    'min_child_samples': 100,  
    'max_bin': 100,  
    'subsample': 0.7,  
    'subsample_freq': 1,  
    'colsample_bytree': 0.7,  
    'min_child_weight': 0,  
    'subsample_for_bin': 200000,  
    'min_split_gain': 0,  
    'reg_alpha': 0,  
    'reg_lambda': 0,  
   # 'nthread': 8,
    'verbose': 0,
    'scale_pos_weight':100 
    }

dtrain = lgb.Dataset(train_X[predictors].values, label=train_y.values,
                      feature_name=predictors,
                      categorical_feature=categorical
                      )
dvalid = lgb.Dataset(val_X[predictors].values, label=val_y.values,
                      feature_name=predictors,
                      categorical_feature=categorical
                      )
                      
evals_results = {}

del train_X
del val_X
gc.collect()

Preparing the datasets for training...


47

In [61]:
print("Training the model...")

lgb_model = lgb.train(params, 
                 dtrain, 
                 valid_sets=[dtrain, dvalid], 
                 valid_names=['train','valid'], 
                 evals_result=evals_results, 
                 num_boost_round=1500,
                 early_stopping_rounds=50,
                 verbose_eval=True, 
                 feval=None)



Training the model...
[1]	train's auc: 0.938483	valid's auc: 0.939107
Training until validation scores don't improve for 50 rounds.
[2]	train's auc: 0.962666	valid's auc: 0.963276
[3]	train's auc: 0.963631	valid's auc: 0.964131
[4]	train's auc: 0.964542	valid's auc: 0.965074
[5]	train's auc: 0.965555	valid's auc: 0.966046
[6]	train's auc: 0.965891	valid's auc: 0.966416
[7]	train's auc: 0.965955	valid's auc: 0.966451
[8]	train's auc: 0.966242	valid's auc: 0.966705
[9]	train's auc: 0.968148	valid's auc: 0.968306
[10]	train's auc: 0.968504	valid's auc: 0.968606
[11]	train's auc: 0.969332	valid's auc: 0.969289
[12]	train's auc: 0.96992	valid's auc: 0.969949
[13]	train's auc: 0.970366	valid's auc: 0.970359
[14]	train's auc: 0.970685	valid's auc: 0.970792
[15]	train's auc: 0.970481	valid's auc: 0.970649
[16]	train's auc: 0.970764	valid's auc: 0.970989
[17]	train's auc: 0.970609	valid's auc: 0.970817
[18]	train's auc: 0.971712	valid's auc: 0.97187
[19]	train's auc: 0.972223	valid's auc: 0.972

[167]	train's auc: 0.989411	valid's auc: 0.982301
[168]	train's auc: 0.98942	valid's auc: 0.982295
[169]	train's auc: 0.98943	valid's auc: 0.982294
[170]	train's auc: 0.989455	valid's auc: 0.982315
[171]	train's auc: 0.989462	valid's auc: 0.982311
[172]	train's auc: 0.989471	valid's auc: 0.982312
[173]	train's auc: 0.989495	valid's auc: 0.982354
[174]	train's auc: 0.989502	valid's auc: 0.982351
[175]	train's auc: 0.98953	valid's auc: 0.982372
[176]	train's auc: 0.989537	valid's auc: 0.982373
[177]	train's auc: 0.989545	valid's auc: 0.982374
[178]	train's auc: 0.989568	valid's auc: 0.982413
[179]	train's auc: 0.989573	valid's auc: 0.982409
[180]	train's auc: 0.989589	valid's auc: 0.982404
[181]	train's auc: 0.989611	valid's auc: 0.982446
[182]	train's auc: 0.989618	valid's auc: 0.982442
[183]	train's auc: 0.989626	valid's auc: 0.982448
[184]	train's auc: 0.989634	valid's auc: 0.982447
[185]	train's auc: 0.989644	valid's auc: 0.982444
[186]	train's auc: 0.989648	valid's auc: 0.982447
[18

[332]	train's auc: 0.990904	valid's auc: 0.983159
[333]	train's auc: 0.990911	valid's auc: 0.983166
[334]	train's auc: 0.990917	valid's auc: 0.983169
[335]	train's auc: 0.990919	valid's auc: 0.983167
[336]	train's auc: 0.990922	valid's auc: 0.983162
[337]	train's auc: 0.990925	valid's auc: 0.983167
[338]	train's auc: 0.99093	valid's auc: 0.983161
[339]	train's auc: 0.990932	valid's auc: 0.983161
[340]	train's auc: 0.990936	valid's auc: 0.983157
[341]	train's auc: 0.990944	valid's auc: 0.983155
[342]	train's auc: 0.990951	valid's auc: 0.983158
[343]	train's auc: 0.990954	valid's auc: 0.983153
[344]	train's auc: 0.990956	valid's auc: 0.98315
[345]	train's auc: 0.990959	valid's auc: 0.983149
[346]	train's auc: 0.990961	valid's auc: 0.983147
[347]	train's auc: 0.990973	valid's auc: 0.983157
[348]	train's auc: 0.990976	valid's auc: 0.983152
[349]	train's auc: 0.990993	valid's auc: 0.983159
[350]	train's auc: 0.990996	valid's auc: 0.983159
[351]	train's auc: 0.990997	valid's auc: 0.983155
[3

In [62]:
print("Preparing data for submission...")

submit = pd.read_csv('test.csv', dtype='int', usecols=['click_id'])

print("Predicting the submission data...")

#submit['is_attributed'] = 0
submit['is_attributed'] = lgb_model.predict(test[predictors], num_iteration=lgb_model.best_iteration)
#submit['is_attributed'] = lgb_model_re.predict(test[predictors], num_iteration= -1)


print("Writing the submission data into a csv file...")
print(submit.shape)
submit.to_csv('submission_lgb_delta7.csv', index=False)

print("All done...")

Preparing data for submission...
Predicting the submission data...
Writing the submission data into a csv file...
(18790469, 2)
All done...


In [63]:
test = train[len_train:]
train_ = train[:len_train]
target = 'is_attributed'
target = train_[target]

In [None]:
# Classifier
from sklearn.model_selection import StratifiedKFold

ITERATIONS = 1000
bayes_cv_tuner = BayesSearchCV(
    estimator = lgb.LGBMRegressor(
        objective='binary',
        metric='auc',
        n_jobs=1,
        verbose=0
    ),
    search_spaces = {
        'learning_rate': (0.01, 1.0, 'log-uniform'),
        'num_leaves': (1, 100),      
        'max_depth': (0, 50),
        'min_child_samples': (0, 50),
        'max_bin': (100, 1000),
        'subsample': (0.01, 1.0, 'uniform'),
        'subsample_freq': (0, 10),
        'colsample_bytree': (0.01, 1.0, 'uniform'),
        'min_child_weight': (0, 10),
        'subsample_for_bin': (100000, 500000),
        'reg_lambda': (1e-9, 1000, 'log-uniform'),
        'reg_alpha': (1e-9, 1.0, 'log-uniform'),
        'scale_pos_weight': (1e-6, 500, 'log-uniform'),
        'n_estimators': (50, 100),
    },    
    scoring = 'roc_auc',
    cv = StratifiedKFold(
        n_splits=3,
        shuffle=True,
        random_state=42
    ),
    n_jobs = 3,
    n_iter = ITERATIONS,   
    verbose = 0,
    refit = True,
    random_state = 42
)
def status_print(optim_result):
    """Status callback durring bayesian hyperparameter search"""
    
    # Get all the models tested so far in DataFrame format
    all_models = pd.DataFrame(bayes_cv_tuner.cv_results_)    
    
    # Get current parameters and the best parameters    
    best_params = pd.Series(bayes_cv_tuner.best_params_)
    print('Model #{}\nBest ROC-AUC: {}\nBest params: {}\n'.format(
        len(all_models),
        np.round(bayes_cv_tuner.best_score_, 4),
        bayes_cv_tuner.best_params_
    ))

# Fit the model
result = bayes_cv_tuner.fit(train_[predictors].values, target.values, callback=status_print)