In [1]:
import pandas as pd
import lightgbm as lgb
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split  
from skopt import BayesSearchCV
import gc
import os
import psutil
import time



In [2]:
train_cols = ['ip', 'app', 'os', 'device', 'channel', 
              'nextClick', 'ip_dev', 'X8', 
              'hour','is_attributed', 'os_count', 
              'app_count','time_delta_ip_dev', 
              'ip_app_day_hour_count', 'time_delta_ip_app', 'ip_count', 'X0']


In [3]:
dtypes = {
        'ip'                          : 'uint32',
        'app'                         : 'uint16',
        'os'                          : 'uint16',
        'channel'                     : 'uint16',
        'device'                      : 'uint16',
        'is_attributed'               : 'float16',
        'hour'                        : 'uint8',
        'ip_count'                    : 'uint16',
        'app_count'                   : 'uint16',
        'os_count'                    : 'uint16',
        'ip_app_day_hour_count'       : 'uint16',
        'ip_dev'                      : 'uint32',
        'X0'                          : 'uint16',
        'X8'                          : 'uint16',
        'nextClick'                   : 'float32',
        'time_delta_ip_app'           : 'float32',
        'time_delta_ip_dev'           : 'float32'
        }

In [4]:
skip = range(1,38941878)
print("Loading Data")
train = pd.read_csv('train_after.csv',dtype=dtypes,usecols=train_cols,header=0)
len_train = len(train) - 18790469
print("len_train: ",len_train)

Loading Data
('len_train: ', 184903890)


In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 203694359 entries, 0 to 203694358
Data columns (total 17 columns):
app                      uint16
channel                  uint16
device                   uint16
ip                       uint32
is_attributed            float16
os                       uint16
hour                     uint8
ip_count                 uint16
app_count                uint16
os_count                 uint16
ip_app_day_hour_count    uint16
ip_dev                   uint32
X0                       uint16
X8                       uint16
nextClick                float32
time_delta_ip_app        float32
time_delta_ip_dev        float32
dtypes: float16(1), float32(3), uint16(10), uint32(2), uint8(1)
memory usage: 8.2 GB


In [6]:
#Change Here When Change feature
predictors = ['ip', 'app', 'os', 'device', 'channel', 
              'nextClick',  'ip_dev', 'X8', 
              'hour', 'os_count', 
              'app_count','time_delta_ip_dev',  
              'ip_app_day_hour_count', 'time_delta_ip_app', 'ip_count', 'X0']

categorical = ['ip', 'app', 'device', 'os', 'channel', 'hour']

gc.collect()

17

In [7]:
#68941878(2017-11-08 00:00:00) [68941878:131886952]
#131886953(2017-11-09 00:00:00)[131886953:]
#62945076   2017-11-08 23:59:59
#62945077   2017-11-09 00:00:00

In [8]:
# #use Day 8 as train, day 9 as valid
# splitTar = 62945076
# test = train[len_train:]
# train_ = train[:len_train]
# train_X = train_[:splitTar]
# val_X = train_[splitTar:]
# train_y = train_[:splitTar].is_attributed
# val_y = train_[splitTar:].is_attributed
# train_y = train_y.astype('uint8')
# val_y = val_y.astype('uint8')
# print('The size of the test set is ', len(test))
# print('The size of the validation set is ', len(val_X))
# print('The size of the train set is ', len(train_X))

# del train
# del train_
# gc.collect()

In [9]:
#random split for both 
test = train[len_train:]
train_ = train[:len_train]
target = 'is_attributed'
target = train_[target]
train_X,val_X, train_y, val_y = train_test_split(train_,target,test_size = 0.25,random_state = 0) 
train_y = train_y.astype('uint8')
val_y = val_y.astype('uint8')
print('The size of the test set is ', len(test))
print('The size of the validation set is ', len(val_X))
print('The size of the train set is ', len(train_X))

del train
del train_
gc.collect()

('The size of the test set is ', 18790469)
('The size of the validation set is ', 46225973)
('The size of the train set is ', 138677917)


13

In [None]:
print("Preparing the datasets for training...")

params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'learning_rate': 0.05,
    'num_leaves': 31,  
    'max_depth': -1,  
    'min_child_samples': 100,  
    'max_bin': 100,  
    'subsample': 0.7,  
    'subsample_freq': 1,  
    'colsample_bytree': 0.7,  
    'min_child_weight': 0,  
    'subsample_for_bin': 200000,  
    'min_split_gain': 0,  
    'reg_alpha': 0,  
    'reg_lambda': 0,  
   # 'nthread': 8,
    'verbose': 0,
    'scale_pos_weight':100 
    }

dtrain = lgb.Dataset(train_X[predictors].values, label=train_y.values,
                      feature_name=predictors,
                      categorical_feature=categorical
                      )
dvalid = lgb.Dataset(val_X[predictors].values, label=val_y.values,
                      feature_name=predictors,
                      categorical_feature=categorical
                      )
                      
evals_results = {}

del train_X
del val_X
gc.collect()

Preparing the datasets for training...


37

In [None]:
print("Training the model...")

lgb_model = lgb.train(params, 
                 dtrain, 
                 valid_sets=[dtrain, dvalid], 
                 valid_names=['train','valid'], 
                 evals_result=evals_results, 
                 num_boost_round=1500,
                 early_stopping_rounds=50,
                 verbose_eval=True, 
                 feval=None)



Training the model...




[1]	train's auc: 0.944214	valid's auc: 0.943216
Training until validation scores don't improve for 50 rounds.
[2]	train's auc: 0.960265	valid's auc: 0.959424
[3]	train's auc: 0.969704	valid's auc: 0.968854
[4]	train's auc: 0.970087	valid's auc: 0.969137
[5]	train's auc: 0.971316	valid's auc: 0.97038
[6]	train's auc: 0.973147	valid's auc: 0.972281
[7]	train's auc: 0.973095	valid's auc: 0.972245
[8]	train's auc: 0.973794	valid's auc: 0.97295
[9]	train's auc: 0.973943	valid's auc: 0.973137
[10]	train's auc: 0.974665	valid's auc: 0.973866
[11]	train's auc: 0.97485	valid's auc: 0.974044
[12]	train's auc: 0.974682	valid's auc: 0.973873
[13]	train's auc: 0.974645	valid's auc: 0.973805
[14]	train's auc: 0.974937	valid's auc: 0.974113
[15]	train's auc: 0.975271	valid's auc: 0.974444
[16]	train's auc: 0.975315	valid's auc: 0.974486
[17]	train's auc: 0.97547	valid's auc: 0.974624
[18]	train's auc: 0.975586	valid's auc: 0.97473
[19]	train's auc: 0.97575	valid's auc: 0.974909
[20]	train's auc: 0.97

[167]	train's auc: 0.988848	valid's auc: 0.983429
[168]	train's auc: 0.988898	valid's auc: 0.983441
[169]	train's auc: 0.988922	valid's auc: 0.983449
[170]	train's auc: 0.988946	valid's auc: 0.983457
[171]	train's auc: 0.98897	valid's auc: 0.983458
[172]	train's auc: 0.988995	valid's auc: 0.983469
[173]	train's auc: 0.989034	valid's auc: 0.983472
[174]	train's auc: 0.989047	valid's auc: 0.983486
[175]	train's auc: 0.989065	valid's auc: 0.983492
[176]	train's auc: 0.989091	valid's auc: 0.983491
[177]	train's auc: 0.989113	valid's auc: 0.983527
[178]	train's auc: 0.989141	valid's auc: 0.983539
[179]	train's auc: 0.989168	valid's auc: 0.983538
[180]	train's auc: 0.989198	valid's auc: 0.983544
[181]	train's auc: 0.989233	valid's auc: 0.983558
[182]	train's auc: 0.989253	valid's auc: 0.983578
[183]	train's auc: 0.989269	valid's auc: 0.9836
[184]	train's auc: 0.989299	valid's auc: 0.983603
[185]	train's auc: 0.989314	valid's auc: 0.983621
[186]	train's auc: 0.989336	valid's auc: 0.983626
[18

In [None]:
#train's auc: 0.988606	valid's auc: 0.984418 0.1, 4, 12

In [None]:
# f, ax = plt.subplots(figsize=[15,10])
# lgb.plot_importance(lgb_model, ax=ax,)
# plt.title("Light GBM Feature Importance")
# plt.savefig('feature_import1.png')

# # Feature names:
# print('Feature names:', lgb_model.feature_name())
# # Feature importances:
# print('Feature importances:', list(lgb_model.feature_importance()))

# feature_imp = pd.DataFrame(lgb_model.feature_name(),list(lgb_model.feature_importance()))



In [None]:
print("Preparing data for submission...")

submit = pd.read_csv('test.csv', dtype='int', usecols=['click_id'])

print("Predicting the submission data...")

#submit['is_attributed'] = 0
submit['is_attributed'] = lgb_model.predict(test[predictors], num_iteration=lgb_model.best_iteration)
#submit['is_attributed'] = lgb_model_re.predict(test[predictors], num_iteration= -1)


print("Writing the submission data into a csv file...")
print(submit.shape)
submit.to_csv('submission_lgb_x4.csv', index=False)

print("All done...")