In [4]:
import pandas as pd
import numpy as np
import gc
import time


In [97]:
train_cols = ['ip', 'app', 'device', 'os', 'channel', 'click_time', 'is_attributed']
test_cols = ['ip', 'app', 'device', 'os', 'channel', 'click_time', 'click_id']
dtypes = {'ip' : 'uint32', 'app' : 'uint16', 'device' : 'uint16', 'os' : 'uint16', 'channel' : 'uint16', 'is_attributed' : 'uint8', 'click_id' : 'uint32'}

start_time = time.time()
train = pd.read_csv('train.csv', usecols=train_cols, dtype=dtypes, skiprows=range(1, 144903891), nrows=40000000)
print ('finish loading train data(time interval):', time.time() - start_time)


finish loading train data(time interval): 92.00672888755798


In [98]:
start_time = time.time()
test = pd.read_csv('test.csv', usecols=test_cols, dtype=dtypes)
print ('finish loading test data(time interval):', time.time() - start_time)

finish loading test data(time interval): 14.560522079467773


In [99]:
y = train['is_attributed']
train.drop(['is_attributed'], axis=1, inplace=True)
gc.collect()
train.head()

Unnamed: 0,ip,app,device,os,channel,click_time
0,33924,15,1,19,111,2017-11-09 04:03:08
1,37383,3,1,13,280,2017-11-09 04:03:08
2,122294,15,1,10,245,2017-11-09 04:03:08
3,73258,9,1,25,145,2017-11-09 04:03:08
4,73347,15,1,13,430,2017-11-09 04:03:08


In [100]:
submission = pd.DataFrame()
submission['click_id'] = test['click_id'].astype('uint32')
test.drop(['click_id'], axis=1, inplace=True)
gc.collect()
test.head()

Unnamed: 0,ip,app,device,os,channel,click_time
0,5744,9,1,3,107,2017-11-10 04:00:00
1,119901,9,1,3,466,2017-11-10 04:00:00
2,72287,21,1,19,128,2017-11-10 04:00:00
3,78477,15,1,13,111,2017-11-10 04:00:00
4,123080,12,1,13,328,2017-11-10 04:00:00


In [101]:
train_num_rows = len(train)
# concat train and test dataset to do feature engineering
combined_ds = pd.concat([train, test])
combined_ds.shape

(58790469, 6)

In [102]:
start_time = time.time()
temp = combined_ds['ip'].value_counts().reset_index(name='ip_count')
temp.head()

Unnamed: 0,index,ip_count
0,5348,500889
1,5314,447234
2,73516,224817
3,73487,222996
4,53454,177851


In [103]:
temp.columns = ['ip', 'ip_count']
combined_ds = combined_ds.merge(temp, on='ip', how='left')
combined_ds.head()
print ('finish merge ip_count on ip:', time.time() - start_time)

finish merge ip_count on ip: 20.545804023742676


In [104]:
# calculate frequency of device
start_time = time.time()
temp = combined_ds['device'].value_counts().reset_index(name='device_count')
temp.columns = ['device', 'device_count']
combined_ds = combined_ds.merge(temp, on='device', how='left')
combined_ds.head(20)
print ('finish merge device_count on device:', time.time() - start_time)

finish merge device_count on device: 13.087085247039795


In [105]:
gb = combined_ds[['ip', 'device', 'channel']].groupby(by=['ip', 'device'])[['channel']].count().reset_index().rename(index=str, columns={'channel':'ip_device_count'})
# gb.describe()

In [106]:
gb.head(20)

Unnamed: 0,ip,device,ip_device_count
0,0,0,2
1,1,1,23
2,2,1,5
3,3,1,48
4,3,2,16
5,4,1,6
6,5,1,25
7,6,0,4
8,6,1,699
9,6,2,6


In [107]:
combined_ds = combined_ds.merge(gb, on=['ip', 'device'], how='left')

In [108]:
combined_ds.head(20)

Unnamed: 0,ip,app,device,os,channel,click_time,ip_count,device_count,ip_device_count
0,33924,15,1,19,111,2017-11-09 04:03:08,3447,54704151,2875
1,37383,3,1,13,280,2017-11-09 04:03:08,2258,54704151,2241
2,122294,15,1,10,245,2017-11-09 04:03:08,584,54704151,578
3,73258,9,1,25,145,2017-11-09 04:03:08,987,54704151,976
4,73347,15,1,13,430,2017-11-09 04:03:08,946,54704151,941
5,5676,32,1,19,376,2017-11-09 04:03:08,5626,54704151,5588
6,118367,3,1,8,115,2017-11-09 04:03:08,27835,54704151,22505
7,89212,32,1,19,376,2017-11-09 04:03:08,1139,54704151,1111
8,92746,11,1,47,122,2017-11-09 04:03:08,1247,54704151,1238
9,82971,2,1,13,237,2017-11-09 04:03:08,2518,54704151,2495


In [109]:
del gb; gc.collect()
gb = combined_ds[['ip', 'os', 'channel']].groupby(by=['ip', 'os'])[['channel']].count().reset_index().rename(index=str, columns={'channel':'ip_os_count'})

In [110]:
gb.head(20)

Unnamed: 0,ip,os,ip_os_count
0,0,0,2
1,1,2,4
2,1,4,2
3,1,7,1
4,1,27,1
5,1,30,2
6,1,32,1
7,1,37,1
8,1,48,6
9,1,52,5


In [111]:
combined_ds = combined_ds.merge(gb, on=['ip', 'os'], how='left')

In [112]:
combined_ds.head(20)

Unnamed: 0,ip,app,device,os,channel,click_time,ip_count,device_count,ip_device_count,ip_os_count
0,33924,15,1,19,111,2017-11-09 04:03:08,3447,54704151,2875,804
1,37383,3,1,13,280,2017-11-09 04:03:08,2258,54704151,2241,548
2,122294,15,1,10,245,2017-11-09 04:03:08,584,54704151,578,18
3,73258,9,1,25,145,2017-11-09 04:03:08,987,54704151,976,26
4,73347,15,1,13,430,2017-11-09 04:03:08,946,54704151,941,199
5,5676,32,1,19,376,2017-11-09 04:03:08,5626,54704151,5588,1128
6,118367,3,1,8,115,2017-11-09 04:03:08,27835,54704151,22505,579
7,89212,32,1,19,376,2017-11-09 04:03:08,1139,54704151,1111,340
8,92746,11,1,47,122,2017-11-09 04:03:08,1247,54704151,1238,6
9,82971,2,1,13,237,2017-11-09 04:03:08,2518,54704151,2495,500


In [113]:
del gb; gc.collect()

405

In [114]:
gb = combined_ds[['device', 'os', 'channel']].groupby(by=['device', 'os'])[['channel']].count().reset_index().rename(index=str, columns={'channel':'device_os_count'})

In [115]:
gb.head(20)

Unnamed: 0,device,os,device_os_count
0,0,0,175068
1,0,21,50094
2,0,24,180443
3,0,29,37334
4,0,33,2871
5,0,38,79988
6,0,50,37951
7,0,57,39
8,0,59,5763
9,0,67,471


In [116]:
combined_ds = combined_ds.merge(gb, on=['device', 'os'], how='left')
combined_ds.head(20)

Unnamed: 0,ip,app,device,os,channel,click_time,ip_count,device_count,ip_device_count,ip_os_count,device_os_count
0,33924,15,1,19,111,2017-11-09 04:03:08,3447,54704151,2875,804,13319385
1,37383,3,1,13,280,2017-11-09 04:03:08,2258,54704151,2241,548,11840091
2,122294,15,1,10,245,2017-11-09 04:03:08,584,54704151,578,18,1547498
3,73258,9,1,25,145,2017-11-09 04:03:08,987,54704151,976,26,1215741
4,73347,15,1,13,430,2017-11-09 04:03:08,946,54704151,941,199,11840091
5,5676,32,1,19,376,2017-11-09 04:03:08,5626,54704151,5588,1128,13319385
6,118367,3,1,8,115,2017-11-09 04:03:08,27835,54704151,22505,579,1587238
7,89212,32,1,19,376,2017-11-09 04:03:08,1139,54704151,1111,340,13319385
8,92746,11,1,47,122,2017-11-09 04:03:08,1247,54704151,1238,6,485482
9,82971,2,1,13,237,2017-11-09 04:03:08,2518,54704151,2495,500,11840091


In [117]:
del gb; gc.collect()

208

In [118]:
gb = combined_ds[['ip', 'device', 'os', 'channel']].groupby(by=['ip', 'device', 'os'])[['channel']].count().reset_index().rename(index=str, columns={'channel':'ip_device_os_count'})

In [119]:
gb.head(20)

Unnamed: 0,ip,device,os,ip_device_os_count
0,0,0,0,2
1,1,1,2,4
2,1,1,4,2
3,1,1,7,1
4,1,1,27,1
5,1,1,30,2
6,1,1,32,1
7,1,1,37,1
8,1,1,48,6
9,1,1,52,5


In [120]:
combined_ds = combined_ds.merge(gb, on=['ip', 'device', 'os'], how='left')
combined_ds.head(20)

Unnamed: 0,ip,app,device,os,channel,click_time,ip_count,device_count,ip_device_count,ip_os_count,device_os_count,ip_device_os_count
0,33924,15,1,19,111,2017-11-09 04:03:08,3447,54704151,2875,804,13319385,748
1,37383,3,1,13,280,2017-11-09 04:03:08,2258,54704151,2241,548,11840091,545
2,122294,15,1,10,245,2017-11-09 04:03:08,584,54704151,578,18,1547498,18
3,73258,9,1,25,145,2017-11-09 04:03:08,987,54704151,976,26,1215741,26
4,73347,15,1,13,430,2017-11-09 04:03:08,946,54704151,941,199,11840091,199
5,5676,32,1,19,376,2017-11-09 04:03:08,5626,54704151,5588,1128,13319385,1125
6,118367,3,1,8,115,2017-11-09 04:03:08,27835,54704151,22505,579,1587238,527
7,89212,32,1,19,376,2017-11-09 04:03:08,1139,54704151,1111,340,13319385,340
8,92746,11,1,47,122,2017-11-09 04:03:08,1247,54704151,1238,6,485482,6
9,82971,2,1,13,237,2017-11-09 04:03:08,2518,54704151,2495,500,11840091,493


In [121]:
del gb; gc.collect()

314

In [122]:
# drop ip, device, os since their counts have been calculated
combined_ds.drop(['ip', 'device', 'os'], axis=1, inplace=True)
gc.collect()
combined_ds.head()

Unnamed: 0,app,channel,click_time,ip_count,device_count,ip_device_count,ip_os_count,device_os_count,ip_device_os_count
0,15,111,2017-11-09 04:03:08,3447,54704151,2875,804,13319385,748
1,3,280,2017-11-09 04:03:08,2258,54704151,2241,548,11840091,545
2,15,245,2017-11-09 04:03:08,584,54704151,578,18,1547498,18
3,9,145,2017-11-09 04:03:08,987,54704151,976,26,1215741,26
4,15,430,2017-11-09 04:03:08,946,54704151,941,199,11840091,199


In [123]:
import pytz
# convert to CST timezone
cst = pytz.timezone('Asia/Shanghai')
start_time = time.time()
combined_ds['click_time'] = pd.to_datetime(combined_ds['click_time']).dt.tz_localize(pytz.utc).dt.tz_convert(cst)
combined_ds.head()

Unnamed: 0,app,channel,click_time,ip_count,device_count,ip_device_count,ip_os_count,device_os_count,ip_device_os_count
0,15,111,2017-11-09 12:03:08+08:00,3447,54704151,2875,804,13319385,748
1,3,280,2017-11-09 12:03:08+08:00,2258,54704151,2241,548,11840091,545
2,15,245,2017-11-09 12:03:08+08:00,584,54704151,578,18,1547498,18
3,9,145,2017-11-09 12:03:08+08:00,987,54704151,976,26,1215741,26
4,15,430,2017-11-09 12:03:08+08:00,946,54704151,941,199,11840091,199


In [124]:
# deal with time: extract hour
combined_ds['hour'] = combined_ds['click_time'].dt.hour
# combined_ds['day_of_week'] = combined_ds['click_time'].dt.weekday
# combined_ds.head()
temp = combined_ds['hour'].value_counts().reset_index(name='hour_count')
temp.columns = ['hour', 'hour_count']
combined_ds = combined_ds.merge(temp, on='hour', how='left')
combined_ds.head()
print ('finish merge hour_count on hour:', time.time() - start_time)

finish merge hour_count on hour: 47.092660903930664


In [125]:
combined_ds.drop(['click_time'], axis=1, inplace=True)
combined_ds.drop(['hour'], axis=1, inplace=True)
gc.collect()
combined_ds.head()

Unnamed: 0,app,channel,ip_count,device_count,ip_device_count,ip_os_count,device_os_count,ip_device_os_count,hour_count
0,15,111,3447,54704151,2875,804,13319385,748,7181078
1,3,280,2258,54704151,2241,548,11840091,545,7181078
2,15,245,584,54704151,578,18,1547498,18,7181078
3,9,145,987,54704151,976,26,1215741,26,7181078
4,15,430,946,54704151,941,199,11840091,199,7181078


In [126]:
# divide dataset
train = combined_ds[:train_num_rows]
test = combined_ds[train_num_rows:]

In [127]:
scale_pos_weight = len(y[y==0]) / len(y[y==1])
print (len(y[y==0]), len(y[y==1]), scale_pos_weight)

39899492 100508 396.9782703864369


In [128]:
# split train dataset into train and validation
from sklearn.model_selection import train_test_split
start_time = time.time()
X1, X2, y1, y2 = train_test_split(train, y, test_size=0.1)

In [129]:
import lightgbm as lgb
lgb_params = {
          'boosting_type': 'gbdt',
          'objective': 'binary',
          'metric':'auc',
          'learning_rate': 0.1,
          'num_leaves': 9,  # we should let it be smaller than 2^(max_depth)
          'max_depth': 5,  # -1 means no limit
          'min_child_samples': 100,  # Minimum number of data need in a child(min_data_in_leaf)
          'max_bin': 100,  # Number of bucketed bin for feature values
          'subsample': 0.9,  # Subsample ratio of the training instance.
          'subsample_freq': 1,  # frequence of subsample, <=0 means no enable
          'colsample_bytree': 0.7,  # Subsample ratio of columns when constructing each tree.
          'min_child_weight': 0,  # Minimum sum of instance weight(hessian) needed in a child(leaf)
          'min_split_gain': 0,  # lambda_l1, lambda_l2 and min_gain_to_split to regularization
          'nthread': 8,
          'verbose': 0,
          'scale_pos_weight':99.7, # because training data is extremely unbalanced 
         }


dtrain = lgb.Dataset(X1.values, 
                         label=y1.values)

In [130]:
dvalid = lgb.Dataset(X2.values, 
                         label=y2.values)

In [131]:
VALIDATE = False
RANDOM_STATE = 50
VALID_SIZE = 0.90
MAX_ROUNDS = 1000
EARLY_STOP = 50
OPT_ROUNDS = 650

evals_results = {}

model = lgb.train(lgb_params, 
                      dtrain, 
                      valid_sets=[dtrain, dvalid], 
                      valid_names=['train','valid'], 
                      evals_result=evals_results, 
                      num_boost_round=MAX_ROUNDS,
                      early_stopping_rounds=EARLY_STOP,
                      verbose_eval=50, 
                      feval=None)

Training until validation scores don't improve for 50 rounds.
[50]	train's auc: 0.962217	valid's auc: 0.962307
[100]	train's auc: 0.966702	valid's auc: 0.96659
[150]	train's auc: 0.968903	valid's auc: 0.968563
[200]	train's auc: 0.97019	valid's auc: 0.969635
[250]	train's auc: 0.970959	valid's auc: 0.970246
[300]	train's auc: 0.971613	valid's auc: 0.970776
[350]	train's auc: 0.972143	valid's auc: 0.971153
[400]	train's auc: 0.972598	valid's auc: 0.971419
[450]	train's auc: 0.97301	valid's auc: 0.971645
[500]	train's auc: 0.973388	valid's auc: 0.97182
[550]	train's auc: 0.973747	valid's auc: 0.971975
[600]	train's auc: 0.974095	valid's auc: 0.972068
[650]	train's auc: 0.974356	valid's auc: 0.972171
[700]	train's auc: 0.974614	valid's auc: 0.972308
[750]	train's auc: 0.974889	valid's auc: 0.972368
[800]	train's auc: 0.975105	valid's auc: 0.972391
[850]	train's auc: 0.975317	valid's auc: 0.972462
[900]	train's auc: 0.975512	valid's auc: 0.972506
[950]	train's auc: 0.975711	valid's auc: 0.

In [132]:

submission['is_attributed'] = model.predict(test)
submission.shape
submission.to_csv('submission.csv', index=False)