In [None]:
# import pandas as pd
import time
import numpy as np
from sklearn.cross_validation import train_test_split
import xgboost as xgb

path = './input/'

def dataPreProcessTime(df):
    df['click_time'] = pd.to_datetime(df['click_time']).dt.date
    df['click_time'] = df['click_time'].apply(lambda x: x.strftime('%Y%m%d')).astype(int)
    
    return df

start_time = time.time()

train = pd.read_csv(path+"train.csv", skiprows=160000000, nrows=21000000)
train.columns = ['ip', 'app', 'device', 'os', 'channel', 'click_time', 'attributed_time', 'is_attributed']
test = pd.read_csv(path+"test.csv")

print('[{}] Finished to load data'.format(time.time() - start_time))

train = dataPreProcessTime(train)
test = dataPreProcessTime(test)

In [3]:
y = train['is_attributed']
train.drop(['is_attributed', 'attributed_time'], axis=1, inplace=True)

sub = pd.DataFrame()
sub['click_id'] = test['click_id']
test.drop('click_id', axis=1, inplace=True)

print('[{}] Start XGBoost Training'.format(time.time() - start_time))

[1389.23646688] Start XGBoost Training


In [4]:
params = {'eta': 0.1, 
          'max_depth': 4, 
          'subsample': 0.9, 
          'colsample_bytree': 0.7, 
          'colsample_bylevel':0.7,
          'min_child_weight':100,
          'alpha':4,
          'objective': 'binary:logistic', 
          'eval_metric': 'auc', 
          'random_state': 99, 
          'silent': True}
          
x1, x2, y1, y2 = train_test_split(train, y, test_size=0.1, random_state=99)

In [5]:
watchlist = [(xgb.DMatrix(x1, y1), 'train'), (xgb.DMatrix(x2, y2), 'valid')]
model = xgb.train(params, xgb.DMatrix(x1, y1), 260, watchlist, maximize=True, verbose_eval=10)

print('[{}] Finish XGBoost Training'.format(time.time() - start_time))

[0]	train-auc:0.90191	valid-auc:0.903179
[10]	train-auc:0.921572	valid-auc:0.922242
[20]	train-auc:0.926661	valid-auc:0.927246
[30]	train-auc:0.930237	valid-auc:0.930138
[40]	train-auc:0.941194	valid-auc:0.940388
[50]	train-auc:0.947484	valid-auc:0.946495
[60]	train-auc:0.950173	valid-auc:0.948943
[70]	train-auc:0.952684	valid-auc:0.951391
[80]	train-auc:0.953973	valid-auc:0.952252
[90]	train-auc:0.955847	valid-auc:0.954014
[100]	train-auc:0.957663	valid-auc:0.955597
[110]	train-auc:0.958585	valid-auc:0.956422
[120]	train-auc:0.959723	valid-auc:0.957494
[130]	train-auc:0.960633	valid-auc:0.958335
[140]	train-auc:0.961394	valid-auc:0.959047
[150]	train-auc:0.961617	valid-auc:0.959265
[160]	train-auc:0.962046	valid-auc:0.95984
[170]	train-auc:0.962494	valid-auc:0.960247
[180]	train-auc:0.962954	valid-auc:0.960698
[190]	train-auc:0.963392	valid-auc:0.961096
[200]	train-auc:0.96378	valid-auc:0.961476
[210]	train-auc:0.963947	valid-auc:0.961658
[220]	train-auc:0.964241	valid-auc:0.961901
[2

In [6]:
sub['is_attributed'] = model.predict(xgb.DMatrix(test), ntree_limit=model.best_ntree_limit)
sub.to_csv('xgb_sub.csv',index=False)