In [5]:
import pandas as pd
#import dask.dataframe as pd
import time
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb

path = '../input/'

def dataPreProcessTime(df):
    df['click_time'] = pd.to_datetime(df['click_time']).dt.date
    df['click_time'] = df['click_time'].apply(lambda x: x.strftime('%Y%m%d')).astype(int)
    
    return df

start_time = time.time()

NROWS = 1e6

train = pd.read_csv(path+"train.csv", skiprows=160000000, nrows=NROWS)
train.columns = ['ip', 'app', 'device', 'os', 'channel', 'click_time', 'attributed_time', 'is_attributed']
test = pd.read_csv(path+"test.csv")

print('[{}] Finished to load data'.format(time.time() - start_time))

train = dataPreProcessTime(train)
test = dataPreProcessTime(test)

y = train['is_attributed']
train.drop(['is_attributed', 'attributed_time'], axis=1, inplace=True)

sub = pd.DataFrame()
sub['click_id'] = test['click_id']
test.drop('click_id', axis=1, inplace=True)

print('[{}] Start XGBoost Training'.format(time.time() - start_time))

params = {'eta': 0.05, 
          'max_depth': 5, 
          'subsample': 0.9, 
          'colsample_bytree': 0.7, 
          'colsample_bylevel':0.7,
          'min_child_weight':100,
          'alpha':2,
          'objective': 'binary:logistic', 
          'eval_metric': 'auc', 
          'random_state': 99, 
          'scale_pos_weight': 20,
          'silent': True}
          
x1, x2, y1, y2 = train_test_split(train, y, test_size=0.1, random_state=54)

watchlist = [(xgb.DMatrix(x1, y1), 'train'), (xgb.DMatrix(x2, y2), 'valid')]
model = xgb.train(params, xgb.DMatrix(x1, y1), 400, watchlist, maximize=True, 
                   early_stopping_rounds=40, verbose_eval=20)

print('[{}] Finish XGBoost Training'.format(time.time() - start_time))

[132.26182913780212] Finished to load data
[274.7447941303253] Start XGBoost Training
[0]	train-auc:0.881724	valid-auc:0.87012
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 40 rounds.
[20]	train-auc:0.936175	valid-auc:0.932883
[40]	train-auc:0.939291	valid-auc:0.933959
[60]	train-auc:0.946336	valid-auc:0.938763
[80]	train-auc:0.949633	valid-auc:0.943438
[100]	train-auc:0.952994	valid-auc:0.947388
[120]	train-auc:0.95631	valid-auc:0.948889
[140]	train-auc:0.958698	valid-auc:0.949285
[160]	train-auc:0.961101	valid-auc:0.949642
[180]	train-auc:0.962573	valid-auc:0.950113
[200]	train-auc:0.963883	valid-auc:0.950128
[220]	train-auc:0.965228	valid-auc:0.950477
[240]	train-auc:0.966444	valid-auc:0.950303
[260]	train-auc:0.96729	valid-auc:0.949653
Stopping. Best iteration:
[232]	train-auc:0.966017	valid-auc:0.950478

[315.7638530731201] Finish XGBoost Training


In [None]:
sub['is_attributed'] = model.predict(xgb.DMatrix(test), ntree_limit=model.best_ntree_limit)
sub.to_csv('xgb_sub.csv',index=False, float_format='%.7f')
print('[{}] Finish writing output'.format(time.time() - start_time))