In [1]:

import pandas as pd
#import lightgbm as lgb
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split  
import gc
import xgboost as xgb
import operator  




In [2]:
train_cols = ['ip', 'app', 'device', 'os', 'channel', 'click_time', 'is_attributed']
test_cols = ['ip', 'app', 'device', 'os', 'channel', 'click_time']

dtypes = {
        'ip'            : 'uint32',
        'app'           : 'uint16',
        'device'        : 'uint16',
        'os'            : 'uint16',
        'channel'       : 'uint16',
        'is_attributed' : 'uint8',
        'click_id'      : 'uint32'
        }


In [3]:
skip = range(1,181876952)

print("Loading Data")
train = pd.read_csv('train.csv',skiprows = skip, dtype=dtypes,
        header=0,usecols=train_cols,parse_dates=["click_time"])#.sample(1000)
test = pd.read_csv('test.csv', dtype=dtypes, header=0,
         usecols=test_cols,parse_dates=["click_time"])#.sample(1000)


Loading Data


In [5]:
len_train = len(train)
print('The initial size of the train set is', len_train)
print('Binding the training and test set together...')
train=train.append(test)
del test
gc.collect()


('The initial size of the train set is', 3026939)
Binding the training and test set together...


KeyboardInterrupt: 

In [6]:
train.shape

(21817408, 7)

In [9]:

train = train.drop(['click_time'], axis = 1)
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21817408 entries, 0 to 18790468
Data columns (total 6 columns):
app              uint16
channel          uint16
device           uint16
ip               uint32
is_attributed    float64
os               uint16
dtypes: float64(1), uint16(4), uint32(1)
memory usage: 582.6 MB


In [7]:
train.shape

(21817408, 7)

In [8]:
test.shape

NameError: name 'test' is not defined

In [10]:
#random split for both 
train_ = train[:3026939]
test = train[3026939:]
del train
target = 'is_attributed'
target = train_[target]
train_ = train_.drop(['is_attributed'],axis = 1)
train_X,val_X, train_y, val_y = train_test_split(train_,target,test_size = 0.1,random_state = 0) 
train_y = train_y.astype('uint8')
val_y = val_y.astype('uint8')
print('The size of the test set is ', len(test))
print('The size of the validation set is ', len(val_X))
print('The size of the train set is ', len(train_X))

('The size of the test set is ', 18790469)
('The size of the validation set is ', 302694)
('The size of the train set is ', 2724245)


In [11]:
print("Preparing the datasets for training...")
params = {
          'tree_method': "approx", 
          'objective': 'binary:logistic', 
          'eval_metric': 'auc', 
          'random_state': 99,
          'silent': False,
          'colsample_bylevel': 0.1,
          'colsample_bytree': 1.0,
          'gamma': 5.103973694670875e-08,
          'learning_rate': 0.140626707498132,
          'max_delta_step': 20,
          'max_depth': 6,
          'min_child_weight': 4,
          'n_estimators': 100,
          'reg_alpha': 1e-09,
          'reg_lambda': 1000.0,
          'scale_pos_weight': 499.99999999999994,
          'subsample': 1.0
          }


Preparing the datasets for training...


In [12]:
dtrain = xgb.DMatrix(train_X, train_y)
del train_X, train_y
gc.collect()

dvalid = xgb.DMatrix(val_X, val_y)
del val_X, val_y
gc.collect()

watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
model = xgb.train(params, dtrain, 1000, watchlist, maximize=True, early_stopping_rounds=30, verbose_eval=10)
del dtrain
del dvalid
gc.collect()



[0]	train-auc:0.969046	valid-auc:0.960367
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 30 rounds.
[10]	train-auc:0.977365	valid-auc:0.96631
[20]	train-auc:0.980361	valid-auc:0.968944
[30]	train-auc:0.983671	valid-auc:0.97074
[40]	train-auc:0.986019	valid-auc:0.972343
[50]	train-auc:0.988119	valid-auc:0.973009
[60]	train-auc:0.9891	valid-auc:0.973238
[70]	train-auc:0.990185	valid-auc:0.973504
[80]	train-auc:0.990611	valid-auc:0.973406
[90]	train-auc:0.991215	valid-auc:0.973425
Stopping. Best iteration:
[66]	train-auc:0.989869	valid-auc:0.973649



21

In [17]:
def ceate_feature_map(features):  
    outfile = open('xgb.fmap', 'w')  
    i = 0  
    for feat in features:  
        outfile.write('{0}\t{1}\tq\n'.format(i, feat))  
        i = i + 1  
    outfile.close() 

In [16]:
print("saveing feat_importance!")
features = [x for x in train_.columns if x not in ['id','loss']]  
ceate_feature_map(features)  
  
importance = model.get_fscore(fmap='xgb.fmap')  
importance = sorted(importance.items(), key=operator.itemgetter(1))  
  
df = pd.DataFrame(importance, columns=['feature', 'fscore'])  
df['fscore'] = df['fscore'] / df['fscore'].sum()  
df.to_csv("../feat_importance.csv", index=False)  

plt.figure()  
df.plot(kind='barh', x='feature', y='fscore', legend=False, figsize=(6, 10))  
plt.title('XGBoost Feature Importance')  
plt.xlabel('relative importance')  
plt.savefig("feat_importance.jpg",dpi = 300)

saveing feat_importance!


In [None]:
sub = pd.read_csv('test.csv', dtype='int', usecols=['click_id'])

print("predicting...")
sub['is_attributed'] = model.predict(xgb.DMatrix(test), ntree_limit=model.best_ntree_limit)
sub.to_csv('sub_xgb_delta.csv',index=False)

print("All done...")