# Modeling : Xgboost

In [1]:
import numpy as np
import pandas as pd

### Data load

In [2]:
from scipy import sparse

X_train = sparse.load_npz("data/X_train.npz")
X_train

<95674x10425 sparse matrix of type '<class 'numpy.float64'>'
	with 1575631 stored elements in Compressed Sparse Row format>

In [3]:
X_test = sparse.load_npz("data/X_test.npz")
X_test

<95674x10425 sparse matrix of type '<class 'numpy.float64'>'
	with 1584735 stored elements in Compressed Sparse Row format>

In [4]:
y_train = pd.read_csv("data/X_train_TripType.csv")
y_train = y_train["TripType"]

print(y_train.shape)
y_train[:5]

(95674,)


0    999.0
1     30.0
2     26.0
3      8.0
4      8.0
Name: TripType, dtype: float64

### Model

In [5]:
import xgboost as xgb

In [17]:
model = xgb.XGBClassifier(n_jobs=8,
                          max_depth=15,
                          objective='multi:softprob',
                          eval_metric='mlogloss',
                          tree_method='auto',
                          n_estimator=100)
model

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, eval_metric='mlogloss', gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=15,
       min_child_weight=1, missing=None, n_estimator=100, n_estimators=100,
       n_jobs=8, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1, tree_method='auto')

In [18]:
%time model.fit(X_train, y_train)

CPU times: user 35min 51s, sys: 1min 57s, total: 37min 49s
Wall time: 8min 59s


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, eval_metric='mlogloss', gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=15,
       min_child_weight=1, missing=None, n_estimator=100, n_estimators=100,
       n_jobs=8, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1, tree_method='auto')

### Prediction

In [19]:
prediction = model.predict(X_train)

print(prediction.shape)
prediction

(95674,)


array([ 999.,    9.,   26., ...,   39.,   39.,    8.])

In [20]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [21]:
confusion_matrix(y_train, prediction)

array([[3577,    0,    0, ...,    0,    0,   60],
       [   0,  343,    3, ...,    0,    0,    0],
       [   1,    0, 4466, ...,    0,    0,    1],
       ..., 
       [   0,    0,   11, ...,  741,    0,    0],
       [   0,    0,    0, ...,    0, 1172,    0],
       [  58,    2,   44, ...,    0,    0, 7887]])

In [22]:
print(classification_report(y_train, prediction,
    target_names=['TripType_3', 'TripType_4', 'TripType_5', 'TripType_6',
       'TripType_7', 'TripType_8', 'TripType_9', 'TripType_12', 'TripType_14',
       'TripType_15', 'TripType_18', 'TripType_19', 'TripType_20',
       'TripType_21', 'TripType_22', 'TripType_23', 'TripType_24',
       'TripType_25', 'TripType_26', 'TripType_27', 'TripType_28',
       'TripType_29', 'TripType_30', 'TripType_31', 'TripType_32',
       'TripType_33', 'TripType_34', 'TripType_35', 'TripType_36',
       'TripType_37', 'TripType_38', 'TripType_39', 'TripType_40',
       'TripType_41', 'TripType_42', 'TripType_43', 'TripType_44',
       'TripType_999']))

              precision    recall  f1-score   support

  TripType_3       0.96      0.98      0.97      3643
  TripType_4       0.99      0.99      0.99       346
  TripType_5       0.94      0.97      0.96      4593
  TripType_6       0.98      0.99      0.98      1277
  TripType_7       0.95      0.95      0.95      5752
  TripType_8       0.94      0.95      0.94     12161
  TripType_9       0.90      0.94      0.92      9464
 TripType_12       1.00      0.98      0.99       269
 TripType_14       1.00      0.75      0.86         4
 TripType_15       0.97      0.95      0.96       978
 TripType_18       0.96      0.90      0.93       549
 TripType_19       0.96      0.93      0.94       375
 TripType_20       0.98      0.93      0.95       637
 TripType_21       0.98      0.97      0.98       641
 TripType_22       0.97      0.94      0.95       928
 TripType_23       0.99      0.99      0.99       139
 TripType_24       0.97      0.95      0.96      2609
 TripType_25       0.97    

In [23]:
prediction_prob = model.predict_proba(X_test)

print(prediction_prob.shape)
prediction_prob

(95674, 38)


array([[  6.55403128e-05,   6.59813813e-05,   1.37799914e-04, ...,
          1.03790069e-03,   3.44249042e-04,   5.42673282e-03],
       [  1.41024444e-04,   1.41973505e-04,   3.04424902e-04, ...,
          6.69852970e-03,   3.33289674e-04,   5.37350075e-03],
       [  7.65838377e-06,   7.70992210e-06,   1.65319034e-05, ...,
          8.42517056e-06,   8.19213255e-06,   9.99270856e-01],
       ..., 
       [  1.06120344e-04,   1.06834501e-04,   2.61593203e-04, ...,
          1.16745628e-04,   1.13516478e-04,   5.85848000e-03],
       [  5.30246616e-05,   3.41149644e-05,   7.00295423e-05, ...,
          4.67323326e-03,   3.89588228e-03,   1.66024678e-04],
       [  9.91190882e-06,   9.97862298e-06,   2.13965177e-05, ...,
          2.03721225e-03,   1.03908917e-03,   2.43809467e-04]], dtype=float32)

### Submission

In [24]:
submission_columns = ['TripType_3', 'TripType_4', 'TripType_5', 'TripType_6',
       'TripType_7', 'TripType_8', 'TripType_9', 'TripType_12', 'TripType_14',
       'TripType_15', 'TripType_18', 'TripType_19', 'TripType_20',
       'TripType_21', 'TripType_22', 'TripType_23', 'TripType_24',
       'TripType_25', 'TripType_26', 'TripType_27', 'TripType_28',
       'TripType_29', 'TripType_30', 'TripType_31', 'TripType_32',
       'TripType_33', 'TripType_34', 'TripType_35', 'TripType_36',
       'TripType_37', 'TripType_38', 'TripType_39', 'TripType_40',
       'TripType_41', 'TripType_42', 'TripType_43', 'TripType_44',
       'TripType_999']

X_submission = pd.DataFrame(prediction_prob, columns=submission_columns)

In [25]:
test = pd.read_csv("test.csv")

In [26]:
X_submission["VisitNumber"] = test["VisitNumber"].unique()

In [27]:
submission_columns.insert(0, 'VisitNumber')
submission = X_submission[submission_columns]
submission.head()

Unnamed: 0,VisitNumber,TripType_3,TripType_4,TripType_5,TripType_6,TripType_7,TripType_8,TripType_9,TripType_12,TripType_14,...,TripType_36,TripType_37,TripType_38,TripType_39,TripType_40,TripType_41,TripType_42,TripType_43,TripType_44,TripType_999
0,1,6.6e-05,6.6e-05,0.000138,6.8e-05,0.00122,0.012816,0.007322,7.7e-05,6.5e-05,...,0.000138,0.0001,0.218302,0.007198,0.000177,0.011709,0.00258,0.001038,0.000344,0.005427
1,2,0.000141,0.000142,0.000304,0.000228,0.006282,0.020472,0.041815,0.000304,0.000141,...,0.000326,0.000205,0.007067,0.224432,0.000148,0.000774,0.003559,0.006699,0.000333,0.005374
2,3,8e-06,8e-06,1.7e-05,1e-05,0.00013,0.000176,2.2e-05,8e-06,8e-06,...,1.7e-05,1e-05,2.6e-05,1.8e-05,8e-06,9e-06,1.6e-05,8e-06,8e-06,0.999271
3,4,9.6e-05,9.7e-05,0.000208,0.000121,0.001743,0.031014,0.939507,0.000103,0.000366,...,0.000208,0.000128,0.000309,0.000214,9.9e-05,0.000121,0.000285,0.000106,0.000103,0.016927
4,6,8e-06,8e-06,1.7e-05,1e-05,0.00013,0.000176,2.2e-05,8e-06,8e-06,...,1.7e-05,1e-05,2.6e-05,1.8e-05,8e-06,9e-06,1.6e-05,8e-06,8e-06,0.999271


In [28]:
submission.to_csv("submission01.csv", index=False)