 # Modeling : Random Forest

In [1]:
import numpy as np
import pandas as pd

### Data load

In [2]:
from scipy import sparse

X_train = sparse.load_npz("data/X_train.npz")
X_train

<95674x10425 sparse matrix of type '<class 'numpy.float64'>'
	with 1575631 stored elements in Compressed Sparse Row format>

In [3]:
X_test = sparse.load_npz("data/X_test.npz")
X_test

<95674x10425 sparse matrix of type '<class 'numpy.float64'>'
	with 1584735 stored elements in Compressed Sparse Row format>

In [4]:
y_train = pd.read_csv("data/X_train_TripType.csv")
y_train = y_train["TripType"]

print(y_train.shape)
y_train[:5]

(95674,)


0    999.0
1     30.0
2     26.0
3      8.0
4      8.0
Name: TripType, dtype: float64

### Model

In [5]:
from sklearn.ensemble import RandomForestClassifier

In [6]:
model = RandomForestClassifier(n_estimators=1000,
                               max_depth=15,
                               n_jobs=8)

model

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=15, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=8,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [7]:
%time model.fit(X_train, y_train)

CPU times: user 2min 45s, sys: 1.71 s, total: 2min 47s
Wall time: 23.7 s


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=15, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=8,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

### Prediction

In [8]:
prediction = model.predict(X_train)

print(prediction.shape)
prediction

(95674,)


array([ 999.,    9.,   39., ...,   39.,   39.,    8.])

In [9]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [10]:
confusion_matrix(y_train, prediction)

array([[3617,    0,    0, ...,    0,    0,    0],
       [   0,    0,    3, ...,    0,    0,    1],
       [   5,    0, 1651, ...,    0,    0,    5],
       ..., 
       [   0,    0,   10, ...,    0,    0,    1],
       [   0,    0,    0, ...,    0,    0,    0],
       [ 557,    0,   15, ...,    0,    0, 6502]])

In [11]:
print(classification_report(y_train, prediction,
    target_names=['TripType_3', 'TripType_4', 'TripType_5', 'TripType_6',
       'TripType_7', 'TripType_8', 'TripType_9', 'TripType_12', 'TripType_14',
       'TripType_15', 'TripType_18', 'TripType_19', 'TripType_20',
       'TripType_21', 'TripType_22', 'TripType_23', 'TripType_24',
       'TripType_25', 'TripType_26', 'TripType_27', 'TripType_28',
       'TripType_29', 'TripType_30', 'TripType_31', 'TripType_32',
       'TripType_33', 'TripType_34', 'TripType_35', 'TripType_36',
       'TripType_37', 'TripType_38', 'TripType_39', 'TripType_40',
       'TripType_41', 'TripType_42', 'TripType_43', 'TripType_44',
       'TripType_999']))

              precision    recall  f1-score   support

  TripType_3       0.82      0.99      0.90      3643
  TripType_4       0.00      0.00      0.00       346
  TripType_5       0.96      0.36      0.52      4593
  TripType_6       1.00      0.19      0.32      1277
  TripType_7       0.83      0.36      0.50      5752
  TripType_8       0.32      0.98      0.48     12161
  TripType_9       0.45      0.64      0.53      9464
 TripType_12       0.00      0.00      0.00       269
 TripType_14       0.00      0.00      0.00         4
 TripType_15       0.00      0.00      0.00       978
 TripType_18       0.00      0.00      0.00       549
 TripType_19       0.00      0.00      0.00       375
 TripType_20       0.00      0.00      0.00       637
 TripType_21       0.00      0.00      0.00       641
 TripType_22       0.00      0.00      0.00       928
 TripType_23       0.00      0.00      0.00       139
 TripType_24       0.97      0.04      0.07      2609
 TripType_25       0.88    

  'precision', 'predicted', average, warn_for)


In [12]:
prediction_prob = model.predict_proba(X_test)

print(prediction_prob.shape)
prediction_prob

(95674, 38)


array([[ 0.02232184,  0.00300191,  0.03757175, ...,  0.00920257,
         0.01114327,  0.06456927],
       [ 0.02014978,  0.0053423 ,  0.036688  , ...,  0.01113851,
         0.01093335,  0.14090792],
       [ 0.02608427,  0.00331594,  0.03966432, ...,  0.00654742,
         0.00626999,  0.34233187],
       ..., 
       [ 0.0386432 ,  0.00413809,  0.04981843, ...,  0.00779054,
         0.00747025,  0.08679286],
       [ 0.01371984,  0.00211239,  0.02791373, ...,  0.01368215,
         0.03060567,  0.03714351],
       [ 0.02774615,  0.00344807,  0.04314654, ...,  0.01117938,
         0.01269539,  0.07198062]])

### Submission

In [13]:
submission_columns = ['TripType_3', 'TripType_4', 'TripType_5', 'TripType_6',
       'TripType_7', 'TripType_8', 'TripType_9', 'TripType_12', 'TripType_14',
       'TripType_15', 'TripType_18', 'TripType_19', 'TripType_20',
       'TripType_21', 'TripType_22', 'TripType_23', 'TripType_24',
       'TripType_25', 'TripType_26', 'TripType_27', 'TripType_28',
       'TripType_29', 'TripType_30', 'TripType_31', 'TripType_32',
       'TripType_33', 'TripType_34', 'TripType_35', 'TripType_36',
       'TripType_37', 'TripType_38', 'TripType_39', 'TripType_40',
       'TripType_41', 'TripType_42', 'TripType_43', 'TripType_44',
       'TripType_999']

X_submission = pd.DataFrame(prediction_prob, columns=submission_columns)

In [14]:
test = pd.read_csv("data/test.csv")

In [15]:
X_submission["VisitNumber"] = test["VisitNumber"].unique()

In [16]:
submission_columns.insert(0, 'VisitNumber')
submission = X_submission[submission_columns]
submission.head()

Unnamed: 0,VisitNumber,TripType_3,TripType_4,TripType_5,TripType_6,TripType_7,TripType_8,TripType_9,TripType_12,TripType_14,...,TripType_36,TripType_37,TripType_38,TripType_39,TripType_40,TripType_41,TripType_42,TripType_43,TripType_44,TripType_999
0,1,0.022322,0.003002,0.037572,0.01176,0.057468,0.137952,0.106322,0.002769,3.7e-05,...,0.025234,0.025065,0.086876,0.10903,0.039331,0.007743,0.020448,0.009203,0.011143,0.064569
1,2,0.02015,0.005342,0.036688,0.011118,0.061431,0.132144,0.085169,0.003167,3.2e-05,...,0.02477,0.022024,0.030843,0.114938,0.039612,0.005772,0.018421,0.011139,0.010933,0.140908
2,3,0.026084,0.003316,0.039664,0.011758,0.043925,0.111822,0.098436,0.001834,4.2e-05,...,0.022952,0.01197,0.014351,0.052822,0.012825,0.004551,0.014664,0.006547,0.00627,0.342332
3,4,0.035798,0.004374,0.053133,0.01564,0.05805,0.149271,0.148124,0.002384,0.000162,...,0.030198,0.015517,0.01873,0.069457,0.017207,0.006187,0.023009,0.009675,0.009067,0.090555
4,6,0.026084,0.003316,0.039664,0.011758,0.043925,0.111822,0.098436,0.001834,4.2e-05,...,0.022952,0.01197,0.014351,0.052822,0.012825,0.004551,0.014664,0.006547,0.00627,0.342332


In [17]:
#submission.to_csv("submission02.csv", index=False)