# Random Forest Model

#### Daniel Fay

#### Train Random Forest Model

In [1]:
# Import Necessary Packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn import tree
from sklearn.model_selection import GridSearchCV
% matplotlib inline

In [2]:
# Load Dataset
df = pd.read_csv('data/processed/bus_network_data.csv', index_col=0)
df.head()

Unnamed: 0,bus_line,direction,date,month,day,day_of_week,time_period,hour,minute,trip_time,...,Humidity,PrecipitationIn,TemperatureF,VisibilityMPH,Wind SpeedMPH,totalInjuries,pavementScore,potholeCount,prev_trip_ratio,ntwk_delay_lag1hr
0,B11,2.0,2016-01-01,1,1,Friday,PeakAM,9,44,38.45,...,58.0,0.0,39.9,10.0,10.4,162,2110.0,237,0.638492,2.0
1,B11,1.0,2016-01-01,1,1,Friday,PeakAM,9,52,53.9,...,58.0,0.0,39.9,10.0,10.4,162,2110.0,237,1.05552,2.0
2,B11,2.0,2016-01-01,1,1,Friday,MidDay,10,10,34.7,...,58.0,0.0,39.9,10.0,6.9,162,2110.0,237,1.017161,4.0
3,B11,1.0,2016-01-01,1,1,Friday,MidDay,10,23,41.05,...,58.0,0.0,39.9,10.0,6.9,162,2110.0,237,0.979406,4.0
4,B11,2.0,2016-01-01,1,1,Friday,MidDay,10,33,40.183333,...,58.0,0.0,39.9,10.0,6.9,162,2110.0,237,0.591185,4.0


In [3]:
df.columns

Index([u'bus_line', u'direction', u'date', u'month', u'day', u'day_of_week',
       u'time_period', u'hour', u'minute', u'trip_time', u'avg_trip_time',
       u'std_trip_time', u'delay_time', u'delay', u'Conditions', u'Humidity',
       u'PrecipitationIn', u'TemperatureF', u'VisibilityMPH', u'Wind SpeedMPH',
       u'totalInjuries', u'pavementScore', u'potholeCount', u'prev_trip_ratio',
       u'ntwk_delay_lag1hr'],
      dtype='object')

In [4]:
# Select target variable and feature space
X = pd.get_dummies(df[[u'bus_line', u'direction', u'month', u'day_of_week',
       u'hour', u'minute', u'Conditions', u'Humidity', u'PrecipitationIn', 
       u'TemperatureF', u'VisibilityMPH', u'Wind SpeedMPH', u'totalInjuries', 
       u'pavementScore', u'potholeCount', u'prev_trip_ratio', u'ntwk_delay_lag1hr']])

Y = df['delay']

X_train,X_test,y_train,y_test=train_test_split(X, Y, test_size=0.3, random_state=123)

In [11]:
# Find best C parameter
param_grid = {'max_depth': np.linspace(16,20,10)}
rfc = RFC(n_estimators=20, n_jobs=4)
C_param = GridSearchCV(rfc, param_grid, cv=5)
C_param.fit(X_train, y_train)
C_param.best_params_

{'max_depth': 18.666666666666668}

In [12]:
# Cross validate using best C parameter
avg_acc = []
for i in range(10):
    X_train, X_test, Y_train, Y_test = train_test_split(
        X, Y, test_size=0.33, random_state=i) #Use random_state to fix samples
    X_train_dummies = pd.get_dummies(X_train)
    X_test_dummies = pd.get_dummies(X_test)

    clf = RFC(n_estimators=20, max_depth=C_param.best_params_.values()[0], n_jobs=4) 
    clf.fit(X_train_dummies, Y_train)

    avg_acc.append(1.0*(clf.predict(X_test_dummies)==np.asarray(Y_test)).sum()/len(Y_test))

print ("Successfully (OS) predict {}% of Bus Delays".format(np.mean(avg_acc)*100))

Successfully (OS) predict 49.9712574943% of Bus Delays


In [15]:
df_dummies = X.copy()
df_dummies['delay'] = Y.copy()
delay_df = df_dummies.loc[df_dummies['delay'].isin(['delay','significantly delay'])]
Y_delay = delay_df['delay']
X_delay = delay_df.iloc[:,:-1]
1.0*(clf.predict(X_delay)==np.asarray(Y_delay)).sum()/len(Y_delay)

0.06248202687292379

#### Feature Selection

In [16]:
Feature_importance=pd.DataFrame([list(X_train.columns),list(clf.feature_importances_)]).T
Feature_importance.columns=["variables","importance"]
Feature_importance.sort_values(by="importance",ascending=False).iloc[:10,:]

Unnamed: 0,variables,importance
2,hour,0.199246
11,prev_trip_ratio,0.147709
3,minute,0.113329
12,ntwk_delay_lag1hr,0.0734847
6,TemperatureF,0.0518624
4,Humidity,0.0507257
8,totalInjuries,0.0401108
1,month,0.0317406
0,direction,0.0283006
9,pavementScore,0.0224668


In [17]:
# Select target variable and feature space
X_red = pd.get_dummies(df[[u'bus_line', u'direction', u'month', u'day_of_week', u'hour', \
                           u'minute', u'prev_trip_ratio', u'ntwk_delay_lag1hr', 'TemperatureF']])

Y = df['delay']

X_train,X_test,y_train,y_test=train_test_split(X_red, Y, test_size=0.3, random_state=123)

In [20]:
# Find best C parameter
param_grid = {'max_depth': np.linspace(19,25,10)}
rfc = RFC()
C_param = GridSearchCV(rfc, param_grid, cv=5)
C_param.fit(X_train, y_train)
C_param.best_params_

{'max_depth': 19.666666666666668}

In [21]:
# Cross validate using best C parameter
avg_acc = []
for i in range(10):
    X_train, X_test, Y_train, Y_test = train_test_split(
        X, Y, test_size=0.33, random_state=i) #Use random_state to fix samples
    X_train_dummies = pd.get_dummies(X_train)
    X_test_dummies = pd.get_dummies(X_test)

    clf = RFC(max_depth=C_param.best_params_.values()[0]) 
    clf.fit(X_train_dummies, Y_train)

    avg_acc.append(1.0*(clf.predict(X_test_dummies)==np.asarray(Y_test)).sum()/len(Y_test))

print ("Successfully (OS) predict {}% of Bus Delays".format(np.mean(avg_acc)*100))

Successfully (OS) predict 49.1524625962% of Bus Delays
