## Environment set up


In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split,GridSearchCV, cross_val_score
from datetime import datetime
from sklearn.metrics import r2_score,mean_squared_error,roc_curve
from sklearn.preprocessing import LabelEncoder
from sklearn import svm


### Data Set Up

In [None]:
df = pd.read_csv('predictionQuery.csv')
df.head()

In [None]:
df = df.head(10000)

### Exploratory Analysis

In [None]:
avgLate = np.sum(df['DELAY'])/len(df['DELAY'])
attributes = ['ROUTE_NUMBER','Trip', 'TripTime', 'vehicleID','Current_Stop','Current_Stop_ID',
             'Current_Time','Current_Day','Current_Hour','Current_Minutes','Current_Seconds','destination','Dir']
for i,pred in enumerate(attributes):
    plt.figure(i, figsize=(40, 20))
    group = df.groupby([pred], as_index=False).aggregate(np.mean)[[pred, 'DELAY']]
    group.sort_values(by=pred, inplace=True)    
    group.plot.bar(x=pred, y='DELAY')
    plt.axhline(y=avgLate, label='Average')
    plt.ylabel('Percent of roads that SAD Late')
    plt.title(pred)
    plt.legend().remove()

### Label encoding


In [None]:
le = LabelEncoder()
df["destination"] = le.fit_transform(df["destination"])
Destination = list(le.classes_)

In [None]:
df["Current_Stop"] = le.fit_transform(df["Current_Stop"])
Current_Stop = list(le.classes_)
Current_Stop

In [None]:
# checking for missing values
df.isnull().values.any()

In [None]:
df.isnull().sum()

In [None]:
df.dtypes

In [None]:
df.columns

In [None]:
#Data set up as predictors and target
Delay_YesNo = df['DELAY']
df.drop(['DELAY'], axis=1, inplace=True)#Removing target variable

df = df.drop(["destination","Current_Stop"],axis = 1)
df = df.drop("Current_Time",axis = 1)
df = df.drop("vehicleID",axis = 1)
df = df.drop("lineID",axis = 1)
df = df.drop("TripTime",axis = 1)
df = df.drop("lon",axis = 1)
df = df.drop("lat",axis = 1)
#df = df.drop("Trip",axis = 1)

print('Dimension reduced to:')
print(len(df.columns))
df.describe()

In [None]:
plt.matshow(df.corr())
plt.title('Correlation matrix for MULTI-COLLINEAR data')
corr = df.corr()
print("Note: Darker cells indicate high collinearity")

In [None]:
corr = df.corr()#Lists all pairs of highly collinear variables
indices = np.where(corr > 0.5)
indices = [(corr.columns[x], corr.columns[y]) for x, y in zip(*indices)
                                        if x != y and x < y]
indices

In [None]:
df.dtypes

In [None]:
df.columns

In [None]:
Delay_YesNo.describe()

In [None]:
#import math

#for index, row in df.iterrows():
#    df.loc[index,'DELAY'] = math.floor(row['DELAY'] / 100)
#df.head()

In [None]:
Delay_YesNo.describe()

## Model building 

#### train / test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df, Delay_YesNo, test_size=0.3, random_state=42)

In [None]:
print(X_train.shape + X_test.shape)

In [None]:
print(y_train.shape + y_test.shape)

#### Grid search

In [None]:
startTimeGS = datetime.now()
rf = RandomForestClassifier()
param_grid = {'n_estimators': [10, 25],
#             'min_samples_split': [2, 4],
#             'min_samples_leaf': [2, 4],
#             'max_features': ['sqrt', 'log2'],
              "criterion" : ["gini", "entropy"]
             }
grid_rf = GridSearchCV(rf, param_grid, cv=10)
grid_rf.fit(X_train, y_train)
bestModel = grid_rf.best_estimator_
bestParameters = grid_rf.best_params_
gridScores = grid_rf.score
print('Random forest Grid Search with non-redundant variables took [', datetime.now() - startTimeGS, '] seconds.')

In [None]:
print(bestModel)
print(bestParameters)
gridScores

In [None]:
startTimeRF = datetime.now()

model = RandomForestClassifier(n_estimators = bestParameters.get('n_estimators'),
                               criterion = bestParameters.get('criterion'))

clf = svm.SVC(kernel='linear', C=1, random_state=42)
scores = cross_val_score(clf, X_train, y_train, cv=5)
print ('Mean cross validation score is: ' + str(np.mean(cvScores)))

model.fit(X_train, Y_train)
print('Random forest training and testing with with non-redundant variables took [', 
      datetime.now() - startTimeRF, '] seconds.')

In [None]:
Y_rf_pred = model.predict(X_test)

labels = [0, 1]
cm = confusion_matrix(y_test, Y_rf_pred,labels)

print('Accuracy: ' + str(np.round(100*float(cm[0][0]+cm[1][1])/float((cm[0][0]+cm[1][1] + cm[1][0] + cm[0][1])),2))+'%')
print('Recall: ' + str(np.round(100*float((cm[1][1]))/float((cm[1][0]+cm[1][1])),2))+'%')
print('Confusion matrix:')
print(cm)

fpr, tpr, _ = roc_curve(y_test, Y_rf_pred)
auc = np.trapz(fpr,tpr)
print('Area under the ROC curve: ' + str(auc))

fig = plt.figure(1)
plt.plot(fpr,tpr,color='green')
plt.xlabel('False positive rate (FPR)')
plt.ylabel('True positive rate (TPR)')
plt.title('Receiver operating characteristic (ROC)')

fig = plt.figure(2)
ax = fig.add_subplot(111)
cax = ax.matshow(cm)
plt.title('Confusion matrix for Random Forest classifier with original data')
fig.colorbar(cax)
ax.set_xticklabels([''] + labels)
ax.set_yticklabels([''] + labels)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

In [None]:
r2_score(X_train,y_train)

In [None]:
mean_squared_error(X_test,predicted)