In [28]:
import numpy as np
import pandas as pd

In [29]:
joined = pd.read_csv('Data/Montreal/Final/joined_by_hour.csv')
new = pd.read_csv('Data/Montreal/Final/joined_by_hour.csv')
collisions = pd.read_csv('Data/Montreal/Final/collisions_cleaned.csv')

joined.head()

Unnamed: 0,Id_Intersection,Date,hour,Longitude,Latitude,Year,Month,Day,weekday,NB,SB,EB,WB,wind_speed,temperature,visibility,collision_nearby
0,1,2018-02-14,0,-73.575661,45.48265,2018,2,14,2,0,0,0,0,2.0,-10.4,24100.0,0
1,1,2018-02-14,1,-73.575661,45.48265,2018,2,14,2,0,0,0,0,7.0,-11.5,24100.0,0
2,1,2018-02-14,2,-73.575661,45.48265,2018,2,14,2,0,0,0,0,3.0,-12.1,24100.0,0
3,1,2018-02-14,3,-73.575661,45.48265,2018,2,14,2,0,0,0,0,9.0,-7.9,24100.0,0
4,1,2018-02-14,4,-73.575661,45.48265,2018,2,14,2,0,0,0,0,13.0,-6.5,24100.0,0


In [30]:
collisions.head()

Unnamed: 0,date,lat,lon,year,month,day,hour,weekday,num_vehicles,num_victims
0,2012-06-02,45.48772,-73.71603,2012,6,2,15,5,1,0
1,2012-06-28,45.51432,-73.68279,2012,6,28,8,3,2,0
2,2012-07-11,45.49221,-73.57647,2012,7,11,9,2,2,0
3,2012-01-02,45.48687,-73.87855,2012,1,2,14,0,2,0
4,2012-01-03,45.4917,-73.85536,2012,1,3,12,1,2,0


In [31]:
# binarize collision_nearby
new['is_collision'] = np.where(new['collision_nearby'] > 0, 1, 0)

# drop some non-useful columns
new.drop(columns=['Id_Intersection', 'Date', 'collision_nearby'], inplace=True)

# check results
new.head()

Unnamed: 0,hour,Longitude,Latitude,Year,Month,Day,weekday,NB,SB,EB,WB,wind_speed,temperature,visibility,is_collision
0,0,-73.575661,45.48265,2018,2,14,2,0,0,0,0,2.0,-10.4,24100.0,0
1,1,-73.575661,45.48265,2018,2,14,2,0,0,0,0,7.0,-11.5,24100.0,0
2,2,-73.575661,45.48265,2018,2,14,2,0,0,0,0,3.0,-12.1,24100.0,0
3,3,-73.575661,45.48265,2018,2,14,2,0,0,0,0,9.0,-7.9,24100.0,0
4,4,-73.575661,45.48265,2018,2,14,2,0,0,0,0,13.0,-6.5,24100.0,0


In [32]:
# import the different sklearn packages we will need
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import joblib
import warnings
warnings.filterwarnings('ignore')

In [33]:
# ensure that the dataset is sorted by date-time
new.sort_values(['Year', 'Month', 'Day', 'hour'], inplace=True)
new.head()

Unnamed: 0,hour,Longitude,Latitude,Year,Month,Day,weekday,NB,SB,EB,WB,wind_speed,temperature,visibility,is_collision
49522,0,-73.615006,45.627322,2012,1,19,3,0,0,0,0,7.0,-16.6,25000.0,0
49523,1,-73.615006,45.627322,2012,1,19,3,0,0,0,0,4.0,-15.2,25000.0,0
49524,2,-73.615006,45.627322,2012,1,19,3,0,0,0,0,6.0,-14.3,25000.0,0
49525,3,-73.615006,45.627322,2012,1,19,3,0,0,0,0,6.0,-18.0,25000.0,0
49526,4,-73.615006,45.627322,2012,1,19,3,0,0,0,0,7.0,-15.4,25000.0,0


In [34]:
# split into features and target
X = new.iloc[:, :-1]
y = new['is_collision']

# train test split with the shuffle set to False
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=99, shuffle=False)

In [35]:
dt = DecisionTreeClassifier(max_depth=5)
dt.fit(X_train, y_train);

dt.score(X_train, y_train)

0.7079991703539823

In [36]:
dt.score(X_test, y_test)

0.7242118362831859

So at this point, we have the trained model.  What I need is
1. The lat/lon values of the intersections which we use
2. The lat/lon values of the collisions which occurred nearby
3. The probabilities for each intersection which we use

- `test_collisions` gives me the locations of the collisions
- 

In [37]:
test_rows = new.loc[(new['Year'] == 2018) & (new['Month'] == 2) & (new['Day'] == 14) \
                      & (new['hour'] == 8), :]
test_rows

Unnamed: 0,hour,Longitude,Latitude,Year,Month,Day,weekday,NB,SB,EB,WB,wind_speed,temperature,visibility,is_collision
8,8,-73.575661,45.48265,2018,2,14,2,466,146,294,347,4.0,-8.0,24100.0,1
5570,8,-73.587639,45.473984,2018,2,14,2,0,2,354,324,4.0,-8.0,24100.0,0
5866,8,-73.574439,45.483398,2018,2,14,2,68,79,387,308,4.0,-8.0,24100.0,1
11432,8,-73.562987,45.495417,2018,2,14,2,969,338,754,164,4.0,-8.0,24100.0,1
22711,8,-73.571767,45.485016,2018,2,14,2,56,151,368,297,4.0,-8.0,24100.0,1
51327,8,-73.563755,45.494298,2018,2,14,2,1,103,723,206,4.0,-8.0,24100.0,1
56709,8,-73.573591,45.481004,2018,2,14,2,514,241,142,0,4.0,-8.0,24100.0,0


In [38]:
X_test_rows = test_rows.iloc[:, :-1]
y_test_rows = test_rows['is_collision']

probs = dt.predict_proba(X_test_rows)
ones = [round(x[1], 2) for x in probs]

test_rows['prob'] = ones
test_rows['pred'] = np.where(test_rows['prob'] > 0.5, 1, 0)
test_rows

Unnamed: 0,hour,Longitude,Latitude,Year,Month,Day,weekday,NB,SB,EB,WB,wind_speed,temperature,visibility,is_collision,prob,pred
8,8,-73.575661,45.48265,2018,2,14,2,466,146,294,347,4.0,-8.0,24100.0,1,0.68,1
5570,8,-73.587639,45.473984,2018,2,14,2,0,2,354,324,4.0,-8.0,24100.0,0,0.32,0
5866,8,-73.574439,45.483398,2018,2,14,2,68,79,387,308,4.0,-8.0,24100.0,1,0.68,1
11432,8,-73.562987,45.495417,2018,2,14,2,969,338,754,164,4.0,-8.0,24100.0,1,0.68,1
22711,8,-73.571767,45.485016,2018,2,14,2,56,151,368,297,4.0,-8.0,24100.0,1,0.68,1
51327,8,-73.563755,45.494298,2018,2,14,2,1,103,723,206,4.0,-8.0,24100.0,1,0.68,1
56709,8,-73.573591,45.481004,2018,2,14,2,514,241,142,0,4.0,-8.0,24100.0,0,0.68,1


In [18]:
test_rows.to_csv('../Data/Montreal/Final/predictions_intersections.csv', index=False)

In [39]:
test_collisions = collisions.loc[(collisions['year'] == 2018) & \
                                 (collisions['month'] == 2) & \
                                 (collisions['day'] == 14) & \
                                 (collisions['hour'] > 5) & (collisions['hour'] < 11), :]

test_collisions.head()

Unnamed: 0,date,lat,lon,year,month,day,hour,weekday,num_vehicles,num_victims
136061,2018-02-14,45.48618,-73.70376,2018,2,14,6,2,8,1
138887,2018-02-14,45.49123,-73.59608,2018,2,14,7,2,2,0
139284,2018-02-14,45.42822,-73.61017,2018,2,14,10,2,2,1
141407,2018-02-14,45.49781,-73.55969,2018,2,14,9,2,2,0
144311,2018-02-14,45.52685,-73.64946,2018,2,14,8,2,2,1


In [40]:
test_collisions.shape

(8, 10)

In [21]:
test_collisions.to_csv('../Data/Montreal/Final/predictions_collisions.csv', index=False)

In [41]:
pd.set_option('max_columns', None)

In [46]:
test_collisions['Longitude'] = test_collisions['lon']
test_collisions['Latitude'] = test_collisions['lat']
test_collisions.head()

Unnamed: 0,date,lat,lon,year,month,day,hour,weekday,num_vehicles,num_victims,Longitude,Latitude
136061,2018-02-14,45.48618,-73.70376,2018,2,14,6,2,8,1,-73.70376,45.48618
138887,2018-02-14,45.49123,-73.59608,2018,2,14,7,2,2,0,-73.59608,45.49123
139284,2018-02-14,45.42822,-73.61017,2018,2,14,10,2,2,1,-73.61017,45.42822
141407,2018-02-14,45.49781,-73.55969,2018,2,14,9,2,2,0,-73.55969,45.49781
144311,2018-02-14,45.52685,-73.64946,2018,2,14,8,2,2,1,-73.64946,45.52685


In [50]:
test_collisions['is_intersection'] = 0
test_rows['is_intersection'] = 1

In [57]:
test_collisions['colour_scheme'] = 1
test_rows['colour_scheme'] = np.where(test_rows['pred'] == 1, 2, 3)

In [58]:
test = pd.concat([test_rows, test_collisions])

In [59]:
test

Unnamed: 0,hour,Longitude,Latitude,Year,Month,Day,weekday,NB,SB,EB,WB,wind_speed,temperature,visibility,is_collision,prob,pred,is_intersection,colour_scheme,date,lat,lon,year,month,day,num_vehicles,num_victims
8,8,-73.575661,45.48265,2018.0,2.0,14.0,2,466.0,146.0,294.0,347.0,4.0,-8.0,24100.0,1.0,0.68,1.0,1,2,,,,,,,,
5570,8,-73.587639,45.473984,2018.0,2.0,14.0,2,0.0,2.0,354.0,324.0,4.0,-8.0,24100.0,0.0,0.32,0.0,1,3,,,,,,,,
5866,8,-73.574439,45.483398,2018.0,2.0,14.0,2,68.0,79.0,387.0,308.0,4.0,-8.0,24100.0,1.0,0.68,1.0,1,2,,,,,,,,
11432,8,-73.562987,45.495417,2018.0,2.0,14.0,2,969.0,338.0,754.0,164.0,4.0,-8.0,24100.0,1.0,0.68,1.0,1,2,,,,,,,,
22711,8,-73.571767,45.485016,2018.0,2.0,14.0,2,56.0,151.0,368.0,297.0,4.0,-8.0,24100.0,1.0,0.68,1.0,1,2,,,,,,,,
51327,8,-73.563755,45.494298,2018.0,2.0,14.0,2,1.0,103.0,723.0,206.0,4.0,-8.0,24100.0,1.0,0.68,1.0,1,2,,,,,,,,
56709,8,-73.573591,45.481004,2018.0,2.0,14.0,2,514.0,241.0,142.0,0.0,4.0,-8.0,24100.0,0.0,0.68,1.0,1,2,,,,,,,,
136061,6,-73.70376,45.48618,,,,2,,,,,,,,,,,0,1,2018-02-14,45.48618,-73.70376,2018.0,2.0,14.0,8.0,1.0
138887,7,-73.59608,45.49123,,,,2,,,,,,,,,,,0,1,2018-02-14,45.49123,-73.59608,2018.0,2.0,14.0,2.0,0.0
139284,10,-73.61017,45.42822,,,,2,,,,,,,,,,,0,1,2018-02-14,45.42822,-73.61017,2018.0,2.0,14.0,2.0,1.0


In [60]:
test.to_csv('../Data/Montreal/Final/predictions.csv', index=False)

In [None]:
test['L']