# Early Models

This notebook will primarily focus on looking at a collection of simpler models using the combined collisions/traffic flow dataset.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("darkgrid")

##### 0. Importing and cleaning the data

In [67]:
## Read in the data with traffic counter and
## bike feature information matched
data_raw = pd.read_csv('./DataFrames/combined_collisions_v3.csv')

In [3]:
## Type mismatch columns from NaN values vs. lists
## No longer an error, can ignore
#data.columns.values[[77,78,95,96]]

In [4]:
## Features to ignore when dropping NaN values
## because only the traffic counter data should
## be required to match accident data
bike_features = ['aadf_FEATURE_ID', 'aadf_SVDATE', 'aadf_CLT_CARR', 
                     'aadf_CLT_SEGREG', 'aadf_CLT_STEPP', 'aadf_CLT_PARSEG',
                     'aadf_CLT_SHARED', 'aadf_CLT_MANDAT', 'aadf_CLT_ADVIS',
                     'aadf_CLT_PRIORI', 'aadf_CLT_CONTRA', 'aadf_CLT_BIDIRE',
                     'aadf_CLT_CBYPAS', 'aadf_CLT_BBYPAS', 'aadf_CLT_PARKR',
                     'aadf_CLT_WATERR', 'aadf_CLT_PTIME', 'aadf_CLT_ACCESS',
                     'aadf_CLT_COLOUR', 'aadf_BOROUGH']

In [68]:
## Dropping NaN values and dummy columns
## and restricting the data to Greater London
data = data_raw.dropna(subset=[column for column in data_raw.columns if column not in bike_features]).copy()
data.drop(columns='Unnamed: 0',inplace=True)
data = data.loc[data.in_london == True]
## This should be redundant, but running
## to make sure anyway
data = data.loc[data.match == True]

## Some extra cleaning of additional columns
def is_str(x):
    Nx = len(x)
    I = np.ones(Nx).astype(bool)
    for ii in range(Nx):
        if type(x[ii]) != str:
            I[ii] = False
    return I

data = data.loc[is_str(data.Time.values),:]
data.aadf_Count_point_id = data.aadf_Count_point_id.values.astype(int)

For now, we will ignore any additional time columns and neighborhood work from Greg because it gets transformed when summing and the mode is taken anyway, making them virtually useless columns.

In [69]:
Inolane = np.ones(data.shape[0]).astype(bool)
for feature in bike_features:
    Inolane &= (data[feature].values == False)
    
data['bikelane'] = ~Inolane

data.sample(5)

Unnamed: 0,Accident_Index,Location_Easting_OSGR,Location_Northing_OSGR,Longitude,Latitude,Police_Force,Accident_Severity,Number_of_Vehicles,Number_of_Casualties,Date,...,aadf_CLT_BBYPAS,aadf_CLT_PARKR,aadf_CLT_WATERR,aadf_CLT_PTIME,aadf_CLT_ACCESS,aadf_CLT_COLOUR,aadf_BOROUGH,distance_to_cp,match,bikelane
165147,201401XH30511,508860.0,181840.0,-0.432223,51.524939,1,3,2,1,2014-08-22,...,False,False,False,False,,"[['NONE', 'GREEN', 'GREEN']]","[['Hillingdon', 'Hillingdon', 'Hillingdon']]",0.754667,True,True
144365,201301VW40169,524400.0,169350.0,-0.21272,51.409481,1,3,2,1,2013-05-20,...,False,False,False,False,,"[['NONE', 'NONE', 'NONE']]","[['Merton', 'Merton', 'Merton']]",0.312611,True,True
202730,201601WW50530,527740.0,175760.0,-0.162412,51.466347,1,3,2,1,2016-05-07,...,False,False,False,False,,[[]],[[]],0.159057,True,True
86315,201001PY20051,541670.0,169330.0,0.03543,51.405253,1,3,2,1,2010-01-02,...,False,False,False,False,,[[]],[[]],1.504347,True,True
1126,200501HT20410,535000.0,181280.0,-0.055847,51.514268,1,3,2,1,2005-06-06,...,False,False,False,True,,"[['NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NON...","[['Tower Hamlets', 'Tower Hamlets', 'Tower Ham...",0.061003,True,True


In [70]:
data.bikelane.value_counts()

True    37044
Name: bikelane, dtype: int64

In [59]:
data[bike_features].any()

aadf_FEATURE_ID     True
aadf_SVDATE         True
aadf_CLT_CARR       True
aadf_CLT_SEGREG     True
aadf_CLT_STEPP      True
aadf_CLT_PARSEG     True
aadf_CLT_SHARED     True
aadf_CLT_MANDAT     True
aadf_CLT_ADVIS      True
aadf_CLT_PRIORI     True
aadf_CLT_CONTRA     True
aadf_CLT_BIDIRE     True
aadf_CLT_CBYPAS     True
aadf_CLT_BBYPAS     True
aadf_CLT_PARKR      True
aadf_CLT_WATERR     True
aadf_CLT_PTIME      True
aadf_CLT_ACCESS    False
aadf_CLT_COLOUR     True
aadf_BOROUGH        True
dtype: bool

In [53]:
## Identify potential regression columns
reg_features = ['Longitude', 'Latitude', 'Day_of_Week',
                    'Time', 'Road_Type', 'Speed_limit',
                    'Light_Conditions', 'Weather_Conditions', 'Road_Surface_Conditions',
                    'Special_Conditions_at_Site', 'Carriageway_Hazards', 'Urban_or_Rural_Area',
                    'aadf_Year', 'aadf_Pedal_cycles', 'aadf_All_motor_vehicles',
                    'aadf_CLT_CARR', 'aadf_CLT_SEGREG', 'aadf_CLT_STEPP', 
                    'aadf_CLT_PARSEG', 'aadf_CLT_SHARED', 'aadf_CLT_MANDAT',
                    'aadf_CLT_ADVIS', 'aadf_CLT_PRIORI', 'aadf_CLT_CONTRA', 
                    'aadf_CLT_BIDIRE', 'aadf_CLT_CBYPAS', 'aadf_CLT_BBYPAS', 
                    'aadf_CLT_PARKR', 'aadf_CLT_WATERR', 'aadf_CLT_PTIME', 
                    'aadf_CLT_ACCESS', 'distance_to_cp']

class_col = 'Accident_Severity'

In [46]:
## import StandardScaler
from sklearn.preprocessing import StandardScaler

In [None]:
## Make a scaler object
scaler = StandardScaler()

## fit the scaler
scaler.fit(X)

In [None]:
#Time column has some nans in it, must drop those entries

def is_str(x):
    Nx = len(x)
    I = np.ones(Nx).astype(bool)
    for ii in range(Nx):
        if type(x[ii]) != str:
            I[ii] = False
    return I

#make hour, year and month columns
#hour will be rounded 

def get_hour(T):
    Nt = len(T)
    H = np.zeros(Nt).astype(int)
    for ii in range(Nt):
        s = T[ii].split(':')
        h = int(s[0])
        m = int(s[1])
        if m > 30:
            h += 1
        H[ii] = h % 24
    return H

def get_ymd(D):
    Nd = len(D)
    ymd = np.zeros((Nd,3)).astype(int)
    for ii in range(Nd):
        s = D[ii].split('-')
        for nn in range(3):
            ymd[ii,nn] = int(s[nn])
    return ymd

def reformat_aadf_year(Year):
    Ny = len(Year)
    y = np.zeros(Ny).astype(int)
    for ii in range(Ny):
        y[ii] = int(Year[ii].split('-')[0])
    return y

In [None]:
df = df.loc[is_str(df.Time.values),:]
df.aadf_Count_point_id = df.aadf_Count_point_id.values.astype(int)

df[['y','m','d']] = get_ymd(df.Date.values)
df['h'] = get_hour(df.Time.values)
#df['aadf_Year'] = reformat_aadf_year(df.aadf_Year.values)

##### 1. Logistic regression

In [11]:
from sklearn.linear_model import LogisticRegression

In [51]:
features = ['Longitude', 'Latitude', 'Speed_limit', 'Light_Conditions', 'Weather_Conditions', 'Road_Surface_Conditions', 'Special_Conditions_at_Site', 'Urban_or_Rural_Area', 'aadf_All_motor_vehicles']

In [52]:
log_reg = LogisticRegression(penalty='none', multi_class='multinomial', max_iter=100000)

In [67]:
log_reg.fit(data[features], data.Accident_Severity)
data[features].sample(5)

Unnamed: 0,Longitude,Latitude,Speed_limit,Light_Conditions,Weather_Conditions,Road_Surface_Conditions,Special_Conditions_at_Site,Urban_or_Rural_Area,aadf_All_motor_vehicles
235864,-0.315662,51.511418,30.0,1,1,1,0,1,18734.0
220750,-0.077957,51.509382,20.0,4,1,1,0,1,29645.0
186106,-0.413677,51.519797,30.0,1,1,1,0,1,29720.0
101785,-0.126706,51.50499,30.0,1,1,1,0,1,22970.0
186063,-0.450573,51.511388,30.0,1,1,1,0,1,14766.0


In [68]:
log_reg.predict(data[features])

array([3, 3, 3, ..., 3, 3, 3])

In [69]:
np.unique(log_reg.predict(data[features]))

array([3])

In [70]:
data.Accident_Severity.values

array([3, 3, 3, ..., 3, 2, 2])

In [41]:
from sklearn.metrics import accuracy_score

In [71]:
accuracy_score(data.Accident_Severity, log_reg.predict(data[features]))

0.8683187560738581

##### 2. GridSearchCV

In [31]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [32]:
max_depths = range(1, 11)
n_trees = [100, 500]

In [57]:
grid_cv = GridSearchCV(RandomForestClassifier(), # first put the model object here
                          param_grid = {'max_depth':max_depths, # place the grid values for max_depth and
                                        'n_estimators':n_trees}, # and n_estimators here
                          scoring = "accuracy", # put the metric we are trying to optimize here as a string, "accuracy"
                          cv = 10)

In [58]:
grid_cv.fit(data[features], data.Accident_Severity)

In [59]:
score_df = pd.DataFrame({'feature':features,
                            'importance_score': grid_cv.best_estimator_.feature_importances_})

score_df.sort_values('importance_score',ascending=False)

Unnamed: 0,feature,importance_score
1,Latitude,0.242688
0,Longitude,0.232319
8,aadf_All_motor_vehicles,0.216169
2,Speed_limit,0.085156
6,Special_Conditions_at_Site,0.06423
3,Light_Conditions,0.053382
4,Weather_Conditions,0.036874
5,Road_Surface_Conditions,0.035159
7,Urban_or_Rural_Area,0.034022


In [60]:
grid_cv.best_params_

{'max_depth': 7, 'n_estimators': 100}

In [61]:
grid_cv.best_score_

0.8683457574406193

In [63]:
grid_cv.best_estimator_.predict(data[features])

array([3, 3, 3, ..., 3, 3, 3])

In [65]:
np.unique(grid_cv.best_estimator_.predict(data[features]))

array([2, 3])