## Logistic Regression to Predict High or Low Traffic Accident Severity

In [65]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, roc_auc_score, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [2]:
data = pd.read_csv('C:/data/Accidents.csv')

In [33]:
# Twice as many Low Severity Accidents vs. High Severity
data.SeverityLoHi.value_counts()

0    1837057
1     880347
Name: SeverityLoHi, dtype: int64

In [41]:
subset = data.drop(['Weather_Timestamp', 'Weather_Condition', 'Weather_Group', 'ID', 'Source', 'Severity', 'Start_Time', 'End_Time', 'Start_Lat', 'Start_Lng', 'Description', 'Wind_Direction', 'Street', 'Side', 'City', 'County', 'State', 'Zipcode', 'Zip2', 'Country', 'Timezone', 'Airport_Code', 'Duration', 'Month', 'Sunrise_Sunset', 'Civil_Twilight', 'Nautical_Twilight', 'Astronomical_Twilight'], axis = 1)

In [44]:
subset = pd.get_dummies(data = subset, columns = ['weekday', 'hour'] )

In [45]:
subset.columns

Index(['Distance(mi)', 'Temperature(F)', 'Wind_Chill(F)', 'Humidity(%)',
       'Pressure(in)', 'Visibility(mi)', 'Wind_Speed(mph)',
       'Precipitation(in)', 'Amenity', 'Bump', 'Crossing', 'Give_Way',
       'Junction', 'No_Exit', 'Railway', 'Roundabout', 'Station', 'Stop',
       'Traffic_Calming', 'Traffic_Signal', 'Turning_Loop', 'Duration(m)',
       'Obscured', 'Heavy_Precip', 'Light_Precip', 'Windy', 'Cloudy', 'Clear',
       'SeverityLoHi', 'weekday_Fri', 'weekday_Mon', 'weekday_Sat',
       'weekday_Sun', 'weekday_Thu', 'weekday_Tue', 'weekday_Wed', 'hour_0',
       'hour_1', 'hour_2', 'hour_3', 'hour_4', 'hour_5', 'hour_6', 'hour_7',
       'hour_8', 'hour_9', 'hour_10', 'hour_11', 'hour_12', 'hour_13',
       'hour_14', 'hour_15', 'hour_16', 'hour_17', 'hour_18', 'hour_19',
       'hour_20', 'hour_21', 'hour_22', 'hour_23'],
      dtype='object')

In [56]:
steps = [('scaler', StandardScaler()), ('logreg', LogisticRegression())]

In [59]:
pipeline = Pipeline(steps)

In [60]:
# Create arrays for the features and the response variable
y = subset['SeverityLoHi'].values
X = subset.drop('SeverityLoHi', axis=1).values

In [61]:
# Create training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=7)

In [62]:
logreg_scaled = pipeline.fit(X_train, y_train)

In [63]:
y_pred = pipeline.predict(X_test)

In [67]:
accuracy_score(y_test, y_pred)

0.6943701803742909