<center><h1>Driver Accident Risk Model</center>
<br>
    
---
   
<b>Objective</b>: Develop a model able to predict whether a driver is going to have a severe or fatal accident given biographical information and vehicle characteristics.

The probability in output of the predictive model can be used as a "Driver Score" highlighting the overall riskiness of a driver.
    
<b>Data</b>: https://data.gov.uk/dataset/cb7ae6f0-4be6-4935-9277-47e5ce24a11f/road-safety-data
    
---

## Libraries

In [None]:
import matplotlib as mp
import numpy as np
import pandas as pd
import sklearn.ensemble as se
import sklearn.impute as si
import sklearn.linear_model as lm
import sklearn.metrics as sm
import sklearn.model_selection as ms
import xgboost as xg


# Jupyter Plots
mp.rcParams['figure.dpi']= 600
%matplotlib inline

## 4. Data Loading

In [None]:
vehicles = pd.read_csv('Data/UK Accidents/Vehicles.csv', low_memory=False)
casualties = pd.read_csv('Data/UK Accidents/Casualties.csv', low_memory=False)
accidents = pd.read_csv('Data/UK Accidents/Accidents.csv', low_memory=False)

## 5. Exploratory Analysis

### Vehicles

In [None]:
vehicles.head()

In [None]:
vehicles.describe()

### Casualties

In [None]:
casualties.head()

In [None]:
casualties.describe()

In [None]:
mp.pyplot.title('Histogram of Casualty Severity');
casualties.Casualty_Severity.hist(bins=np.arange(4.5)-0.5);

### Accidents

In [None]:
accidents.head()

In [None]:
accidents.describe()

### Parameters

In [None]:
PREDICTIVE_FEATURES = ['Vehicle_Type','Towing_and_Articulation','Was_Vehicle_Left_Hand_Drive?', 
                       'Sex_of_Driver', 'Age_of_Driver', 'Engine_Capacity_(CC)', 'Propulsion_Code',
                       'Age_of_Vehicle', 'Driver_IMD_Decile', 'Driver_Home_Area_Type', 'Vehicle_IMD_Decile']

DESCRIPTIVE_FEATURES = ['Location_Easting_OSGR', 'Location_Northing_OSGR', 'Longitude', 
                        'Latitude', 'Police_Force',
                        'Number_of_Vehicles', 'Number_of_Casualties', 'Date', 
                        'Day_of_Week', 'Time', 'Local_Authority_(District)', 
                        'Local_Authority_(Highway)', '1st_Road_Class', '1st_Road_Number', 
                        'Road_Type', 'Speed_limit', 'Junction_Detail', 
                        'Junction_Control', '2nd_Road_Class', '2nd_Road_Number', 
                        'Pedestrian_Crossing-Human_Control', 'Carriageway_Hazards', 
                        'Pedestrian_Crossing-Physical_Facilities', 'Light_Conditions', 'Weather_Conditions', 
                        'Road_Surface_Conditions', 'Special_Conditions_at_Site', 'Urban_or_Rural_Area', 
                        'Did_Police_Officer_Attend_Scene_of_Accident', 'LSOA_of_Accident_Location',
                        'Vehicle_Manoeuvre','Vehicle_Location-Restricted_Lane', 'Junction_Location',
                        'Skidding_and_Overturning', 'Hit_Object_in_Carriageway','Vehicle_Leaving_Carriageway', 
                        'Hit_Object_off_Carriageway','1st_Point_of_Impact','Journey_Purpose_of_Driver',
                        'Pedestrian_Location', 'Pedestrian_Movement', 'Pedestrian_Road_Maintenance_Worker', 'Casualty_Type',
                        'Casualty_Home_Area_Type', 'Casualty_IMD_Decile', 'Casualty_Severity']

CATEGORICAL_FEATURES = ['Vehicle_Type', 'Towing_and_Articulation', 'Sex_of_Driver', 
                        'Propulsion_Code', 'Driver_Home_Area_Type']
BINARY_FEATURES = ['Was_Vehicle_Left_Hand_Drive']

## 6. Data Aggregation

In [None]:
# For each vehicle involved in an accident, select the highest severity (lowest score) experienced by a person
casualties_vehicles = casualties.groupby(['Accident_Index', 'Vehicle_Reference'])['Casualty_Severity'].min().reset_index()
casualties_vehicles

In [None]:
data = vehicles.merge(casualties_vehicles, how='left', on=['Accident_Index', 'Vehicle_Reference'])
data

## 7. Data Cleaning

### Aggregate Unknown and Missing

In [None]:
data['Sex_of_Driver'] = data['Sex_of_Driver'].replace(3, -1)

### Remove NAs

In [None]:
data['Casualty_Severity'].fillna(value=0, inplace=True)

In [None]:
# Encoding missing values as -1 shifts the distribution, we encode them as nan for imputing them later
data = data.replace(-1, np.nan)

### Target Variable Encoding

In [None]:
def severity(x):
    if x in (0.0, 3.0):
        return 0
    else:
        return 1

In [None]:
data['Casualty_Severity'] = data['Casualty_Severity'].apply(severity)

### Ex-Ante Features

In [None]:
# Restrict the dataset to ex-ante features
columns = PREDICTIVE_FEATURES
columns.append('Casualty_Severity')

data = data[columns]

## 8. Feature Engineering

### One-Hot Encoding for Categorical Variables

In [None]:
for c in CATEGORICAL_FEATURES:
    if c in data.columns:
        one_hot = pd.get_dummies(data[c], prefix=c)
        data = data.drop(c, axis=1)
        data = data.join(one_hot)

### Binary Variables Encoding

In [None]:
# Convert 1 and 2 to False and True
for b in BINARY_FEATURES:
    if b in data.columns:
        data[b] = data[b].map({1: 0, 2: 1})
        data[b] = data[b].astype(bool)

## 9. Split Training, Validation and Test

In [None]:
X = data[data.columns[~data.columns.isin(['Casualty_Severity'])]]
y = data['Casualty_Severity']

train_X, test_X, train_y, test_y = ms.train_test_split(X, y)

## 10. Imputation

In [None]:
lnr = si.KNNImputer()

lnr.fit(train_X)

train_X = lnr.transform(train_X)
test_X = lnr.transform(test_X)

## 11. Modelling

### Logistic Regression Model

In [None]:
model = lm.LogisticRegression()

# Hyperparameters
params = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'], 
    'max_iter': [1000],
    'class_weight': ['balanced']
}

# Grid Search
clf = ms.GridSearchCV(model, params)

# Model Fit
logit = clf.fit(train_X, train_y)

### Random Forests

In [None]:
# Logistic Regression Model
model = se.RandomForestClassifier()

# Hyperparameters
params = {
 'max_depth': [10],
 'n_estimators': [100, 1000],
 'class_weight': [{0: 1, 1: 4# Logistic Regression Model
model = se.RandomForestClassifier()

# Hyperparameters
params = {
 'max_depth': [10],
 'n_estimators': [100, 1000],
 'class_weight': [{0: 1, 1: 4}]
}

# Grid Search
clf = ms.GridSearchCV(model, params)

# Model Fit
rf = clf.fit(train_X, train_y)}]
}

# Grid Search
clf = ms.GridSearchCV(model, params)

# Model Fit
rf = clf.fit(train_X, train_y)

### XGBoost

In [None]:
# Gradient Boosting Model
model = xg.XGBClassifier()

# Hyperparameters
params = {
        'eta': [0.001, 0.01],
        'max_depth': [5, 10],
        'n_estimators': [50, 100, 1000],
        'scale_pos_weight': [1, 14]
        }

# Grid Search
grid = ms.GridSearchCV(model, params)

# Model Fit
xgb = grid.fit(train_X, train_y)

## 12. Performance Evaluation

### Logistic Regression

In [None]:
predictions_train = logit.predict_proba(train_X)[:, 1]
predictions_test = logit.predict_proba(test_X)[:, 1]
print('In Sample AUC:', sm.roc_auc_score(train_y, predictions_train))
print('Out of Sample AUC: ', sm.roc_auc_score(test_y, predictions_test))

### Random Forests

In [None]:
predictions_train = rf.predict_proba(train_X)[:, 1]
predictions_test = rf.predict_proba(test_X)[:, 1]
print('In Sample AUC:', sm.roc_auc_score(train_y, predictions_train))
print('Out of Sample AUC: ', sm.roc_auc_score(test_y, predictions_test))

### XGBoost

In [None]:
predictions_train = xgb.predict_proba(train_X)[:, 1]
predictions_test = xgb.predict_proba(test_X)[:, 1]
print('In Sample AUC:', sm.roc_auc_score(train_y, predictions_train))
print('Out of Sample AUC: ', sm.roc_auc_score(test_y, predictions_test))