In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from functions import transform_raw_data, clean_df, impute_fit_df, impute_transform_df
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import cross_val_score, KFold
from sklearn.svm import SVC

In [2]:
# creating dictionary of categorical column names : corresponding one-hot encoded columns names
# this is used to make sure every X dataset in the CV has the same columns
df = transform_raw_data(path_to_csv="stats19CycleCollisions2022.csv")
categorical_columns = [col for col in df.select_dtypes(include='object').columns if col not in ['time','engine_capacity_cc','casualty_severity']] + ['speed_limit']
df = clean_df(df)
categorical_freqs, vars_to_groupby, continuous_medians_grouped, continuous_medians, scaler = impute_fit_df(df)
df = impute_transform_df(df, categorical_freqs, vars_to_groupby, continuous_medians_grouped, continuous_medians, scaler, df.columns)

encoded_cols_dict = {}

for col in categorical_columns:
    encoded_cols = [encoded_col for encoded_col in df.columns if col in encoded_col]
    if len(encoded_cols) > 0:
        encoded_cols_dict[col] = encoded_cols

In [3]:
df = transform_raw_data(path_to_csv="stats19CycleCollisions2022.csv")
df = clean_df(df)

In [4]:
# custom Transformer class for applying preprocessing in pipeline
class CustomPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.categorical_freqs = None
        self.continuous_medians_grouped = None
        self.continuous_medians = None
        self.vars_to_groupby = None
        self.scaler = None

    # fit is only called on the train data
    def fit(self, X, y=None):
        # calculate impute values
        self.categorical_freqs, self.vars_to_groupby, self.continuous_medians_grouped, self.continuous_medians, self.scaler = impute_fit_df(X)
        return self

    # transform is called on the both the train and test data
    def transform(self, X, y=None):
        # apply the preprocessing function to the data
        return impute_transform_df(X, self.categorical_freqs, self.vars_to_groupby, self.continuous_medians_grouped, self.continuous_medians, self.scaler, X.columns, encoded_cols_dict)

In [10]:
pipeline = Pipeline([
    ('custom_preprocessor', CustomPreprocessor()),
    ('classifier', SVC())
])

cv = KFold(n_splits=10)

In [11]:
features = ['age_of_casualty','engine_capacity_cc','age_of_driver',
            'vehicle_type','junction_detail','towing_and_articulation','vehicle_manoeuvre',
            'casualty_imd_decile','pedestrian_crossing_physical_facilities','driver_imd_decile',
            'day_of_week','time_period','season',
            'first_road_class','road_type','speed_limit','junction_control','second_road_class',
            'light_conditions','weather_conditions','road_surface_conditions','propulsion_code']
target = 'casualty_severity'

In [12]:
# baseline accuracy -> simply predict most common category
df.groupby("casualty_severity").size().max() / len(df)

0.7833664577281761

In [13]:
# Perform cross-validation
scores = cross_val_score(pipeline, X=df[features], y=df[target], cv=cv)

In [14]:
scores

array([0.82900763, 0.79663609, 0.75840979, 0.78134557, 0.81498471,
       0.77522936, 0.8088685 , 0.79510703, 0.81039755, 0.66207951])