In [20]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from functions import transform_raw_data, clean_transform_df
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import cross_val_score, KFold
from sklearn.svm import SVC

In [21]:
# creating dictionary of categorical column names : corresponding one-hot encoded columns names
# this is used to make sure every X dataset in the CV has the same columns
df = transform_raw_data(path_to_csv="stats19CycleCollisions2022.csv")
categorical_columns = [col for col in df.select_dtypes(include='object').columns if col not in ['time','engine_capacity_cc','casualty_severity']] + ['speed_limit']
df = clean_transform_df(df, df.columns)

encoded_cols_dict = {}

for col in categorical_columns:
    encoded_cols = [encoded_col for encoded_col in df.columns if col in encoded_col]
    if len(encoded_cols) > 0:
        encoded_cols_dict[col] = encoded_cols

In [22]:
df = transform_raw_data(path_to_csv="stats19CycleCollisions2022.csv")

In [23]:
# custom Transformer class for applying preprocessing in pipeline
class CustomPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, func=clean_transform_df):
        self.func = func

    def fit(self, X, y=None):
        # no fitting is needed
        return self

    def transform(self, X, y=None):
        # apply the preprocessing function to the data
        return self.func(X, X.columns, encoded_cols_dict)

In [24]:
pipeline = Pipeline([
    ('custom_preprocessor', CustomPreprocessor()),
    ('classifier', SVC())
])

cv = KFold(n_splits=10)

In [25]:
features = ['age_of_casualty','engine_capacity_cc','age_of_driver',
            'vehicle_type','junction_detail','towing_and_articulation','vehicle_manoeuvre',
            'casualty_imd_decile','pedestrian_crossing_physical_facilities','driver_imd_decile',
            'day_of_week','time_period','season',
            'first_road_class','road_type','speed_limit','junction_control','second_road_class',
            'light_conditions','weather_conditions','road_surface_conditions','propulsion_code']
target = 'casualty_severity'

In [26]:
# baseline accuracy -> simply predict most common category
df.groupby("casualty_severity").size().max() / len(df)

0.7833664577281761

In [27]:
# Perform cross-validation
scores = cross_val_score(pipeline, X=df[features], y=df[target], cv=cv)

In [28]:
scores

array([0.83053435, 0.79663609, 0.75840979, 0.78287462, 0.81498471,
       0.77522936, 0.8088685 , 0.79510703, 0.8088685 , 0.66207951])