In [16]:
from functions import transform_raw_data, clean_df, impute_fit_df, impute_transform_df

from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import CategoricalNB
#from xgboost import XGBClassifier
import mlflow

In [2]:
# creating dictionary of categorical column names : corresponding one-hot encoded columns names
# this is used to make sure every X dataset in the CV has the same columns
df = transform_raw_data(path_to_csv="stats19CycleCollisions2022.csv")
categorical_columns = [col for col in df.select_dtypes(include='object').columns if col not in ['time','engine_capacity_cc','casualty_severity']] + ['speed_limit']
df = clean_df(df)
categorical_freqs, vars_to_groupby, continuous_medians_grouped, continuous_medians, scaler = impute_fit_df(df)
df = impute_transform_df(df, categorical_freqs, vars_to_groupby, continuous_medians_grouped, continuous_medians, scaler, df.columns)

encoded_cols_dict = {}

for col in categorical_columns:
    encoded_cols = [encoded_col for encoded_col in df.columns if col in encoded_col]
    if len(encoded_cols) > 0:
        encoded_cols_dict[col] = encoded_cols

In [3]:
df = transform_raw_data(path_to_csv="stats19CycleCollisions2022.csv")
df = clean_df(df)

In [4]:
# custom Transformer class for applying preprocessing in pipeline
class CustomPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.categorical_freqs = None
        self.continuous_medians_grouped = None
        self.continuous_medians = None
        self.vars_to_groupby = None
        self.scaler = None

    # fit is only called on the train data
    def fit(self, X, y=None):
        # calculate impute values
        self.categorical_freqs, self.vars_to_groupby, self.continuous_medians_grouped, self.continuous_medians, self.scaler = impute_fit_df(X)
        return self

    # transform is called on the both the train and test data
    def transform(self, X, y=None):
        # apply the preprocessing function to the data
        return impute_transform_df(X, self.categorical_freqs, self.vars_to_groupby, self.continuous_medians_grouped, self.continuous_medians, self.scaler, X.columns, encoded_cols_dict)

In [5]:
features = ['age_of_casualty','engine_capacity_cc','age_of_driver',
            'vehicle_type','junction_detail','towing_and_articulation','vehicle_manoeuvre',
            'casualty_imd_decile','pedestrian_crossing_physical_facilities','driver_imd_decile',
            'day_of_week','time_period','season',
            'first_road_class','road_type','speed_limit','junction_control','second_road_class',
            'light_conditions','weather_conditions','road_surface_conditions','propulsion_code']
target = 'casualty_severity'

In [6]:
# baseline accuracy -> dummy classifier predicting the most frequent class
pipeline = Pipeline([
    ('custom_preprocessor', CustomPreprocessor()),
    ('classifier', DummyClassifier(strategy="most_frequent"))
])
X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.2)
pipeline.fit(X_train, y_train)
y_pred_dummy = pipeline.predict(X_test)

accuracy = accuracy_score(y_test, y_pred_dummy)
precision = precision_score(y_test, y_pred_dummy, average='macro')
recall = recall_score(y_test, y_pred_dummy, average='macro')
f1 = f1_score(y_test, y_pred_dummy, average='macro')

print(accuracy, precision, recall, f1)

0.7761650114591291 0.25872167048637634 0.3333333333333333 0.291326164874552


  _warn_prf(average, modifier, msg_start, len(result))


In [7]:
# # create a new MLflow experiment
# mlflow.set_experiment("cycle collisions")
# with mlflow.start_run():
#     pipeline = Pipeline([
#         ('custom_preprocessor', CustomPreprocessor()),
#         ('classifier', SVC())
#     ])
#     cv = StratifiedKFold(n_splits=5)
#     scores = cross_val_score(pipeline, X=df[features], y=df[target], cv=cv, scoring=accuracy_score)

#     mlflow.log_params(pipeline.get_params())
#     mlflow.log_metric("accuracy", accuracy)

### 1. initial experimentation - testing different features

In [17]:
def run_experiment(features):
    pipeline = Pipeline([
        ('custom_preprocessor', CustomPreprocessor()),
        ('classifier', SVC())
    ])
    cv = StratifiedKFold(n_splits=5)
    scores = cross_val_score(pipeline, X=df[features], y=df[target], cv=cv, scoring=make_scorer(accuracy_score))
    return scores, scores.mean()

In [9]:
# all features
features = ['age_of_casualty','engine_capacity_cc','age_of_driver',
            'vehicle_type','junction_detail','towing_and_articulation','vehicle_manoeuvre',
            'casualty_imd_decile','pedestrian_crossing_physical_facilities','driver_imd_decile',
            'day_of_week','time_period','season',
            'first_road_class','road_type','speed_limit','junction_control','second_road_class',
            'light_conditions','weather_conditions','road_surface_conditions','propulsion_code']
run_experiment(features)

(array([0.78151261, 0.78363914, 0.78363914, 0.78363914, 0.78287462]),
 0.783060930794336)

In [10]:
# selecting subset of features
features = ['towing_and_articulation','pedestrian_crossing_physical_facilities',
            'road_type','speed_limit',
            'first_road_class','second_road_class']
run_experiment(features)

(array([0.78304049, 0.78363914, 0.78363914, 0.78440367, 0.78287462]),
 0.7835194127692777)