## Model Training

In [1]:
## Import required packages

## Basic Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Modelling
from sklearn.metrics import accuracy_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split

## Pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

## Import the csv data as dataframe

In [2]:
df = pd.read_csv("data/rta_cleaned.csv")
df.head(3)

Unnamed: 0,day_of_week,driver_age,driver_sex,educational_level,vehicle_driver_relation,driving_experience,vehicle_type,vehicle_owner,service_year,vehicle_defect,...,vehicle_movement,casualty_class,casualty_sex,casualty_age,casualty_severity,casualty_work,casualty_fitness,pedestrian_movement,accident_cause,accident_severity
0,Monday,18-30,Male,Above high school,Employee,1-2yr,Automobile,Owner,Above 10yr,No defect,...,Going straight,na,na,na,na,Driver,Normal,Not a Pedestrian,Moving Backward,Slight Injury
1,Monday,31-50,Male,Junior high school,Employee,Above 10yr,Public (> 45 seats),Owner,5-10yrs,No defect,...,Going straight,na,na,na,na,Driver,Normal,Not a Pedestrian,Overtaking,Slight Injury
2,Monday,18-30,Male,Junior high school,Employee,1-2yr,Lorry (41?100Q),Owner,Unknown,No defect,...,Going straight,Driver or rider,Male,31-50,3,Driver,Normal,Not a Pedestrian,Changing lane to the left,Serious Injury


In [3]:
df.drop(columns=['accident_area'], axis=1)

Unnamed: 0,day_of_week,driver_age,driver_sex,educational_level,vehicle_driver_relation,driving_experience,vehicle_type,vehicle_owner,service_year,vehicle_defect,...,vehicle_movement,casualty_class,casualty_sex,casualty_age,casualty_severity,casualty_work,casualty_fitness,pedestrian_movement,accident_cause,accident_severity
0,Monday,18-30,Male,Above high school,Employee,1-2yr,Automobile,Owner,Above 10yr,No defect,...,Going straight,na,na,na,na,Driver,Normal,Not a Pedestrian,Moving Backward,Slight Injury
1,Monday,31-50,Male,Junior high school,Employee,Above 10yr,Public (> 45 seats),Owner,5-10yrs,No defect,...,Going straight,na,na,na,na,Driver,Normal,Not a Pedestrian,Overtaking,Slight Injury
2,Monday,18-30,Male,Junior high school,Employee,1-2yr,Lorry (41?100Q),Owner,Unknown,No defect,...,Going straight,Driver or rider,Male,31-50,3,Driver,Normal,Not a Pedestrian,Changing lane to the left,Serious Injury
3,Sunday,18-30,Male,Junior high school,Employee,5-10yr,Public (> 45 seats),Governmental,Unknown,No defect,...,Going straight,Pedestrian,Female,18-30,3,Driver,Normal,Not a Pedestrian,Changing lane to the right,Slight Injury
4,Sunday,18-30,Male,Junior high school,Employee,2-5yr,Automobile,Owner,5-10yrs,No defect,...,Going straight,na,na,na,na,Driver,Normal,Not a Pedestrian,Overtaking,Slight Injury
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12311,Wednesday,31-50,Male,Junior high school,Employee,2-5yr,Lorry (11?40Q),Owner,Unknown,No defect,...,Going straight,na,na,na,na,Driver,Normal,Not a Pedestrian,No distancing,Slight Injury
12312,Sunday,Unknown,Male,Elementary school,Employee,5-10yr,Automobile,Owner,Unknown,No defect,...,Other,na,na,na,na,Driver,Normal,Not a Pedestrian,No distancing,Slight Injury
12313,Sunday,Over 51,Male,Junior high school,Employee,5-10yr,Bajaj,Owner,2-5yrs,No defect,...,Other,Driver or rider,Male,31-50,3,Driver,Normal,Not a Pedestrian,Changing lane to the right,Serious Injury
12314,Sunday,18-30,Female,Junior high school,Employee,Above 10yr,Lorry (41?100Q),Owner,2-5yrs,No defect,...,Other,na,na,na,na,Driver,Normal,Not a Pedestrian,Driving under the influence of drugs,Slight Injury


In [4]:
df.isnull().sum()

day_of_week                0
driver_age                 0
driver_sex                 0
educational_level          0
vehicle_driver_relation    0
driving_experience         0
vehicle_type               0
vehicle_owner              0
service_year               0
vehicle_defect             0
accident_area              0
lanes                      0
road_allignment            0
junction_type              0
surface_type               0
road_surface_conditions    0
light_condition            0
weather_condition          0
collision_type             0
vehicles_involved          0
casualties                 0
vehicle_movement           0
casualty_class             0
casualty_sex               0
casualty_age               0
casualty_severity          0
casualty_work              0
casualty_fitness           0
pedestrian_movement        0
accident_cause             0
accident_severity          0
dtype: int64

In [5]:
num_features = df.select_dtypes(exclude="object").columns
num_features

Index(['vehicles_involved', 'casualties'], dtype='object')

In [6]:
cat_features = df.select_dtypes(include="object").columns
cat_features

Index(['day_of_week', 'driver_age', 'driver_sex', 'educational_level',
       'vehicle_driver_relation', 'driving_experience', 'vehicle_type',
       'vehicle_owner', 'service_year', 'vehicle_defect', 'accident_area',
       'lanes', 'road_allignment', 'junction_type', 'surface_type',
       'road_surface_conditions', 'light_condition', 'weather_condition',
       'collision_type', 'vehicle_movement', 'casualty_class', 'casualty_sex',
       'casualty_age', 'casualty_severity', 'casualty_work',
       'casualty_fitness', 'pedestrian_movement', 'accident_cause',
       'accident_severity'],
      dtype='object')

In [7]:
X = df.drop(columns=['accident_severity', 'accident_area'], axis=1)
X

Unnamed: 0,day_of_week,driver_age,driver_sex,educational_level,vehicle_driver_relation,driving_experience,vehicle_type,vehicle_owner,service_year,vehicle_defect,...,casualties,vehicle_movement,casualty_class,casualty_sex,casualty_age,casualty_severity,casualty_work,casualty_fitness,pedestrian_movement,accident_cause
0,Monday,18-30,Male,Above high school,Employee,1-2yr,Automobile,Owner,Above 10yr,No defect,...,2,Going straight,na,na,na,na,Driver,Normal,Not a Pedestrian,Moving Backward
1,Monday,31-50,Male,Junior high school,Employee,Above 10yr,Public (> 45 seats),Owner,5-10yrs,No defect,...,2,Going straight,na,na,na,na,Driver,Normal,Not a Pedestrian,Overtaking
2,Monday,18-30,Male,Junior high school,Employee,1-2yr,Lorry (41?100Q),Owner,Unknown,No defect,...,2,Going straight,Driver or rider,Male,31-50,3,Driver,Normal,Not a Pedestrian,Changing lane to the left
3,Sunday,18-30,Male,Junior high school,Employee,5-10yr,Public (> 45 seats),Governmental,Unknown,No defect,...,2,Going straight,Pedestrian,Female,18-30,3,Driver,Normal,Not a Pedestrian,Changing lane to the right
4,Sunday,18-30,Male,Junior high school,Employee,2-5yr,Automobile,Owner,5-10yrs,No defect,...,2,Going straight,na,na,na,na,Driver,Normal,Not a Pedestrian,Overtaking
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12311,Wednesday,31-50,Male,Junior high school,Employee,2-5yr,Lorry (11?40Q),Owner,Unknown,No defect,...,1,Going straight,na,na,na,na,Driver,Normal,Not a Pedestrian,No distancing
12312,Sunday,Unknown,Male,Elementary school,Employee,5-10yr,Automobile,Owner,Unknown,No defect,...,1,Other,na,na,na,na,Driver,Normal,Not a Pedestrian,No distancing
12313,Sunday,Over 51,Male,Junior high school,Employee,5-10yr,Bajaj,Owner,2-5yrs,No defect,...,1,Other,Driver or rider,Male,31-50,3,Driver,Normal,Not a Pedestrian,Changing lane to the right
12314,Sunday,18-30,Female,Junior high school,Employee,Above 10yr,Lorry (41?100Q),Owner,2-5yrs,No defect,...,1,Other,na,na,na,na,Driver,Normal,Not a Pedestrian,Driving under the influence of drugs


In [8]:
y = df['accident_severity']
y

0         Slight Injury
1         Slight Injury
2        Serious Injury
3         Slight Injury
4         Slight Injury
              ...      
12311     Slight Injury
12312     Slight Injury
12313    Serious Injury
12314     Slight Injury
12315     Slight Injury
Name: accident_severity, Length: 12316, dtype: object

## Encoding & scaling

In [9]:
X.head()

Unnamed: 0,day_of_week,driver_age,driver_sex,educational_level,vehicle_driver_relation,driving_experience,vehicle_type,vehicle_owner,service_year,vehicle_defect,...,casualties,vehicle_movement,casualty_class,casualty_sex,casualty_age,casualty_severity,casualty_work,casualty_fitness,pedestrian_movement,accident_cause
0,Monday,18-30,Male,Above high school,Employee,1-2yr,Automobile,Owner,Above 10yr,No defect,...,2,Going straight,na,na,na,na,Driver,Normal,Not a Pedestrian,Moving Backward
1,Monday,31-50,Male,Junior high school,Employee,Above 10yr,Public (> 45 seats),Owner,5-10yrs,No defect,...,2,Going straight,na,na,na,na,Driver,Normal,Not a Pedestrian,Overtaking
2,Monday,18-30,Male,Junior high school,Employee,1-2yr,Lorry (41?100Q),Owner,Unknown,No defect,...,2,Going straight,Driver or rider,Male,31-50,3,Driver,Normal,Not a Pedestrian,Changing lane to the left
3,Sunday,18-30,Male,Junior high school,Employee,5-10yr,Public (> 45 seats),Governmental,Unknown,No defect,...,2,Going straight,Pedestrian,Female,18-30,3,Driver,Normal,Not a Pedestrian,Changing lane to the right
4,Sunday,18-30,Male,Junior high school,Employee,2-5yr,Automobile,Owner,5-10yrs,No defect,...,2,Going straight,na,na,na,na,Driver,Normal,Not a Pedestrian,Overtaking


In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [11]:
le = LabelEncoder()
le.fit(y_train)

In [12]:
le.classes_

array(['Fatal injury', 'Serious Injury', 'Slight Injury'], dtype=object)

In [13]:
y_train = le.transform(y_train)
y_test = le.transform(y_test)

In [14]:
y_train

array([2, 2, 2, ..., 2, 2, 2])

In [15]:
y_test

array([2, 2, 2, ..., 2, 2, 2])

In [16]:
num_features = X.select_dtypes(exclude="object").columns 
cat_features = X.select_dtypes(include="object").columns

In [17]:
num_features

Index(['vehicles_involved', 'casualties'], dtype='object')

In [18]:
cat_features

Index(['day_of_week', 'driver_age', 'driver_sex', 'educational_level',
       'vehicle_driver_relation', 'driving_experience', 'vehicle_type',
       'vehicle_owner', 'service_year', 'vehicle_defect', 'lanes',
       'road_allignment', 'junction_type', 'surface_type',
       'road_surface_conditions', 'light_condition', 'weather_condition',
       'collision_type', 'vehicle_movement', 'casualty_class', 'casualty_sex',
       'casualty_age', 'casualty_severity', 'casualty_work',
       'casualty_fitness', 'pedestrian_movement', 'accident_cause'],
      dtype='object')

In [19]:
from sklearn.impute import SimpleImputer

In [20]:
transformer = ColumnTransformer(transformers=[
    ("OrdinalEncoder",OrdinalEncoder(), cat_features),
    ("StandardScaler", StandardScaler(), num_features),
], remainder="drop")

In [21]:
X_train = transformer.fit_transform(X_train)

In [22]:
X_test = transformer.transform(X_test)

In [23]:
X_train.shape

(8621, 29)

In [24]:
X_test.shape

(3695, 29)

## Create an evaluation function to give all metrics after model Training

In [25]:
def evaluate_model(true, predicted):
    score_f1 = f1_score(true, predicted, average="weighted")
    accuracy = accuracy_score(true, predicted)

    return score_f1, accuracy

In [26]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def fit_classification_models(X, y, test_size=0.2, random_state=42):
    """
    Fit classification models using all the available algorithms in scikit-learn.

    Parameters:
    -----------
    X : array-like of shape (n_samples, n_features)
        The input data.

    y : array-like of shape (n_samples,)
        The target variable.

    test_size : float, default=0.2
        The proportion of the dataset to include in the test split.

    random_state : int, default=42
        Controls the randomness of the dataset splitting.

    Returns:
    --------
    results : dict
        A dictionary containing the performance metrics for each classification algorithm.
    """
    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Create pipelines for all the classifiers
    pipelines = {
        'Logistic Regression': Pipeline([('scaler', StandardScaler()), ('classifier', LogisticRegression())]),
        'Decision Tree': Pipeline([('scaler', StandardScaler()), ('classifier', DecisionTreeClassifier())]),
        'Random Forest': Pipeline([('scaler', StandardScaler()), ('classifier', RandomForestClassifier())]),
        'K-Nearest Neighbors': Pipeline([('scaler', StandardScaler()), ('classifier', KNeighborsClassifier())]),
        'Gaussian Naive Bayes': Pipeline([('scaler', StandardScaler()), ('classifier', GaussianNB())]),
        'Support Vector Machine': Pipeline([('scaler', StandardScaler()), ('classifier', SVC())]),
        'XGBoost': Pipeline([('scaler', StandardScaler()), ('classifier', XGBClassifier())]),
        'ExtraTreesClassifier': Pipeline([('scaler', StandardScaler()), ('classifier', ExtraTreesClassifier())]),
        'CatBoostClassifier': Pipeline([('scaler', StandardScaler()), ('classifier', CatBoostClassifier())]),
        'AdaBoostClassifier': Pipeline([('scaler', StandardScaler()), ('classifier', AdaBoostClassifier())]),
    }

    # Fit and evaluate all the classifiers
    results = {}
    model_list = []
    acc_list = []
    f1_list = []
    
    for classifier_name, pipeline in pipelines.items():
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average="weighted")
        results[classifier_name] = {'accuracy': accuracy, 'f1-score': f1}

        model_list.append(results)

    return model_list


In [27]:
fit_classification_models(X_train, y_train)

Learning rate set to 0.087316
0:	learn: 1.0005097	total: 151ms	remaining: 2m 31s
1:	learn: 0.9221380	total: 158ms	remaining: 1m 18s
2:	learn: 0.8588826	total: 165ms	remaining: 54.8s
3:	learn: 0.8076284	total: 172ms	remaining: 42.9s
4:	learn: 0.7641831	total: 180ms	remaining: 35.7s
5:	learn: 0.7267345	total: 187ms	remaining: 30.9s
6:	learn: 0.6924501	total: 193ms	remaining: 27.4s
7:	learn: 0.6649635	total: 200ms	remaining: 24.8s
8:	learn: 0.6397327	total: 207ms	remaining: 22.8s
9:	learn: 0.6190085	total: 214ms	remaining: 21.2s
10:	learn: 0.5994859	total: 221ms	remaining: 19.9s
11:	learn: 0.5834767	total: 228ms	remaining: 18.8s
12:	learn: 0.5692314	total: 232ms	remaining: 17.6s
13:	learn: 0.5561694	total: 240ms	remaining: 16.9s
14:	learn: 0.5443144	total: 246ms	remaining: 16.2s
15:	learn: 0.5335399	total: 253ms	remaining: 15.6s
16:	learn: 0.5249151	total: 260ms	remaining: 15s
17:	learn: 0.5166079	total: 267ms	remaining: 14.6s
18:	learn: 0.5090643	total: 273ms	remaining: 14.1s
19:	learn: 

[{'Logistic Regression': {'accuracy': 0.855072463768116,
   'f1-score': 0.7882699275362319},
  'Decision Tree': {'accuracy': 0.7460869565217392,
   'f1-score': 0.7594549942272767},
  'Random Forest': {'accuracy': 0.855072463768116,
   'f1-score': 0.7907012244129078},
  'K-Nearest Neighbors': {'accuracy': 0.8336231884057971,
   'f1-score': 0.7842218662155915},
  'Gaussian Naive Bayes': {'accuracy': 0.06260869565217392,
   'f1-score': 0.09158364648951271},
  'Support Vector Machine': {'accuracy': 0.855072463768116,
   'f1-score': 0.7882699275362319},
  'XGBoost': {'accuracy': 0.8533333333333334, 'f1-score': 0.8056399168170112},
  'ExtraTreesClassifier': {'accuracy': 0.8579710144927536,
   'f1-score': 0.7962081744197514},
  'CatBoostClassifier': {'accuracy': 0.8556521739130435,
   'f1-score': 0.8012663937974301},
  'AdaBoostClassifier': {'accuracy': 0.8533333333333334,
   'f1-score': 0.7874048587217183}},
 {'Logistic Regression': {'accuracy': 0.855072463768116,
   'f1-score': 0.7882699275

In [28]:
from sklearn.ensemble import GradientBoostingClassifier

In [29]:
# Define a list of algorithms
algorithms = [LogisticRegression(), DecisionTreeClassifier(), RandomForestClassifier(),
              SVC(), KNeighborsClassifier(), GaussianNB(), GradientBoostingClassifier(),
              XGBClassifier(), ExtraTreesClassifier()]

# Fit models on the training data
results = []
for algo in algorithms:
    algo.fit(X_train, y_train)

    # Predict on the test data
    y_pred = algo.predict(X_test)

    # Evaluate performance of model
    f1 = f1_score(y_test, y_pred, average='weighted')
    acc = accuracy_score(y_test, y_pred)
    result = {'Algorithm': type(algo).__name__, 'F1-score': f1, 'accuracy': acc}
    results.append(result)

# Convert results to a DataFrame and sort by F1-score
results_df = pd.DataFrame(results).sort_values(by='F1-score', ascending=False)
print(results_df)

                    Algorithm  F1-score  accuracy
7               XGBClassifier  0.794708  0.845196
6  GradientBoostingClassifier  0.778334  0.842490
8        ExtraTreesClassifier  0.772787  0.841949
2      RandomForestClassifier  0.771927  0.842219
4        KNeighborsClassifier  0.770740  0.825440
0          LogisticRegression  0.768177  0.840866
3                         SVC  0.768177  0.840866
1      DecisionTreeClassifier  0.744279  0.734777
5                  GaussianNB  0.315403  0.212179


## Results

In [30]:
results_df

Unnamed: 0,Algorithm,F1-score,accuracy
7,XGBClassifier,0.794708,0.845196
6,GradientBoostingClassifier,0.778334,0.84249
8,ExtraTreesClassifier,0.772787,0.841949
2,RandomForestClassifier,0.771927,0.842219
4,KNeighborsClassifier,0.77074,0.82544
0,LogisticRegression,0.768177,0.840866
3,SVC,0.768177,0.840866
1,DecisionTreeClassifier,0.744279,0.734777
5,GaussianNB,0.315403,0.212179


XGBClassifier is performing the best, we will choose this for hyperparameter tuning

In [31]:
import plotly.express as px
import plotly.offline as py
import plotly.graph_objs as go
import plotly.tools as tls
import plotly.figure_factory as ff

In [32]:
from sklearn.metrics import (accuracy_score, 
                            classification_report,
                            recall_score, precision_score, f1_score,
                            confusion_matrix)

In [33]:
from sklearn.model_selection import GridSearchCV, KFold

In [34]:
def model_performance(model, y_test, y_hat) : 
    conf_matrix = confusion_matrix(y_test, y_hat)
    trace1 = go.Heatmap(z = conf_matrix  ,x = ["0 (pred)","1 (pred)", "2 (pred)"],
                        y = ["0 (true)","1 (true)", "2 (true)"],xgap = 2, ygap = 2, 
                        colorscale = 'Viridis', showscale  = False)

    #Show metrics
    Accuracy  =  accuracy_score(y_test, y_hat)
    Precision =  precision_score(y_test, y_pred, average= 'weighted')
    Recall    =  recall_score(y_test, y_pred, average= 'weighted')
    F1_score  =  f1_score(y_test, y_pred, average= 'weighted')

    show_metrics = pd.DataFrame(data=[[Accuracy , Precision, Recall, F1_score]])
    show_metrics = show_metrics.T

    colors = ['gold', 'lightgreen', 'lightcoral', 'lightskyblue']
    trace2 = go.Bar(x = (show_metrics[0].values), 
                   y = ['Accuracy', 'Precision', 'Recall', 'F1_score'], text = np.round_(show_metrics[0].values,4),
                    textposition = 'auto',
                   orientation = 'h', opacity = 0.8,marker=dict(
            color=colors,
            line=dict(color='#000000',width=1.5)))
 
     
    #plots
    model = model
    
    #Subplots
    fig = tls.make_subplots(rows=2, cols=1, print_grid=False, 
                          subplot_titles=('Confusion Matrix',
                                        'Metrics',
                                        ))
    
    fig.append_trace(trace1,1,1)
    fig.append_trace(trace2,2,1)
    
    fig['layout'].update(showlegend = False, title = '<b>Model performance report</b><br>'+str(model),
                        autosize = True, height = 800,width = 800,
                        plot_bgcolor = 'rgba(240,240,240, 0.95)',
                        paper_bgcolor = 'rgba(240,240,240, 0.95)',
                        # margin = dict(b = 100)
                        )
    fig.layout.titlefont.size = 14
    
    py.iplot(fig)

In [35]:
xgb = XGBClassifier()
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)

In [36]:
y_pred

array([2, 2, 2, ..., 2, 2, 2], dtype=int64)

In [37]:
model_performance(xgb,y_test, y_pred)


plotly.tools.make_subplots is deprecated, please use plotly.subplots.make_subplots instead



In [43]:
# Define the hyperparameters to be tuned
hyperparameters = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001],
    'n_estimators': [50, 100, 200]
}

# Create a GridSearchCV object
grid_search = GridSearchCV(
    estimator=xgb,
    param_grid=hyperparameters,
    scoring='accuracy',
    cv=10,
    n_jobs=-1
)

# Fit the GridSearchCV object to the training data
grid_search.fit(X_train, y_train)

# Print the best parameters
print(grid_search.best_params_)


{'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}


In [44]:
# Define the XGBClassifier model with the selected hyperparameters
xgb = XGBClassifier(
    max_depth=5,
    learning_rate=0.1,
    n_estimators=100,
    random_state=42
)

# Fit the XGBClassifier model to the training data
xgb.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = xgb.predict(X_test)

# Compute the F1 score on the testing data
f1 = f1_score(y_test, y_pred, average="weighted")
acc = accuracy_score(y_test, y_pred)

# Print the F1 score
print('F1 score:', f1)
print("Accuracy Score:", acc)


F1 score: 0.7834485869680957
Accuracy Score: 0.8443843031123139


In [45]:
import os

In [49]:
%pwd

'd:\\github\\Projects\\TMLC\\road_accident_severity\\notebook'

In [50]:
os.chdir("../Model/")

In [51]:
# Save the trained model to a pickle file
import pickle

with open('xgb_model.pkl', 'wb') as f:
    pickle.dump(xgb, f)



In [53]:
## Load the model

# Load the model from the file
with open('../Model/xgb_model.pkl', 'rb') as f:
    model = pickle.load(f)

In [54]:
model.predict(X_test)

array([2, 2, 2, ..., 2, 2, 2], dtype=int64)