In [1]:
import pandas as pd

# Load the datasets
vehicles_df = pd.read_csv("Vehicles.csv")
casualties_df = pd.read_csv("Casualties.csv")
accidents_df = pd.read_csv("Accidents.csv")

# Display first few rows
print(vehicles_df.head())
print(casualties_df.head())
print(accidents_df.head())


  Accident_Index  Vehicle_Reference  Vehicle_Type  Towing_and_Articulation  \
0  200501BS00001                  1             9                        0   
1  200501BS00002                  1            11                        0   
2  200501BS00003                  1            11                        0   
3  200501BS00003                  2             9                        0   
4  200501BS00004                  1             9                        0   

   Vehicle_Manoeuvre  Vehicle_Location-Restricted_Lane  Junction_Location  \
0                 18                                 0                  0   
1                  4                                 0                  3   
2                 17                                 0                  0   
3                  2                                 0                  0   
4                 18                                 0                  0   

   Skidding_and_Overturning  Hit_Object_in_Carriageway  \
0         

In [3]:
df = accidents_df.merge(vehicles_df, on='Accident_Index').merge(casualties_df, on='Accident_Index')


In [4]:
print(accidents_df.columns)  # Check available columns
print(casualties_df.columns)
print(vehicles_df.columns)

Index(['Accident_Index', 'Location_Easting_OSGR', 'Location_Northing_OSGR',
       'Longitude', 'Latitude', 'Police_Force', 'Accident_Severity',
       'Number_of_Vehicles', 'Number_of_Casualties', 'Date', 'Day_of_Week',
       'Time', 'Local_Authority_(District)', 'Local_Authority_(Highway)',
       '1st_Road_Class', '1st_Road_Number', 'Road_Type', 'Speed_limit',
       'Junction_Detail', 'Junction_Control', '2nd_Road_Class',
       '2nd_Road_Number', 'Pedestrian_Crossing-Human_Control',
       'Pedestrian_Crossing-Physical_Facilities', 'Light_Conditions',
       'Weather_Conditions', 'Road_Surface_Conditions',
       'Special_Conditions_at_Site', 'Carriageway_Hazards',
       'Urban_or_Rural_Area', 'Did_Police_Officer_Attend_Scene_of_Accident',
       'LSOA_of_Accident_Location'],
      dtype='object')
Index(['Accident_Index', 'Vehicle_Reference', 'Casualty_Reference',
       'Casualty_Class', 'Sex_of_Casualty', 'Age_of_Casualty',
       'Age_Band_of_Casualty', 'Casualty_Severity', '

In [5]:
features = [
    'Did_Police_Officer_Attend', 'age_of_driver', 'vehicle_type', 'age_of_vehicle',
    'engine_cc', 'day', 'weather', 'roadsc', 'light', 'gender', 'speedl'
]
target = 'Accident_Severity'  # Adjust based on your dataset (like accident severity level)


In [6]:
print(df.columns)

Index(['Accident_Index', 'Location_Easting_OSGR', 'Location_Northing_OSGR',
       'Longitude', 'Latitude', 'Police_Force', 'Accident_Severity',
       'Number_of_Vehicles', 'Number_of_Casualties', 'Date', 'Day_of_Week',
       'Time', 'Local_Authority_(District)', 'Local_Authority_(Highway)',
       '1st_Road_Class', '1st_Road_Number', 'Road_Type', 'Speed_limit',
       'Junction_Detail', 'Junction_Control', '2nd_Road_Class',
       '2nd_Road_Number', 'Pedestrian_Crossing-Human_Control',
       'Pedestrian_Crossing-Physical_Facilities', 'Light_Conditions',
       'Weather_Conditions', 'Road_Surface_Conditions',
       'Special_Conditions_at_Site', 'Carriageway_Hazards',
       'Urban_or_Rural_Area', 'Did_Police_Officer_Attend_Scene_of_Accident',
       'LSOA_of_Accident_Location', 'Vehicle_Reference_x', 'Vehicle_Type',
       'Towing_and_Articulation', 'Vehicle_Manoeuvre',
       'Vehicle_Location-Restricted_Lane', 'Junction_Location',
       'Skidding_and_Overturning', 'Hit_Object_in

In [7]:
features = [
    "Did_Police_Officer_Attend_Scene_of_Accident",  # ✅ Correct column name
    "Age_of_Driver",  # ✅ Correct case
    "Vehicle_Type",
    "Age_of_Vehicle",
    "Engine_Capacity_(CC)",  # ✅ Correct column name
    "Day_of_Week",  # ✅ Matches dataset
    "Weather_Conditions",  # ✅ Matches dataset
    "Road_Surface_Conditions",  # ✅ Corrected 'roadsc'
    "Light_Conditions",  # ✅ Corrected 'light'
    "Sex_of_Driver",  # ✅ Corrected 'gender'
    "Speed_limit"  # ✅ Corrected 'speedl'
]


In [8]:
target = 'accident_severity' 

In [9]:
X = df[features]


In [10]:
y = df["Accident_Severity"]

In [11]:
  # Drop missing values


In [12]:
# Fill missing values with median for numerical columns
X = X.fillna(X.median())

# Fill categorical columns with mode (most common value)
X = X.apply(lambda col: col.fillna(col.mode()[0]))


In [13]:
# Convert categorical variables into numerical labels
X = X.apply(lambda col: col.astype('category').cat.codes)


In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [15]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


In [16]:
from sklearn.metrics import accuracy_score
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")


Model Accuracy: 0.96


In [17]:
import joblib
joblib.dump(model, "road_accident_model.sav")


['road_accident_model.sav']

In [18]:
model = joblib.load("road_accident_model.sav")


In [49]:
import numpy as np
example_input = np.array([[1, 25, 3, 5, 1500, 3, 2, 1, 2, 1, 50]])  # Replace with actual values
predicted_severity = model.predict(example_input)
print(f"Predicted Accident Severity: {predicted_severity[0]}")

Predicted Accident Severity: 3




In [51]:
from sklearn.ensemble import RandomForestClassifier
import joblib

# Assuming you have your training data as X_train and y_train
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Save the model
joblib.dump(model, 'road_accident_model.sav')
print("Model re-trained and saved successfully!")

Model re-trained and saved successfully!


In [3]:
pip uninstall scikit-learn

^C
Note: you may need to restart the kernel to use updated packages.


In [5]:
pip install scikit-learn==1.6.1


Defaulting to user installation because normal site-packages is not writeable
Collecting scikit-learn==1.6.1
  Obtaining dependency information for scikit-learn==1.6.1 from https://files.pythonhosted.org/packages/a1/a6/c5b78606743a1f28eae8f11973de6613a5ee87366796583fb74c67d54939/scikit_learn-1.6.1-cp311-cp311-win_amd64.whl.metadata
  Downloading scikit_learn-1.6.1-cp311-cp311-win_amd64.whl.metadata (15 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn==1.6.1)
  Obtaining dependency information for threadpoolctl>=3.1.0 from https://files.pythonhosted.org/packages/32/d5/f9a850d79b0851d1d4ef6456097579a9005b31fea68726a4ae5f2d82ddd9/threadpoolctl-3.6.0-py3-none-any.whl.metadata
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.6.1-cp311-cp311-win_amd64.whl (11.1 MB)
   ---------------------------------------- 0.0/11.1 MB ? eta -:--:--
   - -------------------------------------- 0.4/11.1 MB 7.4 MB/s eta 0:00:02
   -- -------------------------

In [19]:
pip show scikit-learn

Name: scikit-learnNote: you may need to restart the kernel to use updated packages.

Version: 1.6.1
Summary: A set of python modules for machine learning and data mining
Home-page: 
Author: 
Author-email: 
License: BSD 3-Clause License

 Copyright (c) 2007-2024 The scikit-learn developers.
 All rights reserved.

 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:

 * Redistributions of source code must retain the above copyright notice, this
   list of conditions and the following disclaimer.

 * Redistributions in binary form must reproduce the above copyright notice,
   this list of conditions and the following disclaimer in the documentation
   and/or other materials provided with the distribution.

 * Neither the name of the copyright holder nor the names of its
   contributors may be used to endorse or promote products derived from
   this software without specific prior written permission