In [15]:
# Setup & Load Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

In [16]:
# Load the Dataset
df = pd.read_csv('../data/raw/RTA Dataset.csv')

In [17]:
df.head()

Unnamed: 0,Time,Day_of_week,Age_band_of_driver,Sex_of_driver,Educational_level,Vehicle_driver_relation,Driving_experience,Type_of_vehicle,Owner_of_vehicle,Service_year_of_vehicle,...,Vehicle_movement,Casualty_class,Sex_of_casualty,Age_band_of_casualty,Casualty_severity,Work_of_casuality,Fitness_of_casuality,Pedestrian_movement,Cause_of_accident,Accident_severity
0,17:02:00,Monday,18-30,Male,Above high school,Employee,1-2yr,Automobile,Owner,Above 10yr,...,Going straight,na,na,na,na,,,Not a Pedestrian,Moving Backward,Slight Injury
1,17:02:00,Monday,31-50,Male,Junior high school,Employee,Above 10yr,Public (> 45 seats),Owner,5-10yrs,...,Going straight,na,na,na,na,,,Not a Pedestrian,Overtaking,Slight Injury
2,17:02:00,Monday,18-30,Male,Junior high school,Employee,1-2yr,Lorry (41?100Q),Owner,,...,Going straight,Driver or rider,Male,31-50,3,Driver,,Not a Pedestrian,Changing lane to the left,Serious Injury
3,1:06:00,Sunday,18-30,Male,Junior high school,Employee,5-10yr,Public (> 45 seats),Governmental,,...,Going straight,Pedestrian,Female,18-30,3,Driver,Normal,Not a Pedestrian,Changing lane to the right,Slight Injury
4,1:06:00,Sunday,18-30,Male,Junior high school,Employee,2-5yr,,Owner,5-10yrs,...,Going straight,na,na,na,na,,,Not a Pedestrian,Overtaking,Slight Injury


In [27]:
# Initial Checks
print(df.info())
print(df.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12316 entries, 0 to 12315
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype
---  ------                   --------------  -----
 0   Age_band_of_driver       12316 non-null  int32
 1   Sex_of_driver            12316 non-null  int32
 2   Educational_level        12316 non-null  int32
 3   Vehicle_driver_relation  12316 non-null  int32
 4   Driving_experience       12316 non-null  int32
 5   Lanes_or_Medians         12316 non-null  int32
 6   Types_of_Junction        12316 non-null  int32
 7   Road_surface_type        12316 non-null  int32
 8   Light_conditions         12316 non-null  int32
 9   Weather_conditions       12316 non-null  int32
 10  Type_of_collision        12316 non-null  int32
 11  Vehicle_movement         12316 non-null  int32
 12  Pedestrian_movement      12316 non-null  int32
 13  Cause_of_accident        12316 non-null  int32
 14  Accident_severity        12316 non-null  int32
dtypes:

In [28]:
# Drop Unnecessary and High-Null Columns
cols_to_drop = [
    'ID', 'Date', 'Time', 
    'Service_year_of_vehicle', 'Defect_of_vehicle',
    'Work_of_casuality', 'Day_of_week',
    'Number_of_vehicles_involved', 'Number_of_casualties'
]
df.drop(columns=[col for col in cols_to_drop if col in df.columns], inplace=True)

In [29]:
# Handle Missing Values

cat_cols = df.select_dtypes(include='object').columns
for col in cat_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

num_cols = df.select_dtypes(include='number').columns
for col in num_cols:
    df[col] = df[col].fillna(df[col].median())

In [30]:
# Strip whitespace + make everything lowercase (nice for consistency)
for col in cat_cols:
    df[col] = df[col].str.strip().str.lower()

In [32]:
# Select Features for Modeling
selected_cols = [
    'Age_band_of_driver', 'Sex_of_driver', 'Educational_level', 'Vehicle_driver_relation',
    'Driving_experience', 'Lanes_or_Medians', 'Types_of_Junction', 'Road_surface_type',
    'Light_conditions', 'Weather_conditions', 'Type_of_collision', 'Vehicle_movement',
    'Pedestrian_movement', 'Cause_of_accident', 'Accident_severity'
]

df = df[selected_cols]

In [23]:
# Outlier Detection & Handling (on numeric columns)
numeric_cols = df.select_dtypes(include='number').columns.tolist()

for col in numeric_cols:
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    iqr = q3 - q1
    lower = q1 - 1.5 * iqr
    upper = q3 + 1.5 * iqr
    # Option: clip outliers
    df[col] = df[col].clip(lower, upper)

In [34]:
# Encode Categorical Variables
label_encoders = {}
for col in df.select_dtypes(include='object').columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Double check target mapping
print("Target values:", df['Accident_severity'].unique())

Target values: [2 1 0]


In [25]:
df.head()

Unnamed: 0,Age_band_of_driver,Sex_of_driver,Educational_level,Vehicle_driver_relation,Driving_experience,Lanes_or_Medians,Types_of_Junction,Road_surface_type,Light_conditions,Weather_conditions,Type_of_collision,Vehicle_movement,Pedestrian_movement,Cause_of_accident,Accident_severity
0,0,1,0,0,0,3,1,0,3,2,3,2,5,9,2
1,1,1,4,0,3,5,1,0,3,2,8,2,5,16,2
2,0,1,4,0,0,2,1,0,3,2,2,2,5,0,1
3,0,1,4,0,2,2,7,2,0,2,8,2,5,1,2
4,0,1,4,0,1,2,7,0,0,2,8,2,5,16,2


In [35]:
# Save Cleaned File
df.to_csv("../data/processed/RTA_cleaned.csv", index=False)
print("Cleaned dataset saved as 'RTA_cleaned.csv'")

Cleaned dataset saved as 'RTA_cleaned.csv'
