### Limpieza / EDA

In [555]:
%pip install -r ../requirements.txt

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.python.org/simple
Note: you may need to restart the kernel to use updated packages.


#### 🔎 Exploratory Data Analysis

In [556]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

from ydata_profiling import ProfileReport
from sklearn.preprocessing import LabelEncoder


In [557]:
raw_data_path = '../data/raw/road_traffic_accidents_dataset.csv'

In [558]:
raw_data = pd.read_csv(raw_data_path)

In [559]:
profile = ProfileReport(raw_data)

In [560]:
profile

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



#### 🧹 Data Cleaning

Gestioń de valores NULOS

In [561]:
raw_data.isnull().sum().sort_values(ascending=False)

Defect_of_vehicle              4427
Service_year_of_vehicle        3928
Work_of_casuality              3198
Fitness_of_casuality           2635
Type_of_vehicle                 950
Types_of_Junction               887
Driving_experience              829
Educational_level               741
Vehicle_driver_relation         579
Owner_of_vehicle                482
Lanes_or_Medians                385
Vehicle_movement                308
Area_accident_occured           239
Road_surface_type               172
Type_of_collision               155
Road_allignment                 142
Casualty_class                    0
Pedestrian_movement               0
Cause_of_accident                 0
Casualty_severity                 0
Age_band_of_casualty              0
Sex_of_casualty                   0
Time                              0
Road_surface_conditions           0
Number_of_casualties              0
Number_of_vehicles_involved       0
Weather_conditions                0
Light_conditions            

In [562]:
temp_data = raw_data.copy()

categorical_columns = temp_data.select_dtypes(include=['object']).columns

for col in categorical_columns:
    mode_value = temp_data[col].mode()[0]
    
    temp_data[col] = temp_data[col].fillna(mode_value)


In [563]:
temp_data.isnull().sum().sort_values(ascending=False)

Time                           0
Day_of_week                    0
Cause_of_accident              0
Pedestrian_movement            0
Fitness_of_casuality           0
Work_of_casuality              0
Casualty_severity              0
Age_band_of_casualty           0
Sex_of_casualty                0
Casualty_class                 0
Vehicle_movement               0
Number_of_casualties           0
Number_of_vehicles_involved    0
Type_of_collision              0
Weather_conditions             0
Light_conditions               0
Road_surface_conditions        0
Road_surface_type              0
Types_of_Junction              0
Road_allignment                0
Lanes_or_Medians               0
Area_accident_occured          0
Defect_of_vehicle              0
Service_year_of_vehicle        0
Owner_of_vehicle               0
Type_of_vehicle                0
Driving_experience             0
Vehicle_driver_relation        0
Educational_level              0
Sex_of_driver                  0
Age_band_o

In [564]:
# temp_data = raw_data.dropna()

In [565]:
temp_data.columns = temp_data.columns.str.lower()

In [566]:
temp_data.loc[:, 'time'] = pd.to_datetime(temp_data['time'], format='%H:%M:%S').dt.hour


In [567]:
temp_data

Unnamed: 0,time,day_of_week,age_band_of_driver,sex_of_driver,educational_level,vehicle_driver_relation,driving_experience,type_of_vehicle,owner_of_vehicle,service_year_of_vehicle,...,vehicle_movement,casualty_class,sex_of_casualty,age_band_of_casualty,casualty_severity,work_of_casuality,fitness_of_casuality,pedestrian_movement,cause_of_accident,accident_severity
0,17,Monday,18-30,Male,Above high school,Employee,1-2yr,Automobile,Owner,Above 10yr,...,Going straight,na,na,na,na,Driver,Normal,Not a Pedestrian,Moving Backward,Slight Injury
1,17,Monday,31-50,Male,Junior high school,Employee,Above 10yr,Public (> 45 seats),Owner,5-10yrs,...,Going straight,na,na,na,na,Driver,Normal,Not a Pedestrian,Overtaking,Slight Injury
2,17,Monday,18-30,Male,Junior high school,Employee,1-2yr,Lorry (41?100Q),Owner,Unknown,...,Going straight,Driver or rider,Male,31-50,3,Driver,Normal,Not a Pedestrian,Changing lane to the left,Serious Injury
3,1,Sunday,18-30,Male,Junior high school,Employee,5-10yr,Public (> 45 seats),Governmental,Unknown,...,Going straight,Pedestrian,Female,18-30,3,Driver,Normal,Not a Pedestrian,Changing lane to the right,Slight Injury
4,1,Sunday,18-30,Male,Junior high school,Employee,2-5yr,Automobile,Owner,5-10yrs,...,Going straight,na,na,na,na,Driver,Normal,Not a Pedestrian,Overtaking,Slight Injury
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12311,16,Wednesday,31-50,Male,Junior high school,Employee,2-5yr,Lorry (11?40Q),Owner,Unknown,...,Going straight,na,na,na,na,Driver,Normal,Not a Pedestrian,No distancing,Slight Injury
12312,18,Sunday,Unknown,Male,Elementary school,Employee,5-10yr,Automobile,Owner,Unknown,...,Other,na,na,na,na,Driver,Normal,Not a Pedestrian,No distancing,Slight Injury
12313,13,Sunday,Over 51,Male,Junior high school,Employee,5-10yr,Bajaj,Owner,2-5yrs,...,Other,Driver or rider,Male,31-50,3,Driver,Normal,Not a Pedestrian,Changing lane to the right,Serious Injury
12314,13,Sunday,18-30,Female,Junior high school,Employee,Above 10yr,Lorry (41?100Q),Owner,2-5yrs,...,Other,na,na,na,na,Driver,Normal,Not a Pedestrian,Driving under the influence of drugs,Slight Injury


In [568]:
columns_to_encode = ['accident_severity']
label_encoder = LabelEncoder()
for column in columns_to_encode:
    label_encoder = LabelEncoder()
    temp_data.loc[:, column] = label_encoder.fit_transform(temp_data[column])


In [569]:
temp_data

Unnamed: 0,time,day_of_week,age_band_of_driver,sex_of_driver,educational_level,vehicle_driver_relation,driving_experience,type_of_vehicle,owner_of_vehicle,service_year_of_vehicle,...,vehicle_movement,casualty_class,sex_of_casualty,age_band_of_casualty,casualty_severity,work_of_casuality,fitness_of_casuality,pedestrian_movement,cause_of_accident,accident_severity
0,17,Monday,18-30,Male,Above high school,Employee,1-2yr,Automobile,Owner,Above 10yr,...,Going straight,na,na,na,na,Driver,Normal,Not a Pedestrian,Moving Backward,2
1,17,Monday,31-50,Male,Junior high school,Employee,Above 10yr,Public (> 45 seats),Owner,5-10yrs,...,Going straight,na,na,na,na,Driver,Normal,Not a Pedestrian,Overtaking,2
2,17,Monday,18-30,Male,Junior high school,Employee,1-2yr,Lorry (41?100Q),Owner,Unknown,...,Going straight,Driver or rider,Male,31-50,3,Driver,Normal,Not a Pedestrian,Changing lane to the left,1
3,1,Sunday,18-30,Male,Junior high school,Employee,5-10yr,Public (> 45 seats),Governmental,Unknown,...,Going straight,Pedestrian,Female,18-30,3,Driver,Normal,Not a Pedestrian,Changing lane to the right,2
4,1,Sunday,18-30,Male,Junior high school,Employee,2-5yr,Automobile,Owner,5-10yrs,...,Going straight,na,na,na,na,Driver,Normal,Not a Pedestrian,Overtaking,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12311,16,Wednesday,31-50,Male,Junior high school,Employee,2-5yr,Lorry (11?40Q),Owner,Unknown,...,Going straight,na,na,na,na,Driver,Normal,Not a Pedestrian,No distancing,2
12312,18,Sunday,Unknown,Male,Elementary school,Employee,5-10yr,Automobile,Owner,Unknown,...,Other,na,na,na,na,Driver,Normal,Not a Pedestrian,No distancing,2
12313,13,Sunday,Over 51,Male,Junior high school,Employee,5-10yr,Bajaj,Owner,2-5yrs,...,Other,Driver or rider,Male,31-50,3,Driver,Normal,Not a Pedestrian,Changing lane to the right,1
12314,13,Sunday,18-30,Female,Junior high school,Employee,Above 10yr,Lorry (41?100Q),Owner,2-5yrs,...,Other,na,na,na,na,Driver,Normal,Not a Pedestrian,Driving under the influence of drugs,2


In [570]:
temp_data = pd.get_dummies(temp_data, columns=[col for col in temp_data.columns if col not in columns_to_encode], dtype=int)

In [571]:
temp_data

Unnamed: 0,accident_severity,time_0,time_1,time_2,time_3,time_4,time_5,time_6,time_7,time_8,...,cause_of_accident_No distancing,cause_of_accident_No priority to pedestrian,cause_of_accident_No priority to vehicle,cause_of_accident_Other,cause_of_accident_Overloading,cause_of_accident_Overspeed,cause_of_accident_Overtaking,cause_of_accident_Overturning,cause_of_accident_Turnover,cause_of_accident_Unknown
0,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12311,2,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
12312,2,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
12313,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12314,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [572]:
temp_data.columns = temp_data.columns.str.lower()

In [573]:
columns_with_high_corr = 10
corr_matrix = np.abs(temp_data.corr()['accident_severity']).sort_values(ascending=False)
print(corr_matrix.head(columns_with_high_corr))


accident_severity                          1.000000
number_of_vehicles_involved_1              0.147213
number_of_vehicles_involved_2              0.095491
number_of_casualties_4                     0.077415
types_of_junction_crossing                 0.049772
time_3                                     0.048485
age_band_of_driver_unknown                 0.045116
age_band_of_driver_under 18                0.044051
light_conditions_darkness - no lighting    0.043421
types_of_junction_no junction              0.035200
Name: accident_severity, dtype: float64


In [574]:
drop_columns = [col for col in temp_data.columns if col not in corr_matrix.index[:columns_with_high_corr]]


In [575]:
temp_data = temp_data.drop(columns=drop_columns)

In [576]:
# Construct the output file path
base_name = os.path.basename(raw_data_path)
name, ext = os.path.splitext(base_name)
output_file_name = f"{name}_proc{ext}"

# Define the processed directory path
processed_dir = '../data/processed'
os.makedirs(processed_dir, exist_ok=True)
output_path = os.path.join(processed_dir, output_file_name)

temp_data.to_csv(output_path, index=False)
print(f"\n✅ The processed DataFrame has been saved to {output_path}")


✅ The processed DataFrame has been saved to ../data/processed/road_traffic_accidents_dataset_proc.csv


In [577]:
proc_data_path = '../data/processed/road_traffic_accidents_dataset_proc.csv'

proc_data = pd.read_csv(proc_data_path)

In [578]:
proc_data.head()

Unnamed: 0,accident_severity,time_3,age_band_of_driver_under 18,age_band_of_driver_unknown,types_of_junction_crossing,types_of_junction_no junction,light_conditions_darkness - no lighting,number_of_vehicles_involved_1,number_of_vehicles_involved_2,number_of_casualties_4
0,2,0,0,0,0,1,0,0,1,0
1,2,0,0,0,0,1,0,0,1,0
2,1,0,0,0,0,1,0,0,1,0
3,2,0,0,0,0,0,0,0,1,0
4,2,0,0,0,0,0,0,0,1,0


In [579]:
proc_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12316 entries, 0 to 12315
Data columns (total 10 columns):
 #   Column                                   Non-Null Count  Dtype
---  ------                                   --------------  -----
 0   accident_severity                        12316 non-null  int64
 1   time_3                                   12316 non-null  int64
 2   age_band_of_driver_under 18              12316 non-null  int64
 3   age_band_of_driver_unknown               12316 non-null  int64
 4   types_of_junction_crossing               12316 non-null  int64
 5   types_of_junction_no junction            12316 non-null  int64
 6   light_conditions_darkness - no lighting  12316 non-null  int64
 7   number_of_vehicles_involved_1            12316 non-null  int64
 8   number_of_vehicles_involved_2            12316 non-null  int64
 9   number_of_casualties_4                   12316 non-null  int64
dtypes: int64(10)
memory usage: 962.3 KB


In [580]:
proc_data.describe(include='all')

Unnamed: 0,accident_severity,time_3,age_band_of_driver_under 18,age_band_of_driver_unknown,types_of_junction_crossing,types_of_junction_no junction,light_conditions_darkness - no lighting,number_of_vehicles_involved_1,number_of_vehicles_involved_2,number_of_casualties_4
count,12316.0,12316.0,12316.0,12316.0,12316.0,12316.0,12316.0,12316.0,12316.0,12316.0
mean,1.832819,0.00682,0.066986,0.12569,0.176762,0.311546,0.015589,0.162066,0.677168,0.031991
std,0.406082,0.082307,0.250008,0.331513,0.381483,0.463144,0.123886,0.368526,0.467578,0.175983
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
75%,2.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
max,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [581]:
proc_data.isnull().sum()


accident_severity                          0
time_3                                     0
age_band_of_driver_under 18                0
age_band_of_driver_unknown                 0
types_of_junction_crossing                 0
types_of_junction_no junction              0
light_conditions_darkness - no lighting    0
number_of_vehicles_involved_1              0
number_of_vehicles_involved_2              0
number_of_casualties_4                     0
dtype: int64

In [582]:
proc_data.duplicated().sum()


12155

In [583]:
injuries = proc_data[['accident_severity']].value_counts()

print(injuries)

accident_severity
2                    10415
1                     1743
0                      158
Name: count, dtype: int64


In [584]:
# Get the value counts of the encoded values
injuries = proc_data['accident_severity'].value_counts()

# Map the encoded values back to the original labels
injuries.index = label_encoder.inverse_transform(injuries.index)

# Print the result with original labels
print(injuries)

Slight Injury     10415
Serious Injury     1743
Fatal injury        158
Name: count, dtype: int64
