### Limpieza / EDA

In [None]:
%pip install -r ../requirements.txt

#### 🔎 Exploratory Data Analysis

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

from ydata_profiling import ProfileReport
from sklearn.preprocessing import LabelEncoder


In [None]:
raw_data_path = '../data/raw/road_traffic_accidents_dataset.csv'

In [None]:
raw_data = pd.read_csv(raw_data_path)

In [None]:
profile = ProfileReport(raw_data)

In [None]:
profile

#### 🧹 Data Cleaning

Gestioń de valores NULOS

In [None]:
raw_data.isnull().sum().sort_values(ascending=False)

In [None]:
temp_data = raw_data.copy()

categorical_columns = temp_data.select_dtypes(include=['object']).columns

for col in categorical_columns:
    mode_value = temp_data[col].mode()[0]
    
    temp_data[col] = temp_data[col].fillna(mode_value)


In [None]:
temp_data.isnull().sum().sort_values(ascending=False)

In [None]:
# temp_data = raw_data.dropna()

In [None]:
temp_data.columns = temp_data.columns.str.lower()

In [None]:
temp_data.loc[:, 'time'] = pd.to_datetime(temp_data['time'], format='%H:%M:%S').dt.hour


In [None]:
temp_data

In [None]:
columns_to_encode = ['accident_severity']
label_encoder = LabelEncoder()
for column in columns_to_encode:
    label_encoder = LabelEncoder()
    temp_data.loc[:, column] = label_encoder.fit_transform(temp_data[column])


In [None]:
temp_data

In [None]:
temp_data = pd.get_dummies(temp_data, columns=[col for col in temp_data.columns if col not in columns_to_encode], dtype=int)

In [None]:
temp_data

In [None]:
temp_data.columns = temp_data.columns.str.lower()

In [None]:
data = temp_data.copy()

correlation_matrix = data.corr().abs()

mask = np.triu(np.ones_like(correlation_matrix, dtype=bool), k=1)

upper_triangle = correlation_matrix.where(mask)

correlation_threshold = 0.7
columns_to_drop = [column for column in upper_triangle.columns if any(upper_triangle[column] > correlation_threshold)]

print(f"Columns to drop: {columns_to_drop} -> {len(columns_to_drop)}")
for col in columns_to_drop:
    print(col)

In [None]:
temp_data = temp_data.drop(columns=columns_to_drop)

In [None]:
# Construct the output file path
base_name = os.path.basename(raw_data_path)
name, ext = os.path.splitext(base_name)
output_file_name = f"{name}_proc{ext}"

# Define the processed directory path
processed_dir = '../data/processed'
os.makedirs(processed_dir, exist_ok=True)
output_path = os.path.join(processed_dir, output_file_name)

temp_data.to_csv(output_path, index=False)
print(f"\n✅ The processed DataFrame has been saved to {output_path}")

In [None]:
proc_data_path = '../data/processed/road_traffic_accidents_dataset_proc.csv'

proc_data = pd.read_csv(proc_data_path)

In [None]:
proc_data

In [None]:
proc_data.info()

In [None]:
proc_data.describe(include='all')

In [None]:
proc_data.isnull().sum()


In [None]:
proc_data.duplicated().sum()


In [None]:
injuries = proc_data[['accident_severity']].value_counts()

print(injuries)

In [None]:
# Get the value counts of the encoded values
injuries = proc_data['accident_severity'].value_counts()

# Map the encoded values back to the original labels
injuries.index = label_encoder.inverse_transform(injuries.index)

# Print the result with original labels
print(injuries)