### Limpieza / EDA

#### 🔎 Exploratory Data Analysis

In [33]:
!pip install ydata_profiling





[notice] A new release of pip is available: 24.1.2 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [34]:
import pandas as pd
import os
from ydata_profiling import ProfileReport
from sklearn.preprocessing import LabelEncoder


In [35]:
raw_data_path='../data/raw/road_traffic_accidents_dataset.csv'

In [36]:
raw_data = pd.read_csv(raw_data_path)

In [37]:
profile = ProfileReport(raw_data)

In [38]:
# profile

#### 🧹 Data Cleaning

1. **Remove Duplicates**
   - **Objective**: Eliminate duplicate rows in the dataset to avoid redundancy.
   - **Action**: Identify and remove duplicate rows based on all or selected columns.
   - **Example**: `df.drop_duplicates()`.

2. **Handle Missing Values**
   - **Objective**: Address missing or incomplete data to ensure data quality.
   - **Actions**:
     - **Imputation**: Replace missing values with a statistical measure such as mean, median, or mode.
     - **Removal**: Remove rows or columns with missing values if they are insignificant.
     - **Flagging**: Create a separate flag column to indicate missing values.
   - **Example**: `df.fillna()` for imputation or `df.dropna()` to remove missing values.

3. **Correct Data Types**
   - **Objective**: Ensure that each column in the dataset has the correct data type.
   - **Actions**:
     - **Conversion**: Convert columns to appropriate data types (e.g., integer, float, datetime).
     - **Verification**: Check and verify data types after conversion.
   - **Example**: Using pandas, you can use `df.astype()` to change data types.

4. **Remove or Address Outliers**
   - **Objective**: Identify and handle data points that deviate significantly from other observations.
   - **Actions**:
     - **Detection**: Use statistical methods or visualization to identify outliers.
     - **Handling**: Decide whether to remove outliers, adjust their values, or analyze their impact.
   - **Example**: You can use methods like Z-scores or IQR to detect outliers.

In [39]:
# processing_path='../src/data_processing.py'
# !python3 {processing_path} {raw_data_path}


In [40]:
temp_data=raw_data.dropna()

In [41]:
temp_data=temp_data.rename(columns=lambda x: x.lower())

In [42]:
drop_columns=[]
# drop_columns=['Time', 'Day_of_week']
temp_data=temp_data.drop(columns=drop_columns)

In [43]:
temp_data['time']=pd.to_datetime(temp_data['time'])

  temp_data['time']=pd.to_datetime(temp_data['time'])


In [44]:
column_encoder = 'accident_severity'
label_encoder = LabelEncoder()
temp_data[column_encoder] = label_encoder.fit_transform(temp_data[column_encoder])

In [45]:
temp_data = pd.get_dummies(temp_data, columns=[col for col in temp_data.columns if col != column_encoder], dtype=int)
temp_data=pd.get_dummies(temp_data, dtype=int)

In [46]:
temp_data=temp_data.rename(columns=lambda x: x.lower())

In [47]:
# Construct the output file path
base_name = os.path.basename(raw_data_path)
name, ext = os.path.splitext(base_name)
output_file_name = f"{name}_proc{ext}"

# Define the processed directory path
processed_dir = '../data/processed'
os.makedirs(processed_dir, exist_ok=True)
output_path = os.path.join(processed_dir, output_file_name)

temp_data.to_csv(output_path, index=False)
print(f"\n✅ The processed DataFrame has been saved to {output_path}")


✅ The processed DataFrame has been saved to ../data/processed\road_traffic_accidents_dataset_proc.csv


In [48]:
proc_data_path='../data/processed/road_traffic_accidents_dataset_proc.csv'

proc_data=pd.read_csv(proc_data_path)

In [49]:
proc_data.head()

Unnamed: 0,accident_severity,time_2024-08-07 00:01:00,time_2024-08-07 00:04:00,time_2024-08-07 00:06:00,time_2024-08-07 00:10:00,time_2024-08-07 00:11:00,time_2024-08-07 00:17:00,time_2024-08-07 00:20:00,time_2024-08-07 00:21:00,time_2024-08-07 00:25:00,...,cause_of_accident_no distancing,cause_of_accident_no priority to pedestrian,cause_of_accident_no priority to vehicle,cause_of_accident_other,cause_of_accident_overloading,cause_of_accident_overspeed,cause_of_accident_overtaking,cause_of_accident_overturning,cause_of_accident_turnover,cause_of_accident_unknown
0,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,2,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [50]:
proc_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2889 entries, 0 to 2888
Columns: 955 entries, accident_severity to cause_of_accident_unknown
dtypes: int64(955)
memory usage: 21.0 MB


In [51]:
proc_data.describe(include='all')

Unnamed: 0,accident_severity,time_2024-08-07 00:01:00,time_2024-08-07 00:04:00,time_2024-08-07 00:06:00,time_2024-08-07 00:10:00,time_2024-08-07 00:11:00,time_2024-08-07 00:17:00,time_2024-08-07 00:20:00,time_2024-08-07 00:21:00,time_2024-08-07 00:25:00,...,cause_of_accident_no distancing,cause_of_accident_no priority to pedestrian,cause_of_accident_no priority to vehicle,cause_of_accident_other,cause_of_accident_overloading,cause_of_accident_overspeed,cause_of_accident_overtaking,cause_of_accident_overturning,cause_of_accident_turnover,cause_of_accident_unknown
count,2889.0,2889.0,2889.0,2889.0,2889.0,2889.0,2889.0,2889.0,2889.0,2889.0,...,2889.0,2889.0,2889.0,2889.0,2889.0,2889.0,2889.0,2889.0,2889.0,2889.0
mean,1.842852,0.000346,0.000346,0.000346,0.003115,0.000346,0.001038,0.000346,0.000346,0.003115,...,0.193493,0.055382,0.095881,0.033922,0.0045,0.006923,0.033576,0.014884,0.009692,0.001385
std,0.392385,0.018605,0.018605,0.018605,0.055737,0.018605,0.032213,0.018605,0.018605,0.055737,...,0.395104,0.228765,0.294479,0.181059,0.066941,0.082929,0.180165,0.12111,0.097986,0.03719
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [52]:
proc_data.isnull().sum()


accident_severity                0
time_2024-08-07 00:01:00         0
time_2024-08-07 00:04:00         0
time_2024-08-07 00:06:00         0
time_2024-08-07 00:10:00         0
                                ..
cause_of_accident_overspeed      0
cause_of_accident_overtaking     0
cause_of_accident_overturning    0
cause_of_accident_turnover       0
cause_of_accident_unknown        0
Length: 955, dtype: int64

In [53]:

proc_data.duplicated().sum()


0

In [56]:
injuries = proc_data[['accident_severity']].value_counts()

print(injuries)

accident_severity
2                    2466
1                     392
0                      31
Name: count, dtype: int64
