### Limpieza / EDA

In [None]:
pip install -r ../requirements.txt

In [2]:
pip list

Package                          Version
-------------------------------- --------------
annotated-types                  0.7.0
antlr4-python3-runtime           4.13.1
anyio                            4.4.0
argon2-cffi                      23.1.0
argon2-cffi-bindings             21.2.0
arrow                            1.3.0
asttokens                        2.4.1
attrs                            23.2.0
backoff                          2.2.1
beautifulsoup4                   4.12.3
bleach                           6.1.0
branca                           0.7.2
certifi                          2024.2.2
cffi                             1.16.0
charset-normalizer               3.3.2
circuit-knitting-toolbox         0.7.2
click                            8.1.7
click-plugins                    1.1.1
cligj                            0.7.2
colorama                         0.4.6
comm                             0.2.2
contourpy                        1.2.1
cryptography                     42.0.8
cycl


[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
# pip uninstall numpy


In [4]:
# pip install numpy scipy statsmodels


In [5]:
# pip install --force-reinstall ydata-profiling


#### 🔎 Exploratory Data Analysis

In [6]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches 
%matplotlib inline

import seaborn as sns
from ydata_profiling import ProfileReport
from sklearn.preprocessing import LabelEncoder


ModuleNotFoundError: No module named 'sklearn'

In [None]:
raw_data_path='../data/raw/road_traffic_accidents_dataset.csv'

In [None]:
raw_data = pd.read_csv(raw_data_path)

In [None]:
profile = ProfileReport(raw_data)

In [None]:
# profile

#### 🧹 Data Cleaning

1. **Remove Duplicates**
   - **Objective**: Eliminate duplicate rows in the dataset to avoid redundancy.
   - **Action**: Identify and remove duplicate rows based on all or selected columns.
   - **Example**: `df.drop_duplicates()`.

2. **Handle Missing Values**
   - **Objective**: Address missing or incomplete data to ensure data quality.
   - **Actions**:
     - **Imputation**: Replace missing values with a statistical measure such as mean, median, or mode.
     - **Removal**: Remove rows or columns with missing values if they are insignificant.
     - **Flagging**: Create a separate flag column to indicate missing values.
   - **Example**: `df.fillna()` for imputation or `df.dropna()` to remove missing values.

3. **Correct Data Types**
   - **Objective**: Ensure that each column in the dataset has the correct data type.
   - **Actions**:
     - **Conversion**: Convert columns to appropriate data types (e.g., integer, float, datetime).
     - **Verification**: Check and verify data types after conversion.
   - **Example**: Using pandas, you can use `df.astype()` to change data types.

4. **Remove or Address Outliers**
   - **Objective**: Identify and handle data points that deviate significantly from other observations.
   - **Actions**:
     - **Detection**: Use statistical methods or visualization to identify outliers.
     - **Handling**: Decide whether to remove outliers, adjust their values, or analyze their impact.
   - **Example**: You can use methods like Z-scores or IQR to detect outliers.

In [None]:
# processing_path='../src/data_processing.py'
# !python3 {processing_path} {raw_data_path}


In [None]:
temp_data=raw_data.dropna()

In [None]:
temp_data=temp_data.rename(columns=lambda x: x.lower())

In [None]:
drop_columns=[]
# drop_columns=['Time', 'Day_of_week']
temp_data=temp_data.drop(columns=drop_columns)

In [None]:
temp_data['time']=pd.to_datetime(temp_data['time'])

In [None]:
column_encoder = 'accident_severity'
label_encoder = LabelEncoder()
temp_data[column_encoder] = label_encoder.fit_transform(temp_data[column_encoder])

In [None]:
temp_data = pd.get_dummies(temp_data, columns=[col for col in temp_data.columns if col != column_encoder], dtype=int)
temp_data=pd.get_dummies(temp_data, dtype=int)

In [None]:
temp_data=temp_data.rename(columns=lambda x: x.lower())

In [None]:
# Construct the output file path
base_name = os.path.basename(raw_data_path)
name, ext = os.path.splitext(base_name)
output_file_name = f"{name}_proc{ext}"

# Define the processed directory path
processed_dir = '../data/processed'
os.makedirs(processed_dir, exist_ok=True)
output_path = os.path.join(processed_dir, output_file_name)

temp_data.to_csv(output_path, index=False)
print(f"\n✅ The processed DataFrame has been saved to {output_path}")

In [None]:
proc_data_path='../data/processed/road_traffic_accidents_dataset_proc.csv'

proc_data=pd.read_csv(proc_data_path)

In [None]:
proc_data.head()

In [None]:
proc_data.info()

In [None]:
proc_data.describe(include='all')

In [None]:
proc_data.isnull().sum()


In [None]:

proc_data.duplicated().sum()


In [None]:
injuries = proc_data[['accident_severity']].value_counts()

print(injuries)