In [9]:
import pandas as pd
from pathlib import Path

In [10]:
ROOT = Path.cwd().parent
path_raw = ROOT / "data" / "raw" / "heart.csv"
df = pd.read_csv(path_raw)
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


### Data quality note

We observe no missing values reported by `df.isnull().sum()`, but the `Cholesterol` column contains many zeros which are likely placeholders for missing measurements rather than true zeros.
We will analyze the extent of zeros and consider options: mark as NA, create a missing indicator, impute, or remove rows â€” documenting the choice and rationale.

In [11]:
df["Cholesterol"].value_counts()
zeros = (df["Cholesterol"] == 0).sum()

In [12]:
# Shape of records with Cholesterol == 0
df[df["Cholesterol"] == 0].shape

(172, 12)

In [13]:
# If removing rows with Cholesterol == 0 (not recommended without clinical justification), use .copy() to avoid SettingWithCopyWarning
df_clean = df[df["Cholesterol"] != 0].copy()
print("original shape:", df.shape)
print("cleaned shape:", df_clean.shape)

original shape: (918, 12)
cleaned shape: (746, 12)


### Alternatives to dropping rows with `Cholesterol == 0`
- Mark zeros as NA and document: `df.loc[df.Cholesterol==0, 'Cholesterol'] = np.nan`.
- Create a missing indicator column: `Cholesterol_missing` to preserve information about missingness.
- Impute by global median or subgroup median (e.g., by `sex` or age bins) and justify the choice clinically/statistically.
- Inspect whether zeros concentrate in subgroups which would introduce bias if removed.

In [14]:
# Target distribution with counts and proportions
df['HeartDisease'].value_counts(dropna=False).rename('count').to_frame()
df['HeartDisease'].value_counts(normalize=True).rename('proportion').to_frame()

Unnamed: 0_level_0,proportion
HeartDisease,Unnamed: 1_level_1
1,0.553377
0,0.446623


In [15]:
# Save processed dataset (choose df_clean or df_alt depending on chosen strategy)
path_proc = ROOT / 'data' / 'processed' / 'heart_clean.csv'
# Here we save df_clean as an example (if rows were dropped). Change to df_alt if imputing/marking missing instead.
df_clean.to_csv(path_proc, index=False)
print(f'Processed file saved to: {path_proc}')

Processed file saved to: c:\FRANK MODELO\GITHUB\HEART-FAILURE-PROJECT\data\processed\heart_clean.csv
