# Deta Preparaton
-----------------------
- Data Cleaning
- One-Hot Encoding
- Drop Culomns

## 1.0 Imports

In [26]:
import os
import pandas as pd

## 2.0 Load and Check Data

In [27]:
df = pd.read_csv('../data/raw/heart_disease_uci.csv')
df.head()

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


In [28]:
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 16 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        920 non-null    int64  
 1   age       920 non-null    int64  
 2   sex       920 non-null    object 
 3   dataset   920 non-null    object 
 4   cp        920 non-null    object 
 5   trestbps  861 non-null    float64
 6   chol      890 non-null    float64
 7   fbs       830 non-null    object 
 8   restecg   918 non-null    object 
 9   thalch    865 non-null    float64
 10  exang     865 non-null    object 
 11  oldpeak   858 non-null    float64
 12  slope     611 non-null    object 
 13  ca        309 non-null    float64
 14  thal      434 non-null    object 
 15  num       920 non-null    int64  
dtypes: float64(5), int64(3), object(8)
memory usage: 115.1+ KB


Unnamed: 0,id,age,trestbps,chol,thalch,oldpeak,ca,num
count,920.0,920.0,861.0,890.0,865.0,858.0,309.0,920.0
mean,460.5,53.51087,132.132404,199.130337,137.545665,0.878788,0.676375,0.995652
std,265.725422,9.424685,19.06607,110.78081,25.926276,1.091226,0.935653,1.142693
min,1.0,28.0,0.0,0.0,60.0,-2.6,0.0,0.0
25%,230.75,47.0,120.0,175.0,120.0,0.0,0.0,0.0
50%,460.5,54.0,130.0,223.0,140.0,0.5,0.0,1.0
75%,690.25,60.0,140.0,268.0,157.0,1.5,1.0,2.0
max,920.0,77.0,200.0,603.0,202.0,6.2,3.0,4.0


### 2.1 Chechk missing values

In [29]:
null_counts = df.isnull().sum()

print("Missing values by culomns:")
print(null_counts[null_counts > 0].sort_values(ascending=False))


null_percent = (df.isnull().sum() / len(df)) * 100
print("\nPercentage rate:")
print(null_percent[null_percent > 0].sort_values(ascending=False))

Missing values by culomns:
ca          611
thal        486
slope       309
fbs          90
oldpeak      62
trestbps     59
exang        55
thalch       55
chol         30
restecg       2
dtype: int64

Percentage rate:
ca          66.413043
thal        52.826087
slope       33.586957
fbs          9.782609
oldpeak      6.739130
trestbps     6.413043
exang        5.978261
thalch       5.978261
chol         3.260870
restecg      0.217391
dtype: float64


### 2.2 Replacing continuous variables with mediansÂ¶

In [30]:
continuous_cols = ['oldpeak', 'trestbps', 'thalch', 'chol']
for col in continuous_cols:
    if col in df.columns:
        df[col] = df[col].fillna(df[col].median())

### 2.3 Replacing categorical variables with the most common value

In [31]:
categorical_to_fill = ['fbs', 'exang', 'restecg', 'slope']
for col in categorical_to_fill:
    cols_found = [c for c in df.columns if c.startswith(col)]
    for c in cols_found:
        df[c] = df[c].fillna(0)

### 2.4 Create missing culomn for ca and thal 

In [32]:
df['missing_ca'] = df['ca'].isnull().astype(int)
df['ca'] = df['ca'].fillna(0)
df['missing_thal'] = df['thal'].isnull().astype(int)

## 3.0 Drop not important culomns

In [33]:
df = df.drop(["id", "dataset"], axis=1)
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num,missing_ca,missing_thal
0,63,Male,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0,0,0
1,67,Male,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2,0,0
2,67,Male,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1,0,0
3,37,Male,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0,0,0
4,41,Female,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0,0,0


## 4.0 Label Encoding and One-Hot Encoding

### 4.1 Label Encoding

In [34]:
df['fbs'] = df['fbs'].astype(bool).astype(int)
df['exang'] = df['exang'].astype(bool).astype(int)

df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num,missing_ca,missing_thal
0,63,Male,typical angina,145.0,233.0,1,lv hypertrophy,150.0,0,2.3,downsloping,0.0,fixed defect,0,0,0
1,67,Male,asymptomatic,160.0,286.0,0,lv hypertrophy,108.0,1,1.5,flat,3.0,normal,2,0,0
2,67,Male,asymptomatic,120.0,229.0,0,lv hypertrophy,129.0,1,2.6,flat,2.0,reversable defect,1,0,0
3,37,Male,non-anginal,130.0,250.0,0,normal,187.0,0,3.5,downsloping,0.0,normal,0,0,0
4,41,Female,atypical angina,130.0,204.0,0,lv hypertrophy,172.0,0,1.4,upsloping,0.0,normal,0,0,0


### 4.2 One-Hot Encoding

In [35]:
categorical_cols = ['cp', 'sex', 'restecg', 'slope', 'thal']
df = pd.get_dummies(df, columns=categorical_cols, drop_first=False, dtype=int)

pd.set_option('display.max_columns', None)
df.head()

Unnamed: 0,age,trestbps,chol,fbs,thalch,exang,oldpeak,ca,num,missing_ca,missing_thal,cp_asymptomatic,cp_atypical angina,cp_non-anginal,cp_typical angina,sex_Female,sex_Male,restecg_0,restecg_lv hypertrophy,restecg_normal,restecg_st-t abnormality,slope_0,slope_downsloping,slope_flat,slope_upsloping,thal_fixed defect,thal_normal,thal_reversable defect
0,63,145.0,233.0,1,150.0,0,2.3,0.0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,1,0,0,1,0,0
1,67,160.0,286.0,0,108.0,1,1.5,3.0,2,0,0,1,0,0,0,0,1,0,1,0,0,0,0,1,0,0,1,0
2,67,120.0,229.0,0,129.0,1,2.6,2.0,1,0,0,1,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,1
3,37,130.0,250.0,0,187.0,0,3.5,0.0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0
4,41,130.0,204.0,0,172.0,0,1.4,0.0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,1,0


### 4.3 Order culomns

In [36]:
info_cols = ['age', 'sex_Female', 'sex_Male']
vitals_cols = ['trestbps', 'chol', 'thalch', 'fbs', 'exang', 'oldpeak']
cp_cols = ['cp_typical angina', 'cp_atypical angina', 'cp_non-anginal', 'cp_asymptomatic']
restecg_cols = ['restecg_normal', 'restecg_st-t abnormality', 'restecg_lv hypertrophy']
slope_cols = ['slope_upsloping', 'slope_flat', 'slope_downsloping']

ca_cols = ['missing_ca', 'ca'] 
thal_cols = ['missing_thal', 'thal_normal', 'thal_fixed defect', 'thal_reversable defect']


ordered_columns = (
    info_cols + 
    vitals_cols + 
    cp_cols + 
    restecg_cols + 
    slope_cols + 
    ca_cols + 
    thal_cols + 
    ['num']
)


existing_columns = [col for col in ordered_columns if col in df.columns]


df = df[existing_columns]

df.head()

Unnamed: 0,age,sex_Female,sex_Male,trestbps,chol,thalch,fbs,exang,oldpeak,cp_typical angina,cp_atypical angina,cp_non-anginal,cp_asymptomatic,restecg_normal,restecg_st-t abnormality,restecg_lv hypertrophy,slope_upsloping,slope_flat,slope_downsloping,missing_ca,ca,missing_thal,thal_normal,thal_fixed defect,thal_reversable defect,num
0,63,0,1,145.0,233.0,150.0,1,0,2.3,1,0,0,0,0,0,1,0,0,1,0,0.0,0,0,1,0,0
1,67,0,1,160.0,286.0,108.0,0,1,1.5,0,0,0,1,0,0,1,0,1,0,0,3.0,0,1,0,0,2
2,67,0,1,120.0,229.0,129.0,0,1,2.6,0,0,0,1,0,0,1,0,1,0,0,2.0,0,0,0,1,1
3,37,0,1,130.0,250.0,187.0,0,0,3.5,0,0,1,0,1,0,0,0,0,1,0,0.0,0,1,0,0,0
4,41,1,0,130.0,204.0,172.0,0,0,1.4,0,1,0,0,0,0,1,1,0,0,0,0.0,0,1,0,0,0


## 5.0 Save To CSV File

In [37]:
file_path = '../data/processed/heart_disease_uci_encoded.csv'

directory = os.path.dirname(file_path)

if not os.path.exists(directory):
    os.makedirs(directory)
    print(f"Directory created: {directory}")

df.to_csv(file_path, index=False)
print(f"File saved succesfully: {file_path}")

File saved succesfully: ../data/processed/heart_disease_uci_encoded.csv
