# 1. Setup project

In [17]:
import pandas as pd
import numpy  as np

# 2. Read data

In [3]:
heart_disease = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data',
                            sep=',',
                            header=None, 
                            names=['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 
                                   'restecg', 'thalach', 'exang', 'oldpeak', 
                                   'slope', 'ca', 'thal', 'num'],
                            na_values='?'
                           )

In [4]:
heart_disease

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,45.0,1.0,1.0,110.0,264.0,0.0,0.0,132.0,0.0,1.2,2.0,0.0,7.0,1
299,68.0,1.0,4.0,144.0,193.0,1.0,0.0,141.0,0.0,3.4,2.0,2.0,7.0,2
300,57.0,1.0,4.0,130.0,131.0,0.0,0.0,115.0,1.0,1.2,2.0,1.0,7.0,3
301,57.0,0.0,2.0,130.0,236.0,0.0,2.0,174.0,0.0,0.0,2.0,1.0,3.0,1


# 3. Data preparation 

## 3.1 Handling missing values

In [5]:
# Quantify the missing values in the dataset
(heart_disease.isnull()).sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          4
thal        2
num         0
dtype: int64

In [21]:
# Calculate the missing value percentage for each column.
def print_missing_values(dataset):
    """Print the column name, number, and percentage of missing values."""
    for i in range(heart_disease.shape[1]):
        name = heart_disease.columns[i]
        misses = heart_disease[name].isnull().sum()
        perc   = misses / heart_disease.shape[0] * 100
        print(f"> [{name}], Missing values: {misses} ({perc:.2f}%)")

print_missing_values(heart_disease)

> [age], Missing values: 0 (0.00%)
> [sex], Missing values: 0 (0.00%)
> [cp], Missing values: 0 (0.00%)
> [trestbps], Missing values: 0 (0.00%)
> [chol], Missing values: 0 (0.00%)
> [fbs], Missing values: 0 (0.00%)
> [restecg], Missing values: 0 (0.00%)
> [thalach], Missing values: 0 (0.00%)
> [exang], Missing values: 0 (0.00%)
> [oldpeak], Missing values: 0 (0.00%)
> [slope], Missing values: 0 (0.00%)
> [ca], Missing values: 0 (0.00%)
> [thal], Missing values: 0 (0.00%)
> [num], Missing values: 0 (0.00%)


In [22]:
# Data imputation by median 
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
heart_disease[['ca', 'thal']] = imputer.fit_transform(heart_disease[['ca', 'thal']].values)
print_missing_values(heart_disease)

> [age], Missing values: 0 (0.00%)
> [sex], Missing values: 0 (0.00%)
> [cp], Missing values: 0 (0.00%)
> [trestbps], Missing values: 0 (0.00%)
> [chol], Missing values: 0 (0.00%)
> [fbs], Missing values: 0 (0.00%)
> [restecg], Missing values: 0 (0.00%)
> [thalach], Missing values: 0 (0.00%)
> [exang], Missing values: 0 (0.00%)
> [oldpeak], Missing values: 0 (0.00%)
> [slope], Missing values: 0 (0.00%)
> [ca], Missing values: 0 (0.00%)
> [thal], Missing values: 0 (0.00%)
> [num], Missing values: 0 (0.00%)
