In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [3]:
df = pd.read_csv('../data/heart_disease_uci.csv')

In [4]:
df.shape

(920, 16)

In [5]:
df.head(5)

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


In [9]:
df['thal'].unique()

array(['fixed defect', 'normal', 'reversable defect', nan], dtype=object)

In [18]:
dct = {}
for t in df['num']:
    dct[t] = dct.get(t, 0) + 1

In [19]:
dct

{0: 411, 2: 109, 1: 265, 3: 107, 4: 28}

In [20]:
df.columns

Index(['id', 'age', 'sex', 'dataset', 'cp', 'trestbps', 'chol', 'fbs',
       'restecg', 'thalch', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num'],
      dtype='object')

In [21]:
df['num'].unique()

array([0, 2, 1, 3, 4], dtype=int64)

In [24]:
df['id'].isna().sum()

0

In [25]:
for col in df.columns:
    if df[col].isna().sum() > 0:
        print(f'{col}, {df[col].isna().sum()}')

trestbps, 59
chol, 30
fbs, 90
restecg, 2
thalch, 55
exang, 55
oldpeak, 62
slope, 309
ca, 611
thal, 486


In [27]:
df['ca']

0      0.0
1      3.0
2      2.0
3      0.0
4      0.0
      ... 
915    NaN
916    NaN
917    NaN
918    NaN
919    NaN
Name: ca, Length: 920, dtype: float64

In [28]:
df.columns

Index(['id', 'age', 'sex', 'dataset', 'cp', 'trestbps', 'chol', 'fbs',
       'restecg', 'thalch', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num'],
      dtype='object')

In [30]:
df.dropna(axis = 1)

Unnamed: 0,id,age,sex,dataset,cp,num
0,1,63,Male,Cleveland,typical angina,0
1,2,67,Male,Cleveland,asymptomatic,2
2,3,67,Male,Cleveland,asymptomatic,1
3,4,37,Male,Cleveland,non-anginal,0
4,5,41,Female,Cleveland,atypical angina,0
...,...,...,...,...,...,...
915,916,54,Female,VA Long Beach,asymptomatic,1
916,917,62,Male,VA Long Beach,typical angina,0
917,918,55,Male,VA Long Beach,asymptomatic,2
918,919,58,Male,VA Long Beach,asymptomatic,0


In [31]:
dct

{0: 411, 2: 109, 1: 265, 3: 107, 4: 28}

In [35]:
sum(dct.values())

920

In [36]:
411/920

0.4467391304347826

In [37]:
df.dtypes

id            int64
age           int64
sex          object
dataset      object
cp           object
trestbps    float64
chol        float64
fbs          object
restecg      object
thalch      float64
exang        object
oldpeak     float64
slope        object
ca          float64
thal         object
num           int64
dtype: object

In [39]:
df_filtered = df.dropna()

In [42]:
df_filtered.head(5)

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


In [45]:
X = df_filtered.iloc[:, 1:-1]

In [46]:
y = df_filtered.iloc[:, -1]

In [47]:
columns_to_one_hot = []
columns_to_scale = []
for tup in zip(X.dtypes.index, X.dtypes):
    if tup[1] == np.dtype('O'):
        columns_to_one_hot.append(tup[0])
    else:
        columns_to_scale.append(tup[0])

In [48]:
columns_to_one_hot

['sex', 'dataset', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'thal']

In [49]:
columns_to_scale

['age', 'trestbps', 'chol', 'thalch', 'oldpeak', 'ca']

In [50]:
# one hot encoding
df1 = OneHotEncoder(sparse_output=False).fit_transform(X[columns_to_one_hot])

# standard scaling
df2 = StandardScaler().fit_transform(X[columns_to_scale])

In [51]:
binary_y = y > 0

In [53]:
final_dataset = np.hstack((df1, df2, binary_y.to_numpy().reshape(-1, 1)))

In [54]:
final_dataset.shape

(299, 29)

In [56]:
# np.save("../data/heart_dataset.npy", final_dataset)