In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler

df = pd.read_csv("heart_disease_uci.csv")
 
display(df.head())
display(df.info())
display(df.columns)
display(df.describe())
display(df.shape)
display(df.isnull().sum())

# Veri hakkındaki gerekli bilgileri edinmek için bu komutlar kullanılmıştır, gerekli bilgiler aşağıdadır.

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 16 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        920 non-null    int64  
 1   age       920 non-null    int64  
 2   sex       920 non-null    object 
 3   dataset   920 non-null    object 
 4   cp        920 non-null    object 
 5   trestbps  861 non-null    float64
 6   chol      890 non-null    float64
 7   fbs       830 non-null    object 
 8   restecg   918 non-null    object 
 9   thalch    865 non-null    float64
 10  exang     865 non-null    object 
 11  oldpeak   858 non-null    float64
 12  slope     611 non-null    object 
 13  ca        309 non-null    float64
 14  thal      434 non-null    object 
 15  num       920 non-null    int64  
dtypes: float64(5), int64(3), object(8)
memory usage: 115.1+ KB


None

Index(['id', 'age', 'sex', 'dataset', 'cp', 'trestbps', 'chol', 'fbs',
       'restecg', 'thalch', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num'],
      dtype='object')

Unnamed: 0,id,age,trestbps,chol,thalch,oldpeak,ca,num
count,920.0,920.0,861.0,890.0,865.0,858.0,309.0,920.0
mean,460.5,53.51087,132.132404,199.130337,137.545665,0.878788,0.676375,0.995652
std,265.725422,9.424685,19.06607,110.78081,25.926276,1.091226,0.935653,1.142693
min,1.0,28.0,0.0,0.0,60.0,-2.6,0.0,0.0
25%,230.75,47.0,120.0,175.0,120.0,0.0,0.0,0.0
50%,460.5,54.0,130.0,223.0,140.0,0.5,0.0,1.0
75%,690.25,60.0,140.0,268.0,157.0,1.5,1.0,2.0
max,920.0,77.0,200.0,603.0,202.0,6.2,3.0,4.0


(920, 16)

id            0
age           0
sex           0
dataset       0
cp            0
trestbps     59
chol         30
fbs          90
restecg       2
thalch       55
exang        55
oldpeak      62
slope       309
ca          611
thal        486
num           0
dtype: int64

In [3]:
knn_imputer = KNNImputer(n_neighbors=5)
cols_for_knn = ['num', 'age', 'ca']
subset = df[cols_for_knn]

imputed_subset = pd.DataFrame(knn_imputer.fit_transform(subset), columns=cols_for_knn)

df['ca'] = imputed_subset['ca']

# ca sütunu çok fazla eksik veri içerdiği için (611 NAN) en çok tekrar eden değerle doldurulmak yerine KNN algoritması kullanılarak doldurulmuştur. En çok tekrar eden değerle doldurulması durumunda 611 tane 0 değeri
# eklenecek olup veri setinin dengesini bozacaktır. Bu yüzden KNN algoritması kullanılarak doldurulmuştur.

In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
import pandas as pd

original_columns = df.columns.tolist()

numeric_cols = df.select_dtypes(include=["int64", "float64"]).columns.tolist()
if 'ca' in numeric_cols:
    numeric_cols.remove('ca')
categorical_cols = df.select_dtypes(include=["object"]).columns.tolist()

transformer = ColumnTransformer(
    transformers=[("numeric", SimpleImputer(strategy="median"), numeric_cols),("categorical", SimpleImputer(strategy="most_frequent"), categorical_cols)])

transformed_data = transformer.fit_transform(df)
imputed_df = pd.DataFrame(transformed_data,columns=numeric_cols + categorical_cols)

for col in numeric_cols + categorical_cols:
    imputed_df[col] = imputed_df[col].astype(df[col].dtypes)


imputed_df['ca'] = df['ca']

df = imputed_df.reindex(columns=original_columns) 

df['ca'] = df['ca'].round().astype(int)
df['ca'] = df['ca'].astype("float64")

print(df.info())
display(df.describe())

# SimpleImputer kullanarak eksik değerler doldurulmuştur, doldurma sonrasında yeniden orijinal (int,float) değerlerine çevirilmiştir.
# Ayrıca ca sütunu 0.4 gibi değerler içerdiği için yuvarlanıp tekrardan floata çevirilmiştir.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 16 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        920 non-null    int64  
 1   age       920 non-null    int64  
 2   sex       920 non-null    object 
 3   dataset   920 non-null    object 
 4   cp        920 non-null    object 
 5   trestbps  920 non-null    float64
 6   chol      920 non-null    float64
 7   fbs       920 non-null    object 
 8   restecg   920 non-null    object 
 9   thalch    920 non-null    float64
 10  exang     920 non-null    object 
 11  oldpeak   920 non-null    float64
 12  slope     920 non-null    object 
 13  ca        920 non-null    float64
 14  thal      920 non-null    object 
 15  num       920 non-null    int64  
dtypes: float64(5), int64(3), object(8)
memory usage: 115.1+ KB
None


Unnamed: 0,id,age,trestbps,chol,thalch,oldpeak,ca,num
count,920.0,920.0,920.0,920.0,920.0,920.0,920.0,920.0
mean,460.5,53.51087,131.995652,199.908696,137.692391,0.853261,0.645652,0.995652
std,265.725422,9.424685,18.4513,109.040171,25.145235,1.058049,0.789266,1.142693
min,1.0,28.0,0.0,0.0,60.0,-2.6,0.0,0.0
25%,230.75,47.0,120.0,177.75,120.0,0.0,0.0,0.0
50%,460.5,54.0,130.0,223.0,140.0,0.5,0.0,1.0
75%,690.25,60.0,140.0,267.0,156.0,1.5,1.0,2.0
max,920.0,77.0,200.0,603.0,202.0,6.2,3.0,4.0


In [5]:
label_cols = ["sex", "slope"]
label_encoder = LabelEncoder()
for col in label_cols:
    df[col] = label_encoder.fit_transform(df[col])

onehot_cols = ["cp", "restecg", "thal", "ca", "dataset"]
df = pd.get_dummies(df, columns=onehot_cols, drop_first=False, dtype=int)

df["fbs"] = df["fbs"].astype(int)
df["exang"] = df["exang"].astype(int)

df.drop("id", axis=1, inplace=True)

display(df)

# LabelEncoder ve OneHotEncoder kullanılarak kategorik veriler sayısal verilere dönüştürülmüştür. Ayrıca id sütunu veri setinden çıkarılmıştır.

Unnamed: 0,age,sex,trestbps,chol,fbs,thalch,exang,oldpeak,slope,num,...,thal_normal,thal_reversable defect,ca_0.0,ca_1.0,ca_2.0,ca_3.0,dataset_Cleveland,dataset_Hungary,dataset_Switzerland,dataset_VA Long Beach
0,63,1,145.0,233.0,1,150.0,0,2.3,0,0,...,0,0,1,0,0,0,1,0,0,0
1,67,1,160.0,286.0,0,108.0,1,1.5,1,2,...,1,0,0,0,0,1,1,0,0,0
2,67,1,120.0,229.0,0,129.0,1,2.6,1,1,...,0,1,0,0,1,0,1,0,0,0
3,37,1,130.0,250.0,0,187.0,0,3.5,0,0,...,1,0,1,0,0,0,1,0,0,0
4,41,0,130.0,204.0,0,172.0,0,1.4,2,0,...,1,0,1,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
915,54,0,127.0,333.0,1,154.0,0,0.0,1,1,...,1,0,0,1,0,0,0,0,0,1
916,62,1,130.0,139.0,0,140.0,0,0.5,1,0,...,1,0,0,1,0,0,0,0,0,1
917,55,1,122.0,223.0,1,100.0,0,0.0,1,2,...,0,0,1,0,0,0,0,0,0,1
918,58,1,130.0,385.0,1,140.0,0,0.5,1,0,...,1,0,1,0,0,0,0,0,0,1


In [6]:
target_column = 'num'

numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
numeric_cols.remove(target_column)

scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

display(df)

# hedef sütun num olarak seçilip hedef sütun harici tüm sayısal sütunlar standartlaştırılmıştır.

Unnamed: 0,age,sex,trestbps,chol,fbs,thalch,exang,oldpeak,slope,num,...,thal_normal,thal_reversable defect,ca_0.0,ca_1.0,ca_2.0,ca_3.0,dataset_Cleveland,dataset_Hungary,dataset_Switzerland,dataset_VA Long Beach
0,1.007386,1,0.705176,0.303643,1,0.489727,0,1.368109,0,0,...,0,0,1,0,0,0,1,0,0,0
1,1.432034,1,1.518569,0.789967,0,-1.181478,1,0.611589,1,2,...,1,0,0,0,0,1,1,0,0,0
2,1.432034,1,-0.650479,0.266939,0,-0.345875,1,1.651804,1,1,...,0,1,0,0,1,0,1,0,0,0
3,-1.752828,1,-0.108217,0.459634,0,1.961979,0,2.502889,0,0,...,1,0,1,0,0,0,1,0,0,0
4,-1.328180,0,-0.108217,0.037541,0,1.365120,0,0.517024,2,0,...,1,0,1,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
915,0.051927,0,-0.270895,1.221235,1,0.648889,0,-0.806886,1,1,...,1,0,0,1,0,0,0,0,0,1
916,0.901224,1,-0.108217,-0.558893,0,0.091821,0,-0.334061,1,0,...,1,0,0,1,0,0,0,0,0,1
917,0.158089,1,-0.542026,0.211884,1,-1.499803,0,-0.806886,1,2,...,0,0,1,0,0,0,0,0,0,1
918,0.476575,1,-0.108217,1.698383,1,0.091821,0,-0.334061,1,0,...,1,0,1,0,0,0,0,0,0,1
