In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier


In [3]:
df = pd.read_csv('heart_disease_uci.csv')

In [4]:
df.head()
df.info()
df.isnull().sum()
df['num'].value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 16 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        920 non-null    int64  
 1   age       920 non-null    int64  
 2   sex       920 non-null    object 
 3   dataset   920 non-null    object 
 4   cp        920 non-null    object 
 5   trestbps  861 non-null    float64
 6   chol      890 non-null    float64
 7   fbs       830 non-null    object 
 8   restecg   918 non-null    object 
 9   thalch    865 non-null    float64
 10  exang     865 non-null    object 
 11  oldpeak   858 non-null    float64
 12  slope     611 non-null    object 
 13  ca        309 non-null    float64
 14  thal      434 non-null    object 
 15  num       920 non-null    int64  
dtypes: float64(5), int64(3), object(8)
memory usage: 115.1+ KB


num
0    411
1    265
2    109
3    107
4     28
Name: count, dtype: int64

In [5]:
#why we drop it because it has 40 to 50% missing values 
df = df.drop(columns=["id", "ca", "thal"])

In [6]:
df["num"] = df["num"].apply(lambda x: 0 if x == 0 else 1)

In [7]:
df["num"].value_counts()

num
1    509
0    411
Name: count, dtype: int64

In [8]:
#For numerical:
nums_cols = ["trestbps", "chol", "thalch", "oldpeak"]
for cols in nums_cols:
    df[cols] = df[cols].fillna(df[cols].median())

In [10]:
#For categorical:
cat_cols = ["fbs", "restecg", "exang", "slope"]

for col in cat_cols:
    df[col].fillna(df[col].mode()[0])


In [12]:
df.info()
df.isnull().sum()
df["num"].value_counts()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 13 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       920 non-null    int64  
 1   sex       920 non-null    object 
 2   dataset   920 non-null    object 
 3   cp        920 non-null    object 
 4   trestbps  920 non-null    float64
 5   chol      920 non-null    float64
 6   fbs       920 non-null    bool   
 7   restecg   920 non-null    object 
 8   thalch    920 non-null    float64
 9   exang     920 non-null    bool   
 10  oldpeak   920 non-null    float64
 11  slope     920 non-null    object 
 12  num       920 non-null    int64  
dtypes: bool(2), float64(4), int64(2), object(5)
memory usage: 81.0+ KB


num
1    509
0    411
Name: count, dtype: int64

# Numerical (need scaling):
age
trestbps
chol
thalch
oldpeak

# Categorical (need encoding):
sex
dataset
cp
restecg
slope

# Boolean (already numeric):
fbs (True/False)
exang (True/False)

In [None]:
df = pd.get_dummies(df, drop_first=True)
#Why we use drop_first = Ture To avoid: Dummy Variable Trap (multicollinearity) We keep N-1 columns instead of N.


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 19 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   age                       920 non-null    int64  
 1   trestbps                  920 non-null    float64
 2   chol                      920 non-null    float64
 3   fbs                       920 non-null    bool   
 4   thalch                    920 non-null    float64
 5   exang                     920 non-null    bool   
 6   oldpeak                   920 non-null    float64
 7   num                       920 non-null    int64  
 8   sex_Male                  920 non-null    bool   
 9   dataset_Hungary           920 non-null    bool   
 10  dataset_Switzerland       920 non-null    bool   
 11  dataset_VA Long Beach     920 non-null    bool   
 12  cp_atypical angina        920 non-null    bool   
 13  cp_non-anginal            920 non-null    bool   
 14  cp_typical

In [17]:
X = df.drop("num", axis=1)
y = df["num"]

In [18]:
X.shape, y.shape

((920, 18), (920,))

In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( X, y,test_size=0.2,random_state=42,stratify=y)

#Stratify = to keep same disease ratio in train and test.


In [20]:
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [21]:
models = {
    "Logistic": LogisticRegression(max_iter=1000),
    "SVM": SVC(),
    "KNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Decision Tree": DecisionTreeClassifier(random_state=42)
}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {acc:.4f}")

Logistic Accuracy: 0.8261
SVM Accuracy: 0.8641
KNN Accuracy: 0.8533
Naive Bayes Accuracy: 0.8533
Decision Tree Accuracy: 0.7880


[WinError 2] The system cannot find the file specified
  File "c:\ProgramData\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
        "wmic CPU Get NumberOfCores /Format:csv".split(),
        capture_output=True,
        text=True,
    )
  File "c:\ProgramData\anaconda3\Lib\subprocess.py", line 554, in run
    with Popen(*popenargs, **kwargs) as process:
         ~~~~~^^^^^^^^^^^^^^^^^^^^^^
  File "c:\ProgramData\anaconda3\Lib\subprocess.py", line 1039, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
    ~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
                        pass_fds, cwd, env,
                        ^^^^^^^^^^^^^^^^^^^
    ...<5 lines>...
                        gid, gids, uid, umask,
                        ^^^^^^^^^^^^^^^^^^^^^^
                        start_new_session, process_group)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^

In [22]:
svm_model = SVC(probability=True)
svm_model.fit(X_train, y_train)

y_pred_svm = svm_model.predict(X_test)

print("SVM:")
print(confusion_matrix(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))

SVM:
[[66 16]
 [ 9 93]]
              precision    recall  f1-score   support

           0       0.88      0.80      0.84        82
           1       0.85      0.91      0.88       102

    accuracy                           0.86       184
   macro avg       0.87      0.86      0.86       184
weighted avg       0.87      0.86      0.86       184



In [23]:
knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)

y_pred_knn = knn_model.predict(X_test)

print("KNN:")
print(confusion_matrix(y_test, y_pred_knn))
print(classification_report(y_test, y_pred_knn))


KNN:
[[65 17]
 [10 92]]
              precision    recall  f1-score   support

           0       0.87      0.79      0.83        82
           1       0.84      0.90      0.87       102

    accuracy                           0.85       184
   macro avg       0.86      0.85      0.85       184
weighted avg       0.85      0.85      0.85       184



In [24]:
y_probs_svm = svm_model.predict_proba(X_test)[:, 1]
from sklearn.metrics import roc_auc_score

auc_svm = roc_auc_score(y_test, y_probs_svm)
print("SVM ROC-AUC:", auc_svm)

SVM ROC-AUC: 0.9226446676231468


In [25]:
y_probs_knn = knn_model.predict_proba(X_test)[:, 1]
auc_knn = roc_auc_score(y_test, y_probs_knn)
print("KNN ROC-AUC:", auc_knn)

KNN ROC-AUC: 0.8989717838354854


In [26]:
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix

y_probs = svm_model.predict_proba(X_test)[:, 1]

best_t = 0
best_recall = 0

for t in np.arange(0.1, 0.9, 0.01):
    y_pred_temp = (y_probs >= t).astype(int)
    report = classification_report(y_test, y_pred_temp, output_dict=True)
    
    recall = report["1"]["recall"]  # disease class
    
    if recall > best_recall:
        best_recall = recall
        best_t = t

print("Best Threshold:", best_t)
print("Best Recall:", best_recall)


Best Threshold: 0.1
Best Recall: 0.9901960784313726


In [27]:
y_pred_final = (y_probs >= best_t).astype(int)

print("Final Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_final))
print(classification_report(y_test, y_pred_final))

Final Confusion Matrix:
[[ 31  51]
 [  1 101]]
              precision    recall  f1-score   support

           0       0.97      0.38      0.54        82
           1       0.66      0.99      0.80       102

    accuracy                           0.72       184
   macro avg       0.82      0.68      0.67       184
weighted avg       0.80      0.72      0.68       184

