<a href="https://colab.research.google.com/github/debojit11070/deep-learning/blob/main/Heart_disease_prediction_with_svm%2C_knn%2C_nn%2C_catboost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2-cp310-cp310-manylinux2014_x86_64.whl (98.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.6/98.6 MB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2


In [14]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split, KFold
from sklearn.neighbors import KNeighborsClassifier
from catboost import CatBoostClassifier
import matplotlib.pyplot as plt

In [16]:
df = pd.read_csv("/content/drive/MyDrive/datasets/heart.csv")

In [19]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [18]:
df.shape

(303, 14)

In [20]:
df.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [21]:
# Split the dataset into features and labels
X = df.drop("target", axis=1)
y = df["target"]

#We will be using neural network, kfold, knn for this project and compare the accuracy scores

In [22]:
# preparing data for k fold cross validation
X_data = (X - X.mean()) / X.std()

# define the neural network model

In [48]:
def nn_model():
    model = tf.keras.Sequential([
        layers.Dense(1024, activation="relu", input_shape=(X_data.shape[1],)),
        layers.Dropout(0.2),
        layers.Dense(512, activation="relu"),
        layers.Dropout(0.2),
        layers.Dense(256, activation="relu"),
        layers.Dropout(0.2),
        layers.Dense(128, activation="relu"),
        layers.Dropout(0.2),
        layers.Dense(1, activation="sigmoid")
    ])
    model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
    return model

In [58]:
# Initialize the K-Fold Cross-Validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)
nn_cv_scores = []
knn_cv_scores = []
cat_cv_scores = []

In [50]:
for train_index, val_index in kf.split(X_data):
  X_train_fold, X_val_fold = X_data.iloc[train_index], X_data.iloc[val_index]
  y_train_fold, y_val_fold = y.iloc[train_index], y.iloc[val_index]

In [51]:
# neural network model
model_nn = nn_model()

In [52]:
    # Convert pandas DataFrames to TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((X_train_fold.values, y_train_fold.values))
train_dataset = train_dataset.shuffle(buffer_size=len(X_train_fold)).batch(32)
val_dataset = tf.data.Dataset.from_tensor_slices((X_val_fold.values, y_val_fold.values))
val_dataset = val_dataset.batch(32)

In [55]:
model_nn.fit(train_dataset, epochs=60, verbose=0)
nn_accuracy = model_nn.evaluate(val_dataset, verbose=0)
nn_cv_scores.append(nn_accuracy)

In [56]:
nn_accuracy

[1.7136130332946777, 0.800000011920929]

In [59]:
# KNN
model_knn = KNeighborsClassifier(n_neighbors=5)
model_knn.fit(X_train_fold, y_train_fold)
knn_accuracy = model_knn.score(X_val_fold, y_val_fold)
knn_cv_scores.append(knn_accuracy)

In [60]:
knn_accuracy

0.8166666666666667

In [63]:
for train_index, val_index in kf.split(X_data):
    X_train_fold, X_val_fold = X_data.iloc[train_index], X_data.iloc[val_index]
    y_train_fold, y_val_fold = y.iloc[train_index], y.iloc[val_index]

CatBoostError: ignored

# svm

In [66]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [67]:
# Prepare the data for SVM
cat_features = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']
X[cat_features] = X[cat_features].astype(str)
X[cat_features] = X[cat_features].fillna("NaN")

In [68]:
# Convert categorical features to numerical using one-hot encoding
X_encoded = pd.get_dummies(X, columns=cat_features)

In [69]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

In [70]:
# Initialize the SVM classifier
svm_classifier = SVC(kernel='linear')

# Train the SVM model on the training data
svm_classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred = svm_classifier.predict(X_test)


In [71]:
# Calculate the accuracy of the SVM model
accuracy = accuracy_score(y_test, y_pred)
print(f"SVM Accuracy: {accuracy:.4f}")

SVM Accuracy: 0.8689


# catboost

In [78]:

from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [79]:
# Prepare the data for CatBoost
cat_features = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']
X[cat_features] = X[cat_features].astype(str)
X[cat_features] = X[cat_features].fillna("NaN")

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the CatBoostClassifier with parameter adjustments
model_cat = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    l2_leaf_reg=3,
    border_count=32,
    cat_features=cat_features,
    verbose=0
)

In [80]:
# Train the CatBoost model on the training data
model_cat.fit(X_train, y_train, eval_set=(X_test, y_test), early_stopping_rounds=50)

# Make predictions on the test data
y_pred = model_cat.predict(X_test)

# Calculate the accuracy of the CatBoost model
accuracy = accuracy_score(y_test, y_pred)
print(f"CatBoost Accuracy: {accuracy:.4f}")

# Feature importance analysis
feature_importance = model_cat.get_feature_importance(prettified=True)
print("\nFeature Importance:")
print(feature_importance)

CatBoost Accuracy: 0.8689

Feature Importance:
   Feature Id  Importances
0          ca    23.919634
1        thal    13.262609
2          cp    12.130957
3       slope     9.553472
4         sex     6.874026
5       exang     6.626839
6     oldpeak     5.343564
7     restecg     4.794006
8         age     4.751172
9     thalach     4.238326
10       chol     4.172203
11   trestbps     2.888162
12        fbs     1.445031
