In [3]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Load the dataset
file_path = '.venv/Dataset/heart.csv'
heart_data = pd.read_csv(file_path)

# Check for missing values
if heart_data.isnull().sum().any():
    print("Handling missing values...")
    heart_data.fillna(heart_data.mean(), inplace=True)

# Normalize numerical columns
scaler = MinMaxScaler()
numerical_columns = heart_data.select_dtypes(include=['int64', 'float64']).columns
heart_data[numerical_columns] = scaler.fit_transform(heart_data[numerical_columns])

# Split data into features and target
X = heart_data.drop(columns=['output'])
y = heart_data['output']


In [4]:
from sklearn.feature_selection import SelectKBest, mutual_info_classif

# Select top k features using mutual information
k = 8
selector = SelectKBest(score_func=mutual_info_classif, k=k)
X_selected = selector.fit_transform(X, y)

# Get selected feature names
selected_features = X.columns[selector.get_support()]
print(f"Selected Features: {list(selected_features)}")


Selected Features: ['cp', 'chol', 'thalachh', 'exng', 'oldpeak', 'slp', 'caa', 'thall']


In [5]:
from sklearn.model_selection import train_test_split

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)


In [6]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, Flatten, Dropout

# Reshape data for CNN
X_train_reshaped = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test_reshaped = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

# Define CNN model
model = Sequential([
    Conv1D(filters=32, kernel_size=2, activation='relu', input_shape=(X_train.shape[1], 1)),
    Flatten(),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')  # Binary classification
])

# Compile and train the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train_reshaped, y_train, epochs=50, batch_size=32, validation_data=(X_test_reshaped, y_test))


Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step - accuracy: 0.5814 - loss: 0.6833 - val_accuracy: 0.8197 - val_loss: 0.6315
Epoch 2/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.7910 - loss: 0.6323 - val_accuracy: 0.8033 - val_loss: 0.5765
Epoch 3/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.7653 - loss: 0.5921 - val_accuracy: 0.8033 - val_loss: 0.5246
Epoch 4/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.7791 - loss: 0.5426 - val_accuracy: 0.8197 - val_loss: 0.4779
Epoch 5/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.7538 - loss: 0.5148 - val_accuracy: 0.8361 - val_loss: 0.4430
Epoch 6/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8006 - loss: 0.4708 - val_accuracy: 0.8525 - val_loss: 0.4266
Epoch 7/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

<keras.src.callbacks.history.History at 0x156f565d0>

In [7]:
from sklearn.ensemble import RandomForestClassifier

# Train Random Forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Predictions from Random Forest
rf_predictions = rf_model.predict(X_test)

# Predictions from CNN
cnn_preds = model.predict(X_test_reshaped).flatten()

# Ensemble predictions (simple averaging)
ensemble_preds = (cnn_preds > 0.5).astype(int) + rf_predictions
ensemble_preds = (ensemble_preds > 1).astype(int)


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step


In [9]:
import shap

# Explain Random Forest predictions using SHAP
explainer = shap.Explainer(rf_model, X_train)
shap_values = explainer(X_test)

# Visualize feature importance
shap.summary_plot(shap_values, X_test, feature_names=selected_features)


ImportError: Numba needs NumPy 2.0 or less. Got NumPy 2.1.

In [10]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Evaluate the model
metrics = {
    "Accuracy": accuracy_score(y_test, ensemble_preds),
    "Precision": precision_score(y_test, ensemble_preds),
    "Recall": recall_score(y_test, ensemble_preds),
    "F1 Score": f1_score(y_test, ensemble_preds),
    "AUC-ROC": roc_auc_score(y_test, cnn_preds)
}

print("Model Performance Metrics:")
for key, value in metrics.items():
    print(f"{key}: {value:.4f}")


Model Performance Metrics:
Accuracy: 0.8525
Precision: 0.8710
Recall: 0.8438
F1 Score: 0.8571
AUC-ROC: 0.8998
