In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import classification_report, precision_recall_fscore_support
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import learning_curve
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

In [None]:
data = pd.read_csv('dataset/heart.csv')
print(data.head)
print(data.columns)  

In [None]:
for column in data.columns:
    if data[column].dtype == 'object':
        print(column, data[column].unique())

In [None]:
stats = data.describe()
print(stats)

In [None]:
data_encoded = pd.get_dummies(data, columns=['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope'])
X = data_encoded.drop('HeartDisease', axis=1) 
y = data_encoded['HeartDisease'] 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
plt.figure(figsize=(8, 6))
sns.histplot(data['Age'], kde=True)
plt.title('Age Distribution', fontsize=16)
plt.xlabel('Age', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.show()

In [None]:
correlation_matrix = data_encoded.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Correlation Matrix', fontsize=16)
plt.show()

Random Forrest Model

In [None]:
random_forest_model = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest_model.fit(X_train, y_train)

y_pred = random_forest_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy*100)

report = classification_report(y_test, y_pred, output_dict=True)
    
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

In [None]:
def plot_confusion_matrix(y_true, y_pred, model_name):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(6, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False)
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title(f'Confusion Matrix - {model_name}')
    plt.show()

plot_confusion_matrix(y_test, y_pred, 'Random Forest')

In [None]:
feature_importance = random_forest_model.feature_importances_

feature_names = X.columns

sorted_idx = feature_importance.argsort()

plt.figure(figsize=(10, 8))
plt.barh(range(len(sorted_idx)), feature_importance[sorted_idx], align='center')
plt.yticks(range(len(sorted_idx)), [feature_names[i] for i in sorted_idx])
plt.xlabel('Feature Importance')
plt.title('Random Forest Feature Importance')
plt.show()

KNN Model

In [None]:
knn_model = KNeighborsClassifier(n_neighbors=5)  
knn_model.fit(X_train, y_train)

y_pred_knn = knn_model.predict(X_test)

accuracy_knn = accuracy_score(y_test, y_pred_knn)
print("KNN Model Accuracy:", accuracy_knn*100)

print("\nClassification Report for KNN Model:")
print(classification_report(y_test, y_pred_knn))


In [None]:
plot_confusion_matrix(y_test, y_pred_knn, 'KNN')

In [None]:
logistic_regression_model = LogisticRegression(solver='sag')
logistic_regression_model.fit(X_train, y_train)
y_pred_logistic = logistic_regression_model.predict(X_test)

accuracy_logistic = accuracy_score(y_test, y_pred_logistic)
print("Logistic Regression Model Accuracy:", accuracy_logistic)

print("\nClassification Report for Logistic Regression Model:")
print(classification_report(y_test, y_pred_logistic))


In [None]:
plot_confusion_matrix(y_test, y_pred_logistic, 'Logistic Regression')

In [None]:
# Get feature coefficients and their corresponding names
feature_coefficients = logistic_regression_model.coef_[0]
feature_names = X.columns

# Create a DataFrame to store feature coefficients and their names
coefficients_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': feature_coefficients})

# Sort the coefficients by their absolute values for better visualization
coefficients_df['Abs_Coefficient'] = abs(coefficients_df['Coefficient'])
coefficients_df = coefficients_df.sort_values(by='Abs_Coefficient', ascending=False)

# Plot feature coefficients
plt.figure(figsize=(10, 8))
plt.barh(coefficients_df['Feature'], coefficients_df['Coefficient'], color='skyblue')
plt.xlabel('Coefficient Value')
plt.ylabel('Feature')
plt.title('Feature Coefficients of Logistic Regression Model')
plt.show()
