<a href="https://colab.research.google.com/github/coder0SHUBHAM/Heart-Disease-Prediction/blob/main/heart_disease_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Heart Disease Prediction

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


df = pd.read_csv("/content/HeartDiseaseTrain-Test.csv")  # raw dataset

print("Initial Dataset Info")
print(df.info())
print("\nMissing Values")
print(df.isnull().sum())



num_cols = df.select_dtypes(include=['int64', 'float64']).columns
for col in num_cols:
    df[col].fillna(df[col].median(), inplace=True)

cat_cols = df.select_dtypes(include=['object']).columns
for col in cat_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)

df.columns = df.columns.str.lower().str.replace(" ", "_")

for col in cat_cols:
    df[col] = df[col].str.strip().str.lower()

label_encoder = LabelEncoder()
for col in cat_cols:
    df[col] = label_encoder.fit_transform(df[col])

print("\nCleaned Dataset Preview")
print(df.head())
print(df.info())


df.to_csv("clean_heart_dataset.csv", index=False)
print("Clean dataset saved as clean_heart_dataset.csv")


sns.countplot(x='target', data=df)
plt.title("Heart Disease Distribution")
plt.show()


X = df.drop("target", axis=1)
y = df["target"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ==============================
# 7. Feature Scaling
# ==============================
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)



# Logistic Regression
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)

# KNN
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_test)

# Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)


def evaluate_model(name, y_test, y_pred):
    print(f"\n{name} Accuracy:", accuracy_score(y_test, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

evaluate_model("Logistic Regression", y_test, lr_pred)
evaluate_model("KNN", y_test, knn_pred)
evaluate_model("Random Forest", y_test, rf_pred)


joblib.dump(rf, "model.pkl")
joblib.dump(scaler, "scaler.pkl")

print("Model & Scaler Saved Successfully âœ…")
