In [None]:
# --------------------------------------------
# Task 3: Heart Disease Prediction
# Objective:
# Build a model to predict whether a person is at risk of heart disease
# based on their health data from the UCI Heart Disease dataset.
# --------------------------------------------

# 📦 Step 1: Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, auc
from sklearn.preprocessing import LabelEncoder

# --------------------------------------------
# 📂 Step 2: Load Dataset
# --------------------------------------------
df = pd.read_csv("data/heart.csv")
print("Dataset shape:", df.shape)
display(df.head())

# --------------------------------------------
# 🧹 Step 3: Data Cleaning
# --------------------------------------------
# Remove duplicates and handle missing values
df.drop_duplicates(inplace=True)
df.dropna(inplace=True)

# Ensure target variable 'num' is binary (0 = no disease, 1 = disease)
if df['num'].nunique() > 2:
    df['num'] = (df['num'] > 0).astype(int)

print("\nDataset Info:")
df.info()

print("\nSummary Statistics:")
display(df.describe())

# Encode categorical variables (if any)
labelencoder = LabelEncoder()
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = labelencoder.fit_transform(df[col])

# --------------------------------------------
# 📊 Step 4: Exploratory Data Analysis (EDA)
# --------------------------------------------

# Correlation heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(df.corr(), annot=True, cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()

# Histograms
df.hist(figsize=(15, 10), bins=20, color='skyblue', edgecolor='black')
plt.suptitle("Feature Distributions", fontsize=16)
plt.show()

# Boxplots for outlier detection
plt.figure(figsize=(15, 10))
for i, col in enumerate(df.columns, 1):
    plt.subplot(4, 4, i)
    sns.boxplot(y=df[col], color='lightgreen')
    plt.title(col)
plt.tight_layout()
plt.show()

# --------------------------------------------
# 🔄 Step 5: Prepare Data for Modeling
# --------------------------------------------
X = df.drop('num', axis=1)
y = df['num']

# One-hot encode if needed
X = pd.get_dummies(X, drop_first=True)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# --------------------------------------------
# 🤖 Step 6: Train Models
# --------------------------------------------

# Logistic Regression
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)
y_pred_log = log_reg.predict(X_test)

# Decision Tree
tree = DecisionTreeClassifier(random_state=42)
tree.fit(X_train, y_train)
y_pred_tree = tree.predict(X_test)

# --------------------------------------------
# 📈 Step 7: Model Evaluation
# --------------------------------------------
def evaluate_model(name, y_true, y_pred):
    print(f"\nModel: {name}")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
    print("Classification Report:\n", classification_report(y_true, y_pred))

evaluate_model("Logistic Regression", y_test, y_pred_log)
evaluate_model("Decision Tree", y_test, y_pred_tree)

# ROC curve for Logistic Regression
y_pred_prob_log = log_reg.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_pred_prob_log)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(6, 4))
plt.plot(fpr, tpr, label=f"Logistic Regression (AUC = {roc_auc:.2f})")
plt.plot([0, 1], [0, 1], "k--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.show()

# Feature importance from Decision Tree
importances = pd.Series(tree.feature_importances_, index=X.columns)
importances = importances.sort_values(ascending=False)

plt.figure(figsize=(8, 5))
importances.plot(kind='bar', color='orange', edgecolor='black')
plt.title("Feature Importance (Decision Tree)")
plt.show()

# --------------------------------------------
# 📝 Step 8: Final Insights
# --------------------------------------------
print("\nInsights:")
print("1. Both models achieved reasonable accuracy, with Logistic Regression slightly more interpretable.")
print("2. ROC AUC score suggests the model has good discriminatory power.")
print("3. Key features influencing prediction (Decision Tree):", list(importances.index[:5]))
print("4. Further optimization (hyperparameter tuning) could improve accuracy.")
