# Part C: Early Disease Detection (Heart Disease)

Classification task to predict **heart disease** presence (`disease` = 1/0). Includes EDA, preprocessing, Logistic Regression / Decision Tree / Random Forest, and evaluation with **Accuracy**, **Precision**, **Recall**, **F1**, **ROC-AUC**, and **Confusion Matrix**.


# Setup

This notebook installs/imports commonly used libraries.  
If some libraries are missing in your environment, run the install cell below.


In [None]:

# If needed, uncomment to install dependencies in your environment
# %pip install pandas numpy scikit-learn matplotlib seaborn joblib openpyxl


In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from pathlib import Path
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

# Regression
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score

# Classification
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             confusion_matrix, ConfusionMatrixDisplay, roc_auc_score, RocCurveDisplay)
import joblib

pd.set_option("display.max_columns", 100)
pd.set_option("display.width", 120)


In [None]:

def read_data_flex(path_or_url: str):
    """Load CSV/Excel from a local path or URL (Google Sheets/Drive direct link supported).
    If it's a Google Sheet viewing URL, convert it to CSV export automatically.
    """
    p = str(path_or_url).strip()
    # Try to convert Google Sheets view link to export CSV
    if "docs.google.com/spreadsheets" in p and "/edit" in p and "export?format=csv" not in p:
        # Convert to CSV export
        key = p.split("/d/")[1].split("/")[0]
        p = f"https://docs.google.com/spreadsheets/d/{key}/export?format=csv"
    try:
        if p.lower().endswith((".xlsx", ".xls")):
            df = pd.read_excel(p)
        else:
            df = pd.read_csv(p)
    except Exception as e:
        raise RuntimeError(f"Failed to load data from '{path_or_url}'. Error: {e}")
    return df

def quick_info(df: pd.DataFrame, name: str = "Data"):
    print(f"\n{name} shape: {df.shape}\n")
    display(df.head())
    display(df.describe(include='all').transpose())
    print("\nMissing values per column:\n")
    display(df.isna().sum().to_frame('missing'))
    print("\nDtypes:\n")
    display(df.dtypes.to_frame('dtype'))

def numeric_categorical_columns(df: pd.DataFrame, exclude_target: str = None):
    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    cat_cols = df.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()
    if exclude_target and exclude_target in num_cols:
        num_cols.remove(exclude_target)
    if exclude_target and exclude_target in cat_cols:
        cat_cols.remove(exclude_target)
    return num_cols, cat_cols


## 1) Load Data

In [None]:

# === User input ===
# Use either the Google Sheet link or local CSV/Excel path
DATA_PATH_OR_URL = "https://docs.google.com/spreadsheets/d/10k1aWfHrvoXJrHxMz6F2nfK5-UlkCmQ8EjhUt9Casso/edit?usp=sharing"

df = read_data_flex(DATA_PATH_OR_URL)
quick_info(df, "Raw Data")


## 2) Target & EDA

- Expected target column: **`disease`** (1 = has heart disease, 0 = no disease)
- Basic distributions and class balance.

In [None]:

TARGET = "disease"  # change if needed
assert TARGET in df.columns, f"Target column '{TARGET}' not found. Please update TARGET."

# ensure binary numeric target
if df[TARGET].dtype == 'object':
    df[TARGET] = df[TARGET].astype(str).str.strip().str.lower().map({'yes':1, 'no':0, '1':1, '0':0})
df[TARGET] = df[TARGET].astype(int)

print("Class balance:")
display(df[TARGET].value_counts(normalize=True).to_frame('proportion'))

num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if num_cols:
    df[num_cols].hist(bins=30, figsize=(14, 10))
    plt.tight_layout()
    plt.show()


## 3) Train/Test Split & Preprocessing

In [None]:

X = df.drop(columns=[TARGET])
y = df[TARGET]

num_cols, cat_cols = numeric_categorical_columns(X)

numeric_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocess = ColumnTransformer(transformers=[
    ("num", numeric_pipe, num_cols),
    ("cat", categorical_pipe, cat_cols)
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")


## 4) Train Models, Tune & Evaluate

In [None]:

models = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "DecisionTree": DecisionTreeClassifier(random_state=42),
    "RandomForest": RandomForestClassifier(random_state=42)
}

param_grid = {
    "LogisticRegression": {"model__C": [0.1, 1.0, 10.0]},
    "DecisionTree": {"model__max_depth": [3, 5, 10, None]},
    "RandomForest": {"model__n_estimators": [100, 300], "model__max_depth": [None, 5, 10]}
}

eval_rows = []
best_pipes = {}

for name, model in models.items():
    pipe = Pipeline(steps=[("preprocess", preprocess),
                          ("model", model)])
    grid = GridSearchCV(pipe, param_grid=param_grid[name], cv=5, scoring="f1", n_jobs=-1)
    grid.fit(X_train, y_train)
    best_pipe = grid.best_estimator_
    best_pipes[name] = best_pipe
    
    y_pred = best_pipe.predict(X_test)
    y_prob = best_pipe.predict_proba(X_test)[:, 1] if hasattr(best_pipe, "predict_proba") else None
    
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc = roc_auc_score(y_test, y_prob) if y_prob is not None else np.nan
    
    eval_rows.append([name, acc, prec, rec, f1, roc, grid.best_params_])

eval_df = pd.DataFrame(eval_rows, columns=["Model", "Accuracy", "Precision", "Recall", "F1", "ROC_AUC", "BestParams"])
display(eval_df.sort_values("F1", ascending=False))

best_name = eval_df.sort_values("F1", ascending=False).iloc[0]["Model"]
best_model = best_pipes[best_name]

# Confusion Matrix
y_pred_best = best_model.predict(X_test)
cm = confusion_matrix(y_test, y_pred_best)
ConfusionMatrixDisplay(cm).plot()
plt.title(f"Confusion Matrix - {best_name}")
plt.show()

# ROC
if hasattr(best_model, "predict_proba"):
    RocCurveDisplay.from_estimator(best_model, X_test, y_test)
    plt.title(f"ROC Curve - {best_name}")
    plt.show()

joblib.dump(best_model, "partC_best_model.joblib")
print(f"Saved best Part C model as 'partC_best_model.joblib' ({best_name}).")


## 5) Feature Importance / Coefficients

In [None]:

# Show top features for interpretability
final_model = best_model.named_steps["model"]
pre = best_model.named_steps["preprocess"]

# Retrieve feature names after preprocessing
oh = pre.named_transformers_["cat"].named_steps["onehot"] if len(pre.transformers_)>1 else None
num_cols, cat_cols = numeric_categorical_columns(df.drop(columns=[TARGET]))
feat_names = []
if num_cols:
    feat_names.extend(num_cols)
if cat_cols and oh is not None:
    feat_names.extend(oh.get_feature_names_out(cat_cols).tolist())

if hasattr(final_model, "coef_"):
    coefs = pd.Series(final_model.coef_.ravel(), index=feat_names)
    display(coefs.sort_values(ascending=False).head(15).to_frame("coef"))
    display(coefs.sort_values().head(15).to_frame("coef"))
elif hasattr(final_model, "feature_importances_"):
    importances = pd.Series(final_model.feature_importances_, index=feat_names)
    display(importances.sort_values(ascending=False).head(20).to_frame("importance"))
else:
    print("Model does not expose coefficients/feature_importances.")


## 6) Predict on New Data (Optional)

In [None]:

# Optional: Predict on new data with same schema as training features
# NEW_DATA_PATH = "path/to/heart_new_data.csv"
# new_df = read_data_flex(NEW_DATA_PATH)
# preds = best_model.predict(new_df)
# out = new_df.copy()
# out["disease_pred"] = preds
# out.to_csv("partC_predictions.csv", index=False)
# print("Saved predictions to 'partC_predictions.csv'")


## 7) Conclusions

- Choose the model with highest F1/ROC-AUC.
- Use feature insights to discuss the strongest correlates with disease.