# Week 1: Setup + EDA

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np


## 2. Import Data

In [None]:
df = pd.read_csv("./dataset/heart_attack_prediction_dataset.csv")

## 3. Data Quality Checks

3.1 Missing Values

In [None]:
# Assuming the DataFrame 'df' is loaded from Step 1.

# 1. Define the Target Variable (y)
TARGET_COLUMN = 'Heart Attack Risk'
y = df[TARGET_COLUMN]

# 2. Define the Feature Set (X) by dropping the target and non-predictive columns
COLUMNS_TO_DROP = ['Patient ID']
X = df.drop(columns=[TARGET_COLUMN] + COLUMNS_TO_DROP)

# Display the shapes of the resulting datasets to confirm the split
print("--- X (Features) Shape ---")
print(X.shape)
print("\n--- y (Target) Shape ---")
print(y.shape)
print("\n--- X (Features) Head (Sample) ---")
print(X.head())

# Assuming X (Features) DataFrame is available from Step 2.

# 1. Check for Missing Values (Re-check the full feature set X)
print("--- Missing Values Check (X) ---")
missing_values = X.isnull().sum()
# Print only columns with missing values (if any)
print(missing_values[missing_values > 0])
# If the output is empty, there are no missing values.

# 2. Handle the 'Blood Pressure' column by splitting it
X[['Systolic BP', 'Diastolic BP']] = X['Blood Pressure'].str.split('/', expand=True)

# 3. Convert the new columns to integers and drop the original 'Blood Pressure'
X['Systolic BP'] = X['Systolic BP'].astype(int)
X['Diastolic BP'] = X['Diastolic BP'].astype(int)
X = X.drop('Blood Pressure', axis=1)

# Display the head and info of the modified DataFrame to confirm the change
print("\n--- X Head After Blood Pressure Split ---")
print(X[['Systolic BP', 'Diastolic BP']].head())
print("\n--- X Info After Blood Pressure Split ---")
X.info()

3.2 Duplicates

3.3 Data Types Consistency

3.4 Outliers (domain checks, IQR/z-score)

## 4. Exploratory Data Analysis (EDA)

4.1 Univariate Distributions (histograms, KDE, boxplots)

4.2 Bivariate Relationships (scatter, groupby stats)

4.4 Target Variable (`y`) Inspection

4.5 Class Balance (counts, %)

## 5. Feature Engineering

5.1 Domain Features

5.2 Aggregations / Ratios / Interactions

5.3 Temporal/Recency Features (if applicable)

5.4 Feature Documentation (what, why, how)

## 6. Imbalance Handling (Preliminary)

6.1 Strategy Rationale (SMOTE vs. Class Weights vs. Thresholding)

6.2 Chosen Approach & Justification

6.3 Sanity Checks (no leakage, applied only to train)

---
# Week II–III: Data Preprocessing + Model Development

## 1. Preprocessing

1.1 Train/Validation/Test Split (with stratification if needed)

In [None]:
# --- 1. Create BMI_Category feature ---
# Defining standard BMI categories:
# < 18.5 (Underweight), 18.5-24.9 (Normal), 25.0-29.9 (Overweight), >= 30.0 (Obese)
bins = [0, 18.5, 25.0, 30.0, np.inf]
labels = ['Underweight', 'Normal', 'Overweight', 'Obese']

X['BMI_Category'] = pd.cut(
    X['BMI'],
    bins=bins,
    labels=labels,
    right=False,  # Bins include the lower boundary, exclude the upper
    include_lowest=True
)

# --- 2. Create Risk_Index feature ---
# Formula: (Cholesterol + Systolic BP) / Exercise Hours Per Week
# Add a small epsilon to the denominator to prevent division by zero
epsilon = 1e-6
X['Risk_Index'] = (X['Cholesterol'] + X['Systolic BP']) / (X['Exercise Hours Per Week'] + epsilon)

# Display a sample of the new features
print("--- X Head with new features ---")
print(X[['BMI', 'BMI_Category', 'Systolic BP', 'Exercise Hours Per Week', 'Cholesterol', 'Risk_Index']].head())
print("\n--- Value Counts for BMI_Category ---")
print(X['BMI_Category'].value_counts())

In [None]:


# 1. Identify Categorical Columns (includes 'object' and 'category' dtypes)
categorical_cols = X.select_dtypes(include=['object', 'category']).columns

# 2. Perform One-Hot Encoding
# drop_first=True removes the first category level to prevent perfect multicollinearity
X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

# Update X to the encoded DataFrame
X = X_encoded

# Display the shape and a sample of the new columns
print("--- X Shape Before Encoding ---")
print(X.shape)
print("\n--- X_encoded Shape After Encoding (Notice the increase in columns) ---")
print(X.shape)
print("\n--- X_encoded Head (Sample of encoded columns) ---")
# Print a few newly created columns for confirmation
print(X[[col for col in X.columns if '_Female' in col or '_Healthy' in col or '_Asia' in col]].head())

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Re-attach the target variable y for correlation analysis
# (Assuming X and y are available from the previous steps)
df_corr = pd.concat([X.copy(), y.rename('Heart Attack Risk')], axis=1)

# --- 1. Correlation with Target ---
correlation_with_target = df_corr.corr()['Heart Attack Risk'].sort_values(ascending=False)

# Select the top 10 most correlated features (excluding the target itself)
top_features_for_heatmap = correlation_with_target.drop('Heart Attack Risk').head(10).index.tolist()

print("--- Top Features Correlated with Heart Attack Risk ---")
print(correlation_with_target[top_features_for_heatmap])

# --- 2. Distribution of Key Numerical Features ---
numerical_cols = ['Age', 'Cholesterol', 'Systolic BP', 'Heart Rate', 'Risk_Index']

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

for i, col in enumerate(numerical_cols):
    sns.histplot(df_corr[col], kde=True, ax=axes[i])
    axes[i].set_title(f'Distribution of {col}', fontsize=12)

plt.tight_layout()
plt.savefig('feature_distributions.png')
plt.close()

# --- 3. Correlation Heatmap ---
df_heatmap = df_corr[top_features_for_heatmap + ['Heart Attack Risk']].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(df_heatmap, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
plt.title('Correlation Heatmap of Top Features and Target')
plt.show()

1.2 Categorical Encoding (label / one-hot)

In [None]:
from sklearn.preprocessing import StandardScaler
# 1. Identify Numerical Columns
# After One-Hot Encoding, all columns should be numerical (int or float)
numerical_cols = X.select_dtypes(include=np.number).columns

# 2. Instantiate the scaler
scaler = StandardScaler()

# 3. Fit and transform the numerical columns
X[numerical_cols] = scaler.fit_transform(X[numerical_cols])

# 4. Display the head of the scaled columns
print("--- X Head after Standardization (Showing key numerical features) ---")
print(X[['Age', 'Cholesterol', 'Systolic BP', 'Heart Rate', 'Risk_Index']].head())
print("\n--- Mean and Std Dev of Scaled Columns (should be close to 0 and 1) ---")
# Check the mean and std dev for a few key columns to confirm scaling
print(X[['Age', 'Cholesterol', 'Systolic BP']].agg(['mean', 'std']).round(2))

In [None]:
from sklearn.model_selection import train_test_split

# 1. Split the data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y  # Crucial for maintaining class distribution
)

# 2. Display the shapes and class distribution
print("--- Data Splitting Results ---")
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

print("\n--- Target Class Distribution in y_train ---")
# Check the distribution in the training set
print(y_train.value_counts(normalize=True).round(4))

In [None]:
# Assuming X_train, y_train are available from Step 8.
from imblearn.over_sampling import SMOTE
from collections import Counter

# 1. Instantiate SMOTE
# Set random_state for reproducibility
smote = SMOTE(random_state=42)

# 2. Apply SMOTE to the training data only
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# 3. Display the new shapes and class distribution
print("--- After SMOTE Resampling ---")
print(f"Original training set shape (y_train): {y_train.shape}")
print(f"Resampled training set shape (y_train_res): {y_train_res.shape}")

print("\n--- Target Class Distribution in y_train_res ---")
print(f"New counts: {Counter(y_train_res)}")
print(f"Class 0 percentage: {y_train_res.value_counts(normalize=True).loc[0]:.2%}")
print(f"Class 1 percentage: {y_train_res.value_counts(normalize=True).loc[1]:.2%}")

1.3 Numeric Scaling (StandardScaler / MinMaxScaler)

1.4 Building Pipeline (ColumnTransformer + Pipeline)

## 2. Baseline Models

2.1 Logistic Regression (default settings)

2.2 Decision Tree (default settings)

2.3 Random Forest (default settings)

2.4 Fit → Predict → Evaluate (val set)

## 3. Boosting Experiments

3.1 XGBoost

3.2 LightGBM

## 4. Experiment Tracking (MLflow)

4.1 MLflow Setup (tracking URI, experiment name)

4.2 Log Params, Metrics, Artifacts

## 5. Evaluation & Diagnostics

## 6. Model Comparison

## 7. Hyperparameter Tuning

7.1 GridSearchCV / RandomizedSearchCV (CV strategy)

7.2 Best Params & CV Scores

7.3 Refit on Train+Val

## 8. Final Model Selection

8.1 Compare Tuned Models (val/test metrics)

8.2 Final Choice & Rationale

## 9. Save Preprocessing Pipeline + Model

---
# Week IV: Model Tuning + Deployment

## 1. Streamlit App

This step must be completed in a separate app.py file