In [20]:
# ============================================================================
# MACHINE LEARNING ASSIGNMENT 2 - HEART DISEASE CLASSIFICATION
# ============================================================================
# Student: [Your Name]
# Course: M.Tech (AIML/DSE) - Machine Learning
# Dataset: Heart Disease UCI
# Models: 6 Classification Models
# ============================================================================

# ============================================================================
# SECTION 1: IMPORT LIBRARIES
# ============================================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import joblib
import warnings
warnings.filterwarnings('ignore')

# Scikit-learn imports
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

# XGBoost
from xgboost import XGBClassifier

# Metrics
from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    precision_score,
    recall_score,
    f1_score,
    matthews_corrcoef,
    confusion_matrix,
    classification_report
)

print("‚úÖ All libraries imported successfully!")

# ============================================================================
# CREATE REQUIRED FOLDERS
# ============================================================================

os.makedirs('model', exist_ok=True)
os.makedirs('data', exist_ok=True)

print("‚úÖ Folders created: model/, data/")

# ============================================================================
# SECTION 2: LOAD HEART DISEASE DATASET
# ============================================================================

print("\n" + "="*70)
print("üì• LOADING HEART DISEASE DATASET")
print("="*70)

# Load dataset (assumes heart.csv is in working directory)
try:
    df = pd.read_csv("heart.csv")
    print("‚úÖ Dataset loaded successfully from: heart.csv")
except FileNotFoundError:
    print("‚ùå heart.csv not found!")
    print("Please download from: https://www.kaggle.com/datasets/johnsmith88/heart-disease-dataset")
    raise

# ============================================================================
# DATASET VERIFICATION
# ============================================================================

print("\n" + "="*70)
print("üìã DATASET VERIFICATION - ASSIGNMENT REQUIREMENTS")
print("="*70)

print(f"Dataset Shape: {df.shape}")
print(f"Rows (Instances): {df.shape[0]}")
print(f"Columns (Features + Target): {df.shape[1]}")
print(f"Features (excluding target): {df.shape[1] - 1}")

# Check requirements
features_count = df.shape[1] - 1
instances_count = df.shape[0]

print(f"\nüìä REQUIREMENT CHECK:")
print(f"   Minimum Features Required: 12")
print(f"   Your Dataset Features: {features_count}")
print(f"   Status: {'‚úÖ PASS' if features_count >= 12 else '‚ùå FAIL'}")

print(f"\n   Minimum Instances Required: 500")
print(f"   Your Dataset Instances: {instances_count}")
print(f"   Status: {'‚úÖ PASS' if instances_count >= 500 else '‚ùå FAIL'}")

if features_count >= 12 and instances_count >= 500:
    print("\nüéâ DATASET MEETS ALL ASSIGNMENT REQUIREMENTS!")
else:
    print("\n‚ö†Ô∏è WARNING: Dataset does not meet requirements!")

print("="*70)

# ============================================================================
# DATASET OVERVIEW
# ============================================================================

print("\nüìä FIRST 5 ROWS:")
print(df.head())

print("\nüìã COLUMN NAMES:")
print(list(df.columns))

print("\nüìà DATA TYPES:")
print(df.dtypes)

print("\n‚ùì MISSING VALUES:")
missing = df.isnull().sum()
total_missing = missing.sum()

if total_missing == 0:
    print("‚úÖ No missing values found - Dataset is clean!")
else:
    print(f"‚ö†Ô∏è Found {total_missing} missing values:")
    print(missing[missing > 0])

print("\nüéØ TARGET VARIABLE DISTRIBUTION:")
print(df['target'].value_counts().sort_index())
print("\nPercentage:")
print(df['target'].value_counts(normalize=True).round(4) * 100)

print("\n‚úÖ SECTION 2 COMPLETE - DATASET LOADED & VERIFIED!")
print("="*70)

# ============================================================================
# SECTION 3: DATA PREPROCESSING & TRAIN-TEST SPLIT
# ============================================================================

print("\n" + "="*70)
print("‚öôÔ∏è DATA PREPROCESSING")
print("="*70)

# ============================================================================
# STEP 1: SEPARATE FEATURES AND TARGET
# ============================================================================

print("\n1Ô∏è‚É£ FEATURE-TARGET SEPARATION:")
print("-"*70)

X = df.drop('target', axis=1)
y = df['target']

print(f"Features (X): {X.shape}")
print(f"Target (y): {y.shape}")

# ============================================================================
# STEP 2: TRAIN-TEST SPLIT (BEFORE SCALING!)
# ============================================================================

print("\n2Ô∏è‚É£ TRAIN-TEST SPLIT (BEFORE SCALING):")
print("-"*70)
print("‚ö†Ô∏è CRITICAL: Splitting BEFORE scaling to prevent data leakage!")

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,        # 20% for testing
    random_state=42,      # Reproducibility
    stratify=y            # Maintain class balance
)

print(f"\n‚úÖ Split completed:")
print(f"   Training set: {X_train.shape[0]} samples ({X_train.shape[0]/len(df)*100:.1f}%)")
print(f"   Test set:     {X_test.shape[0]} samples ({X_test.shape[0]/len(df)*100:.1f}%)")

# Verify class balance maintained
print(f"\nüìä Class distribution check:")
print(f"   Original:  {y.value_counts().to_dict()}")
print(f"   Training:  {y_train.value_counts().to_dict()}")
print(f"   Test:      {y_test.value_counts().to_dict()}")

# ============================================================================
# STEP 3: FEATURE SCALING (AFTER SPLIT!)
# ============================================================================

print("\n3Ô∏è‚É£ FEATURE SCALING (AFTER SPLIT):")
print("-"*70)
print("‚úÖ Scaling AFTER split - No data leakage!")

# Initialize scaler
scaler = StandardScaler()

# Fit on training data ONLY, then transform
print("\n   a) Fitting scaler on TRAINING data only...")
X_train_scaled = scaler.fit_transform(X_train)
print("      ‚úÖ Scaler learned mean & std from training data")

# Transform test data using training statistics
print("\n   b) Transforming TEST data using training statistics...")
X_test_scaled = scaler.transform(X_test)
print("      ‚úÖ Test data transformed (no refitting)")

# Convert back to DataFrame
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X.columns, index=X_test.index)

# Verify scaling worked
print(f"\nüìä Scaling Verification:")
print(f"   Training data mean: {X_train_scaled.mean().mean():.6f} (should be ~0)")
print(f"   Training data std:  {X_train_scaled.std().mean():.6f} (should be ~1)")

print("\n‚úÖ Feature scaling completed successfully!")

# ============================================================================
# STEP 4: SAVE PREPROCESSING OBJECTS FOR STREAMLIT
# ============================================================================

print("\n4Ô∏è‚É£ SAVING FOR STREAMLIT APP:")
print("-"*70)

# Save scaler (CRITICAL for Streamlit!)
joblib.dump(scaler, 'model/scaler.pkl')
print("‚úÖ Scaler saved: model/scaler.pkl")

# Save feature names
feature_names = list(X.columns)
joblib.dump(feature_names, 'model/feature_names.pkl')
print("‚úÖ Feature names saved: model/feature_names.pkl")

# Save test data sample for demo
test_sample = X_test_scaled.head(100).copy()
test_sample['target'] = y_test.head(100).values
test_sample.to_csv('data/test_data.csv', index=False)
print("‚úÖ Test sample saved: data/test_data.csv")

# ============================================================================
# STEP 5: INITIALIZE RESULTS STORAGE
# ============================================================================

print("\n5Ô∏è‚É£ INITIALIZING RESULTS STORAGE:")
print("-"*70)

results = {
    'Model': [],
    'Accuracy': [],
    'AUC': [],
    'Precision': [],
    'Recall': [],
    'F1': [],
    'MCC': []
}

trained_models = {}
model_predictions = {}
model_confusion_matrices = {}
model_classification_reports = {}

print("‚úÖ Results storage initialized")

print("\n" + "="*70)
print("‚úÖ PREPROCESSING COMPLETE - NO DATA LEAKAGE!")
print("="*70)
print("\nüöÄ Ready for model training!")

# ============================================================================
# SECTION 4: TRAIN ALL 6 CLASSIFICATION MODELS
# ============================================================================

print("\n" + "="*70)
print("ü§ñ TRAINING 6 CLASSIFICATION MODELS")
print("="*70)

# ============================================================================
# DEFINE ALL MODELS (WITH PROPER REGULARIZATION TO AVOID OVERFITTING)
# ============================================================================

models_to_train = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(random_state=42, max_depth=5, min_samples_split=10),
    'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=5),
    'Naive Bayes': GaussianNB(),
    'Random Forest': RandomForestClassifier(
        n_estimators=100,
        random_state=42,
        max_depth=5,          # Reduced to prevent overfitting
        min_samples_split=10, # Added regularization
        min_samples_leaf=5    # Added regularization
    ),
    'XGBoost': XGBClassifier(
        n_estimators=100,
        random_state=42,
        max_depth=3,          # Reduced to prevent overfitting
        learning_rate=0.1,    # Added regularization
        eval_metric='logloss',
        use_label_encoder=False
    )
}

# ============================================================================
# TRAINING FUNCTION
# ============================================================================

def train_and_evaluate_model(model_name, model, X_train, X_test, y_train, y_test):
    """Train a model and calculate all 6 evaluation metrics + confusion matrix + classification report."""

    print(f"\n{'='*70}")
    print(f"üîÑ Training: {model_name}")
    print(f"{'='*70}")

    # Train the model
    model.fit(X_train, y_train)
    print(f"‚úÖ Model trained successfully")

    # Make predictions
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]

    # Calculate all 6 metrics
    metrics = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'AUC': roc_auc_score(y_test, y_pred_proba),
        'Precision': precision_score(y_test, y_pred, zero_division=0),
        'Recall': recall_score(y_test, y_pred, zero_division=0),
        'F1': f1_score(y_test, y_pred, zero_division=0),
        'MCC': matthews_corrcoef(y_test, y_pred)
    }

    # Display metrics
    print(f"\nüìä Evaluation Metrics:")
    print(f"   Accuracy:  {metrics['Accuracy']:.4f}")
    print(f"   AUC:       {metrics['AUC']:.4f}")
    print(f"   Precision: {metrics['Precision']:.4f}")
    print(f"   Recall:    {metrics['Recall']:.4f}")
    print(f"   F1 Score:  {metrics['F1']:.4f}")
    print(f"   MCC:       {metrics['MCC']:.4f}")

    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    print(f"\nüî¢ Confusion Matrix:")
    print(f"   TN: {cm[0,0]:3d}  |  FP: {cm[0,1]:3d}")
    print(f"   FN: {cm[1,0]:3d}  |  TP: {cm[1,1]:3d}")

    # Classification report
    report = classification_report(y_test, y_pred, target_names=['No Disease', 'Disease'])
    print(f"\nüìã Classification Report:")
    print(report)

    print(f"\n‚úÖ {model_name} completed!")

    return {
        'model': model,
        'metrics': metrics,
        'predictions': y_pred,
        'probabilities': y_pred_proba,
        'confusion_matrix': cm,
        'classification_report': report
    }


‚úÖ All libraries imported successfully!
‚úÖ Folders created: model/, data/

üì• LOADING HEART DISEASE DATASET
‚úÖ Dataset loaded successfully from: heart.csv

üìã DATASET VERIFICATION - ASSIGNMENT REQUIREMENTS
Dataset Shape: (1025, 14)
Rows (Instances): 1025
Columns (Features + Target): 14
Features (excluding target): 13

üìä REQUIREMENT CHECK:
   Minimum Features Required: 12
   Your Dataset Features: 13
   Status: ‚úÖ PASS

   Minimum Instances Required: 500
   Your Dataset Instances: 1025
   Status: ‚úÖ PASS

üéâ DATASET MEETS ALL ASSIGNMENT REQUIREMENTS!

üìä FIRST 5 ROWS:
   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   52    1   0       125   212    0        1      168      0      1.0      2   
1   53    1   0       140   203    1        0      155      1      3.1      0   
2   70    1   0       145   174    0        1      125      1      2.6      0   
3   61    1   0       148   203    0        1      161      0      0.0      2   
4   6

In [21]:
# ============================================================================
# TRAIN ALL MODELS
# ============================================================================

print("\nüöÄ Starting training pipeline for all 6 models...")
print(f"Training set size: {X_train_scaled.shape[0]} samples")
print(f"Test set size: {X_test_scaled.shape[0]} samples")

model_results = {}

for idx, (model_name, model) in enumerate(models_to_train.items(), 1):
    print(f"\n{'#'*70}")
    print(f"MODEL {idx}/6: {model_name.upper()}")
    print(f"{'#'*70}")

    # Train and evaluate
    result = train_and_evaluate_model(
        model_name=model_name,
        model=model,
        X_train=X_train_scaled,
        X_test=X_test_scaled,
        y_train=y_train,
        y_test=y_test
    )

    # Store results
    model_results[model_name] = result

    # Store in results dictionary for comparison table
    results['Model'].append(model_name)
    results['Accuracy'].append(round(result['metrics']['Accuracy'], 4))
    results['AUC'].append(round(result['metrics']['AUC'], 4))
    results['Precision'].append(round(result['metrics']['Precision'], 4))
    results['Recall'].append(round(result['metrics']['Recall'], 4))
    results['F1'].append(round(result['metrics']['F1'], 4))
    results['MCC'].append(round(result['metrics']['MCC'], 4))

    # Store model, predictions, confusion matrix, and classification report
    trained_models[model_name] = result['model']
    model_predictions[model_name] = result['predictions']
    model_confusion_matrices[model_name] = result['confusion_matrix']
    model_classification_reports[model_name] = result['classification_report']

print("\n" + "="*70)
print("‚úÖ ALL 6 MODELS TRAINED SUCCESSFULLY!")
print("="*70)

# ============================================================================
# CREATE RESULTS COMPARISON TABLE (REQUIRED FOR README)
# ============================================================================

print("\n" + "="*70)
print("üìä MODEL COMPARISON TABLE (ALL 6 MODELS)")
print("="*70)

results_df = pd.DataFrame(results)
print("\n" + results_df.to_string(index=False))

# Save comparison table
results_df.to_csv('model/model_comparison.csv', index=False)
print("\n‚úÖ Comparison table saved: model/model_comparison.csv")

# ============================================================================
# IDENTIFY BEST PERFORMING MODELS
# ============================================================================

print("\n" + "="*70)
print("üèÜ BEST PERFORMING MODELS BY METRIC")
print("="*70)

for metric in ['Accuracy', 'AUC', 'Precision', 'Recall', 'F1', 'MCC']:
    best_idx = results_df[metric].idxmax()
    best_model = results_df.loc[best_idx, 'Model']
    best_score = results_df.loc[best_idx, metric]
    print(f"{metric:12} : {best_model:25} ({best_score:.4f})")

# Find overall best model (by average)
print("\n" + "="*70)
print("ü•á OVERALL BEST MODEL (by average performance)")
print("="*70)

results_df['Average'] = results_df[['Accuracy', 'AUC', 'Precision', 'Recall', 'F1', 'MCC']].mean(axis=1)
best_overall_idx = results_df['Average'].idxmax()
best_overall_model = results_df.loc[best_overall_idx, 'Model']
best_overall_score = results_df.loc[best_overall_idx, 'Average']

print(f"Best Model: {best_overall_model}")
print(f"Average Score: {best_overall_score:.4f}")


print("\n" + "="*70)
print("üíæ SAVING REQUIRED FILES FOR STREAMLIT APP")
print("="*70)

# 1. Save all 6 models (REQUIRED)
for model_name, model_obj in trained_models.items():
    filename = model_name.lower().replace(' ', '_').replace('-', '_') + '.pkl'
    filepath = os.path.join('model', filename)
    joblib.dump(model_obj, filepath)
    print(f"‚úÖ Saved: {filepath}")

# 2. Save scaler (CRITICAL - needed to scale uploaded data in Streamlit)
joblib.dump(scaler, 'model/scaler.pkl')
print("‚úÖ Saved: model/scaler.pkl")

# 3. Save feature names (HELPFUL - for validation)
feature_names = list(X.columns)
joblib.dump(feature_names, 'model/feature_names.pkl')
print("‚úÖ Saved: model/feature_names.pkl")

# 4. Save comparison table (REQUIRED - for README.md)
results_df.to_csv('model/model_comparison.csv', index=False)
print("‚úÖ Saved: model/model_comparison.csv")

# 5. Save sample test data (OPTIONAL - for demo)
test_sample = X_test.head(100).copy()
test_sample['target'] = y_test.head(100).values
test_sample.to_csv('data/test_sample.csv', index=False)
print("‚úÖ Saved: data/test_sample.csv (for demo purposes)")

print("\n‚úÖ All required files saved!")




üöÄ Starting training pipeline for all 6 models...
Training set size: 820 samples
Test set size: 205 samples

######################################################################
MODEL 1/6: LOGISTIC REGRESSION
######################################################################

üîÑ Training: Logistic Regression
‚úÖ Model trained successfully

üìä Evaluation Metrics:
   Accuracy:  0.8098
   AUC:       0.9298
   Precision: 0.7619
   Recall:    0.9143
   F1 Score:  0.8312
   MCC:       0.6309

üî¢ Confusion Matrix:
   TN:  70  |  FP:  30
   FN:   9  |  TP:  96

üìã Classification Report:
              precision    recall  f1-score   support

  No Disease       0.89      0.70      0.78       100
     Disease       0.76      0.91      0.83       105

    accuracy                           0.81       205
   macro avg       0.82      0.81      0.81       205
weighted avg       0.82      0.81      0.81       205


‚úÖ Logistic Regression completed!

#################################