In [None]:
print("\n" + "=" * 70)
print("HEART DISEASE PREDICTION - MLOPS PIPELINE SUMMARY")
print("=" * 70)

print("\n✓ COMPLETED COMPONENTS:")
print("\n  DATA MANAGEMENT")
print("  ├─ Dataset download and loading")
print("  ├─ Data cleaning and preprocessing")
print("  ├─ Train-test split with stratification")
print("  └─ Feature scaling and normalization")

print("\n  MODEL DEVELOPMENT")
print("  ├─ Logistic Regression classifier")
print("  ├─ Random Forest classifier")
print("  ├─ Hyperparameter tuning (GridSearchCV)")
print("  ├─ Cross-validation evaluation")
print("  └─ Comprehensive metrics calculation")

print("\n  EXPERIMENT TRACKING")
print("  ├─ MLflow integration")
print("  ├─ Parameter and metric logging")
print("  ├─ Model artifact storage")
print("  └─ Experiment comparison")

print("\n  MODEL SERVING")
print("  ├─ FastAPI application")
print("  ├─ Single & batch prediction endpoints")
print("  ├─ Input validation (Pydantic)")
print("  └─ Interactive API documentation")

print("\n  TESTING & CI/CD")
print("  ├─ Unit tests (preprocessing, models, API)")
print("  ├─ Code quality checks (flake8, black)")
print("  ├─ Test coverage reporting")
print("  └─ GitHub Actions CI/CD pipeline")

print("\n  CONTAINERIZATION & DEPLOYMENT")
print("  ├─ Docker container with multi-stage builds")
print("  ├─ Docker Compose for local development")
print("  ├─ Kubernetes deployment manifests")
print("  ├─ Horizontal Pod Autoscaling")
print("  └─ Service mesh and ingress configuration")

print("\n  MONITORING & LOGGING")
print("  ├─ Structured JSON logging")
print("  ├─ Prometheus metrics collection")
print("  ├─ Grafana dashboards")
print("  └─ Health checks and alerting")

print("\n" + "=" * 70)
print("QUICK START INSTRUCTIONS")
print("=" * 70)

print("\n1. LOCAL DEVELOPMENT:")
print("   pip install -r requirements.txt")
print("   python scripts/train_model.py")
print("   python -m uvicorn src.api.app:app --port 8000")

print("\n2. DOCKER DEPLOYMENT:")
print("   docker build -f docker/Dockerfile -t heart-disease-api:latest .")
print("   docker-compose -f docker/docker-compose.yml up -d")

print("\n3. KUBERNETES DEPLOYMENT:")
print("   kubectl apply -f k8s/deployment.yaml")
print("   kubectl apply -f monitoring/prometheus-grafana.yaml")

print("\n4. TESTING:")
print("   pytest tests/ -v --cov=src")

print("\n5. MONITORING:")
print("   mlflow ui --backend-store-uri file:mlruns")
print("   kubectl port-forward svc/grafana 3000:3000")

print("\n" + "=" * 70)
print("DOCUMENTATION FILES")
print("=" * 70)
print("  ├─ README.md - Project overview and usage")
print("  ├─ INSTALLATION.md - Detailed setup instructions")
print("  ├─ DEPLOYMENT.md - Deployment guide")
print("  └─ Makefile - Automation commands")

print("\n" + "=" * 70)
print("✓ MLOps PIPELINE SETUP COMPLETE!")
print("=" * 70)

print("\nNext steps:")
print("1. Push code to GitHub repository")
print("2. Configure GitHub Actions secrets if needed")
print("3. Deploy to cloud platform (GKE, EKS, AKS)")
print("4. Monitor via Grafana dashboard")
print("5. Set up automated retraining pipeline")

## 14. Project Summary & Next Steps

Complete MLOps pipeline overview and deployment instructions.

In [None]:
print("\n" + "=" * 60)
print("STEP 13: MONITORING & LOGGING")
print("=" * 60)

print("\nLogging features:")
print("  - Structured JSON logging")
print("  - Request-response logging")
print("  - Prediction logging with confidence scores")
print("  - Model evaluation metrics logging")
print("  - Error and exception logging")
print("  - Log file: logs/api.log")

print("\nPrometheus metrics:")
print("  - http_requests_total: Total HTTP requests")
print("  - http_request_duration_seconds: Request latency")
print("  - predictions_total: Total predictions made")
print("  - prediction_time_seconds: Prediction latency")
print("  - active_requests: Currently active requests")
print("  - model_accuracy/precision/recall: Model metrics")

print("\nGrafana dashboards:")
print("  - API performance dashboard")
print("  - Model metrics dashboard")
print("  - Request rate and latency dashboard")
print("  - Error rate monitoring")

print("\nDeploy monitoring stack:")
print("  kubectl apply -f monitoring/prometheus-grafana.yaml")

print("\nAccess monitoring services:")
print("  - Prometheus: http://localhost:9090")
print("  - Grafana: http://localhost:3000 (admin/admin)")

print("\nMonitoring setup features:")
print("  - Real-time metric collection")
print("  - Alerting rules (configurable)")
print("  - Custom dashboards")
print("  - Performance trending")
print("  - Service health monitoring")

## 13. Monitoring & Logging Setup

Prometheus metrics collection and Grafana dashboards for production monitoring.

In [None]:
print("\n" + "=" * 60)
print("STEP 12: KUBERNETES DEPLOYMENT")
print("=" * 60)

print("\nKubernetes manifests:")
print("  1. deployment.yaml")
print("     - Deployment with 3 replicas")
print("     - Rolling update strategy")
print("     - Resource limits and requests")
print("     - Health checks (liveness & readiness)")
print("     - HorizontalPodAutoscaler (2-10 replicas)")
print("     - Service (LoadBalancer)")
print("")
print("  2. ingress.yaml")
print("     - Nginx ingress controller")
print("     - TLS support")
print("     - Host-based routing")
print("")
print("  3. configmap.yaml")
print("     - Application configuration")
print("     - Secrets management")

print("\nDeploy to Kubernetes:")
print("  kubectl apply -f k8s/deployment.yaml")

print("\nVerify deployment:")
print("  kubectl get pods -n heart-disease-prediction")
print("  kubectl get svc -n heart-disease-prediction")

print("\nAccess services:")
print("  kubectl port-forward svc/heart-disease-api 8000:80 -n heart-disease-prediction")

print("\nAutoscaling triggers:")
print("  - CPU utilization > 70%")
print("  - Memory utilization > 80%")

print("\nProduction features:")
print("  - Load balancing across replicas")
print("  - Automatic pod restart on failure")
print("  - Graceful shutdown handling")
print("  - Resource-aware scheduling")

## 12. Kubernetes Deployment

Production-ready Kubernetes configuration with load balancing, autoscaling, and monitoring.

In [None]:
print("\n" + "=" * 60)
print("STEP 11: DOCKER CONTAINERIZATION")
print("=" * 60)

print("\nDockerfile details:")
print("  - Base image: python:3.9-slim")
print("  - Installs dependencies from requirements.txt")
print("  - Exposes port 8000")
print("  - Health check configured")
print("  - Runs FastAPI application")

print("\nBuild Docker image:")
print("  docker build -f docker/Dockerfile -t heart-disease-api:latest .")

print("\nRun Docker container:")
print("  docker run -p 8000:8000 heart-disease-api:latest")

print("\nUsing Docker Compose:")
print("  docker-compose -f docker/docker-compose.yml up -d")
print("  Services: API + MLflow server")

print("\nServices available:")
print("  - API: http://localhost:8000")
print("  - API Docs: http://localhost:8000/docs")
print("  - MLflow: http://localhost:5000")

print("\nContainer features:")
print("  - Multi-stage build for smaller image size")
print("  - Health checks for monitoring")
print("  - Volume mounts for models and logs")
print("  - Non-root user execution (security)")

## 11. Docker Containerization

Build and deploy containerized model-serving API.

In [None]:
print("\n" + "=" * 60)
print("STEP 10: CI/CD PIPELINE CONFIGURATION")
print("=" * 60)

print("\nWorkflow file: .github/workflows/mlops_pipeline.yml")

print("\nPipeline stages:")
print("  1. Lint and Test")
print("     - Python dependency caching")
print("     - Code linting (flake8)")
print("     - Code formatting check (black)")
print("     - Unit tests with coverage")
print("     - Coverage report upload")
print("")
print("  2. Docker Build and Test")
print("     - Build Docker image")
print("     - Run container health check")
print("")
print("  3. Model Training")
print("     - Run automated model training")
print("     - Archive model artifacts")
print("     - Archive MLflow runs")
print("")
print("  4. Workflow Summary")
print("     - Generate completion report")

print("\nAutomatically triggered on:")
print("  - Push to main/develop branches")
print("  - Pull requests to main/develop branches")

print("\nArtifacts generated:")
print("  - Test results and coverage reports")
print("  - Model artifacts")
print("  - MLflow experiment runs")

## 10. CI/CD Pipeline Configuration

GitHub Actions workflow for automated testing, building, and deployment.

In [None]:
print("\n" + "=" * 60)
print("STEP 9: UNIT TESTING OVERVIEW")
print("=" * 60)

print("\nTest files created:")
print("  - tests/test_preprocessing.py - Data preprocessing tests")
print("  - tests/test_models.py - Model training tests")
print("  - tests/test_api.py - API endpoint tests")
print("  - tests/conftest.py - Test fixtures and configuration")

print("\nTo run tests:")
print("  pytest tests/ -v")
print("  pytest tests/ -v --cov=src")

print("\nTest coverage includes:")
print("  - Data cleaning and preprocessing")
print("  - Missing value handling")
print("  - Model training and prediction")
print("  - Model evaluation metrics")
print("  - Cross-validation")
print("  - API endpoint validation")
print("  - Input validation")

## 9. Unit Testing

Comprehensive testing of data processing, models, and API.

In [None]:
print("\n" + "=" * 60)
print("STEP 8: API DEVELOPMENT FOR MODEL SERVING")
print("=" * 60)

# Sample API test data
sample_patient_data = {
    'age': 63,
    'sex': 1,
    'cp': 3,
    'trestbps': 145,
    'chol': 233,
    'fbs': 1,
    'restecg': 0,
    'thalach': 150,
    'exang': 0,
    'oldpeak': 2.3,
    'slope': 0,
    'ca': 0,
    'thal': 1
}

# Create a DataFrame for the sample
sample_df = pd.DataFrame([sample_patient_data])
print("\nSample input data:")
print(sample_df)

# Make prediction
prediction = pipeline.predict(sample_df)[0]
probability = pipeline.predict_proba(sample_df)[0]

print(f"\nModel Prediction: {'Disease Present' if prediction == 1 else 'No Disease'}")
print(f"Confidence scores: No Disease: {probability[0]:.4f}, Disease: {probability[1]:.4f}")
print(f"Prediction confidence: {max(probability):.4f}")

## 8. Model Serving - API Development

Build a FastAPI application with /predict endpoint for model inference.

In [None]:
print("\n" + "=" * 60)
print("STEP 7: MODEL PACKAGING & SERIALIZATION")
print("=" * 60)

# Save models
trainer.save_model(lr_best, '../models/artifacts/logistic_regression_model.pkl')
trainer.save_model(rf_best, '../models/artifacts/random_forest_model.pkl')

# Create and save a prediction wrapper
class PredictionPipeline:
    def __init__(self, preprocessor_path, model_path):
        self.preprocessor = joblib.load(preprocessor_path)
        self.model = joblib.load(model_path)
    
    def predict(self, X_raw):
        """Make predictions on raw data"""
        X_preprocessed = self.preprocessor.transform(X_raw)
        return self.model.predict(X_preprocessed)
    
    def predict_proba(self, X_raw):
        """Make probability predictions on raw data"""
        X_preprocessed = self.preprocessor.transform(X_raw)
        return self.model.predict_proba(X_preprocessed)

# Test the pipeline
pipeline = PredictionPipeline(
    '../models/artifacts/preprocessor.pkl',
    '../models/artifacts/random_forest_model.pkl'
)

# Make predictions on test set
sample_predictions = pipeline.predict(X_test[:5])
sample_proba = pipeline.predict_proba(X_test[:5])

print("\nSample Predictions (first 5 test samples):")
print(f"Predictions: {sample_predictions}")
print(f"Probabilities:\n{sample_proba}")

# Save pipeline wrapper
joblib.dump(pipeline, '../models/artifacts/prediction_pipeline.pkl')
print("\nPrediction pipeline saved!")

print("\nModel artifacts saved to: ../models/artifacts/")
print("  - logistic_regression_model.pkl")
print("  - random_forest_model.pkl")
print("  - preprocessor.pkl")
print("  - prediction_pipeline.pkl")

## 7. Model Packaging & Serialization

Save models and create reproducible pipelines with preprocessing.

In [None]:
print("\n" + "=" * 60)
print("STEP 6: EXPERIMENT TRACKING WITH MLFLOW")
print("=" * 60)

# Set tracking URI
mlflow.set_tracking_uri(uri='file:../mlruns')

# Create experiment
experiment_name = "heart_disease_prediction"
mlflow.set_experiment(experiment_name)

print(f"MLflow experiment: {experiment_name}")
print(f"MLflow tracking URI: {mlflow.get_tracking_uri()}")

# Log Logistic Regression run
print("\n--- Logging Logistic Regression Experiment ---")
with mlflow.start_run(run_name="LogisticRegression_v1"):
    # Log parameters
    mlflow.log_params({
        'model_type': 'LogisticRegression',
        'C': lr_grid.best_params_['C'],
        'solver': lr_grid.best_params_['solver'],
        'max_iter': 1000,
        'random_state': 42,
    })
    
    # Log metrics
    mlflow.log_metrics({
        'accuracy': lr_metrics['accuracy'],
        'precision': lr_metrics['precision'],
        'recall': lr_metrics['recall'],
        'f1_score': lr_metrics['f1'],
        'roc_auc': lr_metrics['roc_auc'],
    })
    
    # Log model
    mlflow.sklearn.log_model(lr_best, artifact_path="model")
    
    # Log confusion matrix plot
    mlflow.log_artifact('../screenshots/05_confusion_matrix_lr.png')
    
    print("Logistic Regression run logged!")

# Log Random Forest run
print("\n--- Logging Random Forest Experiment ---")
with mlflow.start_run(run_name="RandomForest_v1"):
    # Log parameters
    mlflow.log_params({
        'model_type': 'RandomForest',
        'n_estimators': rf_grid.best_params_['n_estimators'],
        'max_depth': rf_grid.best_params_['max_depth'],
        'random_state': 42,
    })
    
    # Log metrics
    mlflow.log_metrics({
        'accuracy': rf_metrics['accuracy'],
        'precision': rf_metrics['precision'],
        'recall': rf_metrics['recall'],
        'f1_score': rf_metrics['f1'],
        'roc_auc': rf_metrics['roc_auc'],
    })
    
    # Log model
    mlflow.sklearn.log_model(rf_best, artifact_path="model")
    
    # Log confusion matrix plot
    mlflow.log_artifact('../screenshots/06_confusion_matrix_rf.png')
    
    print("Random Forest run logged!")

print("\nMLflow runs logged successfully!")
print("To view experiments, run: mlflow ui --backend-store-uri file:../mlruns")

## 6. Experiment Tracking with MLflow

Log all experiments, parameters, metrics, and artifacts to MLflow.

In [None]:
# Plot confusion matrices
print("\nGenerating confusion matrix plots...")
trainer.plot_confusion_matrix(lr_metrics['confusion_matrix'], 
                             "Logistic Regression",
                             '../screenshots/05_confusion_matrix_lr.png')
trainer.plot_confusion_matrix(rf_metrics['confusion_matrix'], 
                             "Random Forest",
                             '../screenshots/06_confusion_matrix_rf.png')

# Plot ROC curves
from sklearn.metrics import roc_curve, auc

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# LR ROC curve
y_pred_proba_lr = lr_best.predict_proba(X_test)[:, 1]
fpr_lr, tpr_lr, _ = roc_curve(y_test, y_pred_proba_lr)
roc_auc_lr = auc(fpr_lr, tpr_lr)

axes[0].plot(fpr_lr, tpr_lr, color='blue', lw=2, label=f'ROC Curve (AUC = {roc_auc_lr:.4f})')
axes[0].plot([0, 1], [0, 1], color='red', lw=2, linestyle='--', label='Random Classifier')
axes[0].set_xlabel('False Positive Rate')
axes[0].set_ylabel('True Positive Rate')
axes[0].set_title('Logistic Regression ROC Curve')
axes[0].legend()
axes[0].grid(alpha=0.3)

# RF ROC curve
y_pred_proba_rf = rf_best.predict_proba(X_test)[:, 1]
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_proba_rf)
roc_auc_rf = auc(fpr_rf, tpr_rf)

axes[1].plot(fpr_rf, tpr_rf, color='green', lw=2, label=f'ROC Curve (AUC = {roc_auc_rf:.4f})')
axes[1].plot([0, 1], [0, 1], color='red', lw=2, linestyle='--', label='Random Classifier')
axes[1].set_xlabel('False Positive Rate')
axes[1].set_ylabel('True Positive Rate')
axes[1].set_title('Random Forest ROC Curve')
axes[1].legend()
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.savefig('../screenshots/07_roc_curves.png', dpi=300, bbox_inches='tight')
plt.show()

print("ROC curves saved!")

In [None]:
# Model comparison visualization
from models.train import compare_models

models_dict = {'Logistic Regression': lr_best, 'Random Forest': rf_best}
comparison_df = compare_models(models_dict, X_test, y_test)

print("\n--- Model Comparison ---")
print(comparison_df.to_string(index=False))

# Save comparison
comparison_df.to_csv('../data/processed/model_comparison.csv', index=False)
print("\nComparison saved to model_comparison.csv")

In [None]:
# Cross-validation evaluation
print("\n--- Cross-Validation Results ---")

cv_lr = trainer.cross_validate_model(lr_best, X_train, y_train, cv_folds=5)
cv_rf = trainer.cross_validate_model(rf_best, X_train, y_train, cv_folds=5)

print("\nLogistic Regression CV Scores:")
for metric in ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']:
    scores = cv_lr[f'test_{metric}']
    print(f"  {metric}: {scores.mean():.4f} (+/- {scores.std():.4f})")

print("\nRandom Forest CV Scores:")
for metric in ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']:
    scores = cv_rf[f'test_{metric}']
    print(f"  {metric}: {scores.mean():.4f} (+/- {scores.std():.4f})")

In [None]:
print("\n" + "=" * 60)
print("STEP 5: MODEL EVALUATION & COMPARISON")
print("=" * 60)

# Evaluate best models on test set
lr_best = lr_grid.best_estimator_
rf_best = rf_grid.best_estimator_

lr_metrics = trainer.evaluate_model(lr_best, X_test, y_test, "Logistic Regression")
rf_metrics = trainer.evaluate_model(rf_best, X_test, y_test, "Random Forest")

print("\n--- Logistic Regression Metrics ---")
for key, value in lr_metrics.items():
    if key not in ['confusion_matrix', 'classification_report', 'model_name']:
        print(f"{key}: {value:.4f}")

print("\n--- Random Forest Metrics ---")
for key, value in rf_metrics.items():
    if key not in ['confusion_matrix', 'classification_report', 'model_name']:
        print(f"{key}: {value:.4f}")

## 5. Model Evaluation & Comparison

Evaluate models using cross-validation and compare performance metrics.

In [None]:
# Hyperparameter Tuning with GridSearchCV
print("\n--- Hyperparameter Tuning ---")

# Tune Logistic Regression
print("\nTuning Logistic Regression...")
lr_params = {'C': [0.1, 1.0, 10.0], 'solver': ['lbfgs', 'liblinear']}
lr_grid = GridSearchCV(LogisticRegression(random_state=42, max_iter=1000), 
                       lr_params, cv=5, scoring='roc_auc', n_jobs=-1)
lr_grid.fit(X_train, y_train)
print(f"Best LR params: {lr_grid.best_params_}")
print(f"Best LR score: {lr_grid.best_score_:.4f}")

# Tune Random Forest
print("\nTuning Random Forest...")
rf_params = {'n_estimators': [50, 100, 200], 'max_depth': [5, 10, 15]}
rf_grid = GridSearchCV(RandomForestClassifier(random_state=42), 
                       rf_params, cv=5, scoring='roc_auc', n_jobs=-1)
rf_grid.fit(X_train, y_train)
print(f"Best RF params: {rf_grid.best_params_}")
print(f"Best RF score: {rf_grid.best_score_:.4f}")

In [None]:
print("\n" + "=" * 60)
print("STEP 4: MODEL DEVELOPMENT & TRAINING")
print("=" * 60)

from models.train import ModelTrainer

# Initialize trainer
trainer = ModelTrainer(random_state=42)

# Train Logistic Regression
print("\n--- Training Logistic Regression ---")
lr_model = trainer.train_logistic_regression(X_train, y_train, C=1.0, solver='lbfgs')
print("Logistic Regression trained!")

# Train Random Forest
print("\n--- Training Random Forest ---")
rf_model = trainer.train_random_forest(X_train, y_train, n_estimators=100, max_depth=10)
print("Random Forest trained!")

## 4. Model Development & Training

Build and train Logistic Regression and Random Forest classifiers with hyperparameter tuning.

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_preprocessed, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTrain set size: {X_train.shape}")
print(f"Test set size: {X_test.shape}")
print(f"Train target distribution:\n{pd.Series(y_train).value_counts()}")
print(f"Test target distribution:\n{pd.Series(y_test).value_counts()}")

In [None]:
print("\n" + "=" * 60)
print("STEP 3: DATA PREPROCESSING & FEATURE ENGINEERING")
print("=" * 60)

from data.preprocessing import DataPreprocessor, split_features_target

# Separate features and target
X, y = split_features_target(df_clean, target_col='target')

print(f"\nFeatures shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Target distribution: \n{y.value_counts()}")

# Create and fit preprocessor
preprocessor = DataPreprocessor()
X_preprocessed = preprocessor.fit_transform(X, y)

print(f"\nPreprocessed features shape: {X_preprocessed.shape}")
print(f"Feature names: {preprocessor.feature_names}")

# Save preprocessor
preprocessor.save('../models/artifacts/preprocessor.pkl')
print("Preprocessor saved!")

## 3. Data Preprocessing & Feature Engineering

Handle missing values, encode categorical features, and scale numerical features.

In [None]:
# 4. Box plots for outlier detection
fig, axes = plt.subplots(4, 4, figsize=(16, 14))
axes = axes.ravel()

for idx, col in enumerate(df_clean.columns):
    if col != 'target':
        axes[idx].boxplot(df_clean[col])
        axes[idx].set_title(f'Box Plot: {col}', fontsize=10)
        axes[idx].set_ylabel(col)
        axes[idx].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig('../screenshots/04_outlier_boxplots.png', dpi=300, bbox_inches='tight')
plt.show()

print("Outlier box plots saved!")

In [None]:
# 3. Class Balance Analysis
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Count plot
target_counts = df_clean['target'].value_counts()
axes[0].bar(['No Disease', 'Disease Present'], target_counts.values, color=['green', 'red'], alpha=0.7, edgecolor='black')
axes[0].set_ylabel('Count')
axes[0].set_title('Class Distribution (Count)')
axes[0].grid(axis='y', alpha=0.3)

# Percentage plot
target_pct = df_clean['target'].value_counts(normalize=True) * 100
axes[1].pie(target_pct.values, labels=['No Disease', 'Disease Present'], autopct='%1.1f%%',
           colors=['green', 'red'], startangle=90)
axes[1].set_title('Class Distribution (%)')

plt.tight_layout()
plt.savefig('../screenshots/03_class_balance.png', dpi=300, bbox_inches='tight')
plt.show()

print("Class balance analysis saved!")

In [None]:
# 2. Correlation Heatmap
plt.figure(figsize=(14, 12))
correlation_matrix = df_clean.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            fmt='.2f', square=True, linewidths=0.5)
plt.title('Feature Correlation Heatmap', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('../screenshots/02_correlation_heatmap.png', dpi=300, bbox_inches='tight')
plt.show()

print("Correlation heatmap saved!")

In [None]:
# 1. Feature Distributions (Histograms)
fig, axes = plt.subplots(4, 4, figsize=(16, 14))
axes = axes.ravel()

for idx, col in enumerate(df_clean.columns):
    if col != 'target':
        axes[idx].hist(df_clean[col], bins=30, edgecolor='black', alpha=0.7)
        axes[idx].set_title(f'Distribution of {col}', fontsize=10)
        axes[idx].set_xlabel(col)
        axes[idx].set_ylabel('Frequency')

plt.tight_layout()
plt.savefig('../screenshots/01_feature_distributions.png', dpi=300, bbox_inches='tight')
plt.show()

print("Feature distributions saved!")

In [None]:
print("\n" + "=" * 60)
print("STEP 2: EXPLORATORY DATA ANALYSIS (EDA)")
print("=" * 60)

# Clean the data first
df_clean = df.dropna()
print(f"Shape after removing missing values: {df_clean.shape}")

# Convert target to binary
df_clean['target'] = (df_clean['target'] > 0).astype(int)

print(f"\nTarget Distribution:")
print(df_clean['target'].value_counts())
print(f"\nTarget Distribution (%):")
print(df_clean['target'].value_counts(normalize=True) * 100)

## 2. Exploratory Data Analysis (EDA)

Professional visualizations including feature distributions, correlations, and class balance.

In [None]:
# Display dataset information
print("\n" + "=" * 60)
print("DATASET OVERVIEW")
print("=" * 60)
print(f"Dataset Shape: {df.shape}")
print(f"\nData Types:\n{df.dtypes}")
print(f"\nMissing Values:\n{df.isnull().sum()}")
print(f"\nBasic Statistics:\n{df.describe()}")

In [None]:
from data.download_data import download_heart_disease_data, load_and_prepare_data

# Download dataset
print("=" * 60)
print("STEP 1: DATA ACQUISITION")
print("=" * 60)

csv_path = download_heart_disease_data(output_dir='../data/raw')

if csv_path:
    df = load_and_prepare_data(csv_path)
    print("\nDataset successfully loaded!")

## 1. Data Acquisition & Dataset Exploration

Load the Heart Disease UCI Dataset and explore its structure.

In [None]:
# Import Required Libraries
import sys
import os
sys.path.insert(0, '../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, roc_curve, confusion_matrix, classification_report
)
import mlflow
import mlflow.sklearn
import warnings
warnings.filterwarnings('ignore')

# Configure plotting
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("All libraries imported successfully!")

# Heart Disease Prediction - MLOps End-to-End Pipeline
## Comprehensive ML Model Development with Experiment Tracking and Deployment

This notebook covers:
1. Data Acquisition & Exploratory Analysis
2. Data Preprocessing & Feature Engineering
3. Model Development & Training
4. Model Evaluation & Comparison
5. Experiment Tracking with MLflow
6. Model Packaging & Reproducibility
7. Model API Development
8. Unit Testing
9. CI/CD Pipeline Configuration
10. Docker Containerization
11. Kubernetes Deployment
12. Monitoring & Logging