# Heart Disease Prediction System Tutorial

This notebook demonstrates how to use the Heart Disease Prediction MLP system. It covers:

1. Data loading and exploration
2. Model training and evaluation
3. Making predictions
4. Using the API

> **Note:** This notebook requires specific dependencies. Please ensure you've installed all requirements with `pip install -r requirements.txt`.

In [None]:
# Import essential libraries
import os
import sys
import json
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc, precision_recall_curve
import requests

# Set the style for plots
sns.set(style='whitegrid')

# Check if tabulate is available for better table formatting
try:
    import tabulate
    print("Tabulate dependency is available!")
except ImportError:
    print("Warning: Tabulate dependency is missing. Install with 'pip install tabulate'")
    
# Add project root to path
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), '..'))
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)
    
print(f"Current working directory: {os.getcwd()}")
print(f"Project root directory: {PROJECT_ROOT}")

## 1. Data Loading

We'll first load the heart disease dataset. We'll try to load the processed data. If that's not available, we'll generate synthetic data for demonstration purposes.

In [None]:
# Try to load processed data
try:
    # First try to load processed data splits
    data_path = os.path.join(PROJECT_ROOT, 'data/processed/processed_data.npz')
    print(f"Attempting to load data from: {data_path}")
    
    data = np.load(data_path)
    print(f"Available arrays in the data file: {list(data.keys())}")
    
    # Load training data
    X_train = data['X_train']
    y_train = data['y_train']
    
    # If available, load feature names from metadata
    try:
        metadata_path = os.path.join(PROJECT_ROOT, 'data/processed/processing_metadata.txt')
        with open(metadata_path, 'r') as f:
            metadata = json.load(f)
        feature_names = metadata.get('feature_names', [])
    except (FileNotFoundError, json.JSONDecodeError):
        # Default feature names if metadata not available
        feature_names = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 
                         'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']
    
    # Create DataFrame for exploring training data
    if len(feature_names) == X_train.shape[1]:
        df_train = pd.DataFrame(X_train, columns=feature_names)
        df_train['target'] = y_train
        print(f"Successfully loaded processed data with {df_train.shape[0]} samples and {df_train.shape[1]} features (including target)")
    else:
        print(f"Warning: Feature names count ({len(feature_names)}) doesn't match data columns ({X_train.shape[1]})")
        df_train = pd.DataFrame(X_train)
        df_train['target'] = y_train
        print(f"Created DataFrame without feature names, shape: {df_train.shape}")
        
except FileNotFoundError:
    print("Processed data not found. Creating synthetic data for demonstration.")
    # Create synthetic data
    np.random.seed(42)
    n_samples = 300
    feature_names = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 
                     'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']
    
    # Generate random but somewhat realistic features
    X = np.zeros((n_samples, len(feature_names)))
    X[:, 0] = np.random.normal(55, 10, n_samples)  # age
    X[:, 1] = np.random.binomial(1, 0.7, n_samples)  # sex
    X[:, 2] = np.random.randint(0, 4, n_samples)  # cp
    X[:, 3] = np.random.normal(130, 15, n_samples)  # trestbps
    X[:, 4] = np.random.normal(220, 40, n_samples)  # chol
    X[:, 5] = np.random.binomial(1, 0.2, n_samples)  # fbs
    X[:, 6] = np.random.randint(0, 3, n_samples)  # restecg
    X[:, 7] = np.random.normal(150, 20, n_samples)  # thalach
    X[:, 8] = np.random.binomial(1, 0.3, n_samples)  # exang
    X[:, 9] = np.random.exponential(1, n_samples)  # oldpeak
    X[:, 10] = np.random.randint(0, 3, n_samples)  # slope
    X[:, 11] = np.random.randint(0, 4, n_samples)  # ca
    X[:, 12] = np.random.choice([3, 6, 7], n_samples)  # thal
    
    # Generate target variable (some simple rules to create correlations)
    y = np.zeros(n_samples)
    y[X[:, 0] > 60] += 0.2  # older age increases risk
    y[X[:, 1] == 1] += 0.2  # male increases risk
    y[X[:, 2] > 1] += 0.3  # chest pain type > 1 increases risk
    y[X[:, 4] > 240] += 0.2  # high cholesterol increases risk
    y[X[:, 7] < 140] += 0.2  # low max heart rate increases risk
    y[X[:, 8] == 1] += 0.3  # exercise induced angina increases risk
    y[X[:, 9] > 1.5] += 0.3  # high ST depression increases risk
    y[X[:, 11] > 0] += 0.2 * X[:, 11]  # more colored vessels increase risk
    
    # Convert to binary target (with some randomness)
    y = (y + np.random.normal(0, 0.1, n_samples) > 0.5).astype(int)
    
    # Create train/test split
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    # Create DataFrames
    df_train = pd.DataFrame(X_train, columns=feature_names)
    df_train['target'] = y_train
    
    print(f"Created synthetic dataset with {df_train.shape[0]} training samples and {df_train.shape[1]} features (including target)")
    
except Exception as e:
    print(f"Error loading data: {str(e)}")
    print("Creating minimal synthetic dataset as fallback")
    
    # Create very simple synthetic data as fallback
    np.random.seed(42)
    n_samples = 100
    feature_names = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 
                    'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']
    X_train = np.random.rand(n_samples, len(feature_names))
    y_train = np.random.randint(0, 2, size=n_samples)
    
    # Create DataFrame
    df_train = pd.DataFrame(X_train, columns=feature_names)
    df_train['target'] = y_train
    print(f"Created minimal synthetic dataset with {df_train.shape[0]} samples")

## 2. Data Exploration

Let's explore the dataset to understand its characteristics.

In [None]:
# Display the first few rows of the dataset
print("First 5 rows of the dataset:")
df_train.head()

In [None]:
# Show basic statistics
print("\nBasic statistics of the dataset:")
try:
    from tabulate import tabulate
    stats = df_train.describe().T
    stats['missing'] = df_train.isnull().sum()
    print(tabulate(stats, headers='keys', tablefmt='grid'))
except Exception as e:
    print(f"Could not use tabulate due to: {str(e)}")
    display(df_train.describe())

In [None]:
# Plot the distribution of the target variable
plt.figure(figsize=(10, 6))
ax = sns.countplot(x='target', data=df_train)
plt.title('Distribution of Heart Disease')
plt.xlabel('Heart Disease Present')
plt.ylabel('Count')
plt.xticks([0, 1], ['No (0)', 'Yes (1)'])

# Add count labels on top of the bars
for p in ax.patches:
    ax.annotate(f'{p.get_height()}', 
                (p.get_x() + p.get_width() / 2., p.get_height()), 
                ha = 'center', va = 'bottom')

plt.show()

In [None]:
# Plot correlation matrix
plt.figure(figsize=(12, 10))
correlation_matrix = df_train.corr()
mask = np.triu(correlation_matrix)
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5, mask=mask)
plt.title('Correlation Matrix')
plt.tight_layout()
plt.show()

## 3. Using Pre-trained Models

The Heart Disease Prediction system comes with pre-trained models. Let's load and use them.

In [None]:
# Measure loading time for performance evaluation
start_time = time.time()

try:
    # Try to import our model predictor
    from src.models.predict_model import HeartDiseasePredictor
    
    # Create a predictor instance
    model_predictor = HeartDiseasePredictor(model_dir="models")
    
    # Check which models are available
    print("Available models:")
    print(f"  - Scikit-learn MLP model: {'Yes' if model_predictor.has_sklearn_model else 'No'}")
    print(f"  - Keras MLP model: {'Yes' if model_predictor.has_keras_model else 'No'}")
    print(f"  - Ensemble model: {'Yes' if model_predictor.has_ensemble_model else 'No'}")
    print(f"  - Preprocessor available: {'Yes' if model_predictor.preprocessor is not None else 'No'}")
    
    end_time = time.time()
    print(f"\nModel loading time: {end_time - start_time:.2f} seconds")
    
    # If we couldn't load models, explain how to train new ones
    if not model_predictor.has_sklearn_model and not model_predictor.has_keras_model:
        print("\nNo pre-trained models found. You can train new models using the scripts/train_models.sh script.")
    
except ImportError as e:
    print(f"\nError importing modules: {str(e)}")
    print("Make sure you have the project installed properly and Python path is set correctly.")
    
except Exception as e:
    print(f"\nError loading models: {str(e)}")
    print("Will continue with directly using scikit-learn instead.")

### Making Predictions with Pre-trained Models

Let's take a sample from our data and make predictions using the loaded models.

In [None]:
try:
    # Get a sample patient from our data
    sample_patient = df_train.iloc[10].copy()
    sample_target = sample_patient.pop('target')
    sample_dict = sample_patient.to_dict()
    
    print("Sample patient data:")
    for k, v in sample_dict.items():
        print(f"  {k}: {v:.2f}" if isinstance(v, float) else f"  {k}: {v}")
    print(f"Actual target: {sample_target}")
    
    # Make predictions if we have the model predictor loaded
    if 'model_predictor' in locals():
        # Performance measurement
        start_time = time.time()
        
        # Predict with all available models
        prediction_result = model_predictor.predict(
            sample_dict,
            return_probabilities=True,
            return_interpretation=True
        )
        
        end_time = time.time()
        print(f"\nPrediction time: {end_time - start_time:.4f} seconds")
        
        # Display prediction results
        print("\nPrediction Results:")
        
        # Check which model was used
        if "model_used" in prediction_result:
            print(f"Model used: {prediction_result['model_used']}")
            
        # Display predictions from different models if available
        if "sklearn_predictions" in prediction_result:
            print(f"Scikit-learn prediction: {prediction_result['sklearn_predictions'][0]}")
            if "sklearn_probabilities" in prediction_result:
                print(f"Scikit-learn probability: {prediction_result['sklearn_probabilities'][0]:.4f}")
                
        if "keras_predictions" in prediction_result:
            print(f"Keras prediction: {prediction_result['keras_predictions'][0]}")
            if "keras_probabilities" in prediction_result:
                print(f"Keras probability: {prediction_result['keras_probabilities'][0]:.4f}")
                
        if "ensemble_predictions" in prediction_result:
            print(f"Ensemble prediction: {prediction_result['ensemble_predictions'][0]}")
            if "ensemble_probabilities" in prediction_result:
                print(f"Ensemble probability: {prediction_result['ensemble_probabilities'][0]:.4f}")
                
        # Show interpretation if available
        if "interpretation" in prediction_result:
            print(f"\nInterpretation:\n{prediction_result['interpretation']}")
            
except Exception as e:
    print(f"Error making prediction: {str(e)}")
    print("Will demonstrate with a simpler model in the next cell")

### Simple Model Training (Fallback)

If the pre-trained models aren't available, let's train a simple model using scikit-learn.

In [None]:
# Only run this cell if we couldn't load the pre-trained models
if 'model_predictor' not in locals() or not (model_predictor.has_sklearn_model or model_predictor.has_keras_model):
    print("Training a simple scikit-learn model...")
    
    try:
        from sklearn.neural_network import MLPClassifier
        from sklearn.preprocessing import StandardScaler
        from sklearn.pipeline import Pipeline
        from sklearn.model_selection import train_test_split
        
        # Start timing
        start_time = time.time()
        
        # Prepare data
        X = df_train.drop('target', axis=1)
        y = df_train['target']
        
        # Create a train/test split
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        # Create a pipeline with preprocessing and model
        pipeline = Pipeline([
            ('scaler', StandardScaler()),
            ('mlp', MLPClassifier(hidden_layer_sizes=(16, 8), max_iter=1000, random_state=42))
        ])
        
        # Train the model
        pipeline.fit(X_train, y_train)
        
        # Evaluate on test set
        score = pipeline.score(X_test, y_test)
        y_pred = pipeline.predict(X_test)
        y_proba = pipeline.predict_proba(X_test)[:, 1] if hasattr(pipeline, 'predict_proba') else None
        
        # Calculate time
        end_time = time.time()
        training_time = end_time - start_time
        
        # Print results
        print(f"Model training completed in {training_time:.2f} seconds")
        print(f"Accuracy on test set: {score:.4f}")
        
        # Print classification report
        print("\nClassification Report:")
        print(classification_report(y_test, y_pred))
        
        # Plot confusion matrix
        plt.figure(figsize=(8, 6))
        cm = confusion_matrix(y_test, y_pred)
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.title('Confusion Matrix')
        plt.ylabel('True Label')
        plt.xlabel('Predicted Label')
        plt.show()
        
        # Plot ROC curve if probabilities are available
        if y_proba is not None:
            plt.figure(figsize=(8, 6))
            fpr, tpr, _ = roc_curve(y_test, y_proba)
            roc_auc = auc(fpr, tpr)
            plt.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.2f})')
            plt.plot([0, 1], [0, 1], 'k--')
            plt.xlabel('False Positive Rate')
            plt.ylabel('True Positive Rate')
            plt.title('Receiver Operating Characteristic (ROC) Curve')
            plt.legend(loc='lower right')
            plt.show()
        
        # Make a simple prediction with our trained model
        sample_patient = X_test.iloc[0]
        sample_features = sample_patient.values.reshape(1, -1)
        
        prediction_time_start = time.time()
        prediction = pipeline.predict(sample_features)[0]
        probability = pipeline.predict_proba(sample_features)[0, 1] if hasattr(pipeline, 'predict_proba') else None
        prediction_time_end = time.time()
        
        print(f"\nSample Prediction:")
        print(f"Prediction time: {prediction_time_end - prediction_time_start:.4f} seconds")
        print(f"Predicted class: {prediction}")
        if probability is not None:
            print(f"Probability: {probability:.4f}")
        print(f"Actual class: {y_test.iloc[0]}")
        
    except Exception as e:
        print(f"Error training model: {str(e)}")

## 4. Using the API

The Heart Disease Prediction system provides a REST API for making predictions. Let's demonstrate how to use it.

In [None]:
def call_prediction_api(patient_data, api_url="http://localhost:8000/predict"):
    """
    Call the prediction API with the given patient data.
    
    Args:
        patient_data (dict): Dictionary containing patient features
        api_url (str): URL of the prediction API endpoint
        
    Returns:
        dict: API response
    """
    try:
        # Start timing
        start_time = time.time()
        
        # Make the API request
        response = requests.post(api_url, json=patient_data, timeout=5)
        
        # Calculate elapsed time
        elapsed_time = time.time() - start_time
        
        # Check if request was successful
        if response.status_code == 200:
            result = response.json()
            print(f"API request successful in {elapsed_time:.4f} seconds")
            return result
        else:
            print(f"API request failed with status code {response.status_code} in {elapsed_time:.4f} seconds")
            print(f"Error: {response.text}")
            return None
            
    except requests.exceptions.ConnectionError:
        print("\nConnection error: Could not connect to the API.")
        print("Make sure the API server is running using 'python run_api.py' or 'bash scripts/run_api.sh'")
        return None
        
    except Exception as e:
        print(f"\nError calling API: {str(e)}")
        return None

In [None]:
# Prepare a sample patient for API request
sample_api_patient = {
    "age": 61,
    "sex": 1,
    "cp": 3,
    "trestbps": 140,
    "chol": 240,
    "fbs": 1,
    "restecg": 1,
    "thalach": 150,
    "exang": 1,
    "oldpeak": 2.4,
    "slope": 2,
    "ca": 1,
    "thal": 3
}

# Display the patient data
print("Sample patient data for API:")
for k, v in sample_api_patient.items():
    print(f"  {k}: {v}")

# Call the API
print("\nCalling prediction API...")
try:
    result = call_prediction_api(sample_api_patient)
    
    # Display the API response
    if result:
        print("\nAPI Response:")
        for k, v in result.items():
            print(f"  {k}: {v}")
        
        # Interpret the result
        prediction = result.get('prediction')
        probability = result.get('probability')
        risk_level = result.get('risk_level')
        model_used = result.get('model_used')
        
        print(f"\nInterpretation:")
        print(f"The model predicts {'positive' if prediction == 1 else 'negative'} for heart disease")
        print(f"Probability: {probability:.2f}")
        print(f"Risk level: {risk_level}")
        print(f"Model used: {model_used}")
except Exception as e:
    print(f"Error processing API result: {str(e)}")
    print("Note: You need to start the API server to use this feature.")

## 5. Batch Processing Example

Let's demonstrate how to use the batch processing API for making predictions on multiple patients.

In [None]:
def call_batch_prediction_api(patients_data, api_url="http://localhost:8000/predict/batch"):
    """
    Call the batch prediction API with multiple patient data.
    
    Args:
        patients_data (list): List of dictionaries containing patient features
        api_url (str): URL of the batch prediction API endpoint
        
    Returns:
        dict: API response
    """
    try:
        # Start timing
        start_time = time.time()
        
        # Make the API request
        response = requests.post(api_url, json=patients_data, timeout=10)
        
        # Calculate elapsed time
        elapsed_time = time.time() - start_time
        
        # Check if request was successful
        if response.status_code == 200:
            result = response.json()
            print(f"Batch API request successful in {elapsed_time:.4f} seconds")
            return result
        else:
            print(f"Batch API request failed with status code {response.status_code} in {elapsed_time:.4f} seconds")
            print(f"Error: {response.text}")
            return None
            
    except requests.exceptions.ConnectionError:
        print("\nConnection error: Could not connect to the API.")
        print("Make sure the API server is running using 'python run_api.py' or 'bash scripts/run_api.sh'")
        return None
        
    except Exception as e:
        print(f"\nError calling batch API: {str(e)}")
        return None

In [None]:
# Create a batch of sample patients
def generate_patient_batch(n_patients=5):
    """
    Generate a batch of sample patients.
    
    Args:
        n_patients (int): Number of patients to generate
        
    Returns:
        list: List of patient dictionaries
    """
    np.random.seed(42)
    batch = []
    
    for i in range(n_patients):
        patient = {
            "age": int(np.random.normal(55, 10)),
            "sex": int(np.random.binomial(1, 0.7)),
            "cp": int(np.random.randint(0, 4)),
            "trestbps": int(np.random.normal(130, 15)),
            "chol": int(np.random.normal(220, 40)),
            "fbs": int(np.random.binomial(1, 0.2)),
            "restecg": int(np.random.randint(0, 3)),
            "thalach": int(np.random.normal(150, 20)),
            "exang": int(np.random.binomial(1, 0.3)),
            "oldpeak": float(np.round(np.random.exponential(1), 1)),
            "slope": int(np.random.randint(0, 3)),
            "ca": int(np.random.randint(0, 4)),
            "thal": int(np.random.choice([3, 6, 7]))
        }
        batch.append(patient)
    
    return batch

# Generate 10 sample patients
patient_batch = generate_patient_batch(10)

# Display the first patient in the batch
print(f"Generated a batch of {len(patient_batch)} patients for batch processing")
print("\nFirst patient in batch:")
for k, v in patient_batch[0].items():
    print(f"  {k}: {v}")

# Call the batch API
print("\nCalling batch prediction API...")
try:
    batch_result = call_batch_prediction_api(patient_batch)
    
    # Display the batch API response summary
    if batch_result:
        predictions = batch_result.get('predictions', [])
        performance_metrics = batch_result.get('performance_metrics', {})
        
        print(f"\nReceived {len(predictions)} predictions from batch API")
        
        # Show performance metrics if available
        if performance_metrics:
            print("\nPerformance Metrics:")
            for k, v in performance_metrics.items():
                print(f"  {k}: {v}")
        
        # Create a summary table of predictions
        print("\nPrediction Summary:")
        summary_df = pd.DataFrame(predictions)
        print(summary_df[['prediction', 'probability', 'risk_level', 'model_used']].head(10))
        
        # Plot distribution of predictions
        plt.figure(figsize=(10, 6))
        summary_df['risk_level'].value_counts().plot(kind='bar')
        plt.title('Distribution of Risk Levels in Batch Predictions')
        plt.xlabel('Risk Level')
        plt.ylabel('Count')
        plt.tight_layout()
        plt.show()
except Exception as e:
    print(f"Error processing batch API result: {str(e)}")
    print("Note: You need to start the API server to use this feature.")

## 6. Environment-Specific Configuration

The Heart Disease Prediction system supports environment-specific configuration. Let's explore how this works.

In [None]:
try:
    from src.utils import load_config
    
    # Load the configuration
    config = load_config()
    
    # Display configuration sections
    print("Configuration Sections:")
    for section in config.keys():
        print(f"  - {section}")
    
    # Show API configuration
    if 'api' in config:
        print("\nAPI Configuration:")
        for key, value in config['api'].items():
            if not isinstance(value, dict):
                print(f"  {key}: {value}")
    
    # Show model configuration
    if 'models' in config:
        print("\nModel Configuration:")
        for key, value in config['models'].items():
            if not isinstance(value, dict):
                print(f"  {key}: {value}")
except ImportError as e:
    print(f"Error importing config module: {str(e)}")
    print("Will show example config structure instead")
    
    # Example config structure
    example_config = {
        "api": {
            "host": "localhost",
            "port": 8000,
            "batch_size": 50,
            "max_workers": 4,
            "caching": {
                "enabled": True,
                "max_size": 1000,
                "ttl": 3600
            }
        },
        "models": {
            "model_dir": "models",
            "default_model": "ensemble"
        },
        "data": {
            "processed_dir": "data/processed",
            "raw_dir": "data/raw"
        }
    }
    
    print("\nExample Configuration Structure:")
    print(json.dumps(example_config, indent=2))
except Exception as e:
    print(f"Error loading configuration: {str(e)}")

## 7. Summary and Next Steps

In this tutorial, we've covered:
1. Loading and exploring heart disease prediction data
2. Using pre-trained MLP models
3. Making predictions using the API
4. Batch processing capabilities
5. Environment-specific configuration

Next steps you might want to explore:
- Experiment with different model hyperparameters
- Try feature engineering techniques
- Explore model interpretability in more depth
- Implement a custom web interface for the API

For more details, see the project documentation in the `/docs` directory.