# Tabula-8B VA Cause of Death Prediction on Google Colab

This notebook runs the Tabula-8B model on Google Colab with GPU acceleration for fast inference.

**Requirements:**
- Google Colab with GPU runtime (T4 or better)
- PHMRC dataset file
- ~20GB free space for model


## Step 1: Setup GPU Runtime

**IMPORTANT**: Before running this notebook:
1. Go to `Runtime` → `Change runtime type`
2. Select `GPU` as Hardware accelerator (T4 or better)
3. Click `Save`

In [None]:
# Check GPU availability
import torch
import subprocess
import sys

if torch.cuda.is_available():
    print(f"✅ GPU Available: {torch.cuda.get_device_name(0)}")
    print(f"   Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
    !nvidia-smi
else:
    print("❌ No GPU detected! Please enable GPU runtime:")
    print("   Runtime → Change runtime type → GPU")

## Step 2: Clone Repository and Install Dependencies

In [None]:
# Clone the repository
!git clone https://github.com/cliu238/tabula-8b-va-prediction.git
%cd tabula-8b-va-prediction

# Install required packages
!pip install -q transformers torch accelerate datasets pandas numpy scikit-learn tqdm python-dotenv pillow

## Step 3: Upload PHMRC Data

Upload your PHMRC CSV file when prompted. The file should be named:
`IHME_PHMRC_VA_DATA_ADULT_Y2013M09D11_0.csv`

In [None]:
from google.colab import files
import os

# Create data directory
os.makedirs('data/raw/PHMRC', exist_ok=True)

print("Please upload the PHMRC adult dataset CSV file:")
uploaded = files.upload()

# Move uploaded file to correct location
for filename in uploaded.keys():
    if 'ADULT' in filename:
        !mv "{filename}" data/raw/PHMRC/
        print(f"✅ Moved {filename} to data/raw/PHMRC/")

# Verify file exists
!ls -la data/raw/PHMRC/

## Step 4: Preprocess PHMRC Data

This step cleans and prepares the raw PHMRC data for Tabula-8B model input.

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

# Create the preprocessor
from src.data.preprocessor import PHMRCPreprocessor

print("Preprocessing PHMRC data...")
print("="*60)

# Initialize preprocessor
preprocessor = PHMRCPreprocessor()

# Load raw data
raw_data_path = 'data/raw/PHMRC/IHME_PHMRC_VA_DATA_ADULT_Y2013M09D11_0.csv'
df_raw = preprocessor.load_data(raw_data_path)

print(f"\nRaw data shape: {df_raw.shape}")
print(f"Columns: {df_raw.shape[1]}")
print(f"Records: {df_raw.shape[0]}")

# Preprocess the data
df_processed = preprocessor.preprocess(df_raw, keep_target=True)

print(f"\nProcessed data shape: {df_processed.shape}")
print(f"Features after preprocessing: {df_processed.shape[1] - 1}")  # -1 for target

# Check target distribution
if 'cause_of_death' in df_processed.columns:
    print(f"\nUnique causes of death: {df_processed['cause_of_death'].nunique()}")
    print("\nTop 10 causes:")
    print(df_processed['cause_of_death'].value_counts().head(10))

# Save preprocessed data
processed_path = 'data/preprocessed/phmrc_processed.csv'
Path('data/preprocessed').mkdir(parents=True, exist_ok=True)
df_processed.to_csv(processed_path, index=False)
print(f"\n✅ Preprocessed data saved to {processed_path}")

## Step 5: Create GPU-Optimized Model with Proper CSV Formatting

In [ ]:
%%writefile run_colab.py
#!/usr/bin/env python
"""
Colab-optimized script for Tabula-8B VA prediction with GPU support
Uses proper CSV formatting for Tabula-8B model
"""

import sys
import torch
import pandas as pd
from pathlib import Path
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm

# Add src to path
sys.path.insert(0, str(Path.cwd()))

from src.data.preprocessor import PHMRCPreprocessor
from src.data.tabula_formatter import TabulaVAFormatter

def load_model_gpu():
    """Load Tabula-8B with GPU optimization."""
    print("Loading Tabula-8B model on GPU...")
    
    model_name = "mlfoundations/tabula-8b"
    
    # Load with GPU and half precision for speed
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True
    )
    
    tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        trust_remote_code=True
    )
    
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    print(f"✅ Model loaded on {torch.cuda.get_device_name(0)}")
    return model, tokenizer

def predict_batch_gpu(model, tokenizer, df_batch, formatter, batch_size=4):
    """Run batch predictions on GPU using CSV format."""
    predictions = []
    device = next(model.parameters()).device
    
    # Format data properly for Tabula-8B
    for i in tqdm(range(0, len(df_batch), batch_size), desc="Predicting"):
        batch_df = df_batch.iloc[i:i+batch_size]
        
        # Format each row as CSV for Tabula-8B
        prompts = []
        for _, row in batch_df.iterrows():
            # Create CSV format for single prediction
            row_df = pd.DataFrame([row])
            row_df = row_df.drop(columns=['cause_of_death'], errors='ignore')
            
            prompt = (
                "Task: Predict cause of death from verbal autopsy data.\n\n"
                "Data (CSV format):\n"
                f"{row_df.to_csv(index=False)}\n"
                "Predict the cause of death. Common causes include: "
                "TB, AIDS/HIV, Malaria, Cardiovascular disease, Respiratory infections, "
                "Road traffic accidents, Suicide, Homicide, Maternal conditions.\n\n"
                "Cause of death:"
            )
            prompts.append(prompt)
        
        # Tokenize batch
        inputs = tokenizer(
            prompts,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=2048  # Increased for CSV data
        ).to(device)
        
        # Generate predictions
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=20,
                temperature=0.1,
                do_sample=True,
                pad_token_id=tokenizer.pad_token_id
            )
        
        # Decode predictions
        for j, output in enumerate(outputs):
            generated = tokenizer.decode(
                output[inputs['input_ids'][j].shape[0]:],
                skip_special_tokens=True
            ).strip()
            
            # Clean up prediction
            cause = generated.split('\n')[0].strip()
            if ':' in cause:
                cause = cause.split(':')[-1].strip()
            
            # Remove any quotes or extra formatting
            cause = cause.replace('"', '').replace("'", '').strip()
            
            predictions.append(cause if cause else "Unknown")
    
    return predictions

def main(sample_size=100):
    """Run complete pipeline on GPU with CSV formatting."""
    
    # Load preprocessed data
    print("\n" + "="*60)
    print("Loading preprocessed data...")
    print("="*60)
    
    # Check if preprocessed data exists
    processed_path = 'data/preprocessed/phmrc_processed.csv'
    if not Path(processed_path).exists():
        print("Preprocessed data not found. Running preprocessing...")
        preprocessor = PHMRCPreprocessor()
        df_raw = preprocessor.load_data('data/raw/PHMRC/IHME_PHMRC_VA_DATA_ADULT_Y2013M09D11_0.csv')
        df_processed = preprocessor.preprocess(df_raw, keep_target=True)
        Path('data/preprocessed').mkdir(parents=True, exist_ok=True)
        df_processed.to_csv(processed_path, index=False)
    else:
        df_processed = pd.read_csv(processed_path)
    
    print(f"Loaded {len(df_processed)} records with {df_processed.shape[1]} columns")
    
    # Sample data for testing
    sample_df = df_processed.sample(n=min(sample_size, len(df_processed)), random_state=42)
    print(f"\nUsing {len(sample_df)} samples for prediction")
    
    # Initialize formatter
    formatter = TabulaVAFormatter()
    
    # Load model
    print("\n" + "="*60)
    print("Loading Tabula-8B model...")
    print("="*60)
    model, tokenizer = load_model_gpu()
    
    # Run predictions
    print("\n" + "="*60)
    print("Running predictions on GPU...")
    print("="*60)
    predictions = predict_batch_gpu(model, tokenizer, sample_df, formatter, batch_size=4)
    
    # Save results
    results_df = sample_df.copy()
    results_df['predicted_cause'] = predictions
    
    output_path = f'predictions_gpu_{sample_size}.csv'
    
    # Select key columns for output
    output_columns = ['age', 'gender', 'cause_of_death', 'predicted_cause']
    # Add a few key symptoms if available
    symptom_cols = ['fever', 'cough', 'difficulty_breathing', 'chest_pain']
    for col in symptom_cols:
        if col in results_df.columns:
            output_columns.append(col)
    
    results_df[output_columns].to_csv(output_path, index=False)
    
    # Calculate accuracy
    if 'cause_of_death' in results_df.columns:
        # Normalize for comparison
        true_causes = results_df['cause_of_death'].str.lower().str.strip()
        pred_causes = results_df['predicted_cause'].str.lower().str.strip()
        
        # Check for exact matches
        correct = sum(1 for true, pred in zip(true_causes, pred_causes)
                     if true == pred or true in pred or pred in true)
        accuracy = correct / len(predictions)
        print(f"\n✅ Accuracy: {accuracy:.2%} ({correct}/{len(predictions)})")
        
        # Show some examples
        print("\nSample predictions:")
        for i in range(min(5, len(results_df))):
            print(f"  True: {results_df.iloc[i]['cause_of_death']}")
            print(f"  Pred: {results_df.iloc[i]['predicted_cause']}")
            print()
    
    print(f"✅ Results saved to {output_path}")
    return results_df

if __name__ == "__main__":
    import sys
    sample_size = int(sys.argv[1]) if len(sys.argv) > 1 else 100
    results = main(sample_size)

## Step 6: Download Model and Run Predictions

In [ ]:
# First, run the preprocessing step if not already done
!python -c "from src.data.preprocessor import PHMRCPreprocessor; print('Preprocessor imported successfully')"

# Run predictions with GPU acceleration and CSV formatting
# This will download the model on first run (~16GB)
!python run_colab.py 50  # Process 50 samples

## Step 7: Analyze Results

In [ ]:
# Load and analyze results
import pandas as pd
import numpy as np

# Load prediction results
results_df = pd.read_csv('predictions_gpu_50.csv')

print("="*60)
print("PREDICTION RESULTS ANALYSIS")
print("="*60)

# Display sample predictions
print("\nSample Predictions (first 10):")
print("-"*50)
for i in range(min(10, len(results_df))):
    row = results_df.iloc[i]
    print(f"Record {i+1}:")
    print(f"  Age: {row['age']}, Gender: {row['gender']}")
    print(f"  Actual: {row['cause_of_death']}")
    print(f"  Predicted: {row['predicted_cause']}")
    print()

# Calculate accuracy metrics
def normalize_cause(cause):
    """Normalize cause names for comparison."""
    if pd.isna(cause):
        return "unknown"
    return str(cause).lower().strip().replace('_', ' ')

results_df['actual_normalized'] = results_df['cause_of_death'].apply(normalize_cause)
results_df['predicted_normalized'] = results_df['predicted_cause'].apply(normalize_cause)

# Exact match accuracy
exact_matches = (results_df['actual_normalized'] == results_df['predicted_normalized']).sum()
accuracy = exact_matches / len(results_df)

print("="*60)
print("ACCURACY METRICS")
print("="*60)
print(f"Total samples: {len(results_df)}")
print(f"Exact matches: {exact_matches}")
print(f"Accuracy: {accuracy:.2%}")

# Confusion matrix for top causes
from collections import Counter

print("\n" + "="*60)
print("TOP CAUSES DISTRIBUTION")
print("="*60)

actual_counts = Counter(results_df['actual_normalized'])
predicted_counts = Counter(results_df['predicted_normalized'])

print("\nTop 10 Actual Causes:")
for cause, count in actual_counts.most_common(10):
    print(f"  {cause}: {count} ({count/len(results_df)*100:.1f}%)")

print("\nTop 10 Predicted Causes:")
for cause, count in predicted_counts.most_common(10):
    print(f"  {cause}: {count} ({count/len(results_df)*100:.1f}%)")

## Step 8: Download Results

In [ ]:
# Download the results file
from google.colab import files

print("Downloading prediction results...")
files.download('predictions_gpu_50.csv')
print("✅ Results downloaded successfully!")

## Step 9: Run Full Dataset (Optional)

⚠️ **Warning**: Processing the full dataset (7841 records) will take ~15-30 minutes even on GPU

In [None]:
# Run on full dataset (optional - takes 15-30 minutes)
# Uncomment the line below to process all records

# !python run_colab.py 7841  # Process full dataset

# Alternative: Process in smaller batches
# !python run_colab.py 500   # Process 500 samples
# !python run_colab.py 1000  # Process 1000 samples