### Clinical Triage Model Pipeline: V2 Documentation
Here I summarize the significant configuration and code improvements made to the Clinical Triage Prediction Model pipeline.

Training Data Size:
   - old value: 5000
   - new value: 234000
   - improvement rationale:
      - Massive increase in data used for fine-tuning. This is critical for improving model generalization and predictive power in a clinical setting.

Max Token Length:
   - old value: 512
   - new value: 256
   - improvement rationale:
      - Reduction in input length. This dramatically lowers the model's VRAM requirement per sample, speeding up training and reducing the risk of memory bottlenecks.

Batch Size (BATCH_SIZE)
   - old value: 2
   - new value: 16
   - improvement rationale:
      - Increased per-device batch size for faster training throughput.

Gradient Accumulation (GRAD_ACCUM_STEPS):
   - old value: Implicitly 1
   - new value: 2
   - improvement rationale:
      - Introduced to compensate for potential batch size limits, resulting in an effective batch size of 32 ($16 \times 2$). This helps stabilize training while conserving memory.

Epochs (NUM_EPOCHS):
   - old value: 3
   - new value: 10
   - improvement rationale:
      - Allows the model more time to learn complex patterns in the larger dataset.

Early Stopping Patience (PATIENCE):
   - old value: 1
   - new value: 3
   - improvement rationale:
      - Prevents premature stopping by requiring the validation loss to worsen for three consecutive epochs before halting.
   
Model Output Cleanup
   - old value: None
   - new value: Added shutil.rmtree(MODEL_OUTPUT_DIR)
   - improvement rationale:
      - Ensures a clean slate for each run, deleting old checkpoints and processed data to free up disk space.



In [5]:
import pandas as pd
import numpy as np
import json
import torch
import matplotlib.pyplot as plt
import seaborn as sns
import os
import shutil
from typing import List, Dict, Any, Tuple

# Hugging Face and Transformers
try:
    from datasets import Dataset, DatasetDict, load_from_disk
    from transformers import (
        AutoTokenizer,
        AutoModelForSequenceClassification,
        TrainingArguments,
        Trainer,
        EarlyStoppingCallback,
        pipeline
    )
except ImportError:
    print("ImportError: Hugging Face libraries not found.")
    print("Installing missing libraries...")
    os.system("pip install torch transformers datasets scikit-learn pandas numpy matplotlib seaborn accelerate")
    print("Libraries installed! Please restart the kernel if you see import errors.")
    exit()

# SKLearn
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    classification_report,
    confusion_matrix
)

# CONFIGURATION

In [8]:
# Data files 
TRAIN_FILE = "data/bigdata_models/train.csv"
VAL_FILE = "data/bigdata_models/val.csv"
TEST_FILE = "data/bigdata_models/test.csv"

# Train/Test data size
N_TRAIN_ROWS = 232400  
N_TEST_ROWS = 23240   

# Preprocessing
TARGET_COLUMN = 'discharge_disposition'

# Model: ClinicalBERT
MODEL_NAME = "emilyalsentzer/Bio_ClinicalBERT"
MAX_TOKEN_LENGTH = 256
PROCESSED_DATA_DIR = "data/processed_data/tpaul/processed_triage_data"
LABEL_INFO_FILE = "data/info/tpaul/label_info.json"

# Training Output
MODEL_OUTPUT_DIR = "models/tpaul/triage_classifier_model"
ZIP_NAME = "triage_classifier_model"  # Name of the zip file to create

# Hyperparameters
BATCH_SIZE = 16          
GRAD_ACCUM_STEPS = 2     
LEARNING_RATE = 2e-5     
NUM_EPOCHS = 10          
PATIENCE = 3             

# Evaluation
CONFUSION_MATRIX_FILE = "evaluation/tpaul/confusion_matrix.png"

# PREPROCESSING

In [9]:
def get_label_maps() -> Dict[str, Any]:
    print("Creating complete label mappings...")
    try:
        df_train = pd.read_csv(TRAIN_FILE, usecols=[TARGET_COLUMN], nrows=N_TRAIN_ROWS)
        df_val = pd.read_csv(VAL_FILE, usecols=[TARGET_COLUMN], nrows=N_TEST_ROWS)
        df_test = pd.read_csv(TEST_FILE, usecols=[TARGET_COLUMN], nrows=N_TEST_ROWS)
        
        all_labels = pd.concat([df_train, df_val, df_test])
        all_labels[TARGET_COLUMN] = all_labels[TARGET_COLUMN].fillna('Unknown').astype(str).str.strip()
        
        unique_labels = all_labels[TARGET_COLUMN].unique()
        unique_labels.sort()

        label2id = {label: i for i, label in enumerate(unique_labels)}
        id2label = {i: label for label, i in label2id.items()}
        num_labels = len(unique_labels)

        print(f"Found {num_labels} unique labels: {unique_labels}")

        label_info = {'label2id': label2id, 'id2label': id2label, 'num_labels': num_labels}

        os.makedirs(os.path.dirname(LABEL_INFO_FILE) or '.', exist_ok=True)
        with open(LABEL_INFO_FILE, 'w') as f:
            json.dump(label_info, f)

        return label_info

    except FileNotFoundError as e:
        print(f"\nCRITICAL ERROR: {e}")
        print(f"Please check that '{TRAIN_FILE}' exists.")
        raise e

def preprocess(label2id: Dict[str, int]):
    print("\n--- STARTING PREPROCESSING ---")

    print(f"Loading FULL datasets (Train Rows: {N_TRAIN_ROWS if N_TRAIN_ROWS else 'ALL'})...")
    try:
        df_train = pd.read_csv(TRAIN_FILE, nrows=N_TRAIN_ROWS)
        df_val = pd.read_csv(VAL_FILE, nrows=N_TEST_ROWS)
        df_test = pd.read_csv(TEST_FILE, nrows=N_TEST_ROWS)

    except FileNotFoundError:
        print("Error: Dataset files not found. Cannot proceed.")
        return

    print(f"Train shape: {df_train.shape}, Val shape: {df_val.shape}, Test shape: {df_test.shape}")

    print(f"Loading tokenizer: {MODEL_NAME}...")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    def preprocess_function(examples: Dict[str, List[Any]]) -> Dict[str, Any]:
        cc_list = [str(cc) if pd.notna(cc) else "" for cc in examples["chief_complaint"]]
        hpi_list = [str(hpi) if pd.notna(hpi) else "" for hpi in examples["history_of_present_illness"]]

        text = [f"CHIEF COMPLAINT: {cc} | HISTORY: {hpi}" for cc, hpi in zip(cc_list, hpi_list)]

        tokenized_inputs = tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=MAX_TOKEN_LENGTH
        )

        if TARGET_COLUMN in examples:
            cleaned_labels = [str(label).strip() if pd.notna(label) else 'Unknown' for label in examples[TARGET_COLUMN]]
            tokenized_inputs["labels"] = [label2id.get(label, label2id.get('Unknown', 0)) for label in cleaned_labels]

        return tokenized_inputs

    print("Converting to Hugging Face Datasets...")
    ds_train = Dataset.from_pandas(df_train)
    ds_val = Dataset.from_pandas(df_val)
    ds_test = Dataset.from_pandas(df_test)

    print("Tokenizing datasets (this may take a while)...")
    tokenized_train = ds_train.map(preprocess_function, batched=True)
    tokenized_val = ds_val.map(preprocess_function, batched=True)
    tokenized_test = ds_test.map(preprocess_function, batched=True)

    processed_dataset = DatasetDict({
        'train': tokenized_train,
        'validation': tokenized_val,
        'test': tokenized_test
    })

    # Save to disk
    print(f"Saving processed dataset to {PROCESSED_DATA_DIR}...")
    if os.path.exists(PROCESSED_DATA_DIR):
        shutil.rmtree(PROCESSED_DATA_DIR)
    processed_dataset.save_to_disk(PROCESSED_DATA_DIR)

    print("--- PREPROCESSING COMPLETE ---")


# MODEL FINE-TUNING

In [10]:
def compute_metrics(eval_pred: Tuple[np.ndarray, np.ndarray]) -> Dict[str, float]:
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='weighted', zero_division=0)
    precision = precision_score(labels, predictions, average='weighted', zero_division=0)
    recall = recall_score(labels, predictions, average='weighted', zero_division=0)

    return {
        'accuracy': accuracy,
        'f1_weighted': f1,
        'precision_weighted': precision,
        'recall_weighted': recall
    }

def finetune(label_info: Dict[str, Any], device: torch.device):
    print("\n--- STARTING MODEL FINE-TUNING ---")

    try:
        processed_dataset = load_from_disk(PROCESSED_DATA_DIR)
    except FileNotFoundError:
        print(f"Error: No processed data found at {PROCESSED_DATA_DIR}. Run preprocessing first.")
        return

    print(f"Loading pre-trained model: {MODEL_NAME}...")
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=label_info['num_labels'],
        label2id=label_info['label2id'],
        id2label=label_info['id2label']
    )
    model.to(device)

    print("Configuring Best Performance Training Arguments...")
    
    if os.path.exists(MODEL_OUTPUT_DIR):
        print(f"Cleaning up old model directory {MODEL_OUTPUT_DIR} to free disk space...")
        try:
            shutil.rmtree(MODEL_OUTPUT_DIR)
        except Exception as e:
            print(f"Warning: Could not fully delete old directory: {e}")
            
    os.makedirs(MODEL_OUTPUT_DIR, exist_ok=True)

    training_args = TrainingArguments(
        output_dir=MODEL_OUTPUT_DIR,
        
        num_train_epochs=NUM_EPOCHS,
        learning_rate=LEARNING_RATE,
        per_device_train_batch_size=BATCH_SIZE, 
        per_device_eval_batch_size=BATCH_SIZE, 
        gradient_accumulation_steps=GRAD_ACCUM_STEPS,
        
        fp16=torch.cuda.is_available(),  # Mixed Precision (Major Speedup)
        weight_decay=0.01,
        warmup_ratio=0.1,
        
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,     
        metric_for_best_model="eval_loss",
        greater_is_better=False,         
        
        save_total_limit=1,              
        
        logging_strategy="steps",
        logging_steps=50,
        report_to="none"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=processed_dataset['train'],
        eval_dataset=processed_dataset['validation'],
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=PATIENCE)]
    )

    print("Starting training...")
    trainer.train()

    print("Evaluating best model on validation set...")
    eval_results = trainer.evaluate()
    print(json.dumps(eval_results, indent=2))

    print(f"Saving final model to {MODEL_OUTPUT_DIR}...")
    trainer.save_model(MODEL_OUTPUT_DIR)
    
    # Save tokenizer with the model so it is self-contained
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    tokenizer.save_pretrained(MODEL_OUTPUT_DIR)

    print("--- FINE-TUNING COMPLETE ---")

# EVALUATION 

In [11]:
def evaluate_and_infer(label_info: Dict[str, Any], device: torch.device):
    print("\n--- STARTING EVALUATION ---")

    try:
        model = AutoModelForSequenceClassification.from_pretrained(MODEL_OUTPUT_DIR)
        tokenizer = AutoTokenizer.from_pretrained(MODEL_OUTPUT_DIR)
        model.to(device)
    except OSError:
        print(f"Error: Could not load model from {MODEL_OUTPUT_DIR}")
        return

    processed_dataset = load_from_disk(PROCESSED_DATA_DIR)
    test_dataset = processed_dataset['test']
    
    id2label = {int(k): v for k, v in label_info['id2label'].items()}
    all_label_indices = list(range(label_info['num_labels']))
    all_label_names = [id2label.get(i, f"UNK_{i}") for i in all_label_indices] 

    print("Generating predictions on Test Set...")
    trainer = Trainer(
        model=model,
        args=TrainingArguments(output_dir="./temp_eval", per_device_eval_batch_size=BATCH_SIZE)
    )

    predictions = trainer.predict(test_dataset)
    y_pred = np.argmax(predictions.predictions, axis=1)
    y_true = predictions.label_ids

    print("\n--- Classification Report ---")
    print(classification_report(y_true, y_pred, labels=all_label_indices, target_names=all_label_names, zero_division=0))

    print("Generating Confusion Matrix...")
    cm = confusion_matrix(y_true, y_pred, labels=all_label_indices)
    
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=all_label_names, yticklabels=all_label_names)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Triage Confusion Matrix')
    
    os.makedirs(os.path.dirname(CONFUSION_MATRIX_FILE) or '.', exist_ok=True)
    plt.savefig(CONFUSION_MATRIX_FILE)
    print(f"Saved plot to {CONFUSION_MATRIX_FILE}")


# MAIN EXECUTION

In [None]:
def main():
    print("====== STARTING CLINICAL TRIAGE PIPELINE ======")
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Device: {device}")

    try:
        label_info = get_label_maps()
        preprocess(label_info['label2id'])
        finetune(label_info, device)
        evaluate_and_infer(label_info, device)
        
        print(" ZIPPING MODEL FOR DOWNLOAD ")
        
        # This creates 'triage_classifier_model.zip' from the directory
        shutil.make_archive(ZIP_NAME, 'zip', MODEL_OUTPUT_DIR)
        
        print(f"SUCCESS! The model has been saved and zipped.")
        print(f"You should see '{ZIP_NAME}.zip' in your file browser now.")
        
    except Exception as e:
        print(f"\nAn error occurred: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()