# PDF Sentence Extraction and Multi-Label Theme Classification
## Fine-tuned Model for Prevention of Future Deaths Report Analysis

**Version 2.0 - Fixed for small datasets with class imbalance**

This notebook:
1. Extracts sentences from PDF documents
2. Fine-tunes a transformer model for multi-label classification
3. Handles encoding issues and class imbalance
4. Includes data augmentation for rare classes
5. Annotates sentences with theme codes

## 1. Install Required Libraries

In [1]:
!pip install -q transformers datasets torch pdfplumber nltk scikit-learn pandas numpy accelerate

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.9/67.9 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m43.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m28.7 MB/s[0m eta [36m0:00:00[0m
[?25h

## 2. Import Libraries and Setup

In [2]:
import pandas as pd
import numpy as np
import pdfplumber
import nltk
import torch
import re
import random
from pathlib import Path
from typing import List, Dict, Tuple
from collections import Counter

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

# Download NLTK data for sentence tokenization
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Set random seeds for reproducibility
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

Using device: cuda


## 3. Upload Your Files

Upload:
1. Your ground truth CSV file (`ground_truth_Annotations_.csv`)
2. PDF files you want to process

In [3]:
from google.colab import files

print("Upload your ground truth CSV file:")
uploaded = files.upload()
csv_filename = list(uploaded.keys())[0]
print(f"Uploaded: {csv_filename}")

Upload your ground truth CSV file:


Saving ground_truth(Annotations).csv to ground_truth(Annotations).csv
Uploaded: ground_truth(Annotations).csv


## 4. Load and Explore Ground Truth Data

In [4]:
# Load the CSV file with encoding handling
def load_csv_with_encoding(filename):
    """Try multiple encodings to load CSV file"""
    encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']

    for encoding in encodings:
        try:
            df = pd.read_csv(filename, encoding=encoding)
            print(f"Successfully loaded with {encoding} encoding")
            return df
        except UnicodeDecodeError:
            continue

    # If all fail, use latin-1 with error handling
    print("Using latin-1 encoding with error handling")
    return pd.read_csv(filename, encoding='latin-1', encoding_errors='replace')

df = load_csv_with_encoding(csv_filename)

print("\nDataset shape:", df.shape)
print("\nFirst few rows:")
print(df.head())

print("\nColumn names:")
print(df.columns.tolist())

print("\nMissing values:")
print(df.isnull().sum())

Successfully loaded with latin-1 encoding

Dataset shape: (163, 4)

First few rows:
                                         document_id  \
0  George-Fraser-Prevention-of-Future-Deaths-Repo...   
1  George-Fraser-Prevention-of-Future-Deaths-Repo...   
2  George-Fraser-Prevention-of-Future-Deaths-Repo...   
3  Harry-Southern-Prevention-of-Future-Deaths-Rep...   
4  Hayley-Beavington-Prevention-of-Future-Deaths-...   

                                            sentence  themes  \
0                  no clear and documented care plan      O3   
1                          no robust risk assessment  C1, O4   
2                                No action was taken      H1   
3  contact numbers are not answered and do not ca...      O5   
4               did not give the FY1 any instruction      O3   

                                         theme_names  
0             Organisational Factors - Care Planning  
1  Communication and Culture - Safety Culture, Or...  
2                      Human 

## 5. Prepare Data for Multi-Label Classification

In [5]:
def parse_theme_codes(theme_str):
    """Parse theme codes from string format (e.g., 'C1, O4' or 'H1')"""
    if pd.isna(theme_str):
        return []
    # Remove quotes and split by comma
    themes = [t.strip().strip('"') for t in str(theme_str).split(',')]
    return [t for t in themes if t]  # Remove empty strings

def clean_text(text):
    """Clean text by replacing encoding artifacts"""
    if pd.isna(text):
        return text
    replacements = {
        '\x91': "'",  # Left single quote
        '\x92': "'",  # Right single quote
        '\x93': '"',  # Left double quote
        '\x94': '"',  # Right double quote
        '\x96': '-',  # En dash
        '\x97': '-',  # Em dash
    }
    for old, new in replacements.items():
        text = text.replace(old, new)
    return text

# Clean sentences in the dataframe
df['sentence'] = df['sentence'].apply(clean_text)

# Parse theme codes
df['theme_list'] = df['themes'].apply(parse_theme_codes)

# Get all unique theme codes
all_themes = set()
for themes in df['theme_list']:
    all_themes.update(themes)
all_themes = sorted(list(all_themes))

print(f"Total unique theme codes: {len(all_themes)}")
print(f"\nTheme codes: {all_themes}")

# Count theme frequencies
theme_counts = Counter([theme for themes in df['theme_list'] for theme in themes])
print("\nTheme distribution:")
for theme, count in theme_counts.most_common():
    print(f"{theme}: {count}")

Total unique theme codes: 17

Theme codes: ['C1', 'C2', 'E2', 'H1', 'H2', 'L1', 'L4', 'L5', 'O1', 'O2', 'O3', 'O4', 'O5', 'O6', 'S1', 'S2', 'S4']

Theme distribution:
C2: 28
O3: 24
O4: 19
H1: 15
O6: 13
H2: 13
O5: 11
O1: 9
E2: 9
C1: 8
O2: 8
L1: 7
S2: 4
L4: 4
S1: 3
S4: 3
L5: 1


In [6]:
# Create multi-label binary encoding
mlb = MultiLabelBinarizer(classes=all_themes)
labels_encoded = mlb.fit_transform(df['theme_list'])

print(f"Label matrix shape: {labels_encoded.shape}")
print(f"Number of labels per sample (min/mean/max): {labels_encoded.sum(axis=1).min():.2f} / {labels_encoded.sum(axis=1).mean():.2f} / {labels_encoded.sum(axis=1).max():.2f}")

# Create mapping dictionaries
id2label = {idx: label for idx, label in enumerate(all_themes)}
label2id = {label: idx for idx, label in enumerate(all_themes)}

print(f"\nLabel mappings created for {len(all_themes)} themes")

Label matrix shape: (163, 17)
Number of labels per sample (min/mean/max): 1.00 / 1.10 / 3.00

Label mappings created for 17 themes


## 6. Create Training and Validation Datasets

In [7]:
# ==================== IMPROVED DATA SPLIT WITH ITERATIVE STRATIFICATION ====================
from sklearn.model_selection import train_test_split
from collections import Counter

# Prepare data
sentences = df['sentence'].tolist()

# For very small datasets, use a different split ratio
# Use 70/15/15 instead of 80/10/10 to ensure test set has enough samples
print(f"Total samples: {len(sentences)}")

# First split: 70% train, 30% temp
train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    sentences, labels_encoded,
    test_size=0.30,  # Changed from 0.20
    random_state=42,
    stratify=None
)

# Second split: 15% validation, 15% test (from the 30% temp)
val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts, temp_labels,
    test_size=0.50,  # 50% of 30% = 15% overall
    random_state=42
)

print(f"Training samples: {len(train_texts)} ({len(train_texts)/len(sentences)*100:.1f}%)")
print(f"Validation samples: {len(val_texts)} ({len(val_texts)/len(sentences)*100:.1f}%)")
print(f"Test samples: {len(test_texts)} ({len(test_texts)/len(sentences)*100:.1f}%)")

# Convert labels to float32
train_labels = train_labels.astype(np.float32)
val_labels = val_labels.astype(np.float32)
test_labels = test_labels.astype(np.float32)

# Check test set theme distribution
test_theme_counts = Counter([theme for idx in range(len(test_texts))
                             for j in range(len(all_themes))
                             if test_labels[idx][j] == 1
                             for theme in [all_themes[j]]])

print("\nTest set theme distribution:")
for theme in all_themes:
    count = test_theme_counts.get(theme, 0)
    print(f"  {theme}: {count}")

# Create datasets
train_dataset = Dataset.from_dict({'text': train_texts, 'labels': train_labels.tolist()})
val_dataset = Dataset.from_dict({'text': val_texts, 'labels': val_labels.tolist()})
test_dataset = Dataset.from_dict({'text': test_texts, 'labels': test_labels.tolist()})

print("\n✓ Datasets created with improved split ratio")

Total samples: 163
Training samples: 114 (69.9%)
Validation samples: 24 (14.7%)
Test samples: 25 (15.3%)

Test set theme distribution:
  C1: 0
  C2: 5
  E2: 1
  H1: 6
  H2: 3
  L1: 0
  L4: 1
  L5: 0
  O1: 0
  O2: 0
  O3: 4
  O4: 2
  O5: 2
  O6: 1
  S1: 0
  S2: 0
  S4: 0

✓ Datasets created with improved split ratio


## 7. Data Augmentation for Minority Classes

In [8]:
# ==================== MORE AGGRESSIVE DATA AUGMENTATION ====================
print("\n" + "="*70)
print("AGGRESSIVE DATA AUGMENTATION FOR SMALL DATASET")
print("="*70)

def augment_sentence_aggressive(sentence):
    """More aggressive augmentation"""
    replacements = {
        'did not': ['failed to', 'neglected to', 'omitted to', 'refused to'],
        'lack of': ['absence of', 'insufficient', 'inadequate', 'deficiency in'],
        'no ': ['absence of ', 'lacking ', 'without ', 'missing '],
        'delay': ['postponement', 'wait', 'holdup', 'lag'],
        'staff': ['personnel', 'workers', 'employees', 'practitioners', 'clinicians'],
        'training': ['education', 'instruction', 'preparation', 'development'],
        'communication': ['information sharing', 'dialogue', 'correspondence', 'contact'],
        'assessment': ['evaluation', 'review', 'examination', 'appraisal'],
        'not': ['never', 'rarely', 'seldom'],
        'poor': ['inadequate', 'insufficient', 'substandard', 'deficient'],
    }

    variations = [sentence]
    sentence_lower = sentence.lower()

    for orig, syns in replacements.items():
        if orig in sentence_lower:
            for syn in syns[:2]:  # Use top 2 synonyms
                new_sent = sentence.replace(orig, syn)
                if new_sent != sentence:
                    variations.append(new_sent)

    return variations

# MUCH more aggressive augmentation
augmented_texts = []
augmented_labels = []

for i, text in enumerate(train_texts):
    # Add original
    augmented_texts.append(text)
    augmented_labels.append(train_labels[i])

    # Check themes
    themes = [j for j in range(len(all_themes)) if train_labels[i][j] == 1]
    theme_names = [all_themes[j] for j in themes]
    min_count = min([theme_counts[name] for name in theme_names], default=100)

    # For ALL samples with rare themes (< 20), create multiple augmentations
    if min_count < 20:
        variations = augment_sentence_aggressive(text)

        # Add 3-5 variations depending on rarity
        num_to_add = max(3, min(5, 20 - min_count))

        for var in variations[1:num_to_add+1]:  # Skip first (original)
            augmented_texts.append(var)
            augmented_labels.append(train_labels[i])

print(f"\nOriginal training size: {len(train_texts)}")
print(f"Augmented training size: {len(augmented_texts)}")
print(f"Added {len(augmented_texts) - len(train_texts)} synthetic examples")

# Update training data
train_texts = augmented_texts
train_labels = np.array(augmented_labels, dtype=np.float32)

# Recreate dataset
train_dataset = Dataset.from_dict({'text': train_texts, 'labels': train_labels.tolist()})
print("✓ Training dataset updated with aggressive augmentation")


AGGRESSIVE DATA AUGMENTATION FOR SMALL DATASET

Original training size: 114
Augmented training size: 215
Added 101 synthetic examples
✓ Training dataset updated with aggressive augmentation


In [9]:
# Install translation library for back-translation augmentation
!pip install -q deep-translator

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.3/42.3 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [10]:
# ==================== ENHANCED BACK-TRANSLATION AUGMENTATION ====================
print("\n" + "="*70)
print("ENHANCED SYNTHETIC DATA GENERATION - BACK-TRANSLATION")
print("="*70)

from deep_translator import GoogleTranslator
import time

def back_translate_text(text, intermediate_lang='de'):
    """
    Back-translate text for augmentation
    """
    try:
        # Translate to intermediate language
        translator_to = GoogleTranslator(source='en', target=intermediate_lang)
        intermediate = translator_to.translate(text)

        # Small delay to avoid rate limits
        time.sleep(0.1)

        # Translate back to English
        translator_back = GoogleTranslator(source=intermediate_lang, target='en')
        back_translated = translator_back.translate(intermediate)

        # Only return if meaningfully different and reasonable length
        if back_translated != text and len(back_translated) > 10 and len(back_translated) < 200:
            return back_translated
    except Exception as e:
        pass

    return None

# Enhanced augmentation combining original simple augmentation + back-translation
print("\nGenerating enhanced synthetic data...")

enhanced_texts = []
enhanced_labels = []

# Start with all original training data
for i, text in enumerate(train_texts):
    enhanced_texts.append(text)
    enhanced_labels.append(train_labels[i])

# Count synthetic examples by method
back_translation_count = 0

# Generate back-translations for rare themes
for i, text in enumerate(train_texts):
    # Check which themes this sample has
    themes = [j for j in range(len(all_themes)) if train_labels[i][j] == 1]
    theme_names = [all_themes[j] for j in themes]

    # Get minimum count for these themes
    theme_counts_dict = {all_themes[j]: theme_counts[all_themes[j]] for j in themes}
    min_count = min(theme_counts_dict.values(), default=100)

    # Generate back-translations for themes with < 12 examples
    if min_count < 12:
        # Use multiple languages for diversity
        languages = ['de', 'fr', 'es']  # German, French, Spanish

        # More augmentations for rarer themes
        num_augments = max(1, min(3, 12 - min_count))

        for lang in languages[:num_augments]:
            bt_text = back_translate_text(text, intermediate_lang=lang)
            if bt_text:
                enhanced_texts.append(bt_text)
                enhanced_labels.append(train_labels[i])
                back_translation_count += 1

print(f"\n✓ Original training size: {len(train_texts)}")
print(f"✓ Added via back-translation: {back_translation_count}")
print(f"✓ Enhanced training size: {len(enhanced_texts)}")
print(f"✓ Total synthetic examples: {len(enhanced_texts) - len(train_texts)}")

# Update training data
train_texts = enhanced_texts
train_labels = np.array(enhanced_labels, dtype=np.float32)

# Recreate training dataset with enhanced data
train_dataset = Dataset.from_dict({'text': train_texts, 'labels': train_labels.tolist()})
print("\n✓ Training dataset updated with enhanced synthetic data")
print("="*70)


ENHANCED SYNTHETIC DATA GENERATION - BACK-TRANSLATION

Generating enhanced synthetic data...

✓ Original training size: 215
✓ Added via back-translation: 272
✓ Enhanced training size: 487
✓ Total synthetic examples: 272

✓ Training dataset updated with enhanced synthetic data


## 8. Initialize Model and Tokenizer

In [11]:
# ==================== MODEL SELECTION ====================
# Choose your model by setting the selected_model variable
# Recommended: PathologyBERT for pathology reports

AVAILABLE_MODELS = {
    # General models
    'bert-base': 'bert-base-uncased',
    'distilbert': 'distilbert-base-uncased',  # Faster, smaller

    # Pathology-specific model ⭐⭐⭐
    'pathology-bert': 'tsantos/PathologyBERT',  # SPECIALIZED PATHOLOGY

    # Healthcare/Clinical models
    'bio-clinical-bert': 'emilyalsentzer/Bio_ClinicalBERT',  # Clinical notes
    'pubmed-bert': 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext',
    'biobert': 'dmis-lab/biobert-v1.1',  # Biomedical text
    'clinical-bert': 'medicalai/ClinicalBERT',  # Clinical text
    'bluebert': 'bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-12',
    'gatortron': 'UFNLP/gatortron-base',  # Clinical notes (large)
}

MODEL_DESCRIPTIONS = {
    'bert-base': 'Standard BERT - Good baseline, general English',
    'distilbert': 'Smaller/faster BERT - 40% faster, 97% performance',
    'pathology-bert': '⭐⭐⭐ SPECIALIZED PATHOLOGY - Trained on pathology reports',
    'bio-clinical-bert': '⭐ Clinical notes (MIMIC-III) - For healthcare',
    'pubmed-bert': '⭐ PubMed biomedical - For medical terminology',
    'biobert': 'Biomedical text (PubMed)',
    'clinical-bert': 'Clinical text',
    'bluebert': 'PubMed + MIMIC clinical',
    'gatortron': 'Large clinical model - Slower but powerful',
}

print("Available Models for Healthcare/Clinical/Pathology Text:")
print("=" * 80)
for key, desc in MODEL_DESCRIPTIONS.items():
    print(f"{key:20s} | {desc}")
print("=" * 80)

# ==================== SELECT YOUR MODEL HERE ====================
# Change this to try different models:
selected_model = 'pathology-bert'  # ⭐⭐⭐ SPECIALIZED PATHOLOGY BERT
# selected_model = 'bio-clinical-bert'  # Alternative: clinical notes
# selected_model = 'pubmed-bert'        # Alternative: medical terminology
# selected_model = 'bert-base'          # Baseline comparison

model_name = AVAILABLE_MODELS[selected_model]
print(f"\n✓ Selected: {selected_model}")
print(f"  Model: {model_name}")
print(f"  Description: {MODEL_DESCRIPTIONS[selected_model]}")

# Load tokenizer
print(f"\nLoading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_name)
print(f"✓ Tokenizer loaded successfully")

Available Models for Healthcare/Clinical/Pathology Text:
bert-base            | Standard BERT - Good baseline, general English
distilbert           | Smaller/faster BERT - 40% faster, 97% performance
pathology-bert       | ⭐⭐⭐ SPECIALIZED PATHOLOGY - Trained on pathology reports
bio-clinical-bert    | ⭐ Clinical notes (MIMIC-III) - For healthcare
pubmed-bert          | ⭐ PubMed biomedical - For medical terminology
biobert              | Biomedical text (PubMed)
clinical-bert        | Clinical text
bluebert             | PubMed + MIMIC clinical
gatortron            | Large clinical model - Slower but powerful

✓ Selected: pathology-bert
  Model: tsantos/PathologyBERT
  Description: ⭐⭐⭐ SPECIALIZED PATHOLOGY - Trained on pathology reports

Loading tokenizer...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/370 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

✓ Tokenizer loaded successfully


In [12]:
# Tokenize datasets
def tokenize_function(examples):
    return tokenizer(
        examples['text'],
        padding='max_length',
        truncation=True,
        max_length=128
    )

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Set format for PyTorch with float labels
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
val_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

# Ensure labels are float type
def convert_labels_to_float(batch):
    batch['labels'] = batch['labels'].float()
    return batch

train_dataset = train_dataset.map(convert_labels_to_float, batched=False)
val_dataset = val_dataset.map(convert_labels_to_float, batched=False)
test_dataset = test_dataset.map(convert_labels_to_float, batched=False)

print("Datasets tokenized and formatted with float labels")

Map:   0%|          | 0/487 [00:00<?, ? examples/s]

Map:   0%|          | 0/24 [00:00<?, ? examples/s]

Map:   0%|          | 0/25 [00:00<?, ? examples/s]

Map:   0%|          | 0/487 [00:00<?, ? examples/s]

Map:   0%|          | 0/24 [00:00<?, ? examples/s]

Map:   0%|          | 0/25 [00:00<?, ? examples/s]

Datasets tokenized and formatted with float labels


In [13]:
# Load model for multi-label classification
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(all_themes),
    id2label=id2label,
    label2id=label2id,
    problem_type="multi_label_classification"
)

model.to(device)
print(f"Model loaded with {len(all_themes)} output labels")

config.json:   0%|          | 0.00/589 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/384M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at tsantos/PathologyBERT and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/384M [00:00<?, ?B/s]

Model loaded with 17 output labels


## 9. Define Training Configuration with Class Weights

In [14]:
def compute_metrics(eval_pred):
    """Compute metrics for multi-label classification"""
    predictions, labels = eval_pred

    # Apply sigmoid and threshold
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    y_pred = (probs > 0.5).int().numpy()
    y_true = labels.astype(np.float32).astype(int)

    # Calculate metrics
    f1_micro = f1_score(y_true, y_pred, average='micro', zero_division=0)
    f1_macro = f1_score(y_true, y_pred, average='macro', zero_division=0)
    f1_weighted = f1_score(y_true, y_pred, average='weighted', zero_division=0)

    # Per-label F1 scores
    f1_per_label = f1_score(y_true, y_pred, average=None, zero_division=0)

    metrics = {
        'f1_micro': f1_micro,
        'f1_macro': f1_macro,
        'f1_weighted': f1_weighted,
    }

    # Add per-label F1 scores
    for idx, label in id2label.items():
        metrics[f'f1_{label}'] = f1_per_label[idx]

    return metrics

In [15]:
# Calculate class weights to handle imbalance
class_weights = []
for i in range(len(all_themes)):
    pos_samples = train_labels[:, i].sum()
    neg_samples = len(train_labels) - pos_samples
    if pos_samples > 0:
        # Use square root to make weights less extreme
        weight = np.sqrt(neg_samples / pos_samples)
    else:
        weight = 1.0
    class_weights.append(weight)

class_weights = torch.tensor(class_weights, dtype=torch.float32).to(device)
print(f"\nClass weights computed:")
for theme, weight in zip(all_themes, class_weights.cpu().numpy()):
    print(f"{theme}: {weight:.2f}")

# Custom Trainer with weighted loss
class WeightedMultiLabelTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits

        # Weighted BCE loss
        loss_fct = torch.nn.BCEWithLogitsLoss(pos_weight=class_weights)
        loss = loss_fct(logits, labels)

        return (loss, outputs) if return_outputs else loss

# Training arguments - optimized for small datasets
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=25,  # More epochs for small dataset
    per_device_train_batch_size=8,  # Smaller batch size
    per_device_eval_batch_size=8,
    learning_rate=2e-5,  # Lower learning rate
    warmup_ratio=0.1,  # Warm up 10% of steps
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=5,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='f1_weighted',
    greater_is_better=True,
    save_total_limit=2,
    push_to_hub=False,
    report_to='none',
    fp16=torch.cuda.is_available(),
)

# Initialize weighted trainer
trainer = WeightedMultiLabelTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
)

print("\nWeighted trainer configured and ready")


Class weights computed:
C1: 2.16
C2: 3.39
E2: 3.39
H1: 6.04
H2: 3.30
L1: 2.92
L4: 7.74
L5: 4.83
O1: 3.10
O2: 3.17
O3: 3.44
O4: 2.77
O5: 3.26
O6: 3.71
S1: 4.96
S2: 4.60
S4: 6.58

Weighted trainer configured and ready


## 10. Train the Model

In [16]:
# Train the model
print("Starting training...")
train_result = trainer.train()

print("\nTraining completed!")
print(f"Training loss: {train_result.training_loss:.4f}")

# Save the model
trainer.save_model('./best_model')
tokenizer.save_pretrained('./best_model')
print("Model saved to ./best_model")

Starting training...


Epoch,Training Loss,Validation Loss,F1 Micro,F1 Macro,F1 Weighted,F1 C1,F1 C2,F1 E2,F1 H1,F1 H2,F1 L1,F1 L4,F1 L5,F1 O1,F1 O2,F1 O3,F1 O4,F1 O5,F1 O6,F1 S1,F1 S2,F1 S4
1,0.6369,0.590925,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.4794,0.555159,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.3722,0.5061,0.0625,0.019608,0.061728,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0
4,0.2092,0.495907,0.358974,0.133894,0.330511,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.8,0.0,0.857143,0.285714,0.0,0.0,0.0,0.0,0.0
5,0.1628,0.48821,0.341463,0.133894,0.330511,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.8,0.0,0.857143,0.285714,0.0,0.0,0.0,0.0,0.0
6,0.1277,0.485258,0.341463,0.131092,0.321693,0.0,0.285714,0.0,0.0,0.0,0.0,0.0,0.0,0.8,0.0,0.857143,0.285714,0.0,0.0,0.0,0.0,0.0
7,0.104,0.493176,0.368421,0.131092,0.321693,0.0,0.285714,0.0,0.0,0.0,0.0,0.0,0.0,0.8,0.0,0.857143,0.285714,0.0,0.0,0.0,0.0,0.0
8,0.0911,0.5072,0.368421,0.133894,0.330511,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.8,0.0,0.857143,0.285714,0.0,0.0,0.0,0.0,0.0
9,0.0757,0.542916,0.358974,0.133894,0.330511,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.8,0.0,0.857143,0.285714,0.0,0.0,0.0,0.0,0.0



Training completed!
Training loss: 0.2757
Model saved to ./best_model


## 11. Evaluate on Test Set

In [17]:
# Evaluate on test set
test_results = trainer.evaluate(test_dataset)

print("\n" + "="*60)
print("BERT MODEL TEST SET RESULTS")
print("="*60)
print(f"F1 Micro: {test_results['eval_f1_micro']:.4f}")
print(f"F1 Macro: {test_results['eval_f1_macro']:.4f}")
print(f"F1 Weighted: {test_results['eval_f1_weighted']:.4f}")

print("\nPer-label F1 Scores:")
for label in all_themes:
    score = test_results.get(f'eval_f1_{label}', 0)
    print(f"{label}: {score:.4f}")


BERT MODEL TEST SET RESULTS
F1 Micro: 0.1176
F1 Macro: 0.0882
F1 Weighted: 0.1000

Per-label F1 Scores:
C1: 0.0000
C2: 0.0000
E2: 0.0000
H1: 0.0000
H2: 0.5000
L1: 0.0000
L4: 0.0000
L5: 0.0000
O1: 0.0000
O2: 0.0000
O3: 0.0000
O4: 0.0000
O5: 0.0000
O6: 1.0000
S1: 0.0000
S2: 0.0000
S4: 0.0000


In [18]:
# Detailed predictions on test set
predictions = trainer.predict(test_dataset)
sigmoid = torch.nn.Sigmoid()
probs = sigmoid(torch.Tensor(predictions.predictions))
y_pred = (probs > 0.5).int().numpy()
y_true = test_labels.astype(int)

# Per-label classification report
print("\nDetailed Classification Report:")
print(classification_report(
    y_true,
    y_pred,
    target_names=all_themes,
    zero_division=0
))


Detailed Classification Report:
              precision    recall  f1-score   support

          C1       0.00      0.00      0.00         0
          C2       0.00      0.00      0.00         5
          E2       0.00      0.00      0.00         1
          H1       0.00      0.00      0.00         6
          H2       1.00      0.33      0.50         3
          L1       0.00      0.00      0.00         0
          L4       0.00      0.00      0.00         1
          L5       0.00      0.00      0.00         0
          O1       0.00      0.00      0.00         0
          O2       0.00      0.00      0.00         0
          O3       0.00      0.00      0.00         4
          O4       0.00      0.00      0.00         2
          O5       0.00      0.00      0.00         2
          O6       1.00      1.00      1.00         1
          S1       0.00      0.00      0.00         0
          S2       0.00      0.00      0.00         0
          S4       0.00      0.00      0.00     

In [19]:
# ==================== RE-EVALUATE WITH LOWER THRESHOLD ====================
print("\n" + "="*70)
print("RE-EVALUATION WITH LOWER THRESHOLD")
print("="*70)

# Get predictions with different thresholds
predictions_output = trainer.predict(test_dataset)
sigmoid = torch.nn.Sigmoid()
probs = sigmoid(torch.Tensor(predictions_output.predictions))

# Try multiple thresholds
thresholds = [0.5, 0.4, 0.3, 0.2, 0.15]

best_f1 = 0
best_threshold = 0.5

for threshold in thresholds:
    y_pred = (probs > threshold).int().numpy()
    y_true = test_labels.astype(int)

    f1_micro = f1_score(y_true, y_pred, average='micro', zero_division=0)
    f1_macro = f1_score(y_true, y_pred, average='macro', zero_division=0)
    f1_weighted = f1_score(y_true, y_pred, average='weighted', zero_division=0)

    print(f"\nThreshold: {threshold}")
    print(f"  F1 Micro:    {f1_micro:.4f}")
    print(f"  F1 Macro:    {f1_macro:.4f}")
    print(f"  F1 Weighted: {f1_weighted:.4f}")

    # Track predictions per sample
    preds_per_sample = y_pred.sum(axis=1).mean()
    print(f"  Avg predictions per sample: {preds_per_sample:.2f}")

    if f1_weighted > best_f1:
        best_f1 = f1_weighted
        best_threshold = threshold

print(f"\n{'='*70}")
print(f"BEST THRESHOLD: {best_threshold} with F1 Weighted: {best_f1:.4f}")
print(f"{'='*70}")

# Use best threshold for detailed report
y_pred_best = (probs > best_threshold).int().numpy()
print(f"\nDetailed Report with Threshold={best_threshold}:")
print(classification_report(
    test_labels.astype(int),
    y_pred_best,
    target_names=all_themes,
    zero_division=0
))


RE-EVALUATION WITH LOWER THRESHOLD



Threshold: 0.5
  F1 Micro:    0.1176
  F1 Macro:    0.0882
  F1 Weighted: 0.1000
  Avg predictions per sample: 0.36

Threshold: 0.4
  F1 Micro:    0.1538
  F1 Macro:    0.1118
  F1 Weighted: 0.1320
  Avg predictions per sample: 0.56

Threshold: 0.3
  F1 Micro:    0.2909
  F1 Macro:    0.1706
  F1 Weighted: 0.3160
  Avg predictions per sample: 1.20

Threshold: 0.2
  F1 Micro:    0.2000
  F1 Macro:    0.1044
  F1 Weighted: 0.2521
  Avg predictions per sample: 3.00

Threshold: 0.15
  F1 Micro:    0.1401
  F1 Macro:    0.1015
  F1 Weighted: 0.2348
  Avg predictions per sample: 5.28

BEST THRESHOLD: 0.3 with F1 Weighted: 0.3160

Detailed Report with Threshold=0.3:
              precision    recall  f1-score   support

          C1       0.00      0.00      0.00         0
          C2       0.60      0.60      0.60         5
          E2       0.00      0.00      0.00         1
          H1       0.00      0.00      0.00         6
          H2       1.00      0.33      0.50         3
      

## 12. Baseline Comparison: TF-IDF + Logistic Regression

In [20]:
# Train baseline TF-IDF + Logistic Regression model for comparison
print("Training baseline TF-IDF + Logistic Regression model...\n")

# Use original non-augmented data for fair comparison
train_texts_orig, temp_texts, train_labels_orig, temp_labels = train_test_split(
    sentences, labels_encoded, test_size=0.2, random_state=42, stratify=None
)
val_texts_orig, test_texts_orig, val_labels_orig, test_labels_orig = train_test_split(
    temp_texts, temp_labels, test_size=0.5, random_state=42
)

# Vectorize text
vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1, 3), min_df=1)
X_train_tfidf = vectorizer.fit_transform(train_texts_orig)
X_test_tfidf = vectorizer.transform(test_texts_orig)

# Train one-vs-rest classifier
baseline_model = OneVsRestClassifier(
    LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42, C=1.0)
)
baseline_model.fit(X_train_tfidf, train_labels_orig)

# Predict
y_pred_baseline = baseline_model.predict(X_test_tfidf)

# Evaluate
f1_micro_baseline = f1_score(test_labels_orig, y_pred_baseline, average='micro', zero_division=0)
f1_macro_baseline = f1_score(test_labels_orig, y_pred_baseline, average='macro', zero_division=0)
f1_weighted_baseline = f1_score(test_labels_orig, y_pred_baseline, average='weighted', zero_division=0)

print("="*60)
print("BASELINE (TF-IDF + LOGISTIC REGRESSION) TEST SET RESULTS")
print("="*60)
print(f"F1 Micro: {f1_micro_baseline:.4f}")
print(f"F1 Macro: {f1_macro_baseline:.4f}")
print(f"F1 Weighted: {f1_weighted_baseline:.4f}")

# Per-label scores
f1_per_label_baseline = f1_score(test_labels_orig, y_pred_baseline, average=None, zero_division=0)
print("\nPer-label F1 Scores:")
for idx, label in enumerate(all_themes):
    print(f"{label}: {f1_per_label_baseline[idx]:.4f}")

print("\n" + "="*60)
print("COMPARISON")
print("="*60)
print(f"BERT F1 Weighted: {test_results['eval_f1_weighted']:.4f}")
print(f"Baseline F1 Weighted: {f1_weighted_baseline:.4f}")
improvement = ((test_results['eval_f1_weighted'] - f1_weighted_baseline) / f1_weighted_baseline * 100) if f1_weighted_baseline > 0 else 0
print(f"BERT improvement: {improvement:+.1f}%")

Training baseline TF-IDF + Logistic Regression model...

BASELINE (TF-IDF + LOGISTIC REGRESSION) TEST SET RESULTS
F1 Micro: 0.4800
F1 Macro: 0.2176
F1 Weighted: 0.3889

Per-label F1 Scores:
C1: 0.0000
C2: 0.8000
E2: 0.0000
H1: 0.4000
H2: 0.0000
L1: 0.0000
L4: 0.0000
L5: 0.0000
O1: 0.0000
O2: 0.0000
O3: 0.5000
O4: 1.0000
O5: 0.0000
O6: 1.0000
S1: 0.0000
S2: 0.0000
S4: 0.0000

COMPARISON
BERT F1 Weighted: 0.1000
Baseline F1 Weighted: 0.3889
BERT improvement: -74.3%


## 13. PDF Extraction Functions

In [21]:
def extract_text_from_pdf(pdf_path: str) -> str:
    """Extract text from PDF file"""
    text = ""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {e}")
    return text

def clean_sentence(sentence: str) -> str:
    """Clean extracted sentence"""
    # Replace common encoding artifacts (Windows-1252 smart quotes, etc.)
    replacements = {
        '\x91': "'",  # Left single quote
        '\x92': "'",  # Right single quote
        '\x93': '"',  # Left double quote
        '\x94': '"',  # Right double quote
        '\x96': '-',  # En dash
        '\x97': '-',  # Em dash
    }
    for old, new in replacements.items():
        sentence = sentence.replace(old, new)

    # Remove extra whitespace
    sentence = re.sub(r'\s+', ' ', sentence).strip()
    # Remove very short sentences (likely artifacts)
    if len(sentence) < 10:
        return ""
    return sentence

def extract_sentences_from_text(text: str) -> List[str]:
    """Extract and clean sentences from text"""
    # Use NLTK sentence tokenizer
    sentences = nltk.sent_tokenize(text)
    # Clean sentences
    cleaned_sentences = [clean_sentence(s) for s in sentences]
    # Remove empty sentences
    return [s for s in cleaned_sentences if s]

def extract_sentences_from_pdf(pdf_path: str) -> List[str]:
    """Extract sentences from PDF file"""
    text = extract_text_from_pdf(pdf_path)
    return extract_sentences_from_text(text)

print("PDF extraction functions defined")

PDF extraction functions defined


## 14. Inference Function

In [None]:
def predict_themes(texts: List[str], threshold: float = 0.3) -> List[Dict]:
    """
    Predict themes for a list of texts

    Args:
        texts: List of text strings to classify
        threshold: Probability threshold for positive prediction (default 0.3)

    Returns:
        List of dictionaries containing predictions for each text
    """
    # Tokenize
    inputs = tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors='pt'
    )
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Get predictions
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)

    # Apply sigmoid and threshold
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(outputs.logits).cpu().numpy()

    # Process results
    results = []
    for i, text in enumerate(texts):
        pred_labels = []
        pred_probs = {}

        for j, prob in enumerate(probs[i]):
            label = id2label[j]
            pred_probs[label] = float(prob)
            if prob > threshold:
                pred_labels.append(label)

        results.append({
            'text': text,
            'predicted_themes': pred_labels,
            'probabilities': pred_probs,
            'top_themes': sorted(pred_probs.items(), key=lambda x: x[1], reverse=True)[:5]
        })

    return results

print("Inference function defined")

## 15. Test Inference on Sample Sentences

In [None]:
# Test with some sample sentences
test_sentences = [
    "The staff did not receive adequate training on the new procedures.",
    "Communication between departments was poor and led to delays.",
    "The physical environment was unsuitable for patient safety.",
    "No robust risk assessment was conducted.",
    "The care plan was not properly documented."
]

# Use lower threshold for better recall
predictions = predict_themes(test_sentences, threshold=0.25)

print("\nSample Predictions:")
print("=" * 80)
for pred in predictions:
    print(f"\nText: {pred['text']}")
    print(f"Predicted Themes: {', '.join(pred['predicted_themes']) if pred['predicted_themes'] else 'None'}")
    print("Top 5 theme probabilities:")
    for theme, prob in pred['top_themes'][:5]:
        print(f"  {theme}: {prob:.4f}")
    print("-" * 80)

## 16. Process PDF Files

In [None]:
# Upload PDF files
print("Upload PDF files to process:")
uploaded_pdfs = files.upload()

pdf_files = list(uploaded_pdfs.keys())
print(f"\nUploaded {len(pdf_files)} PDF file(s)")

In [None]:
def process_pdf_and_annotate(pdf_path: str, threshold: float = 0.3, batch_size: int = 32) -> pd.DataFrame:
    """
    Process PDF file: extract sentences and annotate with themes

    Args:
        pdf_path: Path to PDF file
        threshold: Probability threshold for predictions
        batch_size: Number of sentences to process at once

    Returns:
        DataFrame with sentences and their predicted themes
    """
    print(f"\nProcessing: {pdf_path}")

    # Extract sentences
    sentences = extract_sentences_from_pdf(pdf_path)
    print(f"Extracted {len(sentences)} sentences")

    if not sentences:
        print("No sentences extracted from PDF")
        return pd.DataFrame()

    # Process in batches
    all_results = []
    for i in range(0, len(sentences), batch_size):
        batch = sentences[i:i+batch_size]
        batch_results = predict_themes(batch, threshold=threshold)
        all_results.extend(batch_results)

    # Create DataFrame
    results_df = pd.DataFrame([
        {
            'document': Path(pdf_path).stem,
            'sentence': r['text'],
            'predicted_themes': ', '.join(r['predicted_themes']),
            'num_themes': len(r['predicted_themes']),
            'top_theme': r['top_themes'][0][0] if r['top_themes'] else '',
            'top_probability': r['top_themes'][0][1] if r['top_themes'] else 0.0
        }
        for r in all_results
    ])

    return results_df

# Process all uploaded PDFs
all_annotations = []

for pdf_file in pdf_files:
    df_annotations = process_pdf_and_annotate(pdf_file, threshold=0.25)
    if not df_annotations.empty:
        all_annotations.append(df_annotations)
        print(f"Annotated {len(df_annotations)} sentences from {pdf_file}")

# Combine all results
if all_annotations:
    final_df = pd.concat(all_annotations, ignore_index=True)
    print(f"\nTotal sentences annotated: {len(final_df)}")
    print(f"\nTheme distribution:")
    print(final_df['num_themes'].value_counts().sort_index())
else:
    print("No annotations generated")
    final_df = pd.DataFrame()

## 17. View and Export Results

In [None]:
# Display sample results
if not final_df.empty:
    print("\nSample Annotations:")
    print(final_df.head(20))

    # Save to CSV
    output_filename = 'annotated_sentences.csv'
    final_df.to_csv(output_filename, index=False)
    print(f"\nResults saved to {output_filename}")

    # Download the file
    files.download(output_filename)
else:
    print("No results to display")

## 18. Save and Download Model

In [None]:
# Create a zip file of the model
import shutil
import json

# Save label mappings
with open('./best_model/label_mappings.json', 'w') as f:
    json.dump({
        'id2label': id2label,
        'label2id': label2id,
        'all_themes': all_themes,
        'class_weights': class_weights.cpu().tolist()
    }, f, indent=2)

print("Label mappings saved")

# Zip the model directory
shutil.make_archive('trained_model', 'zip', './best_model')
print("\nModel packaged as trained_model.zip")

# Download
files.download('trained_model.zip')
print("Model zip file ready for download")

## 19. Interactive Annotation Interface

In [None]:
# Interactive annotation function
def annotate_text_interactive(text: str, threshold: float = 0.25):
    """
    Annotate a single piece of text interactively
    """
    predictions = predict_themes([text], threshold=threshold)
    result = predictions[0]

    print("\n" + "="*80)
    print("TEXT:")
    print(text)
    print("\n" + "-"*80)
    print("PREDICTED THEMES:")
    if result['predicted_themes']:
        for theme in result['predicted_themes']:
            prob = result['probabilities'][theme]
            print(f"  • {theme}: {prob:.4f}")
    else:
        print("  No themes predicted above threshold")

    print("\n" + "-"*80)
    print("ALL THEME PROBABILITIES (Top 10):")
    for theme, prob in result['top_themes'][:10]:
        print(f"  {theme}: {prob:.4f}")
    print("="*80)

# Example usage - uncomment and modify to test
# annotate_text_interactive("The patient was not properly assessed for risk.")
print("Interactive annotation function ready. Use annotate_text_interactive('your text here') to test.")