# Install Dependencies

In [1]:
!pip install -q transformers datasets torch seqeval evaluate

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m32.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m96.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m103.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m57.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.1 MB/s[0m eta [3

# Env Variables

In [2]:
base_path = 'data/'
max_token_length = 1024

# Establish Google Drive Connection (if needed)

In [4]:
from google.colab import drive
drive.mount('/content/drive')

base_path = 'drive/MyDrive/dataset/'

MessageError: Error: credential propagation was unsuccessful

# Imports

In [None]:
import json
import re
import numpy as np
import os
import math
from datasets import Dataset
from transformers import (
    LongformerTokenizerFast,
    LongformerForTokenClassification,
    TrainingArguments,
    Trainer,
    pipeline,
    TrainerCallback,
    TrainerState,
    TrainerControl,
    EarlyStoppingCallback
)
import evaluate
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from torch.nn import CrossEntropyLoss
import torch
from collections import Counter
from typing import Dict, Any

# Helper Functions

## Load Data
Loads all json files in a specified path and combines them in one aggregated list

## Convert Numpy Floats

## Save Model Output

In [None]:
def load_json_data(folder_path):
    aggregated_data = []

    # loop through all files in the given folder
    for root, dirs, files in os.walk(folder_path):
        for file_name in files:
            with open(f"{folder_path}/{file_name}", "r") as f:
                data = json.load(f)

            aggregated_data.append(data)

    return aggregated_data

# Convert NumPy float32 to native Python floats before JSON serialization
def convert_numpy_floats(obj):
    if isinstance(obj, np.float32):
        return float(obj)
    raise TypeError

def save_model_output(output, output_path):
    with open(output_path, "w") as f:
        json.dump(output, f, ensure_ascii=False, indent=2, default=convert_numpy_floats)
    print(f"Saved validation NER predictions to {output_path}")

def compute_f1(predictions_file_path, output_path, validation_dataset):
  # Load saved predictions
  with open(predictions_file_path, 'r') as f:
      saved_preds = json.load(f)

  # Prepare gold and predicted lists
  all_gold = []
  all_pred = []
  for pred in saved_preds:
      idx = pred['index']
      gold_entities = validation_dataset[idx]['entities']
      # flatten gold mentions: (mention_text, type)
      gold_set = set()
      for ent in gold_entities:
          for m in ent['mentions']:
              gold_set.add((m, ent['type']))
      # flatten predicted mentions: pipeline outputs 'word' and 'entity_group'
      pred_list = pred['predictions']
      pred_set = set()
      for p in pred_list:
          w = p.get('word')
          et = p.get('entity_group')
          pred_set.add((w, et))
      # Append to global lists
      all_gold.append(gold_set)
      all_pred.append(pred_set)

  # Compute micro-level counts
  tp = 0
  pred_count = 0
  gold_count = 0
  for gold_set, pred_set in zip(all_gold, all_pred):
      tp += len(gold_set & pred_set)
      pred_count += len(pred_set)
      gold_count += len(gold_set)

  precision = tp / pred_count if pred_count > 0 else 0.0
  recall = tp / gold_count if gold_count > 0 else 0.0
  f1 = (2 * precision * recall / (precision + recall)) if (precision + recall) > 0 else 0.0

  # Print and save metrics
  metrics = {
      'precision': precision,
      'recall': recall,
      'f1': f1,
      'true_positives': tp,
      'predicted': pred_count,
      'gold': gold_count
  }
  print("NER Validation Mention-level Metrics:")
  print(metrics)

  # Save metrics to JSON
  with open(output_path, 'w') as f:
      json.dump(metrics, f, indent=2)
  print(f"Saved evaluation metrics to {output_path}")

# Load data into Memory

In [None]:
# Load JSON files and store them in memory
aggregated_data = []
folder_path = f'{base_path}raw/train'

# loop through all files in the given folder
for root, dirs, files in os.walk(folder_path):
    for file_name in files:
        with open(f"{folder_path}/{file_name}", "r") as f:
            data = json.load(f)

        for d in data:
          aggregated_data.append(d)

dataset = Dataset.from_list(aggregated_data)
print("Sample example:")
print(dataset[0])

# Model Initialization

In [None]:
model_name = 'allenai/longformer-base-4096'

# Prepare label mappings
entity_labels = dataset[0]['entity_label_set']  # list of entity types
label_list = ['O'] + [f"B-{l}" for l in entity_labels] + [f"I-{l}" for l in entity_labels]
print(label_list)
label2id = {l: i for i, l in enumerate(label_list)}
id2label = {i: l for l, i in label2id.items()}

# Tokenizer and model init
tokenizer = LongformerTokenizerFast.from_pretrained(
    model_name,
    max_length = max_token_length
)

model = LongformerForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

# Split Data into Validation and Training Data

In [None]:
# Function to tokenize and align labels
trunc_count = 0

whitespace = re.compile(r"\s")

def is_word_start(text: str, char_idx: int) -> bool:
    """
    Heuristic: a character is the start of a word if it is at position 0
    or the previous character is any whitespace.
    Works well for normal prose tokenised with a BPE WordPiece tokenizer.
    """
    return char_idx == 0 or bool(whitespace.match(text[char_idx - 1]))

def tokenize_and_align_labels(example: Dict[str, Any]) -> Dict[str, Any]:
    """
    • Pads / truncates to `max_token_length`.
    • Sets label = -100 on:
        – special tokens ([CLS], [SEP], etc.)
        – all sub-tokens *except* the first piece of each word
        – all padding tokens
    • Emits exactly one label per word, using your B-/I- scheme.
    """

    encoding = tokenizer(
        example["doc"],
        return_offsets_mapping=True,
        padding="max_length",
        truncation=True,
        max_length=max_token_length,
    )

    # ------------------------------------------------------------------
    # 1. Init every position with ignore_index (-100)
    # ------------------------------------------------------------------
    labels = [-100] * len(encoding["input_ids"])
    doc_text = example["doc"]

    # ------------------------------------------------------------------
    # 2. Mark the first sub-token of every *word* as O
    # ------------------------------------------------------------------
    for idx, (off_start, off_end) in enumerate(encoding["offset_mapping"]):
        if off_start == off_end:          # special tokens → keep -100
            continue
        if is_word_start(doc_text, off_start):
            labels[idx] = label2id["O"]   # will be overwritten if it is an entity

    # ------------------------------------------------------------------
    # 3. Overwrite labels for entity mentions
    # ------------------------------------------------------------------
    for ent in example["entities"]:
        ent_type = ent["type"]            # e.g. "PERSON"
        for m_text in ent["mentions"]:
            for match in re.finditer(re.escape(m_text), doc_text):
                start_char, end_char = match.start(), match.end()

                for idx, (off_start, off_end) in enumerate(encoding["offset_mapping"]):
                    if off_start >= start_char and off_end <= end_char:
                        if off_start == start_char:
                            labels[idx] = label2id[f"B-{ent_type}"]
                        elif is_word_start(doc_text, off_start):
                            labels[idx] = label2id[f"I-{ent_type}"]
                        # every other sub-token stays -100

    # ------------------------------------------------------------------
    # 4. Remove the offsets (Trainer doesn’t need them) and attach labels
    # ------------------------------------------------------------------
    encoding.pop("offset_mapping")
    encoding["labels"] = labels
    return encoding

# Split original dataset into train and validation (preserve raw columns)
all_indices = list(range(len(dataset)))
train_idx, val_idx = train_test_split(all_indices, test_size=0.1, random_state=42)
train_orig = dataset.select(train_idx)
val_orig = dataset.select(val_idx)
print(f"Original train size: {len(train_orig)}, validation size: {len(val_orig)}")

# Tokenize & align labels separately, removing raw columns only from tokenized sets
train_tok = train_orig.map(
    tokenize_and_align_labels,
    batched=False,
    remove_columns=['domain','title','doc','triples','entities','label_set','entity_label_set']
)
val_tok = val_orig.map(
    tokenize_and_align_labels,
    batched=False,
    remove_columns=['domain','title','doc','triples','entities','label_set','entity_label_set']
)
print(f"Documents truncated in training: {trunc_count} / {len(train_tok)}")

# Use tokenized datasets for training and evaluation
train_ds = train_tok
val_ds = val_tok
print(f"Train set size: {len(train_ds)}, Validation set size: {len(val_ds)}")

# Baseline NER with Untrained Model

In [None]:
# Create NER pipeline using the fine-tuned model
ner_pipe_untrained = pipeline(
    'ner',
    model=model,
    tokenizer=tokenizer,
    device=0,
    aggregation_strategy='simple'
)

# Run NER on validation documents and collect aggregated results
val_results = []
for idx, example in enumerate(val_orig):
    preds = ner_pipe_untrained(example['doc'])
    val_results.append({
        'index': idx,
        'doc_title': example.get('title', f'doc_{idx}'),
        'predictions': preds
    })

# Save to JSON in Google Drive folder
output_path = f'{base_path}processed/ner_untrained_predictions.json'
save_model_output(val_results, output_path)

scores_path = f'{base_path}processed/ner_untrained_scores.json'
compute_f1(output_path, scores_path, val_orig)

# Model Training

In [42]:
def make_safe_weights(
    train_dataset: Dataset,
    label_column: str = "labels",
    o_label_id: int = 0,
    clip_range: tuple = (0.05, 5.0),
    o_label_weight: float = 0.10,
) -> torch.Tensor:
    """
    Build a class-weight tensor for token-level NER that

    1. is inverse-frequency based (rare labels ↑ weight);
    2. has mean weight = 1 (keeps the overall loss scale stable);
    3. is clipped to `clip_range` to avoid huge gradients;
    4. overwrites the 'O' label weight with `o_label_weight`.

    Parameters
    ----------
    train_dataset : datasets.Dataset
        Your training split after any `-100` masking.
    label_column : str
        Column that holds the integer tag sequences.
    o_label_id : int
        ID of the majority 'O' label (usually 0).
    clip_range : (float, float)
        Min / max weight allowed after normalisation.
    o_label_weight : float
        Final weight assigned to the 'O' label.

    Returns
    -------
    torch.Tensor  shape = (num_labels,)
    """

    # ------------------------------------------------------------------
    # Count how many *labelled* tokens of each class you have
    # ------------------------------------------------------------------
    counts: Counter[int] = Counter()
    for seq in train_dataset[label_column]:
        for lbl in seq:
            if lbl != -100:          # ignore the sub-token masks
                counts[lbl] += 1

    num_labels = max(counts) + 1     # assumes label ids are 0 … N-1

    # ------------------------------------------------------------------
    # Inverse-frequency weighting
    # ------------------------------------------------------------------
    total = sum(counts.values())
    inv_freq = {lbl: total / cnt for lbl, cnt in counts.items()}

    # ------------------------------------------------------------------
    # Normalise so that mean(weight)=1, then clip
    # ------------------------------------------------------------------
    mean_w = sum(inv_freq.values()) / len(inv_freq)
    weights = {}
    low, high = clip_range
    for lbl in range(num_labels):
        w = inv_freq.get(lbl, 1.0) / mean_w     # unseen lbls → weight 1
        w = max(low, min(w, high))              # clip to safe range
        weights[lbl] = w

    # ------------------------------------------------------------------
    # Down-weight the 'O' label explicitly
    # ------------------------------------------------------------------
    weights[o_label_id] = o_label_weight

    # ------------------------------------------------------------------
    # Build tensor
    # ------------------------------------------------------------------
    weight_vector = torch.tensor(
        [weights[i] for i in range(num_labels)],
        dtype=torch.float32
    )

    return weight_vector

In [30]:
class WeightedTrainer(Trainer):
    def __init__(self, *args, loss_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.loss_weights = loss_weights

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = CrossEntropyLoss(
            weight=self.loss_weights.to(model.device),
            ignore_index=-100
        )
        # reshape to (batch_size*seq_len, num_labels)
        loss = loss_fct(
            logits.view(-1, model.config.num_labels),
            labels.view(-1)
        )
        return (loss, outputs) if return_outputs else loss

In [43]:
# Training arguments
train_batch_size = 2
gradient_accumulation_steps = 8
num_epochs = 5
learning_rate = 1e-5
total_steps = math.ceil(len(train_ds) / train_batch_size / gradient_accumulation_steps) * num_epochs
warmup_steps = int(total_steps * 0.1)
weight_vector = make_safe_weights(train_ds)
print(f"Using learning_rate={learning_rate}, batch_size={train_batch_size}, epochs={num_epochs}, warmup_steps={warmup_steps}, weight_decay=0.01")

"""
training_args = TrainingArguments(
    output_dir='./models/longformer-ner',
    num_train_epochs=num_epochs,
    learning_rate=learning_rate,
    gradient_checkpointing=True,
    fp16=True,
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=train_batch_size,
    warmup_steps=warmup_steps,
    weight_decay=0.01,
    gradient_accumulation_steps=gradient_accumulation_steps,
    logging_dir='./logs',
    logging_steps=50,
    save_total_limit=2,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='f1'
)
"""

training_args = TrainingArguments(
    output_dir='./models/longformer-ner-hp',
    num_train_epochs=num_epochs,
    learning_rate=learning_rate,
    lr_scheduler_type='linear',
    warmup_ratio=0.2,
    warmup_steps=warmup_steps,
    weight_decay=0.01,
    max_grad_norm=1.0,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=gradient_accumulation_steps,
    logging_dir='./logs',
    logging_steps=100,
    save_steps=1000,
    eval_strategy='epoch',
    save_strategy='epoch',
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    greater_is_better=True,
    fp16=True,

)

# Metric computation
evaluator = evaluate.load('seqeval')

def compute_metrics(p):
    predictions, labels = p
    preds = predictions.argmax(-1)
    true_labels = [[id2label[l] for l in label_seq if l != -100] for label_seq in labels]
    true_preds = [[id2label[p_] for (p_, l) in zip(pred_seq, label_seq) if l != -100]
                  for pred_seq, label_seq in zip(preds, labels)]
    results = evaluator.compute(predictions=true_preds, references=true_labels)
    return {
        'precision': results['overall_precision'],
        'recall': results['overall_recall'],
        'f1': results['overall_f1']
    }

# Initialize Trainer
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    loss_weights=weight_vector,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

Using learning_rate=1e-05, batch_size=2, epochs=5, warmup_steps=30, weight_decay=0.01


  super().__init__(*args, **kwargs)


In [44]:
print(weight_vector)

tensor([0.1000, 0.0500, 0.0500, 1.9211, 1.3049, 0.0500, 5.0000, 0.0500, 1.2575,
        0.4405, 0.0500, 0.0637, 0.0500, 0.4269, 0.0500, 0.6916, 1.2350, 4.9401,
        0.5716, 0.0500, 0.2640, 0.0850, 0.7130, 0.7950, 0.1383, 5.0000, 0.8755,
        0.9741, 0.1462, 0.3047, 2.0958, 0.0500, 0.1695, 0.0597, 0.5320, 0.5812,
        3.6400, 0.2203, 0.0500])


In [46]:
trainer.train()

print(f"Total truncated documents: {trunc_count} / {len(dataset)}")

Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.998394,0.017717,0.002634,0.004586
2,1.149300,0.963859,0.018194,0.002685,0.004679
3,1.149300,,0.0,0.0,0.0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


KeyboardInterrupt: 

# NER on Validation Data

In [26]:
# Create NER pipeline using the fine-tuned model
ner_pipe_finetuned = pipeline(
    'ner',
    model=model,
    tokenizer=tokenizer,
    device=0,
    aggregation_strategy='simple'
)

# Run NER on validation documents and collect aggregated results
val_results = []
for idx, example in enumerate(val_orig):
    preds = ner_pipe_finetuned(example['doc'])
    val_results.append({
        'index': idx,
        #'doc_title': example.get('title', f'doc_{idx}'),
        'predictions': preds
    })

# Save to JSON in Google Drive folder
output_path = f'{base_path}processed/ner_trained_predictions.json'
save_model_output(val_results, output_path)

scores_path = f'{base_path}processed/ner_trained_scores.json'
compute_f1(output_path, scores_path, val_orig)

Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Input ids are automatically padded to be a multiple of `config.attention_window`: 512
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Saved validation NER predictions to drive/MyDrive/dataset/processed/ner_trained_predictions.json
NER Validation Mention-level Metrics:
{'precision': 0.0016970725498515061, 'recall': 0.007380073800738007, 'f1': 0.0027595722662987236, 'true_positives': 4, 'predicted': 2357, 'gold': 542}
Saved evaluation metrics to drive/MyDrive/dataset/processed/ner_trained_scores.json


In [47]:
train_ds.save_to_disk(f'{base_path}processed/train_ds')

Saving the dataset (0/1 shards):   0%|          | 0/945 [00:00<?, ? examples/s]