# Install Dependencies

In [1]:
!pip install -q transformers datasets torch seqeval evaluate

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m127.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m49.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m57.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.0 MB/s[0m eta [36

# Env Variables

In [2]:
base_path = 'data/'
max_token_length = 1024

# Establish Google Drive Connection (if needed)

In [3]:
from google.colab import drive
drive.mount('/content/drive')

base_path = 'drive/MyDrive/NLP_project_data/'

Mounted at /content/drive


# Imports

In [4]:
import json
import re
import numpy as np
import os
import math
from datasets import Dataset
from transformers import (
    LongformerTokenizerFast,
    LongformerForTokenClassification,
    TrainingArguments,
    Trainer,
    pipeline,
    TrainerCallback,
    TrainerState,
    TrainerControl,
    EarlyStoppingCallback,
    DataCollatorForTokenClassification
)
import evaluate
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from torch.nn import CrossEntropyLoss
import torch
from collections import Counter
import itertools
from typing import Dict, Any
import re, string

# Helper Functions

## Load Data
Loads all json files in a specified path and combines them in one aggregated list

## Convert Numpy Floats

## Save Model Output

In [5]:
def load_json_data(folder_path):
    aggregated_data = []

    # loop through all files in the given folder
    for root, dirs, files in os.walk(folder_path):
        for file_name in files:
            with open(f"{folder_path}/{file_name}", "r") as f:
                data = json.load(f)

            aggregated_data.append(data)

    return aggregated_data

# Convert NumPy float32 to native Python floats before JSON serialization
def convert_numpy_floats(obj):
    if isinstance(obj, np.float32):
        return float(obj)
    raise TypeError

def save_model_output(output, output_path):
    with open(output_path, "w") as f:
        json.dump(output, f, ensure_ascii=False, indent=2, default=convert_numpy_floats)
    print(f"Saved validation NER predictions to {output_path}")

def compute_f1(predictions_file_path, output_path, validation_dataset):
  # Load saved predictions
  with open(predictions_file_path, 'r') as f:
      saved_preds = json.load(f)

  # Prepare gold and predicted lists
  all_gold = []
  all_pred = []
  for pred in saved_preds:
      idx = pred['index']
      gold_entities = validation_dataset[idx]['entities']
      # flatten gold mentions: (mention_text, type)
      gold_set = set()
      for ent in gold_entities:
          for m in ent['mentions']:
              gold_set.add((m, ent['type']))
      # flatten predicted mentions: pipeline outputs 'word' and 'entity_group'
      pred_list = pred['predictions']
      pred_set = set()
      for p in pred_list:
          w = p.get('word').lstrip()
          et = p.get('entity_group')
          pred_set.add((w, et))
      # Append to global lists
      all_gold.append(gold_set)
      all_pred.append(pred_set)

  # Compute micro-level counts
  tp = 0
  pred_count = 0
  gold_count = 0
  for gold_set, pred_set in zip(all_gold, all_pred):
      tp += len(gold_set & pred_set)
      pred_count += len(pred_set)
      gold_count += len(gold_set)

  precision = tp / pred_count if pred_count > 0 else 0.0
  recall = tp / gold_count if gold_count > 0 else 0.0
  f1 = (2 * precision * recall / (precision + recall)) if (precision + recall) > 0 else 0.0

  # Print and save metrics
  metrics = {
      'precision': precision,
      'recall': recall,
      'f1': f1,
      'true_positives': tp,
      'predicted': pred_count,
      'gold': gold_count
  }
  print("NER Validation Mention-level Metrics:")
  print(metrics)

  # Save metrics to JSON
  with open(output_path, 'w') as f:
      json.dump(metrics, f, indent=2)
  print(f"Saved evaluation metrics to {output_path}")

# Load data into Memory

In [6]:
# Load JSON files and store them in memory
aggregated_data = []
folder_path = f'{base_path}raw/train'

# loop through all files in the given folder
for root, dirs, files in os.walk(folder_path):
    for file_name in files:
        with open(f"{folder_path}/{file_name}", "r") as f:
            data = json.load(f)

        for d in data:
          if len(d["entities"])>0:
            aggregated_data.append(d)



dataset = Dataset.from_list(aggregated_data)
print("Sample example:")
print(dataset[0])

Sample example:
{'domain': 'Energy', 'title': 'Advanced_thermal_recycling_system', 'doc': 'An advanced thermal recycling system (or an ATR system) is the commercial brand name of the waste-to-energy incineration offering by Klean Power, which has been implemented in a single plant in Germany in 1999. WtE facilities such as the ATR transforms municipal solid waste (MSW) into electricity or steam for district heating or industrial customers. The combustion bottom ash, and the combustion fly ash, along with the air pollution control system fly ash, are treated to produce products that can be beneficially reused. Specifically, ATR systems consist of the following:\nSolid waste combustion, boiler and combustion control system, energy recovery and air pollution control equipment;\nCombustion bottom ash and fly ash treatment systems that produce commercially reusable products; and\nAn optional pre-processing system to recover recyclable materials contained in the MSW delivered to the facility

# Model Initialization

In [7]:
model_name = 'allenai/longformer-base-4096'

# Prepare label mappings
entity_labels = dataset[0]['entity_label_set']  # list of entity types
label_list = ['O'] + [f"B-{l}" for l in entity_labels] + [f"I-{l}" for l in entity_labels]
print(label_list)
label2id = {l: i for i, l in enumerate(label_list)}
id2label = {i: l for l, i in label2id.items()}

# Tokenizer and model init
tokenizer = LongformerTokenizerFast.from_pretrained(
    model_name,
    max_length = max_token_length
)

model = LongformerForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

['O', 'B-CARDINAL', 'B-DATE', 'B-EVENT', 'B-FAC', 'B-GPE', 'B-LANGUAGE', 'B-LAW', 'B-LOC', 'B-MONEY', 'B-NORP', 'B-ORDINAL', 'B-ORG', 'B-PERCENT', 'B-PERSON', 'B-PRODUCT', 'B-QUANTITY', 'B-TIME', 'B-WORK_OF_ART', 'B-MISC', 'I-CARDINAL', 'I-DATE', 'I-EVENT', 'I-FAC', 'I-GPE', 'I-LANGUAGE', 'I-LAW', 'I-LOC', 'I-MONEY', 'I-NORP', 'I-ORDINAL', 'I-ORG', 'I-PERCENT', 'I-PERSON', 'I-PRODUCT', 'I-QUANTITY', 'I-TIME', 'I-WORK_OF_ART', 'I-MISC']


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/597M [00:00<?, ?B/s]

Some weights of LongformerForTokenClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Split Data into Validation and Training Data

In [8]:
# Function to tokenize and align labels
trunc_count = 0

word_boundary = re.compile(r"\w")
punct = set(string.punctuation) - {"-"}

def tokenize_and_align_labels(example):
    # Tokenise the raw document and keep character offsets
    enc = tokenizer(
        example["doc"],
        return_offsets_mapping=True,
        truncation=True,
        max_length=max_token_length,
    )
    offsets   = enc.pop("offset_mapping")      # [(char_start, char_end), …]
    word_ids  = enc.word_ids()                 # per-token → word-index (or None)

    # Prepare one BIO label per *word*, initialised to "O"
    n_words   = max(w for w in word_ids if w is not None) + 1
    word_tags = ["O"] * n_words
    used_wids = set()                          # remember word-ids already labelled

    text = example["doc"]

    # Loop over every annotated entity and its surface mentions
    for ent in example["entities"]:
        ent_type = ent["type"]
        for mention in ent["mentions"]:

            # ----- guard: ignore very short, non-acronym mentions --------------
            if len(mention) < 3 and not mention.isupper():
                continue

            # ------------------------------------------------------------------
            #  GUARD A   (no bleed across word chars, hyphens or apostrophes)
            # ------------------------------------------------------------------
            pattern = r"(?<![\w-])" + re.escape(mention) + r"(?![\w-])"

            for m in re.finditer(pattern, text):
                s, e = m.span()

                # map this char-span to the set of word-ids it overlaps
                covered = {
                    wid for tidx, (cs, ce) in enumerate(offsets)
                    if (cs < e and ce > s) and (wid := word_ids[tidx]) is not None
                }

                # ------------------------------------------------------------------
                #  GUARD B   (skip if any word already labelled by another entity)
                # ------------------------------------------------------------------
                if not covered or used_wids.intersection(covered):
                    continue

                # ------------------------------------------------------------------
                #  GUARD C   (discard matches that start/end on punctuation)
                # ------------------------------------------------------------------
                if text[s] in punct or text[e - 1] in punct:
                    continue

                # assign BIO tags to the *new* span
                first, *rest = sorted(covered)
                word_tags[first] = f"B-{ent_type}"
                for wid in rest:
                    word_tags[wid] = f"I-{ent_type}"
                used_wids.update(covered)

    # 4️⃣  Expand word-level tags back to token-level labels
    labels, prev_wid = [], None
    for wid in word_ids:
        if wid is None:
            labels.append(-100)                # special token or padding
        elif wid != prev_wid:                  # first sub-token of a word
            labels.append(label2id[word_tags[wid]])
        else:                                  # continuation sub-token
            labels.append(-100)
        prev_wid = wid

    enc["labels"] = labels
    return enc


# Split original dataset into train and validation (preserve raw columns)
all_indices = list(range(len(dataset)))
train_idx, val_idx = train_test_split(all_indices, test_size=0.1, random_state=42)
train_orig = dataset.select(train_idx)
val_orig = dataset.select(val_idx)
print(f"Original train size: {len(train_orig)}, validation size: {len(val_orig)}")

# Tokenize & align labels separately, removing raw columns only from tokenized sets
train_tok = train_orig.map(
    tokenize_and_align_labels,
    batched=False,
    remove_columns=['domain','title','doc','triples','entities','label_set','entity_label_set']
)
val_tok = val_orig.map(
    tokenize_and_align_labels,
    batched=False,
    remove_columns=['domain','title','doc','triples','entities','label_set','entity_label_set']
)
print(f"Documents truncated in training: {trunc_count} / {len(train_tok)}")

# Use tokenized datasets for training and evaluation
train_ds = train_tok
val_ds = val_tok
print(f"Train set size: {len(train_ds)}, Validation set size: {len(val_ds)}")

Original train size: 29842, validation size: 3316


Map:   0%|          | 0/29842 [00:00<?, ? examples/s]

Map:   0%|          | 0/3316 [00:00<?, ? examples/s]

Documents truncated in training: 0 / 29842
Train set size: 29842, Validation set size: 3316


# Baseline NER with Untrained Model

In [None]:
# Create NER pipeline using the fine-tuned model
ner_pipe_untrained = pipeline(
    'ner',
    model=model,
    tokenizer=tokenizer,
    device=0,
    aggregation_strategy='simple'
)

# Run NER on validation documents and collect aggregated results
val_results = []
for idx, example in enumerate(val_orig):
    preds = ner_pipe_untrained(example['doc'])
    val_results.append({
        'index': idx,
        'doc_title': example.get('title', f'doc_{idx}'),
        'predictions': preds
    })

# Save to JSON in Google Drive folder
output_path = f'{base_path}processed/ner_untrained_predictions.json'
save_model_output(val_results, output_path)

scores_path = f'{base_path}processed/ner_untrained_scores.json'
compute_f1(output_path, scores_path, val_orig)

# Model Training

In [None]:
def make_safe_weights(
    train_dataset: Dataset,
    label_column: str = "labels",
    o_label_id: int = 0,
    clip_range: tuple = (0.05, 3.0),
    o_label_weight: float = 0.25,
) -> torch.Tensor:
    """
    Build a class-weight tensor for token-level NER that

    1. is inverse-frequency based (rare labels ↑ weight);
    2. has mean weight = 1 (keeps the overall loss scale stable);
    3. is clipped to `clip_range` to avoid huge gradients;
    4. overwrites the 'O' label weight with `o_label_weight`.

    Parameters
    ----------
    train_dataset : datasets.Dataset
        Your training split after any `-100` masking.
    label_column : str
        Column that holds the integer tag sequences.
    o_label_id : int
        ID of the majority 'O' label (usually 0).
    clip_range : (float, float)
        Min / max weight allowed after normalisation.
    o_label_weight : float
        Final weight assigned to the 'O' label.

    Returns
    -------
    torch.Tensor  shape = (num_labels,)
    """

    # ------------------------------------------------------------------
    # Count how many *labelled* tokens of each class you have
    # ------------------------------------------------------------------
    counts: Counter[int] = Counter()
    for seq in train_dataset[label_column]:
        for lbl in seq:
            if lbl != -100:          # ignore the sub-token masks
                counts[lbl] += 1

    num_labels = max(counts) + 1     # assumes label ids are 0 … N-1

    # ------------------------------------------------------------------
    # Inverse-frequency weighting
    # ------------------------------------------------------------------
    total = sum(counts.values())
    inv_freq = {lbl: total / cnt for lbl, cnt in counts.items()}

    # ------------------------------------------------------------------
    # Normalise so that mean(weight)=1, then clip
    # ------------------------------------------------------------------
    mean_w = sum(inv_freq.values()) / len(inv_freq)
    weights = {}
    low, high = clip_range
    for lbl in range(num_labels):
        w = inv_freq.get(lbl, 1.0) / mean_w     # unseen lbls → weight 1
        w = max(low, min(w, high))              # clip to safe range
        weights[lbl] = w

    # ------------------------------------------------------------------
    # Down-weight the 'O' label explicitly
    # ------------------------------------------------------------------
    weights[o_label_id] = o_label_weight

    # ------------------------------------------------------------------
    # Build tensor
    # ------------------------------------------------------------------
    weight_vector = torch.tensor(
        [weights[i] for i in range(num_labels)],
        dtype=torch.float32
    )

    return weight_vector

In [None]:
class WeightedTrainer(Trainer):
    def __init__(self, *args, loss_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.loss_weights = loss_weights

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = CrossEntropyLoss(
            weight=self.loss_weights.to(model.device),
            ignore_index=-100
        )
        # reshape to (batch_size*seq_len, num_labels)
        loss = loss_fct(
            logits.view(-1, model.config.num_labels),
            labels.view(-1)
        )
        return (loss, outputs) if return_outputs else loss

In [None]:
from transformers.trainer_callback import TrainerState

class StepPrinter(TrainerCallback):
    def on_step_end(self, args, state: TrainerState, control, **kw):
        if state.global_step % 10 == 0:
            print("step", state.global_step, "lr", trainer.optimizer.param_groups[0]['lr'])



In [None]:
def get_param_groups(model, base_lr=3e-5, decay=0.95):
    groups = []
    for n, p in model.named_parameters():
        depth = n.count("encoder.layer")    # 0 for top, 11 for bottom
        lr = base_lr * (decay ** depth)
        groups.append({"params": [p], "lr": lr})
    return groups


In [9]:
# Training arguments
train_batch_size = 2
gradient_accumulation_steps = 16
num_epochs = 30
learning_rate = 1e-5
total_steps = math.ceil(len(train_ds) / train_batch_size / gradient_accumulation_steps) * num_epochs
warmup_steps = int(total_steps * 0.1)
#weight_vector = make_safe_weights(train_ds)
print(f"Using learning_rate={learning_rate}, batch_size={train_batch_size}, epochs={num_epochs}, warmup_steps={warmup_steps}, weight_decay=0.01")

training_args = TrainingArguments(
    output_dir="longformer-ner-clean_full_a100_cleands",
    num_train_epochs        = 12,
    per_device_train_batch_size = 32,
    per_device_eval_batch_size  = 64,
    gradient_accumulation_steps = 2,   # 16 k tokens / update
    learning_rate           = 1e-5,
    warmup_ratio            = 0.1,
    lr_scheduler_type       = "linear",
    weight_decay            = 0.01,
    fp16                    = True,
    gradient_checkpointing  = True,
    eval_strategy     = "steps",     # ← simpler & larger dev sample
    eval_steps = 500,
    logging_steps           = 100,          # ← granular train-loss
    save_strategy           = "steps",
    save_total_limit        = 2,
    load_best_model_at_end  = True,
    metric_for_best_model   = "f1",
    greater_is_better       = True,
    group_by_length          = True,
    seed                    = 42,
)

optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=training_args.learning_rate,
    weight_decay=training_args.weight_decay
)

collator = DataCollatorForTokenClassification(
    tokenizer,
    pad_to_multiple_of=None,        # dynamic
    return_tensors="pt",
)


# Metric computation
evaluator = evaluate.load('seqeval')

def compute_metrics(p):
    predictions, labels = p
    preds = predictions.argmax(-1)
    true_labels = [[id2label[l] for l in label_seq if l != -100] for label_seq in labels]
    true_preds = [[id2label[p_] for (p_, l) in zip(pred_seq, label_seq) if l != -100]
                  for pred_seq, label_seq in zip(preds, labels)]
    results = evaluator.compute(predictions=true_preds, references=true_labels)
    return {
        'precision': results['overall_precision'],
        'recall': results['overall_recall'],
        'f1': results['overall_f1']
    }


# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=collator,
    #loss_weights=weight_vector,
    #optimizers=(optimizer, None),
    #callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

Using learning_rate=1e-05, batch_size=2, epochs=30, warmup_steps=2799, weight_decay=0.01


Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

  trainer = Trainer(


In [None]:
print(weight_vector)

In [10]:
trainer.train()

print(f"Total truncated documents: {trunc_count} / {len(dataset)}")



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mdaniellocher6[0m ([33mdaniellocher6-university-of-st-gallen-student-union-shsg[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Input ids are automatically padded to be a multiple of `config.attention_window`: 512


Step,Training Loss,Validation Loss


KeyboardInterrupt: 

# NER on Validation Data

In [None]:
# Create NER pipeline using the fine-tuned model
ner_pipe_finetuned = pipeline(
    'ner',
    model=model,
    tokenizer=tokenizer,
    device=0,
    aggregation_strategy='simple'
)

# Run NER on validation documents and collect aggregated results
val_results = []
for idx, example in enumerate(val_orig):
    preds = ner_pipe_finetuned(example['doc'])
    val_results.append({
        'index': idx,
        #'doc_title': example.get('title', f'doc_{idx}'),
        'predictions': preds
    })

# Save to JSON in Google Drive folder
output_path = f'{base_path}processed/ner_trained_predictions_full_cleands.json'
save_model_output(val_results, output_path)

scores_path = f'{base_path}processed/ner_trained_scores_full_cleands.json'
compute_f1(output_path, scores_path, val_orig)

In [None]:
model.save_pretrained(f'{base_path}models/longformer-final-cleands');