# Install Dependencies

In [1]:
!pip install -q transformers datasets torch seqeval evaluate

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m37.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m128.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m92.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m57.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.1 MB/s[0m eta [3

# Env Variables

In [2]:
base_path = 'data/'
max_token_length = 1024

# Establish Google Drive Connection (if needed)

In [3]:
from google.colab import drive
drive.mount('/content/drive')

base_path = 'drive/MyDrive/dataset/'

Mounted at /content/drive


# Imports

In [4]:
import json
import re
import numpy as np
import os
import math
from datasets import Dataset
from transformers import (
    LongformerTokenizerFast,
    LongformerForTokenClassification,
    TrainingArguments,
    Trainer,
    pipeline,
    TrainerCallback,
    TrainerState,
    TrainerControl
)
import evaluate
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from torch.nn import CrossEntropyLoss

# Helper Functions

## Load Data
Loads all json files in a specified path and combines them in one aggregated list

## Convert Numpy Floats

## Save Model Output

In [5]:
def load_json_data(folder_path):
    aggregated_data = []

    # loop through all files in the given folder
    for root, dirs, files in os.walk(folder_path):
        for file_name in files:
            with open(f"{folder_path}/{file_name}", "r") as f:
                data = json.load(f)

            aggregated_data.append(data)

    return aggregated_data

# Convert NumPy float32 to native Python floats before JSON serialization
def convert_numpy_floats(obj):
    if isinstance(obj, np.float32):
        return float(obj)
    raise TypeError

def save_model_output(output, output_path):
    with open(output_path, "w") as f:
        json.dump(output, f, ensure_ascii=False, indent=2, default=convert_numpy_floats)
    print(f"Saved validation NER predictions to {output_path}")

def compute_f1(predictions_file_path, output_path, validation_dataset):
  # Load saved predictions
  with open(predictions_file_path, 'r') as f:
      saved_preds = json.load(f)

  # Prepare gold and predicted lists
  all_gold = []
  all_pred = []
  for pred in saved_preds:
      idx = pred['index']
      gold_entities = validation_dataset[idx]['entities']
      # flatten gold mentions: (mention_text, type)
      gold_set = set()
      for ent in gold_entities:
          for m in ent['mentions']:
              gold_set.add((m, ent['type']))
      # flatten predicted mentions: pipeline outputs 'word' and 'entity_group'
      pred_list = pred['predictions']
      pred_set = set()
      for p in pred_list:
          w = p.get('word')
          et = p.get('entity_group')
          pred_set.add((w, et))
      # Append to global lists
      all_gold.append(gold_set)
      all_pred.append(pred_set)

  # Compute micro-level counts
  tp = 0
  pred_count = 0
  gold_count = 0
  for gold_set, pred_set in zip(all_gold, all_pred):
      tp += len(gold_set & pred_set)
      pred_count += len(pred_set)
      gold_count += len(gold_set)

  precision = tp / pred_count if pred_count > 0 else 0.0
  recall = tp / gold_count if gold_count > 0 else 0.0
  f1 = (2 * precision * recall / (precision + recall)) if (precision + recall) > 0 else 0.0

  # Print and save metrics
  metrics = {
      'precision': precision,
      'recall': recall,
      'f1': f1,
      'true_positives': tp,
      'predicted': pred_count,
      'gold': gold_count
  }
  print("NER Validation Mention-level Metrics:")
  print(metrics)

  # Save metrics to JSON
  with open(output_path, 'w') as f:
      json.dump(metrics, f, indent=2)
  print(f"Saved evaluation metrics to {output_path}")

# Load data into Memory

In [6]:
# Load JSON files and store them in memory
aggregated_data = []
folder_path = f'{base_path}raw/train'

# loop through all files in the given folder
for root, dirs, files in os.walk(folder_path):
    for file_name in files:
        with open(f"{folder_path}/{file_name}", "r") as f:
            data = json.load(f)

        for d in data:
          aggregated_data.append(d)

dataset = Dataset.from_list(aggregated_data)
print("Sample example:")
print(dataset[0])

Sample example:
{'domain': 'Energy', 'title': 'Advanced_thermal_recycling_system', 'doc': 'An advanced thermal recycling system (or an ATR system) is the commercial brand name of the waste-to-energy incineration offering by Klean Power, which has been implemented in a single plant in Germany in 1999. WtE facilities such as the ATR transforms municipal solid waste (MSW) into electricity or steam for district heating or industrial customers. The combustion bottom ash, and the combustion fly ash, along with the air pollution control system fly ash, are treated to produce products that can be beneficially reused. Specifically, ATR systems consist of the following:\nSolid waste combustion, boiler and combustion control system, energy recovery and air pollution control equipment;\nCombustion bottom ash and fly ash treatment systems that produce commercially reusable products; and\nAn optional pre-processing system to recover recyclable materials contained in the MSW delivered to the facility

# Model Initialization

In [7]:
model_name = 'allenai/longformer-base-4096'

# Prepare label mappings
entity_labels = dataset[0]['entity_label_set']  # list of entity types
label_list = ['O'] + [f"B-{l}" for l in entity_labels] + [f"I-{l}" for l in entity_labels]
print(label_list)
label2id = {l: i for i, l in enumerate(label_list)}
id2label = {i: l for l, i in label2id.items()}

# Tokenizer and model init
tokenizer = LongformerTokenizerFast.from_pretrained(
    model_name,
    max_length = max_token_length
)

model = LongformerForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

['O', 'B-CARDINAL', 'B-DATE', 'B-EVENT', 'B-FAC', 'B-GPE', 'B-LANGUAGE', 'B-LAW', 'B-LOC', 'B-MONEY', 'B-NORP', 'B-ORDINAL', 'B-ORG', 'B-PERCENT', 'B-PERSON', 'B-PRODUCT', 'B-QUANTITY', 'B-TIME', 'B-WORK_OF_ART', 'B-MISC', 'I-CARDINAL', 'I-DATE', 'I-EVENT', 'I-FAC', 'I-GPE', 'I-LANGUAGE', 'I-LAW', 'I-LOC', 'I-MONEY', 'I-NORP', 'I-ORDINAL', 'I-ORG', 'I-PERCENT', 'I-PERSON', 'I-PRODUCT', 'I-QUANTITY', 'I-TIME', 'I-WORK_OF_ART', 'I-MISC']


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/597M [00:00<?, ?B/s]

Some weights of LongformerForTokenClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Split Data into Validation and Training Data

In [8]:
# Function to tokenize and align labels
trunc_count = 0

def tokenize_and_align_labels(example):
    global trunc_count
    encoding = tokenizer(
        example['doc'],
        return_offsets_mapping=True,
        padding='max_length',
        truncation=True,
        max_length=max_token_length
    )
    if len(encoding['input_ids']) == max_token_length:
        trunc_count += 1
    labels = [label2id['O']] * len(encoding['input_ids'])
    doc_text = example['doc']
    for ent in example['entities']:
        ent_type = ent['type']
        for m_text in ent['mentions']:
            # find all occurrences of mention string
            for match in re.finditer(re.escape(m_text), doc_text):
                start_char, end_char = match.start(), match.end()
                for idx, (off_start, off_end) in enumerate(encoding['offset_mapping']):
                    if off_start >= start_char and off_end <= end_char:
                        prefix = 'B-' if off_start == start_char else 'I-'
                        labels[idx] = label2id.get(f"{prefix}{ent_type}", label2id['O'])
    encoding.pop('offset_mapping')
    encoding['labels'] = labels
    return encoding

# Split original dataset into train and validation (preserve raw columns)
all_indices = list(range(len(dataset)))
train_idx, val_idx = train_test_split(all_indices, test_size=0.1, random_state=42)
train_orig = dataset.select(train_idx)
val_orig = dataset.select(val_idx)
print(f"Original train size: {len(train_orig)}, validation size: {len(val_orig)}")

# Tokenize & align labels separately, removing raw columns only from tokenized sets
train_tok = train_orig.map(
    tokenize_and_align_labels,
    batched=False,
    remove_columns=['domain','title','doc','triples','entities','label_set','entity_label_set']
)
val_tok = val_orig.map(
    tokenize_and_align_labels,
    batched=False,
    remove_columns=['domain','title','doc','triples','entities','label_set','entity_label_set']
)
print(f"Documents truncated in training: {trunc_count} / {len(train_tok)}")

# Use tokenized datasets for training and evaluation
train_ds = train_tok
val_ds = val_tok
print(f"Train set size: {len(train_ds)}, Validation set size: {len(val_ds)}")

Original train size: 945, validation size: 106


Map:   0%|          | 0/945 [00:00<?, ? examples/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/597M [00:00<?, ?B/s]

Map:   0%|          | 0/106 [00:00<?, ? examples/s]

Documents truncated in training: 1051 / 945
Train set size: 945, Validation set size: 106


# Baseline NER with Untrained Model

In [9]:
# Create NER pipeline using the fine-tuned model
ner_pipe_untrained = pipeline(
    'ner',
    model=model,
    tokenizer=tokenizer,
    device=0,
    aggregation_strategy='simple'
)

# Run NER on validation documents and collect aggregated results
val_results = []
for idx, example in enumerate(val_orig):
    preds = ner_pipe_untrained(example['doc'])
    val_results.append({
        'index': idx,
        'doc_title': example.get('title', f'doc_{idx}'),
        'predictions': preds
    })

# Save to JSON in Google Drive folder
output_path = f'{base_path}processed/ner_untrained_predictions.json'
save_model_output(val_results, output_path)

scores_path = f'{base_path}processed/ner_untrained_scores.json'
compute_f1(output_path, scores_path, val_orig)

Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Input ids are automatically padded to be a multiple of `config.attention_window`: 512
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Saved validation NER predictions to drive/MyDrive/dataset/processed/ner_untrained_predictions.json
NER Validation Mention-level Metrics:
{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'true_positives': 0, 'predicted': 5133, 'gold': 542}
Saved evaluation metrics to drive/MyDrive/dataset/processed/ner_untrained_scores.json


# Model Training

In [None]:
all_labels = []
for ex in train_ds:
    for lbl in ex["labels"]:
        if lbl != -100:
            all_labels.append(lbl)

classes = sorted(set(all_labels))
cw = compute_class_weight(class_weight="balanced", classes=classes, y=all_labels)

# build a full-weight vector (default=1.0) then replace for our classes
weight_vector = torch.ones(len(label_list), dtype=torch.float)
for cls_id, w in zip(classes, cw):
    weight_vector[cls_id] = w

In [None]:
class WeightedTrainer(Trainer):
    def __init__(self, *args, loss_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.loss_weights = loss_weights

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs["labels"]
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = CrossEntropyLoss(
            weight=self.loss_weights.to(model.device),
            ignore_index=-100
        )
        # reshape to (batch_size*seq_len, num_labels)
        loss = loss_fct(
            logits.view(-1, model.config.num_labels),
            labels.view(-1)
        )
        return (loss, outputs) if return_outputs else loss

In [10]:
# Training arguments
train_batch_size = 2
gradient_accumulation_steps = 8
num_epochs = 10
learning_rate = 2e-5
total_steps = math.ceil(len(train_ds) / train_batch_size / gradient_accumulation_steps) * num_epochs
warmup_steps = int(total_steps * 0.1)
print(f"Using learning_rate={learning_rate}, batch_size={train_batch_size}, epochs={num_epochs}, warmup_steps={warmup_steps}, weight_decay=0.01")

"""
training_args = TrainingArguments(
    output_dir='./models/longformer-ner',
    num_train_epochs=num_epochs,
    learning_rate=learning_rate,
    gradient_checkpointing=True,
    fp16=True,
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=train_batch_size,
    warmup_steps=warmup_steps,
    weight_decay=0.01,
    gradient_accumulation_steps=gradient_accumulation_steps,
    logging_dir='./logs',
    logging_steps=50,
    save_total_limit=2,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='f1'
)
"""

training_args = TrainingArguments(
    output_dir='./models/longformer-ner-hp',
    num_train_epochs=num_epochs,
    learning_rate=learning_rate,
    lr_scheduler_type='linear',
    warmup_ratio=0.1,
    weight_decay=0.01,
    per_device_train_batch_size=2,
    #per_device_eval_batch_size=2,
    gradient_accumulation_steps=gradient_accumulation_steps,
    logging_dir='./logs',
    logging_steps=100,
    save_steps=1000,
    eval_strategy='epoch',
    save_strategy='epoch',
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    greater_is_better=True,
    fp16=True,

)

# Metric computation
evaluator = evaluate.load('seqeval')

def compute_metrics(p):
    predictions, labels = p
    preds = predictions.argmax(-1)
    true_labels = [[id2label[l] for l in label_seq if l != -100] for label_seq in labels]
    true_preds = [[id2label[p_] for (p_, l) in zip(pred_seq, label_seq) if l != -100]
                  for pred_seq, label_seq in zip(preds, labels)]
    results = evaluator.compute(predictions=true_preds, references=true_labels)
    return {
        'precision': results['overall_precision'],
        'recall': results['overall_recall'],
        'f1': results['overall_f1']
    }

class StopOnZeroLossCallback(TrainerCallback):
    """Stop training when training loss stays at 0 for more than one logging step."""
    def __init__(self, patience=1):
        self.patience = patience
        self.counter = 0
        self.last_loss = None

    def on_log(self, args, state: TrainerState, control: TrainerControl, logs=None, **kwargs):
        loss = logs.get('loss')
        if loss is not None:
            if loss == 0:
                if self.last_loss == 0:
                    self.counter += 1
                else:
                    self.counter = 1
                self.last_loss = loss
                if self.counter > self.patience:
                    print("Stopping training as loss has remained at 0 for {} steps.".format(self.counter))
                    control.should_training_stop = True
            else:
                # reset counter if loss > 0
                self.counter = 0
                self.last_loss = loss
        return control

# Initialize Trainer
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    loss_weights=weight_vector,
    callbacks=[StopOnZeroLossCallback(patience=1)]
)

Using learning_rate=1e-06, batch_size=2, epochs=10, warmup_steps=60, weight_decay=0.01


Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

  trainer = Trainer(


In [11]:
trainer.train()

print(f"Total truncated documents: {trunc_count} / {len(dataset)}")



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mdaniellocher6[0m ([33mdaniellocher6-university-of-st-gallen-student-union-shsg[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,3.5226,1.914421,0.004118,0.001216,0.001877
2,1.7464,1.062556,0.030612,0.000304,0.000602
3,1.0634,0.928535,0.0,0.0,0.0
4,1.0158,0.89682,0.0,0.0,0.0
5,0.9298,0.883213,0.0,0.0,0.0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


KeyboardInterrupt: 

# NER on Validation Data

In [12]:
# Create NER pipeline using the fine-tuned model
ner_pipe_finetuned = pipeline(
    'ner',
    model=model,
    tokenizer=tokenizer,
    device=0,
    aggregation_strategy='simple'
)

# Run NER on validation documents and collect aggregated results
val_results = []
for idx, example in enumerate(val_orig):
    preds = ner_pipe_finetuned(example['doc'])
    val_results.append({
        'index': idx,
        #'doc_title': example.get('title', f'doc_{idx}'),
        'predictions': preds
    })

# Save to JSON in Google Drive folder
output_path = f'{base_path}processed/ner_trained_predictions.json'
save_model_output(val_results, output_path)

scores_path = f'{base_path}processed/ner_trained_scores.json'
compute_f1(output_path, scores_path, val_orig)

Device set to use cuda:0


Saved validation NER predictions to drive/MyDrive/dataset/processed/ner_trained_predictions.json
NER Validation Mention-level Metrics:
{'precision': 0.06829268292682927, 'recall': 0.06334841628959276, 'f1': 0.06572769953051644, 'true_positives': 14, 'predicted': 205, 'gold': 221}
Saved evaluation metrics to drive/MyDrive/dataset/processed/ner_trained_scores.json


In [16]:
from datasets import DatasetDict

print(val_ds)

raw_ds = DatasetDict({
    'train': train_ds,
    'validation': val_ds,
    'test': val_ds
})
print(dataset)

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 56
})
Dataset({
    features: ['domain', 'title', 'doc', 'entities', 'triples', 'label_set', 'entity_label_set'],
    num_rows: 551
})
