# Install Dependencies

In [None]:
!pip install -q transformers datasets torch seqeval evaluate

# Env Variables

In [None]:
base_path = 'data/'
max_token_length = 1024

# Establish Google Drive Connection (if needed)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

base_path = 'drive/MyDrive/NLP_project_data/'

# Imports

In [None]:
import json
import re
import numpy as np
import os
import math
from datasets import Dataset
from transformers import (
    LongformerTokenizerFast,
    LongformerForTokenClassification,
    TrainingArguments,
    Trainer,
    pipeline,
    TrainerCallback,
    TrainerState,
    TrainerControl,
    EarlyStoppingCallback,
    DataCollatorForTokenClassification
)
import evaluate
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from torch.nn import CrossEntropyLoss
import torch
from collections import Counter
import itertools
from typing import Dict, Any
import re, string

# Helper Functions

In [None]:
def load_json_data(folder_path):
    aggregated_data = []

    for root, dirs, files in os.walk(folder_path):
        for file_name in files:
            with open(f'{folder_path}/{file_name}', 'r') as f:
                data = json.load(f)

            aggregated_data.append(data)

    return aggregated_data

# Convert NumPy float32 to native Python floats before JSON serialization
def convert_numpy_floats(obj):
    if isinstance(obj, np.float32):
        return float(obj)
    raise TypeError

def save_model_output(output, output_path):
    with open(output_path, 'w') as f:
        json.dump(output, f, ensure_ascii=False, indent=2, default=convert_numpy_floats)
    print(f'Saved validation NER predictions to {output_path}')

def compute_f1(predictions_file_path, output_path, validation_dataset):
  with open(predictions_file_path, 'r') as f:
      saved_preds = json.load(f)

  all_gold = []
  all_pred = []
  for pred in saved_preds:
      idx = pred['index']
      gold_entities = validation_dataset[idx]['entities']
      gold_set = set()
      for ent in gold_entities:
          for m in ent['mentions']:
              gold_set.add((m, ent['type']))
      pred_list = pred['predictions']
      pred_set = set()
      for p in pred_list:
          w = p.get('word').lstrip()
          et = p.get('entity_group')
          pred_set.add((w, et))
      all_gold.append(gold_set)
      all_pred.append(pred_set)

  tp = 0
  pred_count = 0
  gold_count = 0
  for gold_set, pred_set in zip(all_gold, all_pred):
      tp += len(gold_set & pred_set)
      pred_count += len(pred_set)
      gold_count += len(gold_set)

  precision = tp / pred_count if pred_count > 0 else 0.0
  recall = tp / gold_count if gold_count > 0 else 0.0
  f1 = (2 * precision * recall / (precision + recall)) if (precision + recall) > 0 else 0.0

  metrics = {
      'precision': precision,
      'recall': recall,
      'f1': f1,
      'true_positives': tp,
      'predicted': pred_count,
      'gold': gold_count
  }
  print('NER Validation Mention-level Metrics:')
  print(metrics)

  with open(output_path, 'w') as f:
      json.dump(metrics, f, indent=2)
  print(f'Saved evaluation metrics to {output_path}')

# Load data into Memory

In [None]:
aggregated_data = []
folder_path = f'{base_path}raw/train'

for root, dirs, files in os.walk(folder_path):
    for file_name in files:
        with open(f'{folder_path}/{file_name}', 'r') as f:
            data = json.load(f)

        for d in data:
          if len(d['entities'])>0:
            aggregated_data.append(d)



dataset = Dataset.from_list(aggregated_data)
print('Sample example:')
print(dataset[0])

# Model Initialization

In [None]:
model_name = 'allenai/longformer-base-4096'

entity_labels = dataset[0]['entity_label_set']
label_list = ['O'] + [f'B-{l}' for l in entity_labels] + [f'I-{l}' for l in entity_labels]
print(label_list)
label2id = {l: i for i, l in enumerate(label_list)}
id2label = {i: l for l, i in label2id.items()}

tokenizer = LongformerTokenizerFast.from_pretrained(
    model_name,
    max_length = max_token_length
)

model = LongformerForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

# Split Data into Validation and Training Data

In [None]:
trunc_count = 0

word_boundary = re.compile(r'\w')
punct = set(string.punctuation) - {'-'}

def tokenize_and_align_labels(example):
    enc = tokenizer(
        example['doc'],
        return_offsets_mapping=True,
        truncation=True,
        max_length=max_token_length,
    )
    offsets   = enc.pop('offset_mapping')
    word_ids  = enc.word_ids()

    n_words   = max(w for w in word_ids if w is not None) + 1
    word_tags = ['O'] * n_words
    used_wids = set()

    text = example['doc']

    for ent in example['entities']:
        ent_type = ent['type']
        for mention in ent['mentions']:

            if len(mention) < 3 and not mention.isupper():
                continue

            pattern = r'(?<![\w-])' + re.escape(mention) + r'(?![\w-])'

            for m in re.finditer(pattern, text):
                s, e = m.span()

                covered = {
                    wid for tidx, (cs, ce) in enumerate(offsets)
                    if (cs < e and ce > s) and (wid := word_ids[tidx]) is not None
                }

                if not covered or used_wids.intersection(covered):
                    continue

                if text[s] in punct or text[e - 1] in punct:
                    continue

                first, *rest = sorted(covered)
                word_tags[first] = f'B-{ent_type}'
                for wid in rest:
                    word_tags[wid] = f'I-{ent_type}'
                used_wids.update(covered)

    labels, prev_wid = [], None
    for wid in word_ids:
        if wid is None:
            labels.append(-100)
        elif wid != prev_wid:
            labels.append(label2id[word_tags[wid]])
        else:
            labels.append(-100)
        prev_wid = wid

    enc['labels'] = labels
    return enc


all_indices = list(range(len(dataset)))
train_idx, val_idx = train_test_split(all_indices, test_size=0.1, random_state=42)
train_orig = dataset.select(train_idx)
val_orig = dataset.select(val_idx)
print(f'Original train size: {len(train_orig)}, validation size: {len(val_orig)}')

train_tok = train_orig.map(
    tokenize_and_align_labels,
    batched=False,
    remove_columns=['domain','title','doc','triples','entities','label_set','entity_label_set']
)
val_tok = val_orig.map(
    tokenize_and_align_labels,
    batched=False,
    remove_columns=['domain','title','doc','triples','entities','label_set','entity_label_set']
)
print(f'Documents truncated in training: {trunc_count} / {len(train_tok)}')

train_ds = train_tok
val_ds = val_tok
print(f'Train set size: {len(train_ds)}, Validation set size: {len(val_ds)}')

# Baseline NER with Untrained Model

In [None]:
ner_pipe_untrained = pipeline(
    'ner',
    model=model,
    tokenizer=tokenizer,
    device=0,
    aggregation_strategy='simple'
)

val_results = []
for idx, example in enumerate(val_orig):
    preds = ner_pipe_untrained(example['doc'])
    val_results.append({
        'index': idx,
        'doc_title': example.get('title', f'doc_{idx}'),
        'predictions': preds
    })

output_path = f'{base_path}processed/ner_untrained_predictions.json'
save_model_output(val_results, output_path)

scores_path = f'{base_path}processed/ner_untrained_scores.json'
compute_f1(output_path, scores_path, val_orig)

# Model Training

In [None]:
def make_safe_weights(
    train_dataset: Dataset,
    label_column: str = 'labels',
    o_label_id: int = 0,
    clip_range: tuple = (0.05, 3.0),
    o_label_weight: float = 0.25,
) -> torch.Tensor:
    counts: Counter[int] = Counter()
    for seq in train_dataset[label_column]:
        for lbl in seq:
            if lbl != -100:
                counts[lbl] += 1

    num_labels = max(counts) + 1

    total = sum(counts.values())
    inv_freq = {lbl: total / cnt for lbl, cnt in counts.items()}

    mean_w = sum(inv_freq.values()) / len(inv_freq)
    weights = {}
    low, high = clip_range
    for lbl in range(num_labels):
        w = inv_freq.get(lbl, 1.0) / mean_w
        w = max(low, min(w, high))
        weights[lbl] = w

    weights[o_label_id] = o_label_weight

    weight_vector = torch.tensor(
        [weights[i] for i in range(num_labels)],
        dtype=torch.float32
    )

    return weight_vector

In [None]:
class WeightedTrainer(Trainer):
    def __init__(self, *args, loss_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.loss_weights = loss_weights

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop('labels')
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = CrossEntropyLoss(
            weight=self.loss_weights.to(model.device),
            ignore_index=-100
        )
        loss = loss_fct(
            logits.view(-1, model.config.num_labels),
            labels.view(-1)
        )
        return (loss, outputs) if return_outputs else loss

In [None]:
from transformers.trainer_callback import TrainerState

class StepPrinter(TrainerCallback):
    def on_step_end(self, args, state: TrainerState, control, **kw):
        if state.global_step % 10 == 0:
            print('step', state.global_step, 'lr', trainer.optimizer.param_groups[0]['lr'])



In [None]:
def get_param_groups(model, base_lr=3e-5, decay=0.95):
    groups = []
    for n, p in model.named_parameters():
        depth = n.count('encoder.layer')    # 0 for top, 11 for bottom
        lr = base_lr * (decay ** depth)
        groups.append({'params': [p], 'lr': lr})
    return groups


In [None]:
# Training arguments
train_batch_size = 32
eval_batch_size = 64
gradient_accumulation_steps = 2
num_epochs = 12
learning_rate = 1e-5
warmup_ratio = 0.1
weight_decay = 0.01
weight_vector = make_safe_weights(train_ds)
print(f'Using learning_rate={learning_rate}, batch_size={train_batch_size}, epochs={num_epochs}')

training_args = TrainingArguments(
    output_dir='longformer-ner',
    num_train_epochs        = num_epochs,
    per_device_train_batch_size = train_batch_size,
    per_device_eval_batch_size  = eval_batch_size,
    gradient_accumulation_steps = gradient_accumulation_steps,
    learning_rate = learning_rate,
    warmup_ratio = warmup_ratio,
    lr_scheduler_type = 'linear',
    weight_decay = weight_decay,
    fp16 = True,
    gradient_checkpointing = True,
    eval_strategy = 'steps',
    eval_steps = 500,
    logging_steps = 100,
    save_strategy = 'steps',
    save_total_limit = 2,
    load_best_model_at_end = True,
    metric_for_best_model = 'f1',
    greater_is_better = True,
    group_by_length = True,
    seed = 42,
)

collator = DataCollatorForTokenClassification(
    tokenizer,
    pad_to_multiple_of=None,
    return_tensors='pt',
)

evaluator = evaluate.load('seqeval')

def compute_metrics(p):
    predictions, labels = p
    preds = predictions.argmax(-1)
    true_labels = [[id2label[l] for l in label_seq if l != -100] for label_seq in labels]
    true_preds = [[id2label[p_] for (p_, l) in zip(pred_seq, label_seq) if l != -100]
                  for pred_seq, label_seq in zip(preds, labels)]
    results = evaluator.compute(predictions=true_preds, references=true_labels)
    return {
        'precision': results['overall_precision'],
        'recall': results['overall_recall'],
        'f1': results['overall_f1']
    }


trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=collator,
    loss_weights=weight_vector,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
)

In [None]:
trainer.train()

# NER on Validation Data

In [None]:
ner_pipe_finetuned = pipeline(
    'ner',
    model=model,
    tokenizer=tokenizer,
    device=0,
    aggregation_strategy='simple'
)

val_results = []
for idx, example in enumerate(val_orig):
    preds = ner_pipe_finetuned(example['doc'])
    val_results.append({
        'index': idx,
        'doc_title': example.get('title', f'doc_{idx}'),
        'entities': preds,
        'doc': example.get('doc')
    })

output_path = f'{base_path}processed/ner_trained_predictions.json'
save_model_output(val_results, output_path)

scores_path = f'{base_path}processed/ner_trained_scores.json'
compute_f1(output_path, scores_path, val_orig)

In [None]:
model.save_pretrained(f'{base_path}models/longformer');