<a href="https://colab.research.google.com/github/dantheman625/nlp_doc_info_extraction/blob/complete_pipe/complete_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [None]:
!pip install seqeval scikit-learn datasets wandb

In [None]:
print("hello world")

In [None]:
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    AutoModelForSequenceClassification,
    LongformerTokenizerFast,
    pipeline
)
from datasets import Dataset
import numpy as np
import os
import json

## Wandb login

In [None]:
import wandb
wandb.login()

# Datasets

Import Challenge data set (Final_eval.json)


## Mount Drive

In [None]:
from google.colab import drive   # only in Colab; skip if you’re on a different setup
import os
import json

drive.mount('/content/drive', force_remount=True)

## Set Project folder

In [None]:
drive_folder = "NLP_project_data"

## Load file

In [None]:
base_path   = os.path.join('drive/MyDrive/', drive_folder)
eval_path   = os.path.join(base_path, 'Final_eval.json')

eval_data = []
folder_path = f'{base_path}/raw/dev'

print(folder_path)

# loop through all files in the given folder
for root, dirs, files in os.walk(folder_path):
    for file_name in files:
        with open(f"{folder_path}/{file_name}", "r") as f:
            data = json.load(f)

        for d in data:
          eval_data.append(d)

dataset = Dataset.from_list(eval_data)
print("Sample example:")
print(dataset[0])

In [None]:
entity_labels = dataset[0]['entity_label_set']  # list of entity types
label_list = ['O'] + [f"B-{l}" for l in entity_labels] + [f"I-{l}" for l in entity_labels]
label2id = {l: i for i, l in enumerate(label_list)}
id2label = {i: l for l, i in label2id.items()}

# Define models

# Baseline models
Define which model you used as a baseline model for the specific task

In [None]:
baseline_ner_name = "allenai/longformer-base-4096"
baseline_re_name = "SpanBERT/spanbert-large-cased"


# Trained models

Define your trained model for the specific task

In [None]:
trained_ner_name = f"{base_path}/models/longformer/1/"
trained_re_name = "/path/to/your/re/checkpoint"

# Model selection

Which model for NER, which for RE? -> Combination untrained/ untrained, trained/ trained, untrained/ trained, trained/ untrained

## Both baseline

In [None]:
ner_model_name = baseline_ner_name
re_model_name  = baseline_re_name

## Both trained

In [None]:
ner_model_name = trained_ner_name
re_model_name  = trained_re_name

## NER: trained, RE: baseline

In [None]:
ner_model_name = trained_ner_name
re_model_name  = baseline_re_name

## NER: baseline, RE: trained

In [None]:
ner_model_name = baseline_ner_name
re_model_name  = trained_re_name

# Load Models and Tokenizer

## NER

In [None]:
print(ner_model_name)

In [None]:
## Update once Daniel has finished setup
ner_tokenizer = LongformerTokenizerFast.from_pretrained(baseline_ner_name)
ner_model     = AutoModelForTokenClassification.from_pretrained(
    ner_model_name,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

ner_pipe = pipeline(
    'ner',
    model=ner_model,
    tokenizer=ner_tokenizer,
    device=-1,
    aggregation_strategy='simple'
)


## RE

In [None]:
re_tokenizer  = AutoTokenizer.from_pretrained(ner_model_name)
re_model      = AutoModelForSequenceClassification.from_pretrained(re_model_name)

#Initialize Wandb

In [None]:
wandb.init(
    project="model-eval",
    name=f"eval_{ner_model_name.split('/')[-1]}_{re_model_name.split('/')[-1]}",
    config={
        "ner_model": ner_model_name,
        "re_model": re_model_name,
        "dataset": "Final_eval.json",
        "batch_size": 32,
        "max_length": 256,
        "seed": 42,
    }
)

# NER Eval

Output: Entity file -> content

In [None]:
ner_val_results = []
for idx, example in enumerate(eval_data):
    preds = ner_pipe(example['doc'])
    ner_val_results.append({
        'domain': example.get('domain'),
        'doc_title': example.get('title', f'doc_{idx}'),
        'entities': preds,
        'doc': example.get('doc')
    })

print(ner_val_results[0])

In [None]:
# placeholder
true_ner_labels = [["O", "O"]]
pred_ner_labels = [["O", "O"]]

## Log Metrics in Wandb

In [None]:
from seqeval.metrics import precision_score as ner_prec, recall_score as ner_rec, f1_score as ner_f1
prec_ner = ner_prec(true_ner_labels, pred_ner_labels)
rec_ner  = ner_rec(true_ner_labels, pred_ner_labels)
f1_ner   = ner_f1(true_ner_labels, pred_ner_labels)

wandb.log({
    "ner/precision": prec_ner,
    "ner/recall":    rec_ner,
    "ner/f1":        f1_ner,
})


# RE Eval

Input: Entity file, original challenge test file -> matching of entities to sentences (siehe wa) -> Liste mit dict

## Extract unique label values for matching to docred

In [None]:
unique_labels = set()
for item in eval_data:
    unique_labels.update(item.get("label_set", []))

# If you want them sorted for readability:
unique_labels = sorted(unique_labels)

print(unique_labels)


## Log metrics in Wandb

In [None]:
# placeholder
true_re_labels = ["O", "O"]
pred_re_labels = ["O", "O"]

In [None]:
from sklearn.metrics import precision_recall_fscore_support

prec_re, rec_re, f1_re, _ = precision_recall_fscore_support(
    true_re_labels, pred_re_labels, average="micro"
)

wandb.log({
    "re/precision": prec_re,
    "re/recall":    rec_re,
    "re/f1":        f1_re,
})


summary_table = wandb.Table(
    columns=[
      "ner_precision",
      "ner_recall",
      "ner_f1",
      "re_precision",
      "re_recall",
      "re_f1"
    ],
    data=[[prec_ner, rec_ner, f1_ner, prec_re, rec_re, f1_re]]
)
wandb.log({"metrics_summary": summary_table})

Wrap Up


In [None]:
wandb.finish()