<a href="https://colab.research.google.com/github/dantheman625/nlp_doc_info_extraction/blob/complete_pipe/complete_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [None]:
!pip install seqeval scikit-learn datasets wandb nltk

In [None]:
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    AutoModelForSequenceClassification,
    LongformerTokenizerFast,
    pipeline
)
from datasets import Dataset
import numpy as np
import os
import json

## Wandb login

In [None]:
import wandb
wandb.login()

# Datasets

Import Challenge data set (Final_eval.json)


## Mount Drive

In [None]:
from google.colab import drive   # only in Colab; skip if you’re on a different setup
import os
import json

#drive.mount('/content/drive', force_remount=True)
drive.mount('/content/drive')

## Set Project folder in Google Drive

In [None]:
drive_folder = "NLP_project_data"


## Load file

In [None]:
base_path   = os.path.join('drive/MyDrive/', drive_folder)
eval_path   = os.path.join(base_path, 'Final_eval.json')

eval_data = []
folder_path = f'{base_path}/raw/dev'

print(folder_path)

# loop through all files in the given folder
for root, dirs, files in os.walk(folder_path):
    for file_name in files:
        with open(f"{folder_path}/{file_name}", "r") as f:
            data = json.load(f)

        for d in data:
          eval_data.append(d)

dataset = Dataset.from_list(eval_data)
print("Sample example:")
print(dataset[0])

In [None]:
entity_labels = dataset[0]['entity_label_set']  # list of entity types
label_list = ['O'] + [f"B-{l}" for l in entity_labels] + [f"I-{l}" for l in entity_labels]
label2id = {l: i for i, l in enumerate(label_list)}
id2label = {i: l for l, i in label2id.items()}

# Define models

# Baseline models
Define which model you used as a baseline model for the specific task

In [None]:
baseline_ner_name = "allenai/longformer-base-4096"
baseline_re_name = "SpanBERT/spanbert-large-cased"


# Trained models

Define your trained model for the specific task

In [None]:
trained_ner_name = f"{base_path}/models/longformer/1/"
trained_re_name = f"{base_path}/Model_checkpoints/checkpoint-306"

# Model selection

Which model for NER, which for RE? -> Combination untrained/ untrained, trained/ trained, untrained/ trained, trained/ untrained

## Both baseline

In [None]:
ner_model_name = baseline_ner_name
re_model_name  = baseline_re_name

## Both trained

In [None]:
ner_model_name = trained_ner_name
re_model_name  = trained_re_name

## NER: trained, RE: baseline

In [None]:
ner_model_name = trained_ner_name
re_model_name  = baseline_re_name

## NER: baseline, RE: trained

In [None]:
ner_model_name = baseline_ner_name
re_model_name  = trained_re_name

# Load Models and Tokenizer

## NER

In [None]:
print(ner_model_name)

In [None]:
## Update once Daniel has finished setup
ner_tokenizer = LongformerTokenizerFast.from_pretrained(baseline_ner_name)
ner_model     = AutoModelForTokenClassification.from_pretrained(
    ner_model_name
)

ner_pipe = pipeline(
    'ner',
    model=ner_model,
    tokenizer=ner_tokenizer,
    device=-1,
    aggregation_strategy='simple'
)


## RE

In [None]:
re_tokenizer  = AutoTokenizer.from_pretrained(re_model_name)
if re_model_name == baseline_re_name:
    # baseline: override classification head to fixed number of labels
    cfg = AutoConfig.from_pretrained(
        re_model_name,
        num_labels=len(label2id),
        label2id=label2id,
        id2label=id2label
    )
    re_model = AutoModelForSequenceClassification.from_pretrained(
        re_model_name,
        config=cfg
    )
    print(f"Loaded baseline RE model '{re_model_name}' with overridden head size num_labels={re_model.config.num_labels}")
else:
    # trained: load checkpoint head as-is
    re_model = AutoModelForSequenceClassification.from_pretrained(re_model_name)
    print(f"Loaded trained RE model '{re_model_name}' with head size num_labels={re_model.config.num_labels}")


#Initialize Wandb

In [None]:
wandb.init(
    project="model-eval",
    name=f"eval_{ner_model_name.split('/')[-1]}_{re_model_name.split('/')[-1]}",
    config={
        "ner_model": ner_model_name,
        "re_model": re_model_name,
        "dataset": "Final_eval.json",
        "batch_size": 32,
        "max_length": 256,
        "seed": 42,
    }
)

# NER Eval

Output: Entity file -> content

In [None]:
ner_val_results = []
for idx, example in enumerate(eval_data):
    preds = ner_pipe(example['doc'])
    ner_val_results.append({
        'domain': example.get('domain'),
        'doc_title': example.get('title', f'doc_{idx}'),
        'entities': preds,
        'doc': example.get('doc')
    })

print(ner_val_results[0])

Print NER Output for Nina to check

In [None]:
import json
import numpy as np

with open('/content/ner_val_results.json','w') as f:
    json.dump(
        ner_val_results,
        f,
        default=lambda o: o.item() if isinstance(o, np.generic) else o
    )


In [None]:
# 2. Index them by (domain, title) for easy lookup
gt_index = {(ex['domain'], ex['title']): ex for ex in eval_data}
pred_index = {(p['domain'], p['doc_title']): p for p in ner_val_results}

true_ner_labels = []
pred_ner_labels = []

for key, gt in gt_index.items():
    pred = pred_index.get(key)
    if pred is None:
        continue

    text = gt['doc']
    # simple whitespace tokenization
    tokens = text.split()
    n = len(tokens)

    # map each character position → token idx
    char2tok = {}
    offset = 0
    for i, tok in enumerate(tokens):
        start = text.find(tok, offset)
        end = start + len(tok)
        for c in range(start, end):
            char2tok[c] = i
        offset = end

    # initialize all O’s
    true_labels = ['O'] * n
    pred_labels = ['O'] * n

    # 3. Fill in ground-truth labels
    for ent in gt['entities']:
        ent_type = ent['type']
        for mention in ent['mentions']:
            start = text.find(mention)
            while start != -1:
                end = start + len(mention)
                t0 = char2tok.get(start)
                t1 = char2tok.get(end-1)
                if t0 is not None and t1 is not None:
                    true_labels[t0] = f'B-{ent_type}'
                    for t in range(t0+1, t1+1):
                        true_labels[t] = f'I-{ent_type}'
                start = text.find(mention, end)

    # 4. Fill in predicted labels
    for ent in pred['entities']:
        t0 = char2tok.get(ent['start'])
        t1 = char2tok.get(ent['end'] - 1)
        et = ent['entity_group']
        if t0 is not None and t1 is not None:
            pred_labels[t0] = f'B-{et}'
            for t in range(t0+1, t1+1):
                pred_labels[t] = f'I-{et}'

    true_ner_labels.append(true_labels)
    pred_ner_labels.append(pred_labels)


## Log Metrics in Wandb

In [None]:
from seqeval.metrics import precision_score as ner_prec, recall_score as ner_rec, f1_score as ner_f1
prec_ner = ner_prec(true_ner_labels, pred_ner_labels)
rec_ner  = ner_rec(true_ner_labels, pred_ner_labels)
f1_ner   = ner_f1(true_ner_labels, pred_ner_labels)

print(prec_ner)
print(rec_ner)
print(f1_ner)

wandb.log({
    "ner/precision": prec_ner,
    "ner/recall":    rec_ner,
    "ner/f1":        f1_ner,
})


# RE Eval

Input: Entity file, original challenge test file -> matching of entities to sentences (siehe wa) -> Liste mit dict

## Preprocessing

In [None]:
# Mapping: challenge labels (Set 2) → docred labels (Set 1)
mapping_challenge_to_docred = {
    # ── Direct one-to-ones ───────────────────────────────────────────────────────
    "Affiliation":                         "member of",
    "ApprovedBy":                          "ApprovedBy",
    "Author":                              "author",
    "AwardReceived":                       "award received",
    "BasedOn":                             "BasedOn",
    "Capital":                             "capital",
    "Causes":                              "Causes",
    "Continent":                           "continent",
    "ContributedToCreativeWork":           "ContributedToCreativeWork",
    "Country":                             "country",
    "CountryOfCitizenship":                "country of citizenship",
    "Creator":                             "creator",
    "Developer":                           "developer",
    "DifferentFrom":                       "DifferentFrom",
    "Director":                            "director",
    "EducatedAt":                          "educated at",
    "Employer":                            "employer",
    "FieldOfWork":                         "FieldOfWork",
    "FollowedBy":                          "followed by",
    "Follows":                             "follows",
    "Founded":                             "founded",
    "FoundedBy":                           "founded by",
    "HasCause":                            "HasCause",
    "HasEffect":                           "HasEffect",
    "HasPart":                             "HasPart",
    "HasWorksInTheCollection":             "HasWorksInTheCollection",
    "InfluencedBy":                        "influenced by",
    "IssuedBy":                            "IssuedBy",
    "LocatedIn":                           "located in the administrative territorial entity",
    "Location":                            "location",
    "MemberOf":                            "member of",
    "NamedBy":                             "NamedBy",
    "NominatedFor":                        "nominated for",
    "OfficialLanguage":                    "official language",
    "OwnedBy":                             "owned by",
    "OwnerOf":                             "owner of",
    "ParentOrganization":                  "parent organization",
    "PartOf":                              "part of",
    "Partner":                             "partner",
    "PlaceOfBirth":                        "place of birth",
    "PositionHeld":                        "position held",
    "PublishedIn":                         "PublishedIn",
    "Replaces":                            "replaces",
    "SaidToBeTheSameAs":                   "SaidToBeTheSameAs",
    "Studies":                             "Studies",
    "UsedBy":                              "UsedBy",
    "Uses":                                "Uses",
    "WorkLocation":                        "work location",

    # ── Very close synonyms ────────────────────────────────────────────────────
    "LanguageOfWorkOrName":                "original language of work",
    "LanguageUsed":                        "languages spoken, written or signed",
    "OriginalLanguageOfFilmOrTvShow":      "original language of work",
    "PartyChiefRepresentative":            "head of government",
    "PrimeFactor":                         "part of",
    "TwinnedAdministrativeBody":           "sister city",

    # ── Functional / looser mappings ──────────────────────────────────────────
    "AcademicDegree":                      "educated at",
    "AdjacentStation":                     "shares border with",
    "AppliesToPeople":                     "applies to jurisdiction",
    "CitesWork":                           "present in work",
    "ContainsAdministrativeTerritorialEntity":     "contains administrative territorial entity",
    "ContainsTheAdministrativeTerritorialEntity":  "contains administrative territorial entity",
    "DiplomaticRelation":                  "conflict",
    "HasQuality":                          "genre",
    "InOppositionTo":                      "separated from",
    "InspiredBy":                          "BasedOn",
    "InterestedIn":                        "Studies",
    "NamedAfter":                          "NamedBy",
    "NativeLanguage":                      "languages spoken, written or signed",
    "OperatingSystem":                     "platform",
    "PhysicallyInteractsWith":             "shares border with",
    "PracticedBy":                         "UsedBy",
    "PresentedIn":                         "present in work",
    "Promoted":                            "HasEffect",
    "RegulatedBy":                         "IssuedBy",
    "SharesBorderWith":                    "shares border with",
    "SignificantEvent":                    "location",
}



In [None]:
import json
import re
from collections import defaultdict
from datasets import Dataset
import numpy as np
from transformers import (
    Trainer,
    TrainingArguments,
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer
)
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

# === 0) Define fixed relation label mapping ===
target_labels = sorted(set(mapping_challenge_to_docred.values()))
label2id = {lbl: idx for idx, lbl in enumerate(target_labels)}
id2label = {idx: lbl for lbl, idx in label2id.items()}
print(f"Using fixed RE labels (len={len(label2id)}): {label2id}")

# === 1) Load RE model & tokenizer with conditional head sizing ===
def load_re_model(name_or_path, baseline_name):
    # shared tokenizer across models
    tokenizer = AutoTokenizer.from_pretrained(name_or_path)
    if name_or_path == baseline_name:
        cfg = AutoConfig.from_pretrained(
            name_or_path,
            num_labels=len(label2id),
            label2id=label2id,
            id2label=id2label
        )
        model = AutoModelForSequenceClassification.from_pretrained(
            name_or_path,
            config=cfg
        )
        print(f"Loaded baseline RE model '{name_or_path}' with overridden head size num_labels={model.config.num_labels}")
    else:
        model = AutoModelForSequenceClassification.from_pretrained(name_or_path)
        print(f"Loaded trained RE model '{name_or_path}' with head size num_labels={model.config.num_labels}")
    return model, tokenizer

re_model, re_tokenizer = load_re_model(re_model_name, baseline_re_name)

# ensure NLTK sentence tokenizer is available
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

# === 2) Index gold triples by normalized document title ===
gold_by_doc = defaultdict(list)
for meta in dataset:
    key = meta["title"].strip().lower()
    for t in meta.get("triples", []):
        gold_by_doc[key].append(t)

# === Helpers to split text into sentences and extract containing sentence ===
from nltk.tokenize import sent_tokenize

def split_sentences(text):
    return sent_tokenize(text)

def get_sentence(text, a, b):
    for sent in split_sentences(text):
        idx = text.find(sent)
        if idx <= a < idx + len(sent) and idx <= b < idx + len(sent):
            return sent
    # no single sentence contains both spans → skip example
    return None

# === 3) Build positive-only RE examples from NER outputs + gold triples ===
examples = []
for doc in ner_val_results:
    title = doc['doc_title'].strip().lower()
    text  = doc.get('doc', '')
    spans = defaultdict(list)
    for ent in doc.get('entities', []):
        key = ent.get('word', '').strip().lower()
        spans[key].append((ent['start'], ent['end']))
    for head, rel, tail in [(t['head'], t['relation'], t['tail']) for t in gold_by_doc.get(title, [])]:
        mapped = mapping_challenge_to_docred.get(rel)
        if mapped is None:
            continue
        hsp = spans.get(head.lower().strip(), [])
        tsp = spans.get(tail.lower().strip(), [])
        if not (hsp and tsp):
            continue
        sent = get_sentence(text, hsp[0][0], tsp[0][0])
        if sent is None:
            continue
        examples.append({
            'RE_sentence':    sent,
            'entity1_label':  head,
            'entity2_label':  tail,
            'relation_label': mapped
        })
print(f"Built {len(examples)} RE examples across {len(gold_by_doc)} docs")
if not examples:
    raise RuntimeError("No RE examples built; check your mapping and NER outputs.")

# === 4) Define transforms for HF Dataset ===
def make_example(ex):
    sent, e1, e2 = ex['RE_sentence'], ex['entity1_label'], ex['entity2_label']
    # only wrap the single sentence containing both entities
    text = sent.replace(e1, f"[E1]{e1}[/E1]").replace(e2, f"[E2]{e2}[/E2]")
    return {
        'text': text,
        'entity1_label': e1,
        'entity2_label': e2,
        'relation_label': ex['relation_label'],
    }

def add_label_ids(ex):
    return {'labels': label2id[ex['relation_label']]}

# === 5) Build HF Dataset and apply transforms ===
re_val_ds = Dataset.from_list(examples)
print("Columns before mapping:", re_val_ds.column_names)
re_val_ds = re_val_ds.map(make_example, remove_columns=['RE_sentence'])
re_val_ds = re_val_ds.map(add_label_ids)
print("→ Prepared RE validation set:", re_val_ds)



### Build RE validation examples

### Create HF Dataset

Mapping Challenge labels to Labels in Docred that RE model was trained on

## Log metrics in Wandb

In [None]:
# First, load the correct RE model & tokenizer (baseline vs. trained)
re_tokenizer  = AutoTokenizer.from_pretrained(re_model_name)
if re_model_name == baseline_re_name:
    # baseline: override classification head to fixed number of labels
    cfg = AutoConfig.from_pretrained(
        re_model_name,
        num_labels=len(label2id),
        label2id=label2id,
        id2label=id2label
    )
    re_model = AutoModelForSequenceClassification.from_pretrained(
        re_model_name,
        config=cfg
    )
    print(f"Loaded baseline RE model '{re_model_name}' with overridden head size num_labels={re_model.config.num_labels}")
else:
    # trained: load checkpoint head as-is
    re_model = AutoModelForSequenceClassification.from_pretrained(re_model_name)
    print(f"Loaded trained RE model '{re_model_name}' with head size num_labels={re_model.config.num_labels}")

# 7) Tokenization function using the newly loaded tokenizer
def tokenize_fn(batch):
    return re_tokenizer(
        batch['text'],
        padding='max_length',
        truncation=True,
        max_length=256
    )

# Apply tokenization to your validation dataset
tokenized_val = re_val_ds.map(tokenize_fn, batched=True)

# 8) Define evaluation metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    p, r, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc, 'precision': p, 'recall': r, 'f1': f1}

# 9) Set up Trainer for evaluation
eval_args = TrainingArguments(
    output_dir='/content/re_eval_output',
    per_device_eval_batch_size=32,
    do_train=False,
    do_eval=True,
    logging_dir='/content/logs',
    report_to='wandb'
)
trainer = Trainer(
    model=re_model,
    args=eval_args,
    tokenizer=re_tokenizer,
    compute_metrics=compute_metrics
)

# 10) Run evaluation
eval_result = trainer.evaluate(eval_dataset=tokenized_val)
print("🔍 RE Validation Results:", eval_result)

# 11) Generate and save predictions as before
preds_output = trainer.predict(tokenized_val)
pred_ids = np.argmax(preds_output.predictions, axis=-1)

# Determine id2label mapping to use for predictions
# If model config defines id2label, prefer that (e.g., for trained model with 87 classes)
if hasattr(re_model.config, 'id2label') and re_model.config.id2label:
    pred_id2label = re_model.config.id2label
else:
    pred_id2label = id2label

# sanitize filename & write out JSON
import os
safe_model_name = os.path.basename(re_model_name.rstrip('/'))
output_path = f'/content/re_{safe_model_name}_predictions.json'

outputs = []
for ex, pred in zip(re_val_ds, pred_ids):
    # guard against out-of-range predictions
    pred_label = pred_id2label.get(pred, 'UNKNOWN')
    outputs.append({
        'text': ex['text'],
        'entity1_label': ex['entity1_label'],
        'entity2_label': ex['entity2_label'],
        'gold_relation': ex['relation_label'],
        'predicted_relation': pred_label
    })
with open(output_path, 'w') as f:
    json.dump(outputs, f, indent=2)
print(f"Wrote predictions to {output_path}")

### Tokenize Validation Examples

### Classification report

In [None]:
from sklearn.metrics import classification_report

# true / pred IDs
true_ids = re_val_ds['labels']
pred_ids = pred_ids   # from trainer.predict()

# pick the right id→label map
if hasattr(re_model.config, 'id2label') and re_model.config.id2label:
    # HF stores them as strings, so cast keys back to ints:
    model_id2label = { int(k):v for k,v in re_model.config.id2label.items() }
else:
    model_id2label = id2label  # your 0–58 fixed mapping

# only include the classes we actually see
unique_labels = sorted(set(true_ids) | set(pred_ids))
target_names   = [ model_id2label[l] for l in unique_labels ]

# build and log the report
report = classification_report(
    true_ids,
    pred_ids,
    labels=unique_labels,
    target_names=target_names,
    output_dict=True,
    zero_division=0
)
wandb.log({"classification_report": report})

# micro-average metrics
prec_re, rec_re, f1_re, _ = precision_recall_fscore_support(
    true_ids,
    pred_ids,
    labels=unique_labels,
    average='micro'
)
wandb.log({
    "re/precision": prec_re,
    "re/recall":    rec_re,
    "re/f1":        f1_re,
})

# finally, your summary table (assuming prec_ner etc.)
summary_table = wandb.Table(
    columns=[
      "ner_precision","ner_recall","ner_f1",
      "re_precision", "re_recall", "re_f1"
    ],
    data=[[prec_ner, rec_ner, f1_ner, prec_re, rec_re, f1_re]]
)
wandb.log({"metrics_summary": summary_table})


In [None]:
from sklearn.metrics import classification_report, precision_recall_fscore_support

# 1) Build an equivalence map to collapse synonyms
equiv = {
    "HasPart": "HasPart",
    "part of": "HasPart",
    # ── add any other pairs you want to collapse ──
    "BasedOn":      "BasedOn",
    "HasEffect":    "HasEffect",
    "Causes":       "Causes",
    "influenced by":"InfluencedBy",
    "InfluencedBy": "InfluencedBy",
    # …
}

# 2) Pull out the *string* gold & predicted labels
gold_str = re_val_ds["relation_label"]         # e.g. ["HasPart","HasPart",…]
pred_str = [ pred_id2label.get(p, "UNKNOWN")   # your pred_id2label from earlier
             for p in pred_ids ]

# 3) Normalize both through the equivalence map
gold_norm = [ equiv[g] if g in equiv else g for g in gold_str ]
pred_norm = [ equiv[p] if p in equiv else p for p in pred_str ]

# 4) Compute unique *string* labels actually present
unique_labels_str = sorted(set(gold_norm) | set(pred_norm))

# 5) Run classification_report on strings directly
report = classification_report(
    gold_norm,
    pred_norm,
    labels=unique_labels_str,
    target_names=unique_labels_str,
    output_dict=True,
    zero_division=0
)
wandb.log({"classification_report": report})

# 6) And micro‐averaged PRF on the strings as well:
prec_re, rec_re, f1_re, _ = precision_recall_fscore_support(
    gold_norm,
    pred_norm,
    labels=unique_labels_str,
    average='micro'
)
wandb.log({
    "re/precision": prec_re,
    "re/recall":    rec_re,
    "re/f1":        f1_re,
})


Wrap Up


In [None]:
wandb.finish()