## Installing requirements

In [1]:
pip install datasets transformers seqeval ipywidgets torch

Note: you may need to restart the kernel to use updated packages.


## Loading the dataset

In [2]:
from datasets import load_dataset

dataset = load_dataset("adsabs/WIESP2022-NER")

#Checking
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['bibcode', 'label_studio_id', 'ner_ids', 'ner_tags', 'section', 'tokens', 'unique_id'],
        num_rows: 1753
    })
    validation: Dataset({
        features: ['bibcode', 'label_studio_id', 'ner_ids', 'ner_tags', 'section', 'tokens', 'unique_id'],
        num_rows: 1366
    })
    test: Dataset({
        features: ['bibcode', 'label_studio_id', 'ner_ids', 'ner_tags', 'section', 'tokens', 'unique_id'],
        num_rows: 2505
    })
})


## Processing dataset

In [3]:
#Extracting all unique NER labels
all_labels = [label for example in dataset["train"] for label in example["ner_tags"]]
unique_labels = sorted(set(all_labels))
print(unique_labels)

['B-Archive', 'B-CelestialObject', 'B-CelestialObjectRegion', 'B-CelestialRegion', 'B-Citation', 'B-Collaboration', 'B-ComputingFacility', 'B-Database', 'B-Dataset', 'B-EntityOfFutureInterest', 'B-Event', 'B-Fellowship', 'B-Formula', 'B-Grant', 'B-Identifier', 'B-Instrument', 'B-Location', 'B-Mission', 'B-Model', 'B-ObservationalTechniques', 'B-Observatory', 'B-Organization', 'B-Person', 'B-Proposal', 'B-Software', 'B-Survey', 'B-Tag', 'B-Telescope', 'B-TextGarbage', 'B-URL', 'B-Wavelength', 'I-Archive', 'I-CelestialObject', 'I-CelestialObjectRegion', 'I-CelestialRegion', 'I-Citation', 'I-Collaboration', 'I-ComputingFacility', 'I-Database', 'I-Dataset', 'I-EntityOfFutureInterest', 'I-Event', 'I-Fellowship', 'I-Formula', 'I-Grant', 'I-Identifier', 'I-Instrument', 'I-Location', 'I-Mission', 'I-Model', 'I-ObservationalTechniques', 'I-Observatory', 'I-Organization', 'I-Person', 'I-Proposal', 'I-Software', 'I-Survey', 'I-Tag', 'I-Telescope', 'I-TextGarbage', 'I-URL', 'I-Wavelength', 'O']


In [4]:
#Making lookup dictionaries for training
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for label, i in label2id.items()}

In [5]:
#Converting string labels to IDs
def ner_tags_to_ids(example):
    example["ner_tag_ids"] = [label2id[tag] for tag in example["ner_tags"]]
    return example

for split in dataset.keys():
    dataset[split] = dataset[split].map(ner_tags_to_ids)

## Tokenizing and Aligning labels for BERT

In [10]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        padding='max_length',   #ensuring equal lengths
        max_length=512,         #max_length for model
        is_split_into_words=True
    )
    all_labels = []
    for i, labels in enumerate(examples["ner_tag_ids"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(labels[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        all_labels.append(label_ids)
    tokenized_inputs["labels"] = all_labels
    return tokenized_inputs

tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/1753 [00:00<?, ? examples/s]

Map:   0%|          | 0/1366 [00:00<?, ? examples/s]

Map:   0%|          | 0/2505 [00:00<?, ? examples/s]

In [11]:
#Checking that it worked
print(tokenized_datasets)
print(tokenized_datasets["train"][0].keys())

DatasetDict({
    train: Dataset({
        features: ['bibcode', 'label_studio_id', 'ner_ids', 'ner_tags', 'section', 'tokens', 'unique_id', 'ner_tag_ids', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 1753
    })
    validation: Dataset({
        features: ['bibcode', 'label_studio_id', 'ner_ids', 'ner_tags', 'section', 'tokens', 'unique_id', 'ner_tag_ids', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 1366
    })
    test: Dataset({
        features: ['bibcode', 'label_studio_id', 'ner_ids', 'ner_tags', 'section', 'tokens', 'unique_id', 'ner_tag_ids', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 2505
    })
})
dict_keys(['bibcode', 'label_studio_id', 'ner_ids', 'ner_tags', 'section', 'tokens', 'unique_id', 'ner_tag_ids', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'])


## Loading model and preparing trainer

In [12]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(
    "bert-base-cased", num_labels=len(unique_labels), id2label=id2label, label2id=label2id
)

training_args = TrainingArguments(
    output_dir="./models/ner-bert",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

from seqeval.metrics import classification_report, f1_score

def compute_metrics(p):
    predictions, labels = p
    predictions = predictions.argmax(-1)
    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return {
        "f1": f1_score(true_labels, true_predictions),
        "report": classification_report(true_labels, true_predictions),
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


## Training model

In [13]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1,Report
1,No log,0.284338,0.553672,precision recall f1-score support  Archive 0.00 0.00 0.00 144  CelestialObject 0.39 0.61 0.48 1574  CelestialObjectRegion 0.00 0.00 0.00 101  CelestialRegion 0.00 0.00 0.00 65  Citation 0.66 0.77 0.71 3512  Collaboration 0.00 0.00 0.00 200  ComputingFacility 0.02 0.00 0.00 352  Database 0.00 0.00 0.00 176  Dataset 0.00 0.00 0.00 176  EntityOfFutureInterest 0.00 0.00 0.00 43  Event 0.00 0.00 0.00 35  Fellowship 0.10 0.12 0.11 312  Formula 0.28 0.59 0.38 1052  Grant 0.48 0.67 0.56 2645  Identifier 0.00 0.00 0.00 69  Instrument 0.00 0.00 0.00 494  Location 0.51 0.48 0.50 1034  Mission 0.00 0.00 0.00 85  Model 0.18 0.15 0.17 989 ObservationalTechniques 0.00 0.00 0.00 77  Observatory 0.20 0.25 0.22 609  Organization 0.59 0.82 0.69 5359  Person 0.94 0.94 0.94 3146  Proposal 0.00 0.00 0.00 72  Software 0.49 0.28 0.36 613  Survey 0.44 0.14 0.21 487  Tag 0.00 0.00 0.00 32  Telescope 0.22 0.34 0.27 951  TextGarbage 0.00 0.00 0.00 26  URL 0.99 0.88 0.93 189  Wavelength 0.43 0.24 0.31 1940  micro avg 0.53 0.58 0.55 26559  macro avg 0.22 0.23 0.22 26559  weighted avg 0.50 0.58 0.53 26559
2,No log,0.226519,0.614551,precision recall f1-score support  Archive 0.00 0.00 0.00 144  CelestialObject 0.53 0.71 0.61 1574  CelestialObjectRegion 0.00 0.00 0.00 101  CelestialRegion 0.00 0.00 0.00 65  Citation 0.67 0.81 0.73 3512  Collaboration 0.24 0.17 0.20 200  ComputingFacility 0.11 0.11 0.11 352  Database 0.43 0.15 0.22 176  Dataset 0.00 0.00 0.00 176  EntityOfFutureInterest 0.00 0.00 0.00 43  Event 0.00 0.00 0.00 35  Fellowship 0.44 0.49 0.46 312  Formula 0.31 0.63 0.42 1052  Grant 0.58 0.70 0.63 2645  Identifier 0.00 0.00 0.00 69  Instrument 0.00 0.00 0.00 494  Location 0.63 0.71 0.67 1034  Mission 0.00 0.00 0.00 85  Model 0.23 0.23 0.23 989 ObservationalTechniques 0.00 0.00 0.00 77  Observatory 0.49 0.54 0.51 609  Organization 0.66 0.83 0.74 5359  Person 0.96 0.96 0.96 3146  Proposal 0.00 0.00 0.00 72  Software 0.54 0.44 0.48 613  Survey 0.42 0.31 0.36 487  Tag 0.00 0.00 0.00 32  Telescope 0.28 0.58 0.38 951  TextGarbage 0.00 0.00 0.00 26  URL 0.93 0.97 0.95 189  Wavelength 0.42 0.48 0.45 1940  micro avg 0.57 0.66 0.61 26559  macro avg 0.29 0.32 0.29 26559  weighted avg 0.56 0.66 0.60 26559
3,0.381200,0.212977,0.638099,precision recall f1-score support  Archive 0.03 0.03 0.03 144  CelestialObject 0.54 0.74 0.63 1574  CelestialObjectRegion 0.00 0.00 0.00 101  CelestialRegion 0.00 0.00 0.00 65  Citation 0.69 0.83 0.75 3512  Collaboration 0.34 0.27 0.30 200  ComputingFacility 0.27 0.29 0.28 352  Database 0.52 0.29 0.37 176  Dataset 0.00 0.00 0.00 176  EntityOfFutureInterest 0.00 0.00 0.00 43  Event 0.00 0.00 0.00 35  Fellowship 0.45 0.55 0.50 312  Formula 0.33 0.63 0.43 1052  Grant 0.59 0.71 0.65 2645  Identifier 0.00 0.00 0.00 69  Instrument 0.44 0.01 0.02 494  Location 0.67 0.71 0.69 1034  Mission 0.00 0.00 0.00 85  Model 0.25 0.24 0.25 989 ObservationalTechniques 0.00 0.00 0.00 77  Observatory 0.56 0.60 0.58 609  Organization 0.68 0.83 0.75 5359  Person 0.96 0.96 0.96 3146  Proposal 0.00 0.00 0.00 72  Software 0.49 0.49 0.49 613  Survey 0.49 0.43 0.46 487  Tag 0.00 0.00 0.00 32  Telescope 0.34 0.54 0.42 951  TextGarbage 0.00 0.00 0.00 26  URL 0.93 0.97 0.95 189  Wavelength 0.49 0.50 0.49 1940  micro avg 0.60 0.68 0.64 26559  macro avg 0.32 0.34 0.32 26559  weighted avg 0.59 0.68 0.62 26559


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=660, training_loss=0.3369264487064246, metrics={'train_runtime': 500.4089, 'train_samples_per_second': 10.509, 'train_steps_per_second': 1.319, 'total_flos': 1374917488966656.0, 'train_loss': 0.3369264487064246, 'epoch': 3.0})

## Evaluating

In [14]:
trainer.evaluate()

{'eval_loss': 0.21297726035118103,
 'eval_f1': 0.6380986165306846,
 'eval_report': '                         precision    recall  f1-score   support\n\n                Archive       0.03      0.03      0.03       144\n        CelestialObject       0.54      0.74      0.63      1574\n  CelestialObjectRegion       0.00      0.00      0.00       101\n        CelestialRegion       0.00      0.00      0.00        65\n               Citation       0.69      0.83      0.75      3512\n          Collaboration       0.34      0.27      0.30       200\n      ComputingFacility       0.27      0.29      0.28       352\n               Database       0.52      0.29      0.37       176\n                Dataset       0.00      0.00      0.00       176\n EntityOfFutureInterest       0.00      0.00      0.00        43\n                  Event       0.00      0.00      0.00        35\n             Fellowship       0.45      0.55      0.50       312\n                Formula       0.33      0.63      0.43  

## Predicting on test set

In [18]:
test_results = trainer.predict(tokenized_datasets["test"])

import numpy as np

preds = np.argmax(test_results.predictions, axis=-1)

## Analysis of results: NER gold vs. predicted labels comparison

In [34]:
def collect_ner_comparison(dataset_split, tokenized_split, preds, id2label):
    rows = []
    for i, pred_ids in enumerate(preds):
        tokens = dataset_split[i]["tokens"]
        gold_label_ids = tokenized_split[i]["labels"]
        token_idx = 0
        for idx, label_id in enumerate(gold_label_ids):
            if label_id != -100:
                if token_idx < len(tokens):
                    rows.append({
                        "sentence_id": i,
                        "token": tokens[token_idx],
                        "gold_label": id2label[label_id],
                        "predicted_label": id2label[pred_ids[idx]]
                    })
                    token_idx += 1
    return pd.DataFrame(rows)

In [37]:
df_compare = collect_ner_comparison(dataset["test"], tokenized_datasets["test"], preds, id2label)

In [40]:
#Displaying table for two sentences
from IPython.display import display, HTML

def show_ner_table(df, num_sentences=2):
    def highlight_row(row):
        return ['background-color: #d4edda' if row['gold_label'] == row['predicted_label']
                else 'background-color: #f8d7da'] * len(row)
    for sent_id in df["sentence_id"].unique()[:num_sentences]:
        sub_df = df[df["sentence_id"] == sent_id]
        styled = sub_df.style.apply(highlight_row, axis=1)
        display(HTML(f"<h4>Sentence {sent_id}</h4>"))
        display(styled)

show_ner_table(df_compare, num_sentences=2)

Unnamed: 0,sentence_id,token,gold_label,predicted_label
0,0,The,O,O
1,0,authors,O,O
2,0,would,O,O
3,0,like,O,O
4,0,to,O,O
5,0,thank,O,O
6,0,Adam,B-Person,B-Person
7,0,"Burgasser,",I-Person,I-Person
8,0,Brendan,B-Person,B-Person
9,0,"Bowler,",I-Person,I-Person


Unnamed: 0,sentence_id,token,gold_label,predicted_label
153,1,Acknowledgments,O,O
154,1,I,O,O
155,1,thank,O,O
156,1,Dustin,B-Person,B-Person
157,1,Lang,I-Person,I-Person
158,1,for,O,O
159,1,providing,O,O
160,1,the,O,O
161,1,TGAS-matched,B-Survey,B-Telescope
162,1,APASS,B-Survey,B-Telescope


## Saving results

In [39]:
df_compare.to_csv("ner_test_comparison.csv", index=False)