# Comparison of NER frameworks/approaches


In [15]:
from typing import List, Tuple
import json
import os
import re
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')  # "error", "ignore", "always", "default", "module" or "once"

# Huggingface datasets is needed to load the CONLL 2003 data
from datasets import load_dataset, Dataset
# A Hugging library for easily evaluating machine learning models and datasets.
import evaluate

In [2]:
# Load the CONLL 2003 dataset
conll = load_dataset("conll2003")
tag_names = conll["test"].features[f"ner_tags"].feature.names
test = conll["test"]

tag_names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

## Use pre-trained BERT model to predict NER labels


see `CAVEAT` cell below
- This cell will download a large (400+MB) pre-trained model and likely take several minutes to do so.
- The Bert model will be saved to the local filesystem and not downloaded on subsequent invocations. 

In [39]:
# import needed dependences
from transformers import pipeline
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification
from transformers.pipelines.token_classification import TokenClassificationPipeline

In [122]:
# CAVEAT: Long-running cell on initial invocation

# load pre-trained BERT base cased model
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
bert_model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
bert_nlp: TokenClassificationPipeline = pipeline("ner", model=bert_model, tokenizer=tokenizer)

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


transformers.pipelines.token_classification.TokenClassificationPipeline

In [105]:
test1 = test.select(range(10))


In [40]:
def annotate_bert(result: int) -> str:
    """ Helper function to tranlate NER int to class label """
    if len(result) == 0:
        return 'O'
    return result[0]['entity']

def predict_ner_bert(bert_nlp: TokenClassificationPipeline, labeled_dataset: Dataset) -> Tuple[list[list[str]], list[list[str]]]:
    """ Run inference on the tokens using trained BERT model """
    references: list[list[str]] = []
    predictions: list[list[str]] = []

    for row in tqdm(labeled_dataset, desc=str(len(labeled_dataset))):
        # add ground truth labels to references
        references.append([tag_names[id] for id in row['ner_tags']])
        # recognize named entity in a test tokens
        ner_results = bert_nlp(row['tokens'])
        # translate numerical index to NER class label
        predicted_tags = [annotate_bert(x) for x in ner_results]
        predictions.append(predicted_tags)
    return references, predictions

########################################
# I/O Helper Functions
########################################
def _load_persisted_json(inpath: str) -> dict:
    return json.loads(open(inpath).read())

def save_ner_results(outpath: str, references: list[list[str]], predictions: list[list[str]]) -> None:
    """ Helper function for persisting true and predicted NER labels """
    print(f"Saving NER results to {outpath}")
    d = {"references": references, "predictions": predictions}
    with open(outpath, "w") as fo:
        fo.write(json.dumps(d))

def load_ner_results(inpath: str) -> Tuple[list[list[str]], list[list[str]]]:
    """ Helper function for loading previously persisted true and predicted NER labels """
    d = json.loads(open(inpath).read())
    return d["references"], d["predictions"]


def evaluate_results(references: list[list[str]], predictions: list[list[str]]):
    seqeval = evaluate.load("seqeval")
    return seqeval.compute(predictions=predictions, references=references)

def save_evaluation_results(outpath: str, results: dict) -> None:
    print(f"Saving evaluation results to {outpath}")
    with open(outpath, "w") as fo:
        # fo.write(json.dumps(results))
        fo.write(json.dumps(results, indent=2, default=float))

def load_evaluation_results(inpath: str) -> dict:
    return _load_persisted_json(inpath)




In [125]:
# Generate a list of ground truth NER labels and predictions
references, bert_predictions = predict_ner_bert(bert_nlp, test1)

10:   0%|          | 0/10 [00:00<?, ?it/s]

10: 100%|██████████| 10/10 [00:27<00:00,  2.77s/it]


In [None]:
# Save NER results to disk
interim_dir = "/Users/chagerman/MyProjects/NER4news/data/interim"
bert_results_path = os.path.join(interim_dir, "ner_results_bert.json")

save_ner_results(bert_results_path, references, bert_predictions)

# Load persisted NER results
# references, bert_predictions = load_ner_results(bert_results_path)


Saving NER results to /Users/chagerman/MyProjects/NER4news/data/interim/ner_results_bert.json


In [None]:

results = evaluate_results(references, bert_predictions)

bert_evaluation_path = os.path.join(interim_dir, "evaluation_results_bert.json")
save_evaluation_results(bert_evaluation_path, results)


Saving evaluation results to /Users/chagerman/MyProjects/NER4news/data/interim/evaluation_results_bert.json


# Spacy NER

In [None]:
import spacy

In [None]:
nlp_lg = spacy.load("en_core_web_lg")
nlp_trf = spacy.load("en_core_web_trf")

In [None]:
labeled_dataset = test1

In [None]:

archive_dir = "/Users/chagerman/MyProjects/NER4news/data/external/archive"
test_data_path = os.path.join(archive_dir, "test.txt")
spacy_test_data_path = os.path.join(archive_dir, "test_spacy.txt")
test_data = [x.strip() for x in open(test_data_path)]

def process_line(line):
    return re.sub("([BI])-(PER)", r"\g<1>-PERSON", line )

test_data = [process_line(line) for line in test_data]

with open(spacy_test_data_path, "w") as fo:
    fo.write("\n".join(test_data))


In [None]:
%%bash
cd Users/chagerman/MyProjects/NER4news/data/external/archive
python -m spacy convert "test_spacy.txt" spacyNER_data -c ner
python -m spacy evaluate en_core_web_lg spacyNER_data/test.spacy

python -m spacy evaluate en_core_web_lg spacyNER_data/test_spacy.spacy > spacy_lg_results.txt
python -m spacy evaluate en_core_web_trf spacyNER_data/test_spacy.spacy > spacy_trf_results.txt


In [None]:
spacy_data_path = "/Users/chagerman/MyProjects/NER4news/data/external/archive/spacyNER_data/test.spacy"

# nlp_lg.from_disk(spacy_data_path)

In [None]:

# text = "When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously."
# doc = nlp_lg(text)
# for ent in doc.ents:
#     print(ent.text, ent.label_)


# Stanford NER tagger w. NLTK

In [3]:
from nltk.tag.stanford import StanfordNERTagger
stanford_dir = "/Users/chagerman/MyProjects/NER4news/ner4news/stanford-ner-2015-04-20"
jar = os.path.join(stanford_dir, "stanford-ner-3.5.2.jar")

stanford_model_path = os.path.join(stanford_dir, "classifiers/")

In [34]:
labeled_dataset = test.select(range(10))

In [36]:
stanford_model = f"{stanford_model_path}english.conll.4class.distsim.crf.ser.gz" 
stanford_tagger: StanfordNERTagger = StanfordNERTagger(stanford_model, jar, encoding="utf8")

def annotate_stanford(result: str) -> str:
    """ Helper function to tranlate NER int to class label """
    match result:
        case "LOCATION":
            return "LOC"
        case "PERSON":
            return "PER"
        case "ORGANIZATION":
            return "ORG"
        case "MISC":
            return "MISC"
        case "O":
            return "O"
        case _:
            return "X"
            
def predict_ner_stanford(stanford_tagger: StanfordNERTagger, labeled_dataset: Dataset) -> Tuple[list[list[str]], list[list[str]]]:
    """ Run inference on the tokens using trained BERT model """
    references: list[list[str]] = []
    st_predictions: list[list[str]] = []

    for row in tqdm(labeled_dataset, desc=str(len(labeled_dataset))):
        # add ground truth labels to references
        references.append([re.sub("^[BI]-", "", tag_names[id]) for id in row['ner_tags']])
        # recognize named entity in a test tokens
        ner_results = stanford_tagger.tag(row['tokens'])
        # translate numerical index to NER class label
        predicted_tags = [annotate_stanford(y) for x, y in ner_results]
        st_predictions.append(predicted_tags)
    return references, st_predictions


In [43]:

# Run NER inference using Stanford NER tagger
references, st_predictions = predict_ner_stanford(stanford_tagger, labeled_dataset)

10: 100%|██████████| 10/10 [00:43<00:00,  4.40s/it]


In [45]:
results = evaluate_results(references, st_predictions)
results

{'ER': {'precision': 1.0,
  'recall': 0.75,
  'f1': 0.8571428571428571,
  'number': 4},
 'ISC': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 6},
 'OC': {'precision': 0.9166666666666666,
  'recall': 1.0,
  'f1': 0.9565217391304348,
  'number': 11},
 'overall_precision': 0.9523809523809523,
 'overall_recall': 0.9523809523809523,
 'overall_f1': 0.9523809523809523,
 'overall_accuracy': 0.9949494949494949}

# NER using Flair

In [44]:
from flair.data import Sentence
from flair.nn import Classifier
from flair.models import SequenceTagger


In [81]:


# load the 4-class NER tagger
# tagger = Classifier.load("ner")
# tagger = SequenceTagger.load("flair/ner-english")
flair_tagger = Classifier.load('ner')

2024-01-28 17:33:59,275 SequenceTagger predicts: Dictionary with 20 tags: <unk>, O, S-ORG, S-MISC, B-PER, E-PER, S-LOC, B-ORG, E-ORG, I-PER, S-PER, B-MISC, I-MISC, E-MISC, I-ORG, B-LOC, E-LOC, I-LOC, <START>, <STOP>


In [98]:
sample

text = "Nadim Ladki"
sentence = Sentence(text)
# run NER over sentence
flair_tagger.predict(sentence)

fl_preds = ["O" for _ in range(len(text.split()))]

for entity in sentence.get_spans('ner'):
    tag = entity.tag
    for token in entity:
        idx = token.idx - 1
        print(f"{entity.text} / {tag}  idx: {idx}")
        fl_preds[idx] = tag   

fl_preds
    

Nadim Ladki / PER  idx: 0
Nadim Ladki / PER  idx: 1


['PER', 'PER']

In [83]:
tokens = labeled_dataset.select(range(1))["tokens"][0]
sample = " ".join(tokens)
# make a flair sentence object
sentence = Sentence(sample)
# run NER over sentence
flair_tagger.predict(sentence)

fl_predictions = ["O" for _ in range(len(tokens))]

for entity in sentence.get_spans('ner'):
    tag = entity.tag
    for token in entity:
        idx = token.idx - 1
        # print(f"{entity.text} / {tag}  idx: {idx}")
        fl_predictions[idx] = tag   

print(tokens)
print(fl_predictions)

JAPAN / LOC  idx: 3
CHINA / LOC  idx: 8
['SOCCER', '-', 'JAPAN', 'GET', 'LUCKY', 'WIN', ',', 'CHINA', 'IN', 'SURPRISE', 'DEFEAT', '.']
['O', 'O', 'O', 'LOC', 'O', 'O', 'O', 'O', 'LOC', 'O', 'O', 'O']


In [105]:
def predict_ner_flair(flair_tagger: SequenceTagger, labeled_dataset: Dataset) -> Tuple[list[list[str]], list[list[str]]]:
    """ Run inference on the tokens using trained BERT model """
    references: list[list[str]] = []
    fl_predictions: list[list[str]] = []

    for row in tqdm(labeled_dataset, desc=str(len(labeled_dataset))):
        # add ground truth labels to references
        references.append([re.sub("^[BI]-", "", tag_names[id]) for id in row['ner_tags']])
        tokens = row["tokens"]
        # make a flair sentence object
        sentence = Sentence(" ".join(tokens))
        # run NER over sentence
        flair_tagger.predict(sentence)
        ner_results = ["O" for _ in range(len(tokens))]
        # print(sentence.get_spans("ner"))
        for entity in sentence.get_spans('ner'):
            tag = entity.tag
            for token in entity:
                idx = token.idx - 1
                ner_results[idx] = tag   
        # translate numerical index to NER class label
        predicted_tags = ner_results
        fl_predictions.append(predicted_tags)
    return references, fl_predictions


In [106]:
references, fl_predictions = predict_ner_flair(flair_tagger, labeled_dataset.select(range(5)))

5:   0%|          | 0/5 [00:00<?, ?it/s]

5: 100%|██████████| 5/5 [00:06<00:00,  1.25s/it]


In [107]:
results = evaluate_results(references, fl_predictions)
results

{'ER': {'precision': 1.0,
  'recall': 0.5,
  'f1': 0.6666666666666666,
  'number': 2},
 'ISC': {'precision': 0.5,
  'recall': 1.0,
  'f1': 0.6666666666666666,
  'number': 1},
 'OC': {'precision': 0.875,
  'recall': 1.0,
  'f1': 0.9333333333333333,
  'number': 7},
 'overall_precision': 0.8181818181818182,
 'overall_recall': 0.9,
 'overall_f1': 0.8571428571428572,
 'overall_accuracy': 0.9571428571428572}