# Stanford NER tagger w. NLTK



In [1]:
%pip install datasets
%pip install evaluate
%pip install nltk

[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.


In [10]:
%pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting scikit-learn>=0.21.3 (from seqeval)
  Downloading scikit_learn-1.4.0-1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting scipy>=1.6.0 (from scikit-learn>=0.21.3->seqeval)
  Downloading scipy-1.12.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.4/60.4 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
Collecting threadpoolctl>=2.0.0 (from scikit-learn>=0.21.3->seqeval)
  Downloading threadpoolctl-3.2.0-py3-none-any.whl.metadata (10.0 kB)
Downloading scikit_learn-1.4.0-1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.1/12.1 MB[0m [3

In [2]:
# %%bash
# wget http://nlp.stanford.edu/software/stanford-ner-2015-04-20.zip
# unzip stanford-ner-2015-04-20.zip

In [3]:
from utils import *
from pathlib import Path
import time
ts = time.time()

In [4]:
from nltk.tag.stanford import StanfordNERTagger

CWD = Path().resolve()
stanford_dir = os.path.join(CWD, "stanford-ner-2015-04-20")
jar = os.path.join(stanford_dir, "stanford-ner-3.5.2.jar")
stanford_model_path = os.path.join(stanford_dir, "classifiers")

In [5]:
stanford_model = os.path.join(stanford_model_path, "english.conll.4class.distsim.crf.ser.gz")
stanford_tagger: StanfordNERTagger = StanfordNERTagger(stanford_model, jar, encoding="utf8")

In [6]:

def annotate_stanford(result: str) -> str:
    """ Helper function to tranlate NER int to class label """
    match result:
        case "LOCATION":
            return "LOC"
        case "PERSON":
            return "PER"
        case "ORGANIZATION":
            return "ORG"
        case "MISC":
            return "MISC"
        case "O":
            return "O"
        case _:
            return "X"
            
def predict_ner_stanford(stanford_tagger: StanfordNERTagger, labeled_dataset: Dataset) -> Tuple[list[list[str]], list[list[str]]]:
    """ Run inference on the tokens using trained BERT model """
    references: list[list[str]] = []
    st_predictions: list[list[str]] = []

    for row in tqdm(labeled_dataset, desc=str(len(labeled_dataset))):
        # add ground truth labels to references
        references.append([re.sub("^[BI]-", "", tag_names[id]) for id in row['ner_tags']])
        # recognize named entity in a test tokens
        ner_results = stanford_tagger.tag(row['tokens'])
        # translate numerical index to NER class label
        predicted_tags = [annotate_stanford(y) for x, y in ner_results]
        st_predictions.append(predicted_tags)
    return references, st_predictions


In [7]:

# Run NER inference using Stanford NER tagger
references, st_predictions = predict_ner_stanford(stanford_tagger, test)

3453: 100%|██████████| 3453/3453 [1:01:42<00:00,  1.07s/it]


In [8]:
# Save NER results to disk
stanford_results_path = os.path.join(interim_dir, "ner_results_stanford.json")
save_ner_results(stanford_results_path, references, st_predictions)

# Load persisted NER results
# references, st_predictions = load_ner_results(stanford_results_path)

Saving NER results to ../data/interim/ner_results_stanford.json


In [11]:
results = evaluate_results(references, st_predictions)
results

{'ER': {'precision': 0.9509360877985797,
  'recall': 0.9109461966604824,
  'f1': 0.930511686670878,
  'number': 1617},
 'ISC': {'precision': 0.8191027496382055,
  'recall': 0.8167388167388168,
  'f1': 0.8179190751445087,
  'number': 693},
 'OC': {'precision': 0.9049881235154394,
  'recall': 0.9169675090252708,
  'f1': 0.9109384339509863,
  'number': 1662},
 'overall_precision': 0.9080020387359837,
 'overall_recall': 0.8970292044310171,
 'overall_f1': 0.902482269503546,
 'overall_accuracy': 0.9758587272531496}

In [12]:
stanford_evaluation_path = os.path.join(interim_dir, "evaluation_results_stanford.json")
save_evaluation_results(stanford_evaluation_path, results)

Saving evaluation results to ../data/interim/evaluation_results_stanford.json


In [14]:
te = time.time()
duration = te-ts
duration = float(f"{duration:.2f}")
print(f"Total running time: {duration} sec ")

Total running time: 4013.37 sec 
