In [1]:
from flair.models import SequenceTagger
from flair.data import Sentence
from flair.tokenization import SegtokSentenceSplitter
import numpy as np
import csv
import os.path
import json

# Relevant entity types:
# ----------------------
# FAC	building name
# GPE	geo-political entity
# LOC	location name
# NORP	affiliation
# ORG	organization name
# PERSON	person name

relevant_ent_types = ['FAC', 'GPE', 'LOC', 'NORP', 'PERSON', 'ORG', 'MISC']

flair_18class = SequenceTagger.load('flair/ner-english-ontonotes-large')
# flair_12class = SequenceTagger.load('ner-ontonotes-fast')
# flair_4class = SequenceTagger.load('ner')

IOStream.flush timed out
  from .autonotebook import tqdm as notebook_tqdm


2022-06-28 10:19:16,940 loading file /Users/kodymoodley/.flair/models/ner-english-ontonotes-large/2da6c2cdd76e59113033adf670340bfd820f0301ae2e39204d67ba2dc276cc28.ec1bdb304b6c66111532c3b1fc6e522460ae73f1901848a4d0362cdf9760edb1
2022-06-28 10:19:44,470 SequenceTagger predicts: Dictionary with 76 tags: <unk>, O, B-CARDINAL, E-CARDINAL, S-PERSON, S-CARDINAL, S-PRODUCT, B-PRODUCT, I-PRODUCT, E-PRODUCT, B-WORK_OF_ART, I-WORK_OF_ART, E-WORK_OF_ART, B-PERSON, E-PERSON, S-GPE, B-DATE, I-DATE, E-DATE, S-ORDINAL, S-LANGUAGE, I-PERSON, S-EVENT, S-DATE, B-QUANTITY, E-QUANTITY, S-TIME, B-TIME, I-TIME, E-TIME, B-GPE, E-GPE, S-ORG, I-GPE, S-NORP, B-FAC, I-FAC, E-FAC, B-NORP, E-NORP, S-PERCENT, B-ORG, E-ORG, B-LANGUAGE, E-LANGUAGE, I-CARDINAL, I-ORG, S-WORK_OF_ART, I-QUANTITY, B-MONEY


In [2]:
# Get data (full texts of documents including acknowledgements, foreword, executive summary and body)
f = open('data.json')
data = json.load(f)

In [3]:
# Do NER tagging on a given document
def flair_ner(document, model, docid):
    results = []
    splitter = SegtokSentenceSplitter()
    sentences = splitter.split(document)
    model.predict(sentences)
    for sentence in sentences:        
        for entity in sentence.get_spans('ner'):
            if (entity.get_label("ner").value in relevant_ent_types):
                results.append((entity.text, entity.get_label("ner").value, docid))
                              
    return list(set(results))

# Write tagging results to CSV file
def write_results_to_file(results, file):
    if os.path.exists(file):
        # append
        with open(file, 'a+', newline='') as f:
            writer = csv.writer(f)
            for item in results:
                writer.writerow([str(item[0]), str(item[1]), str(item[2]), 'flair - FLERT and XML embeddings'])
    else:
        # create file from scratch
        with open(file, 'w') as f:
            writer = csv.writer(f)
            writer.writerow(['entity', 'entity_type', 'docid', 'model'])
            for item in results:
                writer.writerow([str(item[0]), str(item[1]), str(item[2]), 'flair - FLERT and XML embeddings'])

In [None]:
# Run the NER tagging on each document in the corpus
import datetime;
ct = datetime.datetime.now()
print()
print("Started entire run at:-", ct)
print("---")
print()

index = 1
for key in data:
    ct = datetime.datetime.now()
    print("Started processing Doc (" + str(index) + " / " + "55) at:-", ct)
    ner_results = flair_ner(data[key], flair_18class, key)
    write_results_to_file(ner_results, 'master-ner-results.csv')
    ct = datetime.datetime.now()
    print("Finished processing Doc (" + str(index) + " / " + "55) at:-", ct)
    index += 1

print()
ct = datetime.datetime.now()
print("---")
print("Finished entire run at:-", ct)


Started entire run at:- 2022-06-28 10:21:20.585983
---

Started processing Doc (1 / 55) at:- 2022-06-28 10:21:20.587040
