In [None]:
from flair.models import SequenceTagger
from flair.data import Sentence
from flair.tokenization import SegtokSentenceSplitter
import numpy as np
import csv
import os.path
import json

# Relevant entity types:
# ----------------------
# FAC	building name
# GPE	geo-political entity
# LOC	location name
# NORP	affiliation
# ORG	organization name
# PERSON	person name

relevant_ent_types = ['FAC', 'GPE', 'LOC', 'NORP', 'PERSON', 'ORG', 'MISC']

flair_18class = SequenceTagger.load('flair/ner-english-ontonotes-large')
# flair_12class = SequenceTagger.load('ner-ontonotes-fast')
# flair_4class = SequenceTagger.load('ner')

In [None]:
# Get data (full texts of documents including acknowledgements, foreword, executive summary and body)
f = open('data.json', encoding='utf-8')
data = json.load(f)

In [None]:
# Preprocessing
import re
import gensim
from gensim.parsing.preprocessing import preprocess_string, strip_tags, strip_multiple_whitespaces

def preprocess(text):
    # remove URLs
    text = re.sub('http://\S+|https://\S+', '', text)
    text = re.sub('http[s]?://\S+', '', text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub('^(http:\/\/www\.|https:\/\/www\.|http:\/\/|https:\/\/)?[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(:[0-9]{1,5})?(\/.*)?$', '', text)
    
    # remove HTML / XML-like tags in text and multiple whitespaces
    CUSTOM_FILTERS = [strip_tags, strip_multiple_whitespaces]
    text_tokens = preprocess_string(text, CUSTOM_FILTERS)
    text = ' '.join(text_tokens)
    
    # remove niche irrelevant characters
    irrelevant_tokens = ['et', 'al.', 'x', 'pdf', 'yes', 'abbrev', 
                            'page', 'pp', 'p', 'er', 'doi', 'can', 'b', 'c', 'd', 'e',
                            'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'o', 'q', 'r', 's',
                            't', 'u', 'v', 'w', 'y', 'z']
    tokens = text.split()
    tokens_without_sw = [word for word in tokens if not word in irrelevant_tokens]
    text = ' '.join(tokens_without_sw)
    return text

In [None]:
for key in data:
    data[key] = preprocess(data[key])

In [None]:
# Do NER tagging on a given document
def flair_ner(document, model, docid):
    results = []
    splitter = SegtokSentenceSplitter()
    sentences = splitter.split(document)
    model.predict(sentences)
    for sentence in sentences:        
        for entity in sentence.get_spans('ner'):
            if (entity.get_label("ner").value in relevant_ent_types):
                if (len(entity.text) > 1): # one character entities disregarded
                    results.append((entity.text.replace('"', ''), entity.get_label("ner").value, sentence, docid))
                              
    return results

# Write tagging results to CSV file
def write_results_to_file(results, file):
    if os.path.exists(file):
        # append
        with open(file, 'a+', newline='') as f:
            writer = csv.writer(f)
            for item in results:
                writer.writerow([str(item[0]), str(item[1]), str(item[2]), str(item[3]), 'flair - FLERT and XML embeddings'])
    else:
        # create file from scratch
        with open(file, 'w') as f:
            writer = csv.writer(f)
            writer.writerow(['entity', 'entity_type', 'sentence', 'docid', 'model'])
            for item in results:
                writer.writerow([str(item[0]), str(item[1]), str(item[2]), str(item[3]), 'flair - FLERT and XML embeddings'])

In [None]:
# Run the NER tagging on each document in the corpus
import datetime;
ct = datetime.datetime.now()
print()
print("Started entire run at:-", ct)
print("---")
print()

index = 1
for key in data:
    ct = datetime.datetime.now()
    print("Started processing Doc (" + str(index) + " / " + "55) at:-", ct)
    ner_results = flair_ner(data[key], flair_18class, key)
    write_results_to_file(ner_results, 'master-ner-results.csv')
    ct = datetime.datetime.now()
    print("Finished processing Doc (" + str(index) + " / " + "55) at:-", ct)
    index += 1

print()
ct = datetime.datetime.now()
print("---")
print("Finished entire run at:-", ct)