In [None]:
from flair.models import SequenceTagger
from flair.data import Sentence
from flair.tokenization import SegtokSentenceSplitter
import numpy as np
import csv
import os.path
import json

# relevant entity types:
# ----------------------
# FAC	building name
# GPE	geo-political entity
# LOC	location name
# NORP	affiliation
# ORG	organization name
# PERSON	person name

relevant_ent_types = ['FAC', 'GPE', 'LOC', 'NORP', 'PERSON', 'ORG', 'MISC']

flair_18class = SequenceTagger.load('flair/ner-english-ontonotes-large')
# flair_12class = SequenceTagger.load('ner-ontonotes-fast')
# flair_4class = SequenceTagger.load('ner')

In [None]:
# get data (full texts of documents including acknowledgements, foreword, executive summary and body)
f = open('data.json')
data = json.load(f)

# get data (the structured data which Malte processed into lines with metadata)
sf = open('studies_on_water_scraped.json')
malte_data = json.load(sf)

In [None]:
import re
import gensim
from gensim.parsing.preprocessing import preprocess_string, strip_tags, strip_multiple_whitespaces

# define preprocessing steps
def preprocess(text):
    # remove URLs
    text = re.sub('http://\S+|https://\S+', '', text)
    text = re.sub('http[s]?://\S+', '', text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub('^(http:\/\/www\.|https:\/\/www\.|http:\/\/|https:\/\/)?[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(:[0-9]{1,5})?(\/.*)?$', '', text)
    
    # remove HTML / XML-like tags in text and multiple whitespaces
    CUSTOM_FILTERS = [strip_tags, strip_multiple_whitespaces]
    text_tokens = preprocess_string(text, CUSTOM_FILTERS)
    
    # remove niche irrelevant characters
    irrelevant_tokens = ['et', 'al.', 'x', 'pdf', 'yes', 'abbrev','fe',
                            'page', 'pp', 'p', 'er', 'doi', 'can', 'b', 'c', 'd', 'e',
                            'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'o', 'q', 'r', 's',
                            't', 'u', 'v', 'w', 'y', 'z','www', 'com', 'org', 'de', 'dx', 'th', 'ii', 'le']

    tokens_without_sw = [word.strip() for word in text_tokens if not word.strip() in irrelevant_tokens]
    text = ' '.join(tokens_without_sw)
    return text

In [None]:
# preprocess data
for key in data:
    data[key] = preprocess(data[key])

In [None]:
# define function to lookup correct ID for document in studies_on_water_scraped.json
# before this, I was using the INDEX of the document in the JSON array of this file as its ID.
def lookup_correct_docid(old_key):
    global malte_data
    return malte_data[int(old_key)]['meta']['id']

In [None]:
# function to split sentence list into chunks for batch processing by GPU
def split(list_a, chunk_size):
    for i in range(0, len(list_a), chunk_size):
        yield list_a[i:i + chunk_size]

# do NER tagging on a given document
def flair_ner(document, model, docid):
    results = []
    splitter = SegtokSentenceSplitter()
    sentences = splitter.split(document)
    batches = split(sentences, 20)

    for batch in batches:
        model.predict(batch)
        for sentence in batch:        
            for entity in sentence.get_spans('ner'):
                if (entity.get_label("ner").value in relevant_ent_types):
                    if (len(entity.text) > 1): # one character entities disregarded
                        results.append((entity.text.replace('"', ''), entity.get_label("ner").value, sentence.to_plain_string(), str(entity.start_position) + ":" + str(entity.end_position), docid))
                              
    return results

# write tagging results to CSV file
def write_results_to_file(results, file):
    if os.path.exists(file):
        # append
        with open(file, 'a+', newline='') as f:
            writer = csv.writer(f)
            for item in results:
                writer.writerow([str(item[0]), str(item[1]), str(item[2]), str(item[3]), str(item[4]), 'flair - FLERT and XML embeddings'])
    else:
        # create file from scratch
        with open(file, 'w') as f:
            writer = csv.writer(f)
            writer.writerow(['entity', 'entity_type', 'sentence', 'span', 'docid', 'model'])
            for item in results:
                writer.writerow([str(item[0]), str(item[1]), str(item[2]), str(item[3]), str(item[4]), 'flair - FLERT and XML embeddings'])

In [None]:
# run the NER tagging on each document in the corpus
import datetime;
ct = datetime.datetime.now()
print()
print("Started entire run at:-", ct)
print("---")
print()

for key in data:
    ct = datetime.datetime.now()
    print("Started processing Doc (" + str(key) + "-" + str(lookup_correct_docid(key)) + " / " + "55) at:-", ct)
    ner_results = flair_ner(data[key], flair_18class, lookup_correct_docid(key))
    write_results_to_file(ner_results, 'master-ner-results.csv')
    ct = datetime.datetime.now()
    print("Finished processing Doc (" + str(key) + "-" + str(lookup_correct_docid(key)) + " / " + "55) at:-", ct)

print()
ct = datetime.datetime.now()
print("---")
print("Finished entire run at:-", ct)