## Named Entity Recognition (NER) 

This notebook identifies the main organisations and actors in the OECD corpus of texts. It uses the FLAIR framework for NLP with the 'ner-english-ontonotes-large' model which can be found [here](https://huggingface.co/flair/ner-english-ontonotes-large). 


### 1. Import relevant libraries and load the NER model

In [17]:
import sys
sys.path.append('../util/') # import python preprocessing script

from flair.models import SequenceTagger
from flair.data import Sentence
from flair.tokenization import SegtokSentenceSplitter
import numpy as np
import csv
import os.path

# relevant entity types:
# ----------------------
# FAC	building name
# GPE	geo-political entity
# LOC	location name
# NORP	affiliation
# ORG	organization name
# PERSON	person name

relevant_ent_types = ['FAC', 'GPE', 'LOC', 'NORP', 'PERSON', 'ORG', 'MISC']

flair_18class = SequenceTagger.load('flair/ner-english-ontonotes-large')
# flair_12class = SequenceTagger.load('ner-ontonotes-fast')
# flair_4class = SequenceTagger.load('ner')



2022-10-25 18:22:09,557 loading file /Users/kodymoodley/.flair/models/ner-english-ontonotes-large/2da6c2cdd76e59113033adf670340bfd820f0301ae2e39204d67ba2dc276cc28.ec1bdb304b6c66111532c3b1fc6e522460ae73f1901848a4d0362cdf9760edb1
2022-10-25 18:22:38,576 SequenceTagger predicts: Dictionary with 76 tags: <unk>, O, B-CARDINAL, E-CARDINAL, S-PERSON, S-CARDINAL, S-PRODUCT, B-PRODUCT, I-PRODUCT, E-PRODUCT, B-WORK_OF_ART, I-WORK_OF_ART, E-WORK_OF_ART, B-PERSON, E-PERSON, S-GPE, B-DATE, I-DATE, E-DATE, S-ORDINAL, S-LANGUAGE, I-PERSON, S-EVENT, S-DATE, B-QUANTITY, E-QUANTITY, S-TIME, B-TIME, I-TIME, E-TIME, B-GPE, E-GPE, S-ORG, I-GPE, S-NORP, B-FAC, I-FAC, E-FAC, B-NORP, E-NORP, S-PERCENT, B-ORG, E-ORG, B-LANGUAGE, E-LANGUAGE, I-CARDINAL, I-ORG, S-WORK_OF_ART, I-QUANTITY, B-MONEY


### 2. Load the corpus of OECD texts

In [18]:
import json
import os
from pathlib import Path

# get data (full texts of documents including acknowledgements, foreword, executive summary and body)
path = Path(os.getcwd())
data_dir = os.path.join(path.parents[0], "data-files")
with open(os.path.join(data_dir, "data.json")) as f:
    data = json.load(f)

# get data (the structured data which Malte processed into lines with metadata)
with open(os.path.join(data_dir, "studies_on_water_scraped.json")) as sf:
    raw_data = json.load(sf)

### 3. Do the NER-specific preprocessing required 

In [5]:
from preprocessing import preprocess_ner, lookup_correct_docid

# preprocess data
for key in data:
    data[key] = preprocess_ner(data[key], stopwords=None)

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/kodymoodley/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


NameError: name 'data' is not defined

### 4. Functions to extract the named entities in the corpus

In [20]:
# function to split sentence list into chunks for batch processing by GPU
def split(list_a, chunk_size):
    for i in range(0, len(list_a), chunk_size):
        yield list_a[i:i + chunk_size]

# do NER tagging on a given document
def flair_ner(document, model, docid):
    results = []
    splitter = SegtokSentenceSplitter()
    sentences = splitter.split(document)
    batches = split(sentences, 20)

    for batch in batches:
        model.predict(batch)
        for sentence in batch:        
            for entity in sentence.get_spans('ner'):
                if (entity.get_label("ner").value in relevant_ent_types):
                    if (len(entity.text) > 1): # one character entities disregarded
                        results.append((entity.text.replace('"', ''), entity.get_label("ner").value, sentence.to_plain_string(), str(entity.start_position) + ":" + str(entity.end_position), docid))
                              
    return results

# write tagging results to CSV file
def write_results_to_file(results, file):
    if os.path.exists(file):
        # append
        with open(file, 'a+', newline='') as f:
            writer = csv.writer(f)
            for item in results:
                writer.writerow([str(item[0]), str(item[1]), str(item[2]), str(item[3]), str(item[4]), 'flair - FLERT and XML embeddings'])
    else:
        # create file from scratch
        with open(file, 'w') as f:
            writer = csv.writer(f)
            writer.writerow(['entity', 'entity_type', 'sentence', 'span', 'docid', 'model'])
            for item in results:
                writer.writerow([str(item[0]), str(item[1]), str(item[2]), str(item[3]), str(item[4]), 'flair - FLERT and XML embeddings'])

### 5. Run the NER extraction process on the corpus

**Note:** it is highly recommended to run this analysis on a GPU for best performance. For reference, on Google Colab (free edition), using one GPU, it takes approximately 2.5 hours to process the 55 input documents in this corpus. If you are not using a GPU this can take much longer to process.

In [21]:
# run the NER tagging on each document in the corpus
import datetime;
ct = datetime.datetime.now()
print()
print("Started entire run at:-", ct)
print("---")
print()

idx = 1
for key in data:
    ct = datetime.datetime.now()
    print("Started processing Doc (" + str(idx) + " / " + "55) -> DOC ID: " + str(lookup_correct_docid(key)) + " at:-", ct)
    ner_results = flair_ner(data[key], flair_18class, lookup_correct_docid(key))
    write_results_to_file(ner_results, os.path.join(data_dir, "master-ner-results.csv"))
    ct = datetime.datetime.now()
    print("Finished doc at:-", ct)
    idx += 1

print()
ct = datetime.datetime.now()
print("---")
print("Finished entire run at:-", ct)


Started entire run at:- 2022-10-25 18:22:55.767029
---

Started processing Doc (1 / 55) -> DOC ID: 31 at:- 2022-10-25 18:22:55.767244



KeyboardInterrupt

