In [1]:
import pandas as pd
import csv
import os.path
import json

df = pd.read_csv('full-flair-ner-list-oecd-corpus.csv')

In [2]:
# get data (full texts of documents including acknowledgements, foreword, executive summary and body)
f = open('data.json')
data = json.load(f)

# get data (the structured data which Malte processed into lines with metadata)
sf = open('studies_on_water_scraped.json')
malte_data = json.load(sf)

In [3]:
import re
import gensim
from gensim.parsing.preprocessing import preprocess_string, strip_tags, strip_multiple_whitespaces

# define preprocessing steps
def preprocess(text):
    # remove URLs
    text = re.sub('http://\S+|https://\S+', '', text)
    text = re.sub('http[s]?://\S+', '', text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub('^(http:\/\/www\.|https:\/\/www\.|http:\/\/|https:\/\/)?[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(:[0-9]{1,5})?(\/.*)?$', '', text)
    
    # remove HTML / XML-like tags in text and multiple whitespaces
    CUSTOM_FILTERS = [strip_tags, strip_multiple_whitespaces]
    text_tokens = preprocess_string(text, CUSTOM_FILTERS)
    
    # remove niche irrelevant characters
    irrelevant_tokens = ['et', 'al.', 'x', 'pdf', 'yes', 'abbrev','fe',
                            'page', 'pp', 'p', 'er', 'doi', 'can', 'b', 'c', 'd', 'e',
                            'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'o', 'q', 'r', 's',
                            't', 'u', 'v', 'w', 'y', 'z','www', 'com', 'org', 'de', 'dx', 'th', 'ii', 'le']

    tokens_without_sw = [word.strip() for word in text_tokens if not word.strip() in irrelevant_tokens]
    text = ' '.join(tokens_without_sw)
    return text

In [4]:
# preprocess data
for key in data:
    data[key] = preprocess(data[key])

In [5]:
# define function to lookup correct ID for document in studies_on_water_scraped.json
# before this, I was using the INDEX of the document in the JSON array of this file as its ID.
def lookup_correct_docid(old_key):
    global malte_data
    return malte_data[int(old_key)]['meta']['id']

In [6]:
def replace_all(text, dic):
    for i, j in dic.items():
        text = text.replace(i, j)
    return text

def process(datafr, doctext, docid):
    # print(docid)
    # first filter the rows pertaining to the given docid
    docid_datafr = datafr[datafr['docid'] == docid]
    # then filter only for organisations and persons
    docid_org_per_datafr = docid_datafr[docid_datafr['entity_type'].isin(['ORG', 'PERSON'])]
    # print("before: ", len(docid_org_per_datafr))
    # old_count = len(docid_org_per_datafr)
    v = docid_org_per_datafr[['entity']]
    docid_org_per_datafr = docid_org_per_datafr[v.replace(v.stack().value_counts()).gt(2).all(1)]
    # print("after: ", len(docid_org_per_datafr))
    # new_count = len(docid_org_per_datafr)
    # loop through each entity mention (row) in the dataframe
    docid_org_per_datafr = docid_org_per_datafr.reset_index()  # make sure indexes pair with number of rows
    unique_sentences = pd.unique(docid_org_per_datafr['sentence'])
    
    for sentence in unique_sentences:
        curr_sent_df = docid_org_per_datafr[docid_org_per_datafr['sentence'] == sentence]
        curr_sent_entities = curr_sent_df['entity'].tolist()
        replace_patterns = {}
        for entity in curr_sent_entities:
            named_entity_tokens = entity.strip().replace('"', '').replace("'", '').replace(",",'').split()
            if (len(named_entity_tokens) > 1):
                # form single token from multiple ones
                single_token_entity = '_'.join(named_entity_tokens)
                replace_patterns[entity] = single_token_entity
                
        new_sentence = replace_all(sentence, replace_patterns)
        doctext = doctext.replace(sentence, new_sentence)

    return doctext

In [7]:
# run the NER tagging on each document in the corpus
import datetime;
ct = datetime.datetime.now()
print()
print("Started entire run at:-", ct)
print("---")
print()

processed_data = {}
for key in data:
    ct = datetime.datetime.now()
    print("Started processing Doc (" + str(key) + "-" + str(lookup_correct_docid(key)) + " / " + "55) at:-", ct)
    processed_data[lookup_correct_docid(key)] = process(df, data[key], lookup_correct_docid(key))
    ct = datetime.datetime.now()
    print("Finished processing Doc (" + str(key) + "-" + str(lookup_correct_docid(key)) + " / " + "55) at:-", ct)

with open('processed_ngram_ner_data.json', 'w') as fp:
    json.dump(processed_data, fp)
    
print()
ct = datetime.datetime.now()
print("---")
print("Finished entire run at:-", ct)


Started entire run at:- 2022-07-11 20:08:51.597747
---

Started processing Doc (48-50 / 55) at:- 2022-07-11 20:08:51.599351
Finished processing Doc (48-50 / 55) at:- 2022-07-11 20:08:51.775210
Started processing Doc (0-0 / 55) at:- 2022-07-11 20:08:51.775239
Finished processing Doc (0-0 / 55) at:- 2022-07-11 20:08:51.815897
Started processing Doc (26-28 / 55) at:- 2022-07-11 20:08:51.816010
Finished processing Doc (26-28 / 55) at:- 2022-07-11 20:08:51.953708
Started processing Doc (42-44 / 55) at:- 2022-07-11 20:08:51.953738
Finished processing Doc (42-44 / 55) at:- 2022-07-11 20:08:52.173708
Started processing Doc (49-51 / 55) at:- 2022-07-11 20:08:52.173729
Finished processing Doc (49-51 / 55) at:- 2022-07-11 20:08:52.233480
Started processing Doc (11-12 / 55) at:- 2022-07-11 20:08:52.233566
Finished processing Doc (11-12 / 55) at:- 2022-07-11 20:08:52.320783
Started processing Doc (23-25 / 55) at:- 2022-07-11 20:08:52.320799
Finished processing Doc (23-25 / 55) at:- 2022-07-11 20:0