In [None]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
import csv

def load_csv(filepath):
    with open(filepath, newline='') as csvfile:
        return list(csv.DictReader(csvfile))

In [None]:
import simplejson

def json_load(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        return simplejson.load(f)

def json_save(data, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        simplejson.dump(data, f, separators=(',', ':'), iterable_as_array=True)

In [None]:
import re

csv_file_path = '../data/BBairline200722_coreffed.csv'
specifier = re.split('[\/\.]',csv_file_path)[-2]

In [None]:
csv_data = load_csv(csv_file_path)

In [None]:
single_quote_unicode = ord("'")
translation_table_text = str.maketrans(
    {
        '`': single_quote_unicode,
        '‘': single_quote_unicode,
        '’': single_quote_unicode,
        '“': single_quote_unicode,
        '”': single_quote_unicode,
    }
)
     
corpus_texts_full, corpus_titles_full = [], []
for row in csv_data:
    text, title = row['text'].translate(translation_table_text), row['title']
    corpus_texts_full.append(text)
    corpus_titles_full.append(title)

In [None]:
%%time
from allennlp.predictors import Predictor
predictor_models = ('ner-model-2020.02.10', 'fine-grained-ner.2021-02-11', 'fgner-transformer.2021-02-11',)
predictors = {m: Predictor.from_path(f"https://storage.googleapis.com/allennlp-public-models/{m}.tar.gz") for m in predictor_models}

In [None]:
def get_ents(predictor, doc):
    prediction = predictor.predict(sentence=doc)
    return list(zip(prediction['words'], prediction['tags']))

In [None]:
# unmerged_ents = json_load(f'unmerged_ents-{specifier}.json')

In [None]:
%%time

from collections import defaultdict

unmerged_ents = defaultdict(list)
for model, predictor in predictors.items():
    for doc in corpus_texts_full:
        unmerged_ents[model].append(get_ents(predictor, doc))

In [None]:
def get_ents_ids(ents):
    ent_ids = []
    processing_ent = False
    for idx, (word, tag) in enumerate(ents):
        if tag != 'O':
            pos, ent_type = tag.split('-')
        else:
            pos, ent_type = (None, 'O')
        if not processing_ent:
            ent_ids.append([idx])
            if pos == 'B':
                processing_ent = True
        else:
            ent_ids[-1].append(idx)
            if pos == 'L':
                processing_ent = False
    return ent_ids

In [None]:
import itertools

def get_merged_ents(ents):
    all_ents = []
    for ents_doc in ents:
        ents_ids = get_ents_ids(ents_doc)
        doc_ents = []
        for ent_ids in ents_ids:
            word_tag_ents = []
            for ent_id in ent_ids:
                word_ent, tag_ent = ents_doc[ent_id]
                word_tag_ent = (
                    word_ent,
                    tag_ent.split("-")[-1], # This drops the U- B- I- from ent types
                )
                word_tag_ents.append(word_tag_ent)
            ent_words, ent_tags = list(zip(*word_tag_ents))
            combined_ent_words = ' '.join(ent_words)
            collapsed_ent_tags = ' '.join(i for i, _ in itertools.groupby(ent_tags))
            doc_ents.append((combined_ent_words, collapsed_ent_tags))
        all_ents.append(doc_ents)
    return all_ents

In [None]:
%%time
merged_ents = {}
for model, ents in unmerged_ents.items():
    merged_ents[model] = get_merged_ents(ents)

In [None]:
json_save(merged_ents, f'merged_ents-{specifier}.json')

In [None]:
json_save(unmerged_ents, f'unmerged_ents-{specifier}.json')