In [1]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
import csv

def load_csv(filepath):
    with open(filepath, newline='') as csvfile:
        return list(csv.DictReader(csvfile))

In [3]:
import simplejson

def json_load(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        return simplejson.load(f)

def json_save(data, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        simplejson.dump(data, f, separators=(',', ':'), iterable_as_array=True)

In [4]:
csv_file_path = '../data/BBairline200722_coreffed.csv'
csv_data = load_csv(csv_file_path)

In [5]:
single_quote_unicode = ord("'")
translation_table_text = str.maketrans(
    {
        '`': single_quote_unicode,
        '‘': single_quote_unicode,
        '’': single_quote_unicode,
        '“': single_quote_unicode,
        '”': single_quote_unicode,
    }
)
     
corpus_texts_full, corpus_titles_full = [], []
for row in csv_data:
    text, title = row['text'].translate(translation_table_text), row['title']
    corpus_texts_full.append(text)
    corpus_titles_full.append(title)

In [6]:
def get_ents_ids(ents):
    ent_ids = []
    processing_ent = False
    for idx, (word, tag) in enumerate(ents):
        if tag != 'O':
            pos, ent_type = tag.split('-')
        else:
            pos = None
        if not processing_ent:
            if pos:
                ent_ids.append([idx])
                if pos == 'B':
                    processing_ent = True
        else:
            ent_ids[-1].append(idx)
            if pos == 'L':
                processing_ent = False
    return ent_ids

In [8]:
%%time
from allennlp.predictors import Predictor
predictor_models = ('ner-model-2020.02.10', 'fine-grained-ner.2021-02-11', 'fgner-transformer.2021-02-11',)
predictors = {m: Predictor.from_path(f"https://storage.googleapis.com/allennlp-public-models/{m}.tar.gz") for m in predictor_models}

2021-09-25 20:04:41.813373: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-09-25 20:04:41.813404: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2021-09-25 20:04:46,186 : INFO : NumExpr defaulting to 4 threads.
2021-09-25 20:04:46,902 : INFO : Plugin allennlp_models available
2021-09-25 20:04:47,320 : INFO : cache of https://storage.googleapis.com/allennlp-public-models/ner-model-2020.02.10.tar.gz is up-to-date
2021-09-25 20:04:47,321 : INFO : loading archive file https://storage.googleapis.com/allennlp-public-models/ner-model-2020.02.10.tar.gz from cache at /home/dnk8n/.allennlp/cache/298798bbebac87143a3b421ff478cca929a5e43926e19f879c053473827e55c6.dad72088b0fc59006a39cb50270a64476d176006a3054327423c35096d8c4265
2021-09-25 20:04:47,322 : INFO : e

2021-09-25 20:04:53,093 : INFO : model.regularizer.regexes.0.1.alpha = 0.1
2021-09-25 20:04:53,094 : INFO : model.ddp_accelerator = None
2021-09-25 20:04:53,096 : INFO : model.text_field_embedder.type = ref
2021-09-25 20:04:53,098 : INFO : model.text_field_embedder.type = basic
2021-09-25 20:04:53,100 : INFO : model.text_field_embedder.token_embedders.type = ref
2021-09-25 20:04:53,103 : INFO : model.text_field_embedder.token_embedders.elmo.type = elmo_token_embedder
2021-09-25 20:04:53,104 : INFO : model.text_field_embedder.token_embedders.elmo.type = elmo_token_embedder
2021-09-25 20:04:53,106 : INFO : model.text_field_embedder.token_embedders.elmo.options_file = https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json
2021-09-25 20:04:53,116 : INFO : model.text_field_embedder.token_embedders.elmo.weight_file = https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_w

2021-09-25 20:05:16,016 : INFO : model.text_field_embedder.token_embedders.tokens.vocab_namespace = tokens
2021-09-25 20:05:16,017 : INFO : model.text_field_embedder.token_embedders.tokens.pretrained_file = None
2021-09-25 20:05:16,036 : INFO : model.encoder.type = lstm
2021-09-25 20:05:16,040 : INFO : model.encoder.type = lstm
2021-09-25 20:05:16,042 : INFO : model.encoder.input_size = 1202
2021-09-25 20:05:16,043 : INFO : model.encoder.hidden_size = 200
2021-09-25 20:05:16,044 : INFO : model.encoder.num_layers = 2
2021-09-25 20:05:16,044 : INFO : model.encoder.bias = True
2021-09-25 20:05:16,045 : INFO : model.encoder.dropout = 0.5
2021-09-25 20:05:16,047 : INFO : model.encoder.bidirectional = True
2021-09-25 20:05:16,049 : INFO : model.encoder.stateful = False
2021-09-25 20:05:16,074 : INFO : model.label_namespace = labels
2021-09-25 20:05:16,075 : INFO : model.feedforward = None
2021-09-25 20:05:16,076 : INFO : model.label_encoding = BIOUL
2021-09-25 20:05:16,077 : INFO : type = BI

2021-09-25 20:05:16,170 : INFO :    text_field_embedder.token_embedder_elmo._elmo.scalar_mix_0.scalar_parameters.0
2021-09-25 20:05:16,171 : INFO :    text_field_embedder.token_embedder_elmo._elmo.scalar_mix_0.scalar_parameters.1
2021-09-25 20:05:16,175 : INFO :    text_field_embedder.token_embedder_elmo._elmo.scalar_mix_0.scalar_parameters.2
2021-09-25 20:05:16,176 : INFO :    text_field_embedder.token_embedder_token_characters._embedding._module.weight
2021-09-25 20:05:16,177 : INFO :    text_field_embedder.token_embedder_token_characters._encoder._module.conv_layer_0.bias
2021-09-25 20:05:16,181 : INFO :    text_field_embedder.token_embedder_token_characters._encoder._module.conv_layer_0.weight
2021-09-25 20:05:16,182 : INFO :    text_field_embedder.token_embedder_tokens.weight
2021-09-25 20:05:16,516 : INFO : removing temporary unarchived model dir at /tmp/tmpk6qw2vlh
2021-09-25 20:05:17,280 : INFO : Plugin allennlp_models available
2021-09-25 20:05:17,728 : INFO : cache of https:/

2021-09-25 20:05:20,878 : INFO : model.text_field_embedder.token_embedders.elmo.type = elmo_token_embedder
2021-09-25 20:05:20,879 : INFO : model.text_field_embedder.token_embedders.elmo.options_file = https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json
2021-09-25 20:05:20,883 : INFO : model.text_field_embedder.token_embedders.elmo.weight_file = https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5
2021-09-25 20:05:20,883 : INFO : model.text_field_embedder.token_embedders.elmo.do_layer_norm = False
2021-09-25 20:05:20,884 : INFO : model.text_field_embedder.token_embedders.elmo.dropout = 0
2021-09-25 20:05:20,885 : INFO : model.text_field_embedder.token_embedders.elmo.requires_grad = False
2021-09-25 20:05:20,886 : INFO : model.text_field_embedder.token_embedders.elmo.projection_dim = None
2021-09-25 20:05:20,887 : INFO : model.text_field_embedder.tok

2021-09-25 20:05:42,801 : INFO : model.encoder.hidden_size = 200
2021-09-25 20:05:42,802 : INFO : model.encoder.num_layers = 2
2021-09-25 20:05:42,803 : INFO : model.encoder.recurrent_dropout_probability = 0.5
2021-09-25 20:05:42,805 : INFO : model.encoder.layer_dropout_probability = 0.0
2021-09-25 20:05:42,806 : INFO : model.encoder.use_highway = True
2021-09-25 20:05:42,807 : INFO : model.encoder.stateful = False
2021-09-25 20:05:42,970 : INFO : model.label_namespace = labels
2021-09-25 20:05:42,971 : INFO : model.feedforward.type = ref
2021-09-25 20:05:42,973 : INFO : model.feedforward.input_dim = 400
2021-09-25 20:05:42,975 : INFO : model.feedforward.num_layers = 1
2021-09-25 20:05:42,976 : INFO : model.feedforward.hidden_dims = 400
2021-09-25 20:05:42,979 : INFO : model.feedforward.activations = tanh
2021-09-25 20:05:42,980 : INFO : type = tanh
2021-09-25 20:05:42,982 : INFO : type = tanh
2021-09-25 20:05:42,983 : INFO : type = tanh
2021-09-25 20:05:42,984 : INFO : model.feedforwa

2021-09-25 20:05:43,080 : INFO :    text_field_embedder.token_embedder_elmo._elmo._elmo_lstm._token_embedder.char_conv_6.weight
2021-09-25 20:05:43,081 : INFO :    text_field_embedder.token_embedder_elmo._elmo.scalar_mix_0.gamma
2021-09-25 20:05:43,082 : INFO :    text_field_embedder.token_embedder_elmo._elmo.scalar_mix_0.scalar_parameters.0
2021-09-25 20:05:43,085 : INFO :    text_field_embedder.token_embedder_elmo._elmo.scalar_mix_0.scalar_parameters.1
2021-09-25 20:05:43,085 : INFO :    text_field_embedder.token_embedder_elmo._elmo.scalar_mix_0.scalar_parameters.2
2021-09-25 20:05:43,086 : INFO :    text_field_embedder.token_embedder_token_characters._embedding._module.weight
2021-09-25 20:05:43,088 : INFO :    text_field_embedder.token_embedder_token_characters._encoder._module._module.bias_hh_l0
2021-09-25 20:05:43,092 : INFO :    text_field_embedder.token_embedder_token_characters._encoder._module._module.bias_ih_l0
2021-09-25 20:05:43,093 : INFO :    text_field_embedder.token_em

2021-09-25 20:06:02,781 : INFO : model.initializer = <allennlp.nn.initializers.InitializerApplicator object at 0x7f63ac92a250>
2021-09-25 20:06:02,782 : INFO : model.top_k = 1
2021-09-25 20:06:02,785 : INFO : model.ignore_loss_on_o_tags = False
2021-09-25 20:06:02,805 : INFO : Initializing parameters
2021-09-25 20:06:02,807 : INFO : Done initializing parameters; the following parameters are using their default initialization from their code
2021-09-25 20:06:02,808 : INFO :    crf._constraint_mask
2021-09-25 20:06:02,809 : INFO :    crf.transitions
2021-09-25 20:06:02,812 : INFO :    tag_projection_layer._module.bias
2021-09-25 20:06:02,816 : INFO :    tag_projection_layer._module.weight
2021-09-25 20:06:02,817 : INFO :    text_field_embedder.token_embedder_tokens._matched_embedder.transformer_model.embeddings.LayerNorm.bias
2021-09-25 20:06:02,818 : INFO :    text_field_embedder.token_embedder_tokens._matched_embedder.transformer_model.embeddings.LayerNorm.weight
2021-09-25 20:06:02,81

2021-09-25 20:06:02,886 : INFO :    text_field_embedder.token_embedder_tokens._matched_embedder.transformer_model.encoder.layer.10.output.LayerNorm.bias
2021-09-25 20:06:02,888 : INFO :    text_field_embedder.token_embedder_tokens._matched_embedder.transformer_model.encoder.layer.10.output.LayerNorm.weight
2021-09-25 20:06:02,889 : INFO :    text_field_embedder.token_embedder_tokens._matched_embedder.transformer_model.encoder.layer.10.output.dense.bias
2021-09-25 20:06:02,890 : INFO :    text_field_embedder.token_embedder_tokens._matched_embedder.transformer_model.encoder.layer.10.output.dense.weight
2021-09-25 20:06:02,890 : INFO :    text_field_embedder.token_embedder_tokens._matched_embedder.transformer_model.encoder.layer.11.attention.output.LayerNorm.bias
2021-09-25 20:06:02,891 : INFO :    text_field_embedder.token_embedder_tokens._matched_embedder.transformer_model.encoder.layer.11.attention.output.LayerNorm.weight
2021-09-25 20:06:02,891 : INFO :    text_field_embedder.token_em

2021-09-25 20:06:02,945 : INFO :    text_field_embedder.token_embedder_tokens._matched_embedder.transformer_model.encoder.layer.4.attention.output.LayerNorm.weight
2021-09-25 20:06:02,946 : INFO :    text_field_embedder.token_embedder_tokens._matched_embedder.transformer_model.encoder.layer.4.attention.output.dense.bias
2021-09-25 20:06:02,947 : INFO :    text_field_embedder.token_embedder_tokens._matched_embedder.transformer_model.encoder.layer.4.attention.output.dense.weight
2021-09-25 20:06:02,952 : INFO :    text_field_embedder.token_embedder_tokens._matched_embedder.transformer_model.encoder.layer.4.attention.self.key.bias
2021-09-25 20:06:02,953 : INFO :    text_field_embedder.token_embedder_tokens._matched_embedder.transformer_model.encoder.layer.4.attention.self.key.weight
2021-09-25 20:06:02,954 : INFO :    text_field_embedder.token_embedder_tokens._matched_embedder.transformer_model.encoder.layer.4.attention.self.query.bias
2021-09-25 20:06:02,956 : INFO :    text_field_embed

2021-09-25 20:06:03,008 : INFO :    text_field_embedder.token_embedder_tokens._matched_embedder.transformer_model.encoder.layer.7.attention.self.query.bias
2021-09-25 20:06:03,008 : INFO :    text_field_embedder.token_embedder_tokens._matched_embedder.transformer_model.encoder.layer.7.attention.self.query.weight
2021-09-25 20:06:03,009 : INFO :    text_field_embedder.token_embedder_tokens._matched_embedder.transformer_model.encoder.layer.7.attention.self.value.bias
2021-09-25 20:06:03,013 : INFO :    text_field_embedder.token_embedder_tokens._matched_embedder.transformer_model.encoder.layer.7.attention.self.value.weight
2021-09-25 20:06:03,015 : INFO :    text_field_embedder.token_embedder_tokens._matched_embedder.transformer_model.encoder.layer.7.intermediate.dense.bias
2021-09-25 20:06:03,018 : INFO :    text_field_embedder.token_embedder_tokens._matched_embedder.transformer_model.encoder.layer.7.intermediate.dense.weight
2021-09-25 20:06:03,019 : INFO :    text_field_embedder.token_

CPU times: user 40.1 s, sys: 3.85 s, total: 43.9 s
Wall time: 1min 22s


In [9]:
def get_ents(predictor, doc):
    prediction = predictor.predict(sentence=doc)
    return list(zip(prediction['words'], prediction['tags']))

In [10]:
%%time

from collections import defaultdict

unmerged_ents = defaultdict(list)
for model, predictor in predictors.items():
    for doc in corpus_texts_full[:2]:
        unmerged_ents[model].append(get_ents(predictor, doc))

To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  /pytorch/aten/src/ATen/native/BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)


CPU times: user 7min 13s, sys: 3.99 s, total: 7min 17s
Wall time: 3min 41s


In [14]:
import itertools

def get_merged_ents(ents):
    all_ents = []
    for ents_doc in ents:
        ents_ids = get_ents_ids(ents_doc)
        doc_ents = []
        for ent_ids in ents_ids:
            word_tag_ents = []
            for ent_id in ent_ids:
                word_ent, tag_ent = ents_doc[ent_id]
                word_tag_ent = (
                    word_ent.split("'")[0], # This drops any 'll 'm 's that might be part of the ent
                    tag_ent.split("-")[1], # This drops the U- B- I- from ent types
                )
                word_tag_ents.append(word_tag_ent)
            ent_words, ent_tags = list(zip(*word_tag_ents))
            combined_ent_words = ' '.join(ent_words)
            collapsed_ent_tags = ' '.join(i for i, _ in itertools.groupby(ent_tags))
            doc_ents.append((combined_ent_words, collapsed_ent_tags))
        all_ents.append(doc_ents)
    return all_ents

In [15]:
%%time
merged_ents = {}
for model, ents in unmerged_ents.items():
    merged_ents[model] = get_merged_ents(ents)

CPU times: user 42 ms, sys: 1.94 ms, total: 43.9 ms
Wall time: 41.8 ms


In [18]:
import re

specifier = re.split('[\/\.]',csv_file_path)[-2]
json_save(merged_ents, f'merged_ents-{specifier}.json')