In [4]:
! pip install transformers
! pip install torch

Collecting transformers
  Using cached transformers-4.26.0-py3-none-any.whl (6.3 MB)
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Using cached tokenizers-0.13.2-cp38-cp38-macosx_10_11_x86_64.whl (3.8 MB)
Collecting huggingface-hub<1.0,>=0.11.0
  Using cached huggingface_hub-0.12.0-py3-none-any.whl (190 kB)
Installing collected packages: tokenizers, huggingface-hub, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.10.1
    Uninstalling tokenizers-0.10.1:
      Successfully uninstalled tokenizers-0.10.1
  Attempting uninstall: huggingface-hub
    Found existing installation: huggingface-hub 0.5.1
    Uninstalling huggingface-hub-0.5.1:
      Successfully uninstalled huggingface-hub-0.5.1
  Attempting uninstall: transformers
    Found existing installation: transformers 4.3.3
    Uninstalling transformers-4.3.3:
      Successfully uninstalled transformers-4.3.3
[31mERROR: After October 2020 you may experience errors when installing or updati

In [2]:
import transformers
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# setup named entity recognizer using pre-trained model
tokenizer = AutoTokenizer.from_pretrained("./model")
model = AutoModelForTokenClassification.from_pretrained("./model",ignore_mismatched_sizes=True)
nlp = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="first")

In [4]:
import json

# read and merge the two scenario datasets
scenarios = json.load(open('../datasets/scenarios1.json'))
scenarios.extend(json.load(open('../datasets/scenarios2_200.json')))

In [5]:
print(len(scenarios))

300


In [72]:
# extract entities from the scenario text (long running process)
entities = []
for scenario in scenarios:
    results = nlp(scenario['text'])
    for r in results:
        r['text'] = scenario['text']
        entities.append(r)

In [77]:
print('Found %i entities across %i scenarios.' % (len(entities), len(scenarios)))

Found 4723 entities across 300 scenarios.


In [143]:
import nltk

# returns the start and end word indices for the phrase in words
def word_index(words, phrase, start_char):
    # correct certain parse errors
    extras = ["' ", " - ", " / "]
    for i in range(len(extras)):
        j = phrase.find(extras[i])
        if j >= 0:
            phrase = phrase[:j] + extras[i].strip() + phrase[j+len(extras[i]):]
        
    # using the start char, find the first and last word index for phrase
    char_index = 0
    phrase_len = len(phrase.split())
    for i in range(len(words)):
        char_index += len(words[i])
        if char_index + i >= start_char:
            j = i
            while i < len(words):
                if i + phrase_len > len(words):
                    return None
                elif ' '.join(words[i:i + phrase_len]) == phrase:
                    return (i, i + phrase_len)
                i += 1
    return None

# construct word, tag lists for phrase and words to the left and right of phrase
errors = 0
parsed = {}
for e in entities:
    # reuse the tagged scenario texts
    if not e['text'] in parsed:
        tags = nltk.pos_tag(nltk.word_tokenize(e['text']))
        parsed[e['text']] = tags
    tags = parsed[e['text']]
    
    # create separate word list and use to estimate word indices from char indices
    words = [w for (w, p) in tags]
    index = word_index(words, e['word'], e['start'])
    
    # save the associated word, tag lists
    e['p_words'] = nltk.pos_tag(nltk.word_tokenize(e['text'][e['start']:e['end']]))
    if not index:
        e['l_words'] = []
        e['r_words'] = []
        errors += 1
    else:
        e['r_words'] = tags[index[1]:index[1] + 3]
        e['l_words'] = tags[index[0] - 3:index[1]]
        
# report any phrases that could not be indexed
print('Unable to find word %i/%i boundaries.' % (errors, len(entities)))

Unable to find word 45/4723 boundaries.


In [150]:
import nltk

# filter out incomplete phrases based on a few simple rules
filtered = []
excluded = []
for e in entities:
    pos = e['p_words']
    # remove phrases ending in 'the', 'a', 'and', 'or,' or 'your', for example
    if pos[-1][1].startswith('DT') or pos[-1][1].startswith('CC') or pos[-1][1].startswith('PRP$'):
        excluded.append((1, e['word'], pos))
    # remove phrases beginning with POS
    elif pos[0][1] == 'POS' or pos[0][0] == 'of' or pos[0][1] == 'CC':
        excluded.append((3, e['word'], pos))
    # remove phrases less than two words not ending in NN or VBG
    elif len(pos) == 1 and not pos[0][1].startswith('NN') and not pos[0][1].startswith('VBG'):
        excluded.append((2, e['word'], pos))
    elif len(pos) == 2 and not pos[0][1].startswith('NN') and not pos[1][1].startswith('NN') and not pos[1][1].startswith('VBG'):
        excluded.append((2, e['word'], pos))
    else:
        filtered.append(e)

# print the excluded phrases and rule used to exclude
#for i in range(len(excluded)): print(excluded[i])
    
print('Filtered from %i to %i unique entities' % (len(entities), len(filtered)))
print('Excluded %i entites' % len(excluded))

# False positives = 0.38 * 4723 = 1571, true positives = 3152
# Enhanced precision = 3152 / 4499 = 0.70

Filtered from 4723 to 4499 unique entities
Excluded 224 entites


In [145]:
# remove duplicate phrases for easier analysis
unique = set()
results = []
duplicate = 0
for e in filtered:
    phrase = e['word'].lower()
    if phrase in unique:
        duplicate += 1
    else:
        unique.add(phrase)
        results.append([phrase, e['l_words'], e['r_words'], e['pos']])

print('Found %i unique entities.' % len(unique))

Found 1944 unique entities.


In [146]:
import csv

# sort phrases alphabetically
results.sort(key=lambda x:x[0])

# write phrases to a file
with open('entities_300.csv', 'w') as f:
    writer = csv.writer(f)
    for row in results:
        writer.writerow(row)