In [1]:
# https://sklearn-crfsuite.readthedocs.io/en/latest/tutorial.html#let-s-use-conll-2002-data-to-build-a-ner-system
# functions to create NER features

def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [2]:
import pickle

with open('crf.model', 'rb') as f:
    model = pickle.load(f)

In [24]:
import spacy, nltk, json

# load the spacey nlp processor
nlp = spacy.load("en_core_web_sm")

def segment_text(scenario_text):    
    # parse scenario text to tokenize sentences
    doc = nlp(scenario_text)
    tag = [t for w, t in nltk.pos_tag([t.text for t in doc], tagset='universal')]
    
    sentences = []
    sent = []
    for i, token in enumerate(doc):
        if token.is_sent_start and len(sent) > 0:
            sentences.append(sent)
            sent = []
        sent.append([token.text, tag[i], token.idx])
    sentences.append(sent)
    return sentences

# load the scenario data
dataset = json.load(open('../datasets/scenarios-labeled.json', 'r'))
scenarios = list(dataset.values())

segments = segment_text(scenarios[0]['text'])
X = [sent2features(s) for s in segments]


In [25]:
y_pred = model.predict(X)

In [32]:
entities = []
for i in range(len(segments)):
    for j in range(len(segments[i])):
        text = segments[i][j][0]
        text_start = segments[i][j][2]
        text_end = text_start + len(text)
        label = y_pred[i][j]
        print('%s\t%s' % (text.ljust(15), label))
        
        if label.startswith('B-'):
            entities.append({'start': text_start, 'end': text_end, 'entity_group': label[2:]})
        elif label.startswith('I-'):
            entities[-1]['end'] = text_end

From           	O
this           	O
screen         	O
,              	O
I              	O
like           	O
to             	O
search         	O
for            	O
anything       	O
from           	O
recipes        	B-SIM
,              	O
to             	O
home           	B-SIM
decor          	I-SIM
,              	O
to             	O
people         	B-SIM
,              	O
etc            	O
.              	O
,              	O
just           	O
depending      	O
on             	O
my             	O
mood           	O
.              	O
To             	O
get            	O
to             	O
this           	O
screen         	O
all            	O
I              	O
had            	O
to             	O
do             	O
was            	O
tap            	O
on             	O
the            	O
little         	O
magnifying     	O
glass          	O
next           	O
to             	O
the            	O
image          	O
of             	O
the            	O
home           	O
icon           	O
,           

In [33]:
print('Found %i entities' % len(entities))
for entity in entities:
    print('%s (%s)' % (scenarios[0]['text'][entity['start']:entity['end']], entity['entity_group']))

Found 12 entities
recipes (SIM)
home decor (SIM)
people (SIM)
preferences (SIM)
trends (SIM)
search bar (SIM)
word (SIM)
phrase (SIM)
recipe (SIM)
recipe (SIM)
phrase (SIM)
board (SIM)
