In [1]:
# https://sklearn-crfsuite.readthedocs.io/en/latest/tutorial.html#let-s-use-conll-2002-data-to-build-a-ner-system
# functions to create NER features

def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [2]:
import json

scenarios = json.load(open("../datasets/scenarios-labeled.json", 'r'))

In [3]:
import spacy, nltk

# load the spacey nlp processor
nlp = spacy.load("en_core_web_sm")

sentences = []
for scenario_id, scenario in scenarios.items():
    
    # parse scenario text to tokenize sentences
    doc = nlp(scenario['text'])
    tag = [t for w, t in nltk.pos_tag(scenario['words'], tagset='universal')]
    
    sent = []
    for i, token in enumerate(doc):
        if token.is_sent_start and len(sent) > 0:
            sentences.append(sent)
            sent = []
        sent.append([scenario['words'][i], tag[i], scenario['codes'][i]])
    sentences.append(sent)

In [4]:
# convert the sentence-word triples into features

X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]

In [5]:
from sklearn.model_selection import train_test_split
from sklearn_crfsuite import CRF

# split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

# create and fit the model
crf = CRF(algorithm = 'lbfgs',
         c1 = 0.1,
         c2 = 0.1,
         max_iterations = 100,
         all_possible_transitions = False)

crf.fit(X_train, y_train)



CRF(algorithm='lbfgs', all_possible_transitions=False, c1=0.1, c2=0.1,
    keep_tempfiles=None, max_iterations=100)

In [6]:
from sklearn.metrics import classification_report
import numpy as np

# evaluate the model on the text data
y_pred = crf.predict(X_test)

# prepare and print the evaluation report
y_test_flat = np.array([x for y in y_test for x in y ])
y_pred_flat = np.array([x for y in y_pred for x in y ])

print(classification_report(y_test_flat, y_pred_flat))

              precision    recall  f1-score   support

       B-COM       0.27      0.12      0.17        57
       B-QUE       0.63      0.57      0.60        58
       B-SIM       0.73      0.70      0.71       673
       I-COM       0.33      0.17      0.23       349
       I-QUE       0.47      0.45      0.46       376
       I-SIM       0.74      0.63      0.68       188
           O       0.93      0.96      0.94      9233

    accuracy                           0.89     10934
   macro avg       0.58      0.51      0.54     10934
weighted avg       0.87      0.89      0.88     10934

