In [2]:
# https://sklearn-crfsuite.readthedocs.io/en/latest/tutorial.html#let-s-use-conll-2002-data-to-build-a-ner-system
# functions to create NER features

def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [3]:
import json

scenarios = json.load(open("../datasets/scenarios-labeled.json", 'r'))

In [4]:
import spacy, nltk

# load the spacey nlp processor
nlp = spacy.load("en_core_web_sm")

sentences = []
for scenario_id, scenario in scenarios.items():
    
    # parse scenario text to tokenize sentences
    doc = nlp(scenario['text'])
    tag = [t for w, t in nltk.pos_tag(scenario['words'], tagset='universal')]
    
    sent = []
    for i, token in enumerate(doc):
        if token.is_sent_start and len(sent) > 0:
            sentences.append(sent)
            sent = []
        sent.append([scenario['words'][i], tag[i], scenario['codes'][i]])
    sentences.append(sent)

In [5]:
# convert the sentence-word triples into features

X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]

In [6]:
from sklearn.model_selection import train_test_split
from sklearn_crfsuite import CRF

# split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

# create and fit the model
crf = CRF(algorithm = 'lbfgs',
         c1 = 0.1,
         c2 = 0.1,
         max_iterations = 100,
         all_possible_transitions = False)

crf.fit(X_train, y_train)



CRF(algorithm='lbfgs', all_possible_transitions=False, c1=0.1, c2=0.1,
    keep_tempfiles=None, max_iterations=100)

In [7]:
from sklearn.metrics import classification_report
import numpy as np

# evaluate the model on the text data
y_pred = crf.predict(X_test)

# prepare and print the evaluation report
y_test_flat = np.array([x for y in y_test for x in y ])
y_pred_flat = np.array([x for y in y_pred for x in y ])

print(classification_report(y_test_flat, y_pred_flat))

              precision    recall  f1-score   support

       B-COM       0.20      0.07      0.11        70
       B-QUE       0.70      0.44      0.54        68
       B-SIM       0.72      0.70      0.71       726
       I-COM       0.21      0.10      0.13       364
       I-QUE       0.60      0.38      0.46       411
       I-SIM       0.68      0.60      0.64       200
           O       0.92      0.97      0.94      9288

    accuracy                           0.88     11127
   macro avg       0.58      0.46      0.50     11127
weighted avg       0.86      0.88      0.87     11127



In [8]:
import pickle

# write model to file
with open('crf.model', 'wb') as f:
    pickle.dump(crf, f, protocol=pickle.HIGHEST_PROTOCOL)

In [10]:
from seqeval.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         COM       0.15      0.06      0.08        70
         QUE       0.56      0.35      0.43        68
         SIM       0.70      0.68      0.69       726

   micro avg       0.67      0.60      0.64       864
   macro avg       0.47      0.36      0.40       864
weighted avg       0.64      0.60      0.62       864

