In [92]:
# https://sklearn-crfsuite.readthedocs.io/en/latest/tutorial.html#let-s-use-conll-2002-data-to-build-a-ner-system
# functions to create NER features

def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [93]:
import json

# convert the sentence-word triples into features
dataset = json.load(open('../datasets/scenarios-training.json'))

X_train = [sent2features(s) for scenario in dataset['train'] for s in scenario['tokens']]
y_train = [sent2labels(s) for scenario in dataset['train'] for s in scenario['tokens']]
X_test = [sent2features(s) for scenario in dataset['test'] for s in scenario['tokens']]
y_test = [sent2labels(s) for scenario in dataset['test'] for s in scenario['tokens']]

In [80]:
import json

# convert the sentence-word triples into features
dataset = json.load(open('../datasets/sentences-training.json'))

X_train = [sent2features(s['tokens']) for s in dataset['train']]
y_train = [sent2labels(s['tokens']) for s in dataset['train']]
X_test = [sent2features(s['tokens']) for s in dataset['test']]
y_test = [sent2labels(s['tokens']) for s in dataset['test']]

In [94]:
from sklearn.model_selection import train_test_split
from sklearn_crfsuite import CRF

# split the data into train and test sets
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

# create and fit the model
crf = CRF(algorithm = 'lbfgs',
         c1 = 0.1,
         c2 = 0.1,
         max_iterations = 100,
         all_possible_transitions = False)

crf.fit(X_train, y_train)

CRF(algorithm='lbfgs', all_possible_transitions=False, c1=0.1, c2=0.1,
    keep_tempfiles=None, max_iterations=100)

In [95]:
from sklearn.metrics import classification_report
import numpy as np

# evaluate the model on the text data
y_pred = crf.predict(X_test)

# prepare and print the evaluation report
y_test_flat = np.array([x for y in y_test for x in y ])
y_pred_flat = np.array([x for y in y_pred for x in y ])

print(classification_report(y_test_flat, y_pred_flat))

              precision    recall  f1-score   support

       B-COM       0.26      0.19      0.22        32
       B-QUE       0.77      0.51      0.62        47
       B-SIM       0.76      0.67      0.71       390
       I-COM       0.39      0.33      0.36       189
       I-QUE       0.73      0.51      0.60       257
       I-SIM       0.70      0.57      0.63        98
           O       0.92      0.96      0.94      4748

    accuracy                           0.89      5761
   macro avg       0.65      0.53      0.58      5761
weighted avg       0.88      0.89      0.88      5761



In [96]:
from seqeval.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         COM       0.13      0.09      0.11        32
         QUE       0.74      0.49      0.59        47
         SIM       0.73      0.64      0.68       390

   micro avg       0.69      0.59      0.64       469
   macro avg       0.53      0.41      0.46       469
weighted avg       0.69      0.59      0.63       469



In [97]:
import pickle

# write model to file
with open('crf.model', 'wb') as f:
    pickle.dump(crf, f, protocol=pickle.HIGHEST_PROTOCOL)