In [1]:
# https://sklearn-crfsuite.readthedocs.io/en/latest/tutorial.html#let-s-use-conll-2002-data-to-build-a-ner-system
# functions to create NER features

def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [2]:
import json

if False:
    # convert the sentence-word triples into features
    dataset = json.load(open('sentences-training-9.json'))

    X_train = [sent2features(s['tokens']) for s in dataset['train']]
    y_train = [sent2labels(s['tokens']) for s in dataset['train']]
    X_test = [sent2features(s['tokens']) for s in dataset['test']]
    y_test = [sent2labels(s['tokens']) for s in dataset['test']]
else:
    # convert the sentence-word triples into features
    dataset = json.load(open('scenarios-training-N.json'))

    X_train = [sent2features(s) for scenario in dataset['train'] for s in scenario['tokens']]
    y_train = [sent2labels(s) for scenario in dataset['train'] for s in scenario['tokens']]
    X_test = [sent2features(s) for scenario in dataset['test'] for s in scenario['tokens']]
    y_test = [sent2labels(s) for scenario in dataset['test'] for s in scenario['tokens']]

In [3]:
from sklearn.model_selection import train_test_split
from sklearn_crfsuite import CRF

# split the data into train and test sets
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

# create and fit the model
crf = CRF(algorithm = 'lbfgs',
         c1 = 0.335955589737437,
         c2 = 0.05948353066282411,
         max_iterations = 100,
         all_possible_transitions = False)

crf.fit(X_train, y_train)

labels = list(crf.classes_)
labels.remove('O')

In [4]:
from sklearn.metrics import classification_report
import numpy as np

# evaluate the model on the text data
y_pred = crf.predict(X_test)

# prepare and print the evaluation report
y_test_flat = np.array([x for y in y_test for x in y ])
y_pred_flat = np.array([x for y in y_pred for x in y ])

print(classification_report(y_test_flat, y_pred_flat))

              precision    recall  f1-score   support

       B-COM       0.50      0.07      0.12        46
       B-QUE       0.77      0.62      0.69        77
       B-SIM       0.62      0.61      0.62       674
       I-COM       0.34      0.05      0.09       196
       I-QUE       0.66      0.57      0.61       499
       I-SIM       0.37      0.31      0.33       117
           O       0.92      0.96      0.94      7526

    accuracy                           0.88      9135
   macro avg       0.60      0.45      0.48      9135
weighted avg       0.86      0.88      0.86      9135



In [5]:
from seqeval.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         COM       0.33      0.04      0.08        46
         QUE       0.55      0.44      0.49        78
         SIM       0.61      0.61      0.61       674

   micro avg       0.61      0.56      0.58       798
   macro avg       0.50      0.36      0.39       798
weighted avg       0.59      0.56      0.57       798



In [6]:
import pickle

# write model to file
with open('crf.model', 'wb') as f:
    pickle.dump(crf, f, protocol=pickle.HIGHEST_PROTOCOL)