Conditional Random Fields have lost some of their popularity since the advent of neural-network models. Still, they can be very effective for named entity recognition, particularly when word embedding information is taken into account.

In [1]:
import nltk
import sklearn
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer
import sklearn_crfsuite as crfsuite
from sklearn_crfsuite import metrics

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  dtype=np.int):
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  method='lar', copy_X=True, eps=np.finfo(np.float).eps,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  method='lar', copy_X=True, eps=np.finfo(np.float).eps,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_Gram=True, verbose=0,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.floa

## importing data CoNLL-2002 data from nltk  

In [2]:
train_sents = list(nltk.corpus.conll2002.iob_sents('ned.train'))
dev_sents = list(nltk.corpus.conll2002.iob_sents('ned.testa'))
test_sents = list(nltk.corpus.conll2002.iob_sents('ned.testb'))

In [3]:
train_sents[0]

[('De', 'Art', 'O'),
 ('tekst', 'N', 'O'),
 ('van', 'Prep', 'O'),
 ('het', 'Art', 'O'),
 ('arrest', 'N', 'O'),
 ('is', 'V', 'O'),
 ('nog', 'Adv', 'O'),
 ('niet', 'Adv', 'O'),
 ('schriftelijk', 'Adj', 'O'),
 ('beschikbaar', 'Adj', 'O'),
 ('maar', 'Conj', 'O'),
 ('het', 'Art', 'O'),
 ('bericht', 'N', 'O'),
 ('werd', 'V', 'O'),
 ('alvast', 'Adv', 'O'),
 ('bekendgemaakt', 'V', 'O'),
 ('door', 'Prep', 'O'),
 ('een', 'Art', 'O'),
 ('communicatiebureau', 'N', 'O'),
 ('dat', 'Conj', 'O'),
 ('Floralux', 'N', 'B-ORG'),
 ('inhuurde', 'V', 'O'),
 ('.', 'Punc', 'O')]

## Feature Extraction

word embeddings on Dutch Wikipedia clustered in 500 clusters

In [7]:
def read_clusters(cluster_file):
    word2cluster = {}
    with open(cluster_file,encoding="utf-8") as i:
        for line in i:
            word, cluster = line.strip().split('\t')
            word2cluster[word] = cluster
    return word2cluster

In [8]:
word2cluster = read_clusters("embeddings/clusters_nl.tsv")

assembling all needed features

In [9]:
def word2features(sent, i, word2cluster):
    # an element in sentence is for exp ('De', 'Art', 'O') => (word,postag,label)
    word = sent[i][0]
    postag = sent[i][1]
    
    features = [
        'bias',
        'word.lower=' + word.lower(),
        # the character bigram and trigram the word ends with
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        # binary features
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
        # cluster id & postag
        'word.cluster=%s' % word2cluster[word.lower()] if word.lower() in word2cluster else "0",
        'postag=' + postag
    ]
    
    # get the info about the word before the target word 
    # if i==0 then its the begining of sentence BOS
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
            '-1:postag=' + postag1
        ])
    else:
        features.append('BOS')

    # append also one word further back when possible     
    if i > 1: 
        word2 = sent[i-2][0]
        postag2 = sent[i-2][1]
        features.extend([
            '-2:word.lower=' + word2.lower(),
            '-2:word.istitle=%s' % word2.istitle(),
            '-2:word.isupper=%s' % word2.isupper(),
            '-2:postag=' + postag2
        ])        

    # add the word after the taget else add "EOS" to singnify end of sentence    
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
            '+1:postag=' + postag1
        ])
    else:
        features.append('EOS')

    # one more word after target if possible    
    if i < len(sent)-2:
        word2 = sent[i+2][0]
        postag2 = sent[i+2][1]
        features.extend([
            '+2:word.lower=' + word2.lower(),
            '+2:word.istitle=%s' % word2.istitle(),
            '+2:word.isupper=%s' % word2.isupper(),
            '+2:postag=' + postag2
        ])

        
    return features

In [10]:
def sent2features(sent, word2cluster):
    return [word2features(sent, i, word2cluster) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [11]:
X_train = [sent2features(s, word2cluster) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_dev = [sent2features(s, word2cluster) for s in dev_sents]
y_dev = [sent2labels(s) for s in dev_sents]

X_test = [sent2features(s, word2cluster) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

## Training

create a CRF model and train it. We'll use the standard L-BFGS algorithm for our parameter estimation and run it for 100 iterations. 

L_BFGS an optimization algorithm, a version of Broyden–Fletcher–Goldfarb–Shanno algorithm that uses less memory by storing only a few vectors that represent the approximation implicitly
instead of  the dense (n * n) approximation to the inverse Hessian

In [12]:
crf = crfsuite.CRF(
    verbose='true',
    algorithm='lbfgs',
    max_iterations=100
)

crf.fit(X_train, y_train, X_dev=X_dev, y_dev=y_dev)

loading training data to CRFsuite: 100%|███████████████████████████████████████| 15806/15806 [00:09<00:00, 1682.22it/s]





loading dev data to CRFsuite: 100%|██████████████████████████████████████████████| 2895/2895 [00:01<00:00, 1452.88it/s]



Holdout group: 2

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 0
0....1....2....3....4....5....6....7....8....9....10
Number of features: 152117
Seconds required: 2.511

L-BFGS optimization
c1: 0.000000
c2: 1.000000
num_memories: 6
max_iterations: 100
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

Iter 1   time=1.26  loss=104214.83 active=152117 precision=0.100  recall=0.111  F1=0.105  Acc(item/seq)=0.901 0.496  feature_norm=1.00
Iter 2   time=0.68  loss=96997.81 active=152117 precision=0.100  recall=0.111  F1=0.105  Acc(item/seq)=0.901 0.496  feature_norm=1.13
Iter 3   time=0.70  loss=92085.38 active=152117 precision=0.100  recall=0.111  F1=0.105  Acc(item/seq)=0.901 0.496  feature_norm=1.26
Iter 4   time=0.69  loss=84277.67 active=152117 precision=0.100  recall=0.111  F1=0.105  Acc(item/seq)=0.901 0.496  feature_norm=1.51
Iter 5   time=0.80  loss=67577.53 active=15

Iter 60  time=0.69  loss=10712.24 active=152117 precision=0.779  recall=0.719  F1=0.744  Acc(item/seq)=0.970 0.788  feature_norm=44.41
Iter 61  time=0.71  loss=10602.81 active=152117 precision=0.789  recall=0.709  F1=0.740  Acc(item/seq)=0.970 0.789  feature_norm=44.68
Iter 62  time=0.69  loss=10508.84 active=152117 precision=0.782  recall=0.711  F1=0.739  Acc(item/seq)=0.970 0.787  feature_norm=45.37
Iter 63  time=0.73  loss=10458.88 active=152117 precision=0.783  recall=0.717  F1=0.744  Acc(item/seq)=0.970 0.788  feature_norm=45.51
Iter 64  time=0.99  loss=10420.78 active=152117 precision=0.763  recall=0.711  F1=0.730  Acc(item/seq)=0.969 0.786  feature_norm=45.67
Iter 65  time=0.91  loss=10315.28 active=152117 precision=0.766  recall=0.721  F1=0.735  Acc(item/seq)=0.969 0.786  feature_norm=46.30
Iter 66  time=0.82  loss=10204.10 active=152117 precision=0.769  recall=0.728  F1=0.740  Acc(item/seq)=0.970 0.786  feature_norm=47.10
Iter 67  time=0.74  loss=10134.54 active=152117 precisi

CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=None,
    averaging=None, c=None, c1=None, c2=None, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose='true')

## save the model

In [14]:
import joblib
import os

OUTPUT_PATH = "models/ner/"
OUTPUT_FILE = "crf_model"

if not os.path.exists(OUTPUT_PATH):
    os.mkdir(OUTPUT_PATH)

joblib.dump(crf, os.path.join(OUTPUT_PATH, OUTPUT_FILE))

['models/ner/crf_model']

## Evaluation

In [16]:
labels = list(crf.classes_)
labels.remove("O")
y_pred = crf.predict(X_test)
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)

print(metrics.flat_classification_report(y_test, y_pred, labels=sorted_labels))

              precision    recall  f1-score   support

       B-LOC       0.83      0.83      0.83       774
       I-LOC       0.29      0.41      0.34        49
      B-MISC       0.84      0.61      0.71      1187
      I-MISC       0.59      0.42      0.49       410
       B-ORG       0.80      0.69      0.74       882
       I-ORG       0.74      0.66      0.70       551
       B-PER       0.80      0.90      0.85      1098
       I-PER       0.87      0.95      0.91       807

   micro avg       0.80      0.74      0.77      5758
   macro avg       0.72      0.68      0.70      5758
weighted avg       0.80      0.74      0.76      5758



## Finding the optimal hyperparameters

Regularization prevents overfitting on the training data by adding a penalty to the loss function. In L1 regularization, this penalty is the sum of the absolute values of the weights; in L2 regularization, it is the sum of the squared weights. L1 regularization performs a type of feature selection, as it assigns 0 weight to irrelevant features. L2 regularization, by contrast, makes the weight of irrelevant features small, but not necessarily zero. L1 regularization is often called the Lasso method, L2 is called the Ridge method, and the linear combination of both is called Elastic Net regularization.

In [None]:
import scipy
from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV

crf = crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)

params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=labels)

rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)
rs.fit(X_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  fold_sizes = np.full(n_splits, n_samples // n_splits, dtype=np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  test_mask = np.zeros(_num_samples(X), dtype=np.bool)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  test_mask = np.zeros(_num_samples(X), dtype=np.bool)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  test_mask = np.zeros(_num_samples(X), dtype=np.bool)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


In [19]:
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

AttributeError: 'RandomizedSearchCV' object has no attribute 'best_params_'

In [20]:
best_crf = rs.best_estimator_
y_pred = best_crf.predict(X_test)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

AttributeError: 'RandomizedSearchCV' object has no attribute 'best_estimator_'