In [59]:
%matplotlib inline  
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [83]:
from itertools import chain 
import nltk  #To install this package in windows with conda please run:  'conda install -c anaconda nltk'. to install this package in windows with pip please run: 'pip install nltk'
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer,confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
import sklearn_crfsuite  #to install this package in windows with pip please run: 'pip install sklearn-crfsuite'
from sklearn_crfsuite import metrics, scorers




## Let's use CoNLL 2002 data to build a NER system

CoNLL2002 corpus is available in NLTK. We use Spanish data.

In [61]:
import nltk  
nltk.download('conll2002')    # the data can be also found here
#nltk.corpus.conll2002.fileids()

[nltk_data] Downloading package conll2002 to
[nltk_data]     C:\Users\moha\AppData\Roaming\nltk_data...
[nltk_data]   Package conll2002 is already up-to-date!


True

In [62]:
%%time
train_sents = list(nltk.corpus.conll2002.iob_sents('esp.train')) #The data consists of three files: one training file and two test files testa and testb. 
test_sents = list(nltk.corpus.conll2002.iob_sents('esp.testb')) #The first test file will be used in the development phase for finding good parameters for the learning system. The second test file will be used for the final evaluation.

Wall time: 1.53 s


In [63]:
train_sents=train_sents[0:4000]
test_sents=test_sents[0:4000]

In [64]:
len(train_sents) #sent is abbrivation of sentences! 

4000

## A Random Sample of data


In [65]:
train_sents[20] # 4th sample

[('Como', 'CS', 'O'),
 ('contrapartida', 'NC', 'O'),
 (',', 'Fc', 'O'),
 ('Deutsche', 'NC', 'B-ORG'),
 ('Telekom', 'NC', 'I-ORG'),
 ('venderá', 'VMI', 'O'),
 ('al', 'SP', 'O'),
 ('consorcio', 'NC', 'O'),
 ('francés', 'AQ', 'O'),
 ('su', 'DP', 'O'),
 ('participación', 'NC', 'O'),
 ('del', 'SP', 'O'),
 ('25', 'Z', 'O'),
 ('por', 'SP', 'O'),
 ('ciento', 'PN', 'O'),
 ('en', 'SP', 'O'),
 ('el', 'DA', 'O'),
 ('empresa', 'NC', 'O'),
 ('mixta', 'AQ', 'O'),
 ('británica', 'AQ', 'O'),
 ('MetroHoldings', 'NC', 'B-ORG'),
 ('.', 'Fp', 'O')]

## Features

Next, define some features. In this example we use word identity, word suffix, word shape and word POS tag; also, some information from nearby words is used. 

This makes a simple baseline, but you certainly can add and remove some features to get (much?) better results - experiment with it.

sklearn-crfsuite (and python-crfsuite) supports several feature formats; here we use feature dicts.

In [66]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],        
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True
        
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True
                
    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

### This is what word2features extracts:

In [67]:
word2features(train_sents[20],2) 

{'bias': 1.0,
 'word.lower()': ',',
 'word[-3:]': ',',
 'word[-2:]': ',',
 'word.isupper()': False,
 'word.istitle()': False,
 'word.isdigit()': False,
 'postag': 'Fc',
 'postag[:2]': 'Fc',
 '-1:word.lower()': 'contrapartida',
 '-1:word.istitle()': False,
 '-1:word.isupper()': False,
 '-1:postag': 'NC',
 '-1:postag[:2]': 'NC',
 '+1:word.lower()': 'deutsche',
 '+1:word.istitle()': True,
 '+1:word.isupper()': False,
 '+1:postag': 'NC',
 '+1:postag[:2]': 'NC'}

### This is what sent2features extracts:

In [68]:
sent2features(train_sents[20])


[{'bias': 1.0,
  'word.lower()': 'como',
  'word[-3:]': 'omo',
  'word[-2:]': 'mo',
  'word.isupper()': False,
  'word.istitle()': True,
  'word.isdigit()': False,
  'postag': 'CS',
  'postag[:2]': 'CS',
  'BOS': True,
  '+1:word.lower()': 'contrapartida',
  '+1:word.istitle()': False,
  '+1:word.isupper()': False,
  '+1:postag': 'NC',
  '+1:postag[:2]': 'NC'},
 {'bias': 1.0,
  'word.lower()': 'contrapartida',
  'word[-3:]': 'ida',
  'word[-2:]': 'da',
  'word.isupper()': False,
  'word.istitle()': False,
  'word.isdigit()': False,
  'postag': 'NC',
  'postag[:2]': 'NC',
  '-1:word.lower()': 'como',
  '-1:word.istitle()': True,
  '-1:word.isupper()': False,
  '-1:postag': 'CS',
  '-1:postag[:2]': 'CS',
  '+1:word.lower()': ',',
  '+1:word.istitle()': False,
  '+1:word.isupper()': False,
  '+1:postag': 'Fc',
  '+1:postag[:2]': 'Fc'},
 {'bias': 1.0,
  'word.lower()': ',',
  'word[-3:]': ',',
  'word[-2:]': ',',
  'word.isupper()': False,
  'word.istitle()': False,
  'word.isdigit()': Fal

### This is what sent2labels extracts:

In [69]:
sent2labels(train_sents[20])

['O',
 'O',
 'O',
 'B-ORG',
 'I-ORG',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-ORG',
 'O']

Extract features from the data:

### This is what sent2tokens extracts:

In [70]:
sent2tokens(train_sents[20])

['Como',
 'contrapartida',
 ',',
 'Deutsche',
 'Telekom',
 'venderá',
 'al',
 'consorcio',
 'francés',
 'su',
 'participación',
 'del',
 '25',
 'por',
 'ciento',
 'en',
 'el',
 'empresa',
 'mixta',
 'británica',
 'MetroHoldings',
 '.']

In [71]:
%%time
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

Wall time: 480 ms


## Training

To see all possible CRF parameters check its docstring. Here we are useing L-BFGS training algorithm (it is default) with Elastic Net (L1 + L2) regularization.

In [72]:
%%time

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    c1=0.1, 
    c2=0.1, 
    max_iterations=100, 
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

Wall time: 12.2 s


## Evaluation

There is much more O entities in data set, but we're more interested in other entities. To account for this we'll use averaged F1 score computed for all labels except for O. ``sklearn-crfsuite.metrics`` package provides some useful metrics for sequence classification task, including this one.

In [73]:
labels = list(crf.classes_)
labels.remove('O')
labels

['B-LOC', 'B-ORG', 'B-PER', 'I-PER', 'B-MISC', 'I-ORG', 'I-LOC', 'I-MISC']

In [74]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred, 
                      average='weighted', labels=labels)

0.7410293084158585

Inspect per-class results in more detail:

In [75]:
# group B and I results
sorted_labels = sorted(
    labels, 
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

              precision    recall  f1-score   support

       B-LOC      0.753     0.750     0.752      1084
       I-LOC      0.548     0.471     0.507       325
      B-MISC      0.583     0.434     0.497       339
      I-MISC      0.568     0.478     0.519       557
       B-ORG      0.797     0.781     0.789      1400
       I-ORG      0.797     0.766     0.782      1104
       B-PER      0.800     0.861     0.830       735
       I-PER      0.858     0.924     0.890       634

   micro avg      0.758     0.734     0.746      6178
   macro avg      0.713     0.683     0.696      6178
weighted avg      0.751     0.734     0.741      6178



## Hyperparameter Optimization

To improve quality try to select regularization parameters using randomized search and 3-fold cross-validation.

I takes quite a lot of CPU time and RAM (we're fitting a model ``50 * 3 = 150`` times), so grab a tea and be patient, or reduce n_iter in RandomizedSearchCV, or fit model only on a subset of training data.

In [76]:
%%time
# define fixed parameters and parameters to search
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    max_iterations=100, 
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score, 
                        average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space, 
                        cv=3, 
                        verbose=1, 
                        n_jobs=-1, 
                        n_iter=5, 
                        scoring=f1_scorer)
rs.fit(X_train, y_train)

Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  2.0min finished


Wall time: 2min 13s


Best result:

In [77]:
# crf = rs.best_estimator_
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

best params: {'c1': 0.18217460758684698, 'c2': 0.07560257235551793}
best CV score: 0.7390181562702433
model size: 0.71M


In [102]:
rs.cv_results_

{'mean_fit_time': array([8.64951833, 8.5536921 , 8.57835658, 8.41075357, 8.46973372]),
 'std_fit_time': array([0.53735559, 0.34659121, 0.43728475, 0.27637101, 0.60209581]),
 'mean_score_time': array([0.42084988, 0.40203476, 0.43387318, 0.41166766, 0.40605036]),
 'std_score_time': array([0.04562385, 0.03890944, 0.0635739 , 0.0514427 , 0.03187468]),
 'param_c1': masked_array(data=[0.8356301624778462, 0.41920591435609644,
                    1.069675198012202, 0.22905813691576424,
                    0.18217460758684698],
              mask=[False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_c2': masked_array(data=[0.012143369127797163, 0.21880474484859902,
                    0.017487806045237687, 0.13048480853187902,
                    0.07560257235551793],
              mask=[False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'c1': 0.8356301624778462, 'c2': 0.012143369127797163},
  {'c1': 0.

### Check parameter space

A chart which shows which ``c1`` and ``c2`` values have RandomizedSearchCV checked. Red color means better results, blue means worse.

## Check best estimator on our test data

As you can see, quality is improved.

In [104]:
crf = rs.best_estimator_
y_pred = crf.predict(X_test)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

              precision    recall  f1-score   support

       B-LOC      0.751     0.744     0.747      1084
       I-LOC      0.535     0.446     0.487       325
      B-MISC      0.586     0.434     0.498       339
      I-MISC      0.557     0.476     0.513       557
       B-ORG      0.791     0.778     0.785      1400
       I-ORG      0.786     0.769     0.777      1104
       B-PER      0.793     0.857     0.824       735
       I-PER      0.853     0.918     0.884       634

   micro avg      0.752     0.730     0.741      6178
   macro avg      0.707     0.678     0.689      6178
weighted avg      0.744     0.730     0.736      6178



## Let's check what classifier learned

In [107]:
from collections import Counter

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(20))

print("\nTop unlikely transitions:")
print_transitions(Counter(crf.transition_features_).most_common()[-20:])

Top likely transitions:
B-MISC -> I-MISC  8.185305
I-MISC -> I-MISC  7.528064
B-LOC  -> I-LOC   6.343427
B-PER  -> I-PER   5.832125
B-ORG  -> I-ORG   5.709613
I-LOC  -> I-LOC   5.365236
I-ORG  -> I-ORG   4.828668
I-PER  -> I-PER   4.033823
O      -> O       3.702218
O      -> B-ORG   1.950945
O      -> B-PER   1.336051
O      -> B-LOC   1.013002
O      -> B-MISC  0.911307
I-PER  -> B-LOC   0.211734
B-LOC  -> B-LOC   0.103000
B-ORG  -> O       -0.053004
B-MISC -> O       -0.057276
B-MISC -> I-LOC   -0.178759
B-PER  -> O       -0.248781
B-PER  -> I-MISC  -0.255145

Top unlikely transitions:
I-PER  -> B-MISC  -2.013218
I-ORG  -> I-LOC   -2.075814
I-MISC -> B-ORG   -2.138798
B-LOC  -> I-ORG   -2.161732
I-PER  -> I-ORG   -2.164271
I-LOC  -> B-LOC   -2.178574
I-MISC -> B-LOC   -2.201608
I-LOC  -> B-PER   -2.211915
I-ORG  -> I-PER   -2.486622
I-ORG  -> B-MISC  -2.502074
I-PER  -> B-ORG   -2.597654
I-PER  -> B-PER   -2.602817
I-MISC -> I-ORG   -2.769440
B-PER  -> B-PER   -2.848963
I-ORG  -> B-

We can see that, for example, it is very likely that the beginning of an organization name (B-ORG) will be followed by a token inside organization name (I-ORG), but transitions to I-ORG from tokens with other labels are penalized.

Check the state features:

In [108]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))    

print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(30))

print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-30:])

Top positive:
7.592764 B-ORG    word.lower():psoe-progresistas
5.821067 O        BOS
4.785025 B-ORG    -1:word.lower():distancia
4.777626 B-ORG    word.lower():petrobras
4.760178 B-ORG    word[-2:]:-e
4.702719 B-MISC   word.lower():cc2305001730
4.702719 B-MISC   word[-3:]:730
4.498578 B-ORG    word.lower():coag-extremadura
4.483249 B-MISC   word.lower():diversia
4.454884 B-ORG    word.lower():telefónica
4.452901 O        bias
4.443639 O        word.lower():r.
4.443639 O        word[-3:]:R.
4.443032 B-ORG    word.lower():esquerra
4.440906 B-ORG    +1:word.lower():plasencia
4.422201 O        -1:word.lower():siglo
4.400474 B-MISC   word.lower():justicia
4.387753 B-ORG    word.lower():terra
4.386111 B-MISC   word.lower():exteriores
4.344909 O        word.lower():b
4.344909 O        word[-3:]:B
4.344909 O        word[-2:]:B
4.275126 B-PER    -1:word.lower():según
4.211182 B-MISC   word.lower():competencia
4.137659 B-LOC    -1:word.lower():cantabria
4.082948 B-LOC    +1:word.lower():finaliza



Some observations:

   * **9.385823 B-ORG word.lower():psoe-progresistas** - the model remembered names of some entities - maybe it is overfit, or maybe our features are not adequate, or maybe remembering is indeed helpful;
   * **4.636151 I-LOC -1:word.lower():calle:** "calle" is a street in Spanish; model learns that if a previous word was "calle" then the token is likely a part of location;
   * **-5.632036 O word.isupper()**, **-8.215073 O word.istitle()** : UPPERCASED or TitleCased words are likely entities of some kind;
   * **-2.097561 O postag:NP** - proper nouns (NP is a proper noun in the Spanish tagset) are often entities.

What to do next

    * Load 'testa' Spanish data.
    * Use it to develop better features and to find best model parameters.
    * Apply the model to 'testb' data again.

The model in this notebook is just a starting point; you certainly can do better!

