- http://nbviewer.jupyter.org/github/tpeng/python-crfsuite/blob/master/examples/CoNLL%202002.ipynb

## CRF NER

In [31]:
from itertools import chain
import nltk
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer
import sklearn
import pycrfsuite

## DATA LOAD

In [32]:
# nltk에 있는 conll2002 corpus
nltk.corpus.conll2002.fileids()

['esp.testa', 'esp.testb', 'esp.train', 'ned.testa', 'ned.testb', 'ned.train']

In [57]:
train_sents = list(nltk.corpus.conll2002.iob_sents('esp.train'))
test_sents = list(nltk.corpus.conll2002.iob_sents('esp.testb'))

In [58]:
len(train_sents), len(test_sents)

(8323, 1517)

In [34]:
# Data sample
sample = train_sents[2]
sample

[('El', 'DA', 'O'),
 ('Abogado', 'NC', 'B-PER'),
 ('General', 'AQ', 'I-PER'),
 ('del', 'SP', 'I-PER'),
 ('Estado', 'NC', 'I-PER'),
 (',', 'Fc', 'O'),
 ('Daryl', 'VMI', 'B-PER'),
 ('Williams', 'NC', 'I-PER'),
 (',', 'Fc', 'O'),
 ('subrayó', 'VMI', 'O'),
 ('hoy', 'RG', 'O'),
 ('la', 'DA', 'O'),
 ('necesidad', 'NC', 'O'),
 ('de', 'SP', 'O'),
 ('tomar', 'VMN', 'O'),
 ('medidas', 'NC', 'O'),
 ('para', 'SP', 'O'),
 ('proteger', 'VMN', 'O'),
 ('al', 'SP', 'O'),
 ('sistema', 'NC', 'O'),
 ('judicial', 'AQ', 'O'),
 ('australiano', 'AQ', 'O'),
 ('frente', 'RG', 'O'),
 ('a', 'SP', 'O'),
 ('una', 'DI', 'O'),
 ('página', 'NC', 'O'),
 ('de', 'SP', 'O'),
 ('internet', 'NC', 'O'),
 ('que', 'PR', 'O'),
 ('imposibilita', 'VMI', 'O'),
 ('el', 'DA', 'O'),
 ('cumplimiento', 'NC', 'O'),
 ('de', 'SP', 'O'),
 ('los', 'DA', 'O'),
 ('principios', 'NC', 'O'),
 ('básicos', 'AQ', 'O'),
 ('de', 'SP', 'O'),
 ('la', 'DA', 'O'),
 ('Ley', 'NC', 'B-MISC'),
 ('.', 'Fp', 'O')]

In [35]:
' '.join([t for t, _ , _ in sample])

'El Abogado General del Estado , Daryl Williams , subrayó hoy la necesidad de tomar medidas para proteger al sistema judicial australiano frente a una página de internet que imposibilita el cumplimiento de los principios básicos de la Ley .'

## DATA PREPROCESSING
- 특징(Features)을 정의.
    - word identity 
    - word suffix
    - word shape
    - word POS tag
    - some information from nearby words is used.

In [None]:
train_sents[0]

In [43]:
sent = train_sents[0]
[(sent, i) for i in range(len(sent))]

[([('Melbourne', 'NP', 'B-LOC'),
   ('(', 'Fpa', 'O'),
   ('Australia', 'NP', 'B-LOC'),
   (')', 'Fpt', 'O'),
   (',', 'Fc', 'O'),
   ('25', 'Z', 'O'),
   ('may', 'NC', 'O'),
   ('(', 'Fpa', 'O'),
   ('EFE', 'NC', 'B-ORG'),
   (')', 'Fpt', 'O'),
   ('.', 'Fp', 'O')],
  0),
 ([('Melbourne', 'NP', 'B-LOC'),
   ('(', 'Fpa', 'O'),
   ('Australia', 'NP', 'B-LOC'),
   (')', 'Fpt', 'O'),
   (',', 'Fc', 'O'),
   ('25', 'Z', 'O'),
   ('may', 'NC', 'O'),
   ('(', 'Fpa', 'O'),
   ('EFE', 'NC', 'B-ORG'),
   (')', 'Fpt', 'O'),
   ('.', 'Fp', 'O')],
  1),
 ([('Melbourne', 'NP', 'B-LOC'),
   ('(', 'Fpa', 'O'),
   ('Australia', 'NP', 'B-LOC'),
   (')', 'Fpt', 'O'),
   (',', 'Fc', 'O'),
   ('25', 'Z', 'O'),
   ('may', 'NC', 'O'),
   ('(', 'Fpa', 'O'),
   ('EFE', 'NC', 'B-ORG'),
   (')', 'Fpt', 'O'),
   ('.', 'Fp', 'O')],
  2),
 ([('Melbourne', 'NP', 'B-LOC'),
   ('(', 'Fpa', 'O'),
   ('Australia', 'NP', 'B-LOC'),
   (')', 'Fpt', 'O'),
   (',', 'Fc', 'O'),
   ('25', 'Z', 'O'),
   ('may', 'NC', 'O'),
   

In [48]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
        'postag=' + postag,
        'postag[:2]=' + postag[:2],
    ]
    if i > 0: 
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
            '-1:postag=' + postag1,
            '-1:postag[:2]=' + postag1[:2],
        ])
    else:
        features.append('BOS') # 맨 첫글자면..
        
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
            '+1:postag=' + postag1,
            '+1:postag[:2]=' + postag1[:2],
        ])
    else:
        features.append('EOS') # 맨 마지막 글자면..
                
    return features

In [49]:
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]    

In [53]:
sent2features(train_sents[0])

[['bias',
  'word.lower=melbourne',
  'word[-3:]=rne',
  'word[-2:]=ne',
  'word.isupper=False',
  'word.istitle=True',
  'word.isdigit=False',
  'postag=NP',
  'postag[:2]=NP',
  'BOS',
  '+1:word.lower=(',
  '+1:word.istitle=False',
  '+1:word.isupper=False',
  '+1:postag=Fpa',
  '+1:postag[:2]=Fp'],
 ['bias',
  'word.lower=(',
  'word[-3:]=(',
  'word[-2:]=(',
  'word.isupper=False',
  'word.istitle=False',
  'word.isdigit=False',
  'postag=Fpa',
  'postag[:2]=Fp',
  '-1:word.lower=melbourne',
  '-1:word.istitle=True',
  '-1:word.isupper=False',
  '-1:postag=NP',
  '-1:postag[:2]=NP',
  '+1:word.lower=australia',
  '+1:word.istitle=True',
  '+1:word.isupper=False',
  '+1:postag=NP',
  '+1:postag[:2]=NP'],
 ['bias',
  'word.lower=australia',
  'word[-3:]=lia',
  'word[-2:]=ia',
  'word.isupper=False',
  'word.istitle=True',
  'word.isdigit=False',
  'postag=NP',
  'postag[:2]=NP',
  '-1:word.lower=(',
  '-1:word.istitle=False',
  '-1:word.isupper=False',
  '-1:postag=Fpa',
  '-1:post

In [39]:
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

## TRAIN THE MODEL

In [54]:
%%time
trainer = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

CPU times: user 3.5 s, sys: 56.2 ms, total: 3.56 s
Wall time: 3.56 s


In [60]:
# L-BFGS training algorithm (default) with Elastic Net (L1 + L2) regularization.
trainer.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

In [61]:
trainer.params()

['feature.minfreq',
 'feature.possible_states',
 'feature.possible_transitions',
 'c1',
 'c2',
 'max_iterations',
 'num_memories',
 'epsilon',
 'period',
 'delta',
 'linesearch',
 'max_linesearch']

In [62]:
%%time
trainer.train('conll2002-esp.crfsuite')

CPU times: user 16.9 s, sys: 78.6 ms, total: 16.9 s
Wall time: 17 s


- trainer의 logparser를 통해 최종상태에 대한 정보를 얻을 수 있음.
- If we had tagged our input data using the optional group argument in add, and had used the optional holdout argument during train, there would be information about the trainer's performance on the holdout set as well.

In [63]:
trainer.logparser.last_iteration

{'active_features': 11346,
 'error_norm': 1262.912078,
 'feature_norm': 79.110017,
 'linesearch_step': 1.0,
 'linesearch_trials': 1,
 'loss': 14807.577946,
 'num': 50,
 'scores': {},
 'time': 0.288}

In [64]:
print(len(trainer.logparser.iterations), trainer.logparser.iterations[-1])

50 {'num': 50, 'scores': {}, 'loss': 14807.577946, 'feature_norm': 79.110017, 'error_norm': 1262.912078, 'active_features': 11346, 'linesearch_trials': 1, 'linesearch_step': 1.0, 'time': 0.288}


## Make predictions

In [65]:
tagger = pycrfsuite.Tagger()
tagger.open('conll2002-esp.crfsuite')

<contextlib.closing at 0x114250be0>

In [67]:
example_sent = test_sents[0]
print(' '.join(sent2tokens(example_sent)))

print("Predicted:", ' '.join(tagger.tag(sent2features(example_sent))))
print("Correct:  ", ' '.join(sent2labels(example_sent)))

La Coruña , 23 may ( EFECOM ) .
Predicted: B-LOC I-LOC O O O O B-ORG O O
Correct:   B-LOC I-LOC O O O O B-ORG O O


## Evaluate the model

In [68]:
def bio_classification_report(y_true, y_pred):
    """
    Classification report for a list of BIO-encoded sequences.
    It computes token-level metrics and discards "O" labels.
    
    Note that it requires scikit-learn 0.15+ (or a version from github master)
    to calculate averages properly!
    """
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
        
    tagset = set(lb.classes_) - {'O'}
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset,
    )

In [69]:
%%time
y_pred = [tagger.tag(xseq) for xseq in X_test]

CPU times: user 414 ms, sys: 2.97 ms, total: 417 ms
Wall time: 423 ms


In [70]:
print(bio_classification_report(y_test, y_pred))

             precision    recall  f1-score   support

      B-LOC       0.78      0.75      0.76      1084
      I-LOC       0.66      0.60      0.63       325
     B-MISC       0.69      0.47      0.56       339
     I-MISC       0.61      0.49      0.54       557
      B-ORG       0.79      0.81      0.80      1400
      I-ORG       0.80      0.79      0.80      1104
      B-PER       0.82      0.87      0.84       735
      I-PER       0.87      0.93      0.90       634

avg / total       0.77      0.76      0.76      6178



## Let's check what classifier learned

In [71]:
from collections import Counter
info = tagger.info()

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(info.transitions).most_common(15))

print("\nTop unlikely transitions:")
print_transitions(Counter(info.transitions).most_common()[-15:])

Top likely transitions:
B-ORG  -> I-ORG   8.631963
I-ORG  -> I-ORG   7.833706
B-PER  -> I-PER   6.998706
B-LOC  -> I-LOC   6.913675
I-MISC -> I-MISC  6.129735
B-MISC -> I-MISC  5.538291
I-LOC  -> I-LOC   4.983567
I-PER  -> I-PER   3.748358
B-ORG  -> B-LOC   1.727090
B-PER  -> B-LOC   1.388267
B-LOC  -> B-LOC   1.240278
O      -> O       1.197929
O      -> B-ORG   1.097062
I-PER  -> B-LOC   1.083332
O      -> B-MISC  1.046113

Top unlikely transitions:
I-PER  -> B-ORG   -2.056130
I-LOC  -> I-ORG   -2.143940
B-ORG  -> I-MISC  -2.167501
I-PER  -> I-ORG   -2.369380
B-ORG  -> I-PER   -2.378110
I-MISC -> I-PER   -2.458788
B-LOC  -> I-PER   -2.516414
I-ORG  -> I-MISC  -2.571973
I-LOC  -> B-PER   -2.697791
I-LOC  -> I-PER   -3.065950
I-ORG  -> I-PER   -3.364434
O      -> I-PER   -7.322841
O      -> I-MISC  -7.648246
O      -> I-ORG   -8.024126
O      -> I-LOC   -8.333815


We can see that, for example, it is very likely that the beginning of an organization name (B-ORG) will be followed by a token inside organization name (I-ORG), but transitions to I-ORG from tokens with other labels are penalized. Also note I-PER -> B-LOC transition: a positive weight means that model thinks that a person name is often followed by a location.

Check the state features:

In [72]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-6s %s" % (weight, label, attr))    

print("Top positive:")
print_state_features(Counter(info.state_features).most_common(20))

print("\nTop negative:")
print_state_features(Counter(info.state_features).most_common()[-20:])

Top positive:
8.886516 B-ORG  word.lower=efe-cantabria
8.743642 B-ORG  word.lower=psoe-progresistas
5.769032 B-LOC  -1:word.lower=cantabria
5.195429 I-LOC  -1:word.lower=calle
5.116821 O      word.lower=mayo
4.990871 O      -1:word.lower=día
4.910915 I-ORG  -1:word.lower=l
4.721572 B-MISC word.lower=diversia
4.676259 B-ORG  word.lower=telefónica
4.334354 B-ORG  word[-2:]=-e
4.149862 B-ORG  word.lower=amena
4.141370 B-ORG  word.lower=terra
3.942852 O      word.istitle=False
3.926397 B-ORG  word.lower=continente
3.924672 B-ORG  word.lower=acesa
3.888706 O      word.lower=euro
3.856445 B-PER  -1:word.lower=según
3.812373 B-MISC word.lower=exteriores
3.807582 I-MISC -1:word.lower=1.9
3.807098 B-MISC word.lower=sanidad

Top negative:
-1.965379 O      word.lower=fundación
-1.981541 O      -1:word.lower=británica
-2.118347 O      word.lower=061
-2.190653 B-PER  word[-3:]=nes
-2.226373 B-ORG  postag=SP
-2.226373 B-ORG  postag[:2]=SP
-2.260972 O      word[-3:]=uia
-2.384920 O      -1:word.lower