In [3]:
from itertools import chain
import nltk
import os
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer
import sklearn
import pycrfsuite
from ner_utils import corpus_as_iob_sents

print(sklearn.__version__)

0.16.1


In [10]:
CORPUS_FOLDER = os.path.abspath('corpus_ner_big/square-enhanced/')

'/home/curtis/shoeboxed/py-classifier/corpus_ner_big/square-enhanced'

In [14]:
%%time
train_sents = corpus_as_iob_sents(CORPUS_FOLDER)
test_sents = corpus_as_iob_sents(CORPUS_FOLDER, set_name='test')

CPU times: user 32.7 ms, sys: 16 ms, total: 48.7 ms
Wall time: 67.3 ms


In [41]:
train_sents[12]

[('LINE', 'LINE', 'O'), ('TAB', 'TAB', 'O'), ('TAB', 'TAB', 'O')]

In [37]:
test_sents[5]

[('LINE', 'LINE', 'O'),
 ('TAB', 'TAB', 'O'),
 ('Latte', 'EX', 'B-PROD'),
 ('Large', 'EX', 'I-PROD'),
 ('TAB', 'TAB', 'O'),
 ('$', '$', 'O'),
 ('4.78', 'CD', 'O')]

In [17]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    features = [
        'bias',
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'word.isupper={}'.format(word.isupper()),
        'word.istitle={}'.format(word.istitle()),
        'word.isdigit={}'.format(word.isdigit()),
        'postag=' + postag,
        'postag[:2]=' + postag[:2],
    ]
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.extend([
                '-1:word.lower=' + word1.lower(),
                '-1:word.istitle={}'.format(word1.istitle()),
                '-1:word.isupper={}'.format(word1.isupper()),
                '-1:postag=' + postag1,
                '-1:postag[:2]=' + postag1[:2]
            ])
    else:
        features.append('BOS')
    
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
            '+1:postag=' + postag1,
            '+1:postag[:2]=' + postag1[:2],
        ])
    else:
        features.append('EOS')
        
    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [18]:
sent2features(train_sents[0])[0]

['bias',
 'word[-3:]=INE',
 'word[-2:]=NE',
 'word.isupper=True',
 'word.istitle=False',
 'word.isdigit=False',
 'postag=LINE',
 'postag[:2]=LI',
 'BOS',
 '+1:word.lower=tab',
 '+1:word.istitle=False',
 '+1:word.isupper=True',
 '+1:postag=TAB',
 '+1:postag[:2]=TA']

In [19]:
%%time
x_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

x_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

CPU times: user 1.33 s, sys: 48.3 ms, total: 1.38 s
Wall time: 1.41 s


In [20]:
%%time
trainer = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(x_train, y_train):
    trainer.append(xseq, yseq)

CPU times: user 464 ms, sys: 3.53 ms, total: 468 ms
Wall time: 480 ms


In [21]:
trainer.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

In [22]:
trainer.params()

['feature.minfreq',
 'feature.possible_states',
 'feature.possible_transitions',
 'c1',
 'c2',
 'max_iterations',
 'num_memories',
 'epsilon',
 'period',
 'delta',
 'linesearch',
 'max_linesearch']

In [26]:
%%time
trainer.train('square-enhanced.crfsuite')

CPU times: user 1.77 s, sys: 7.54 ms, total: 1.78 s
Wall time: 1.81 s


In [27]:
!ls -lh ./square-enhanced.crfsuite

-rw-rw-r-- 1 curtis curtis 32K Jun  2 11:35 ./square-enhanced.crfsuite


In [28]:
tagger = pycrfsuite.Tagger()
tagger.open('square-enhanced.crfsuite')

<contextlib.closing at 0x7f064b65a5c0>

In [31]:
example_sent = test_sents[5]
print(' '.join(sent2tokens(example_sent)), end = '\n\n')

print("Predicted:", ' '.join(tagger.tag(sent2features(example_sent))))
print("Correct:  ", ' '.join(sent2labels(example_sent)))

LINE TAB Latte Large TAB $ 4.78

Predicted: O O B-PROD I-PROD O O O
Correct:   O O B-PROD I-PROD O O O


In [32]:
def bio_classification_report(y_true, y_pred):
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
        
    tagset = set(lb.classes_) - {'O'}
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset,
    )

In [33]:
%%time
y_pred = [tagger.tag(xseq) for xseq in x_test]

CPU times: user 52.3 ms, sys: 0 ns, total: 52.3 ms
Wall time: 75.6 ms


In [34]:
print(bio_classification_report(y_test, y_pred))

             precision    recall  f1-score   support

     B-PROD       1.00      0.80      0.89        25
     I-PROD       0.91      0.94      0.93        33

avg / total       0.95      0.88      0.91        58



In [35]:
from collections import Counter
info = tagger.info()

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(info.transitions).most_common(15))

print("\nTop unlikely transitions:")
print_transitions(Counter(info.transitions).most_common()[-15:])

Top likely transitions:
B-PROD -> I-PROD  2.711294
O      -> O       2.026260
I-PROD -> I-PROD  1.935834
O      -> B-PROD  -0.241722
B-PROD -> B-PROD  -2.098638
B-PROD -> O       -2.963685
I-PROD -> O       -3.307693
I-PROD -> B-PROD  -3.589634
O      -> I-PROD  -6.361430

Top unlikely transitions:
B-PROD -> I-PROD  2.711294
O      -> O       2.026260
I-PROD -> I-PROD  1.935834
O      -> B-PROD  -0.241722
B-PROD -> B-PROD  -2.098638
B-PROD -> O       -2.963685
I-PROD -> O       -3.307693
I-PROD -> B-PROD  -3.589634
O      -> I-PROD  -6.361430


In [36]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-6s %s" % (weight, label, attr))    

print("Top positive:")
print_state_features(Counter(info.state_features).most_common(20))

print("\nTop negative:")
print_state_features(Counter(info.state_features).most_common()[-20:])

Top positive:
5.369755 I-PROD +1:word.lower=1733
4.648653 B-PROD BOS
4.465384 O      BOS
3.363478 B-PROD -1:word.lower=tab
3.159274 I-PROD +1:word.lower=(
2.900037 O      EOS
2.599837 O      word[-3:]=TAB
2.599837 O      word[-2:]=AB
2.508511 I-PROD +1:word.lower=welcome
2.266115 I-PROD +1:word.lower=tab
2.109153 O      word[-2:]=33
1.819385 O      word[-3:]=733
1.517955 O      word[-3:]=INE
1.462312 O      word[-2:]=NE
1.449820 I-PROD -1:word.lower=a
1.305649 I-PROD word[-2:]=nd
1.284116 O      word[-3:]=tal
1.133855 B-PROD -1:word.isupper=True
1.096099 O      word[-2:]=sa
1.096099 O      word[-3:]=isa

Top negative:
-0.606952 I-PROD word[-3:]=(
-0.606952 I-PROD word[-2:]=(
-0.607854 B-PROD word.isupper=False
-0.632614 B-PROD word.isdigit=False
-0.635250 O      word[-2:]=ch
-0.650254 O      -1:word.lower=4
-0.673550 B-PROD bias
-0.717411 B-PROD word.istitle=False
-0.718966 B-PROD postag[:2]=JJ
-0.825443 I-PROD word.isdigit=True
-0.837734 O      +1:word.isupper=True
-0.868834 I-PROD wo