# Named Entity Extraction Tutorial
This tutorial is a slight modification of the tutorial by Sam Galen.

In [1]:
from __future__ import print_function
from sklearn.metrics import confusion_matrix
import io
import nltk
import scipy
import codecs
import sklearn
import pycrfsuite
import pandas as pd
from itertools import chain
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import classification_report

print('sklearn version:', sklearn.__version__)
print('Libraries succesfully loaded!')


sklearn version: 0.21.2
Libraries succesfully loaded!


In [136]:
def sent2features(sent, feature_func):
    return [feature_func(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [s[-1] for s in sent]

def sent2tokens(sent):
    return [s[0] for s in sent]

def bio_classification_report(y_true, y_pred):
    """
    Classification report for a list of BIO-encoded sequences.
    It computes token-level metrics and discards "O" labels.
    
    Note that it requires scikit-learn 0.15+ (or a version from github master)
    to calculate averages properly!
    """
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(y_true)
    y_pred_combined = lb.transform(y_pred)
        
    tagset = set(lb.classes_) - {'O'}
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset,
    )
            
def word2simple_features(sent, i):
    '''
    This makes a simple baseline.  
    You can add and/or remove features to get (much?) better results.
    Experiment with it as you will need to do this for assignment.
    '''
    word = sent[i][0]

    features = {
        'bias': 1.0, # This feature is constant for all words.
        'word.lower()': word.lower(), # This feature is the word, ignoring case.
        'word[-2:]': word[-2:], # This feature is the last two characters of the word (i.e. the suffix).
        'word[-3:]': word[-3:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'len(word)': len(word)
    }
    if i > 0:
        word1 = sent[i-1][0]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            'len(word)': len(word),
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            'len(word)': len(word),
        })
    else:
        features['EOS'] = True

    return features

# load data and preprocess
def extract_data(path):
    """
    Extracting data from train file or test file. 
    path - the path of the file to extract
    
    return:
        res - a list of sentences, each sentence is a
              a list of tuples. For train file, each tuple
              contains token and label. For test file, each
              tuple only contains token.
        ids - a list of ids for the corresponding token. This
              is mainly for Kaggle submission.
    """
    file = io.open(path, mode="r", encoding="utf-8")
    next(file)
    res = []
    ids = []
    sent = []
    for line in file:
        if line != '\n':
            # Each line contains the position ID, the token, and (for the training set) the label.
            parts = line.strip().split(' ')
            sent.append(tuple(parts[1:]))
            ids.append(parts[0])
        else:
            res.append(sent)
            sent = []
                
    return res, ids
            

# Build a NER classifier

## Load data and extract features

In [145]:
# Load train and test data
train_data, train_ids = extract_data('train')
test_data, test_ids = extract_data('test')

# Load true labels for test data
test_labels = list(pd.read_csv('test_ground_truth').loc[:, 'label'])

print('Train and Test data loaded succesfully!')

# Feature extraction using the word2simple_features function
train_features = [sent2features(s, feature_func=word2simple_features) for s in train_data]
train_labels = [sent2labels(s) for s in train_data]
test_features = [sent2features(s, feature_func=word2simple_features) for s in test_data]

trainer = pycrfsuite.Trainer(algorithm='lbfgs', verbose=False)
for xseq, yseq in zip(train_features, train_labels):
    trainer.append(xseq, yseq)
print('Feature Extraction done!')    

# Explore the extracted features    
sent2features(train_data[0], word2simple_features)

Train and Test data loaded succesfully!
Feature Extraction done!


[{'bias': 1.0,
  'word.lower()': 'también',
  'word[-2:]': 'én',
  'word[-3:]': 'ién',
  'word.isupper()': False,
  'word.istitle()': True,
  'word.isdigit()': False,
  'len(word)': 7,
  'BOS': True,
  '+1:word.lower()': 'el',
  '+1:word.istitle()': False,
  '+1:word.isupper()': False},
 {'bias': 1.0,
  'word.lower()': 'el',
  'word[-2:]': 'el',
  'word[-3:]': 'el',
  'word.isupper()': False,
  'word.istitle()': False,
  'word.isdigit()': False,
  'len(word)': 2,
  '-1:word.lower()': 'también',
  '-1:word.istitle()': True,
  '-1:word.isupper()': False,
  '+1:word.lower()': 'secretario',
  '+1:word.istitle()': False,
  '+1:word.isupper()': False},
 {'bias': 1.0,
  'word.lower()': 'secretario',
  'word[-2:]': 'io',
  'word[-3:]': 'rio',
  'word.isupper()': False,
  'word.istitle()': False,
  'word.isdigit()': False,
  'len(word)': 10,
  '-1:word.lower()': 'el',
  '-1:word.istitle()': False,
  '-1:word.isupper()': False,
  '+1:word.lower()': 'general',
  '+1:word.istitle()': False,
  '+1:

## Explore the classifier parameters

In [138]:
trainer.params()

['feature.minfreq',
 'feature.possible_states',
 'feature.possible_transitions',
 'c1',
 'c2',
 'max_iterations',
 'num_memories',
 'epsilon',
 'period',
 'delta',
 'linesearch',
 'max_linesearch']

## Set the classifier parameters

In [139]:
trainer.set_params({
    'c1': 0.1,   # coefficient for L1 penalty
    'c2': 0.1,  # coefficient for L2 penalty
    'max_iterations': 100,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

## Train a NER model

In [140]:
%%time
trainer.train('ner-esp.model')

print('Training done :)')

Training done :)
Wall time: 19.3 s


## Make predictions with your NER model
Make predictions and evaluate your model on the test set.
To use your NER model, create pycrfsuite.Tagger, open the model, and use the "tag" method, as follows:

In [141]:
# Make predictions
tagger = pycrfsuite.Tagger()
tagger.open('ner-esp.model')
test_pred = [tagger.tag(xseq) for xseq in test_features]
test_pred = [s for w in test_pred for s in w]

## Print evaluation
print(bio_classification_report(test_pred, test_labels))


              precision    recall  f1-score   support

       B-LOC       0.82      0.82      0.82      2036
       I-LOC       0.73      0.77      0.75       725
      B-MISC       0.61      0.76      0.68       706
      I-MISC       0.62      0.65      0.63      1205
       B-ORG       0.85      0.87      0.86      3136
       I-ORG       0.84      0.81      0.82      2291
       B-PER       0.90      0.91      0.90      1865
       I-PER       0.94      0.94      0.94      1632

   micro avg       0.82      0.84      0.83     13596
   macro avg       0.79      0.82      0.80     13596
weighted avg       0.82      0.84      0.83     13596
 samples avg       0.10      0.10      0.10     13596



In [142]:
print (len(trainer.logparser.iterations), trainer.logparser.iterations[-1])

100 {'num': 100, 'scores': {}, 'loss': 8014.592385, 'feature_norm': 143.515279, 'error_norm': 576.824024, 'active_features': 28145, 'linesearch_trials': 1, 'linesearch_step': 1.0, 'time': 0.178}


## Check what the classifier has learned

In [143]:
from collections import Counter
info = tagger.info()

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(info.transitions).most_common(15))

print("\nTop unlikely transitions:")
print_transitions(Counter(info.transitions).most_common()[-15:])

Top likely transitions:
I-MISC -> I-MISC  4.700132
I-ORG  -> I-ORG   4.368079
B-ORG  -> I-ORG   4.291800
I-LOC  -> I-LOC   3.938575
B-LOC  -> I-LOC   3.811628
B-PER  -> I-PER   3.713824
B-MISC -> I-MISC  3.676691
O      -> O       3.540892
I-PER  -> I-PER   3.222277
O      -> B-ORG   1.580806
O      -> B-MISC  1.063171
O      -> B-PER   0.655464
O      -> B-LOC   0.532967
B-ORG  -> O       -0.122448
I-LOC  -> O       -0.133611

Top unlikely transitions:
B-ORG  -> I-LOC   -3.541001
B-MISC -> I-ORG   -3.640333
B-MISC -> B-MISC  -3.669086
B-PER  -> B-ORG   -3.685735
I-ORG  -> I-MISC  -3.696378
I-MISC -> B-ORG   -3.729409
B-ORG  -> B-ORG   -3.739666
I-ORG  -> I-PER   -3.892700
I-ORG  -> I-LOC   -4.104742
I-ORG  -> B-LOC   -4.748850
B-PER  -> B-PER   -5.236093
O      -> I-ORG   -5.487109
O      -> I-MISC  -5.678040
O      -> I-PER   -5.773493
O      -> I-LOC   -6.374917


We can see that, for example, it is very likely that the beginning of a person name (B-PER) will be followed by a token inside person name (I-PER). Also note O -> B-LOC are penalized.

## Check the state features

In [144]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-6s %s" % (weight, label, attr))    

print("Top positive:")
print_state_features(Counter(info.state_features).most_common(20))

print("\nTop negative:")
print_state_features(Counter(info.state_features).most_common()[-20:])

Top positive:
8.814194 B-ORG  word.lower():efe-cantabria
7.599136 O      word.lower():y
7.438450 B-ORG  word.lower():psoe-progresistas
7.059294 O      word.lower():a
5.752124 O      BOS
5.322476 I-PER  -1:word.lower():antoñete
5.189838 B-ORG  word.lower():bnp-paribas
5.040786 B-LOC  word.lower():líbano
4.941653 B-ORG  word.lower():petrobras
4.840271 B-PER  word.lower():franca
4.797150 B-MISC word.lower():firagran
4.731994 B-MISC word.lower():justicia
4.731486 B-ORG  word[-2:]:-e
4.680997 B-ORG  word.lower():gales
4.637589 B-ORG  +1:word.lower():deutsche
4.503533 I-ORG  -1:word.lower():l
4.279195 B-LOC  -1:word.lower():cantabria
4.275674 B-ORG  word.lower():telefónica
4.265343 O      word[-2:]:63
4.254343 B-ORG  word.lower():eu-ecologista

Top negative:
-2.393790 O      word.lower():bosque
-2.403469 B-PER  -1:word.lower():las
-2.408018 O      word.lower():2000
-2.452281 O      word[-3:]:and
-2.461585 O      word.lower():061
-2.519016 O      -1:word.lower():coi
-2.670017 I-MISC BOS
-2.67