In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

In [2]:
num_word=50000

In [3]:
path='https://github.com/duybluemind1988/NLP-with-Python/blob/master/data/ner_dataset.csv?raw=true'
df=pd.read_csv(path,encoding = "ISO-8859-1")
df = df[:num_word]
print(df.shape)
df.head()

(50000, 4)


Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [4]:
df.isnull().sum()

Sentence #    47730
Word              0
POS               0
Tag               0
dtype: int64

In [5]:
df = df.fillna(method='ffill')
df

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O
...,...,...,...,...
49995,Sentence: 2270,pushed,VBN,O
49996,Sentence: 2270,by,IN,O
49997,Sentence: 2270,South,NNP,B-geo
49998,Sentence: 2270,Africa,NNP,I-geo


Essential info about entities:
geo = Geographical Entity
org = Organization
per = Person
gpe = Geopolitical Entity
tim = Time indicator
art = Artifact
eve = Event
nat = Natural Phenomenon

We have 4,544 sentences that contain 10,922 unique words and tagged by 17 tags.

In [6]:
df['Sentence #'].nunique(), df.Word.nunique(), df.POS.nunique(), df.Tag.nunique()

(2270, 7464, 40, 17)

In [7]:
df.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O


In [8]:
df.groupby('Tag').size().reset_index(name='counts')

Unnamed: 0,Tag,counts
0,B-art,48
1,B-eve,39
2,B-geo,1490
3,B-gpe,968
4,B-nat,18
5,B-org,959
6,B-per,789
7,B-tim,880
8,I-art,27
9,I-eve,33


In [9]:
X = df.drop('Tag', axis=1)
X.head()

Unnamed: 0,Sentence #,Word,POS
0,Sentence: 1,Thousands,NNS
1,Sentence: 1,of,IN
2,Sentence: 1,demonstrators,NNS
3,Sentence: 1,have,VBP
4,Sentence: 1,marched,VBN


In [10]:
X.columns

Index(['Sentence #', 'Word', 'POS'], dtype='object')

In [11]:
X

Unnamed: 0,Sentence #,Word,POS
0,Sentence: 1,Thousands,NNS
1,Sentence: 1,of,IN
2,Sentence: 1,demonstrators,NNS
3,Sentence: 1,have,VBP
4,Sentence: 1,marched,VBN
...,...,...,...
49995,Sentence: 2270,pushed,VBN
49996,Sentence: 2270,by,IN
49997,Sentence: 2270,South,NNP
49998,Sentence: 2270,Africa,NNP


In [12]:
v = DictVectorizer(sparse=False)
X = v.fit_transform(X.to_dict('records'))
X.shape

(50000, 9774)

In [13]:
y = df.Tag.values
print(y.shape)
y

(50000,)


array(['O', 'O', 'O', ..., 'B-geo', 'I-geo', 'O'], dtype=object)

In [14]:
classes = np.unique(y)

In [15]:
classes = classes.tolist()
classes

['B-art',
 'B-eve',
 'B-geo',
 'B-gpe',
 'B-nat',
 'B-org',
 'B-per',
 'B-tim',
 'I-art',
 'I-eve',
 'I-geo',
 'I-gpe',
 'I-nat',
 'I-org',
 'I-per',
 'I-tim',
 'O']

In [16]:
X.shape, y.shape

((50000, 9774), (50000,))

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state=0)

In [18]:
X_train.shape, y_train.shape

((33500, 9774), (33500,))

### Perceptron

Because tag “O” (outside) is the most common tag and it will make our results look much better than they actual are. So we remove tag “O” when we evaluate classification metrics.

In [19]:
new_classes = classes.copy()
new_classes.pop()
new_classes

['B-art',
 'B-eve',
 'B-geo',
 'B-gpe',
 'B-nat',
 'B-org',
 'B-per',
 'B-tim',
 'I-art',
 'I-eve',
 'I-geo',
 'I-gpe',
 'I-nat',
 'I-org',
 'I-per',
 'I-tim']

Out-of-core Algorithms

We will try some of the out-of-core algorithms that are designed to process data that is too large to fit into a single computer memory that support partial_fit method.

In [20]:
per = Perceptron(verbose=10, n_jobs=-1, max_iter=5)
per.partial_fit(X_train, y_train, classes)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.


-- Epoch 1
-- Epoch 1
Norm: 10.20, NNZs: 77, Bias: -4.000000, T: 33500, Avg. loss: 0.001433
Total training time: 0.73 seconds.
-- Epoch 1
Norm: 10.34, NNZs: 101, Bias: -3.000000, T: 33500, Avg. loss: 0.002030
Total training time: 0.76 seconds.
-- Epoch 1


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.8s


Norm: 50.47, NNZs: 1434, Bias: -5.000000, T: 33500, Avg. loss: 0.042269
Total training time: 0.78 seconds.
-- Epoch 1
Norm: 40.36, NNZs: 932, Bias: -3.000000, T: 33500, Avg. loss: 0.020746
Total training time: 0.76 seconds.
-- Epoch 1


[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    1.6s


Norm: 7.21, NNZs: 49, Bias: -2.000000, T: 33500, Avg. loss: 0.000955
Total training time: 0.73 seconds.
-- Epoch 1
Norm: 42.87, NNZs: 1146, Bias: -4.000000, T: 33500, Avg. loss: 0.036328
Total training time: 0.81 seconds.
-- Epoch 1
Norm: 35.41, NNZs: 871, Bias: -4.000000, T: 33500, Avg. loss: 0.024149
Total training time: 0.76 seconds.
-- Epoch 1
Norm: 35.31, NNZs: 695, Bias: -3.000000, T: 33500, Avg. loss: 0.017612
Total training time: 0.76 seconds.
-- Epoch 1
Norm: 9.70, NNZs: 80, Bias: -4.000000, T: 33500, Avg. loss: 0.001224
Total training time: 0.76 seconds.
-- Epoch 1
Norm: 9.64, NNZs: 76, Bias: -3.000000, T: 33500, Avg. loss: 0.001164
Total training time: 0.74 seconds.
-- Epoch 1


[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    3.8s


Norm: 24.43, NNZs: 417, Bias: -3.000000, T: 33500, Avg. loss: 0.010478
Total training time: 0.79 seconds.
-- Epoch 1
Norm: 8.66, NNZs: 63, Bias: -3.000000, T: 33500, Avg. loss: 0.001313
Total training time: 0.79 seconds.
-- Epoch 1
Norm: 6.16, NNZs: 27, Bias: -2.000000, T: 33500, Avg. loss: 0.000209
Total training time: 0.73 seconds.
-- Epoch 1
Norm: 38.47, NNZs: 860, Bias: -6.000000, T: 33500, Avg. loss: 0.023343
Total training time: 0.76 seconds.
-- Epoch 1


[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:    5.4s


Norm: 44.52, NNZs: 1248, Bias: -6.000000, T: 33500, Avg. loss: 0.028746
Total training time: 0.77 seconds.
-- Epoch 1
Norm: 21.84, NNZs: 341, Bias: -3.000000, T: 33500, Avg. loss: 0.009881
Total training time: 0.78 seconds.
Norm: 51.33, NNZs: 1478, Bias: 3.000000, T: 33500, Avg. loss: 0.046090
Total training time: 0.67 seconds.


[Parallel(n_jobs=-1)]: Done  17 out of  17 | elapsed:    6.8s finished


Perceptron(alpha=0.0001, class_weight=None, early_stopping=False, eta0=1.0,
           fit_intercept=True, max_iter=5, n_iter_no_change=5, n_jobs=-1,
           penalty=None, random_state=0, shuffle=True, tol=0.001,
           validation_fraction=0.1, verbose=10, warm_start=False)

In [21]:
#DNN with 50,000 words training
print(classification_report(y_pred=per.predict(X_test), y_true=y_test, labels=new_classes))

              precision    recall  f1-score   support

       B-art       0.75      0.17      0.27        18
       B-eve       0.00      0.00      0.00        14
       B-geo       0.76      0.39      0.52       476
       B-gpe       0.26      0.82      0.39       320
       B-nat       0.00      0.00      0.00         3
       B-org       0.56      0.45      0.50       301
       B-per       0.68      0.47      0.55       258
       B-tim       0.86      0.75      0.81       272
       I-art       0.00      0.00      0.00         5
       I-eve       0.67      0.36      0.47        11
       I-geo       0.36      0.60      0.45       115
       I-gpe       0.00      0.00      0.00        11
       I-nat       0.50      0.33      0.40         3
       I-org       0.81      0.23      0.36       200
       I-per       0.86      0.08      0.15       306
       I-tim       0.44      0.09      0.15        90

   micro avg       0.48      0.44      0.46      2403
   macro avg       0.47   

  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
#From ebook with 100,000 words
print(classification_report(y_pred=per.predict(X_test), y_true=y_test, labels=new_classes))

             precision    recall  f1-score   support

      B-art       0.15      0.12      0.14        24
      B-eve       0.46      0.32      0.37        19
      B-geo       0.42      0.91      0.57      1085
      B-gpe       0.89      0.78      0.83       556
      B-nat       0.11      0.25      0.15        12
      B-org       0.55      0.35      0.43       589
      B-per       0.72      0.43      0.53       564
      B-tim       0.65      0.78      0.71       611
      I-art       0.02      0.08      0.03        12
      I-eve       0.00      0.00      0.00        18
      I-geo       0.81      0.32      0.46       230
      I-gpe       0.00      0.00      0.00        14
      I-nat       0.50      0.50      0.50         2
      I-org       0.71      0.41      0.52       445
      I-per       0.76      0.20      0.32       591
      I-tim       0.26      0.05      0.09       194

avg / total       0.62      0.55      0.53      4966



  'precision', 'predicted', average, warn_for)


### Linear classifiers with SGD training

In [22]:
sgd = SGDClassifier()
sgd.partial_fit(X_train, y_train, classes)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [23]:
print(classification_report(y_pred=sgd.predict(X_test), y_true=y_test, labels=new_classes))

              precision    recall  f1-score   support

       B-art       0.33      0.33      0.33        18
       B-eve       1.00      0.14      0.25        14
       B-geo       0.48      0.87      0.62       476
       B-gpe       0.86      0.56      0.68       320
       B-nat       0.00      0.00      0.00         3
       B-org       0.67      0.38      0.48       301
       B-per       0.70      0.48      0.57       258
       B-tim       0.72      0.72      0.72       272
       I-art       0.00      0.00      0.00         5
       I-eve       0.03      0.45      0.05        11
       I-geo       0.63      0.45      0.53       115
       I-gpe       0.00      0.00      0.00        11
       I-nat       0.00      0.00      0.00         3
       I-org       0.80      0.18      0.29       200
       I-per       0.79      0.37      0.51       306
       I-tim       0.55      0.07      0.12        90

   micro avg       0.57      0.52      0.54      2403
   macro avg       0.47   

  _warn_prf(average, modifier, msg_start, len(result))


### Naive Bayes classifier for multinomial models

In [24]:
nb = MultinomialNB(alpha=0.01)
nb.partial_fit(X_train, y_train, classes)

MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)

In [25]:
print(classification_report(y_pred=nb.predict(X_test), y_true=y_test, labels = new_classes))

              precision    recall  f1-score   support

       B-art       0.09      0.28      0.13        18
       B-eve       0.29      0.29      0.29        14
       B-geo       0.66      0.58      0.62       476
       B-gpe       0.65      0.73      0.68       320
       B-nat       0.18      0.67      0.29         3
       B-org       0.48      0.48      0.48       301
       B-per       0.38      0.48      0.42       258
       B-tim       0.60      0.72      0.65       272
       I-art       0.08      0.20      0.12         5
       I-eve       0.47      0.64      0.54        11
       I-geo       0.43      0.49      0.46       115
       I-gpe       0.00      0.00      0.00        11
       I-nat       0.00      0.00      0.00         3
       I-org       0.47      0.48      0.48       200
       I-per       0.52      0.45      0.49       306
       I-tim       0.16      0.26      0.20        90

   micro avg       0.50      0.54      0.52      2403
   macro avg       0.34   

### Passive Aggressive Classifier

In [26]:
pa =PassiveAggressiveClassifier()
pa.partial_fit(X_train, y_train, classes)

PassiveAggressiveClassifier(C=1.0, average=False, class_weight=None,
                            early_stopping=False, fit_intercept=True,
                            loss='hinge', max_iter=1000, n_iter_no_change=5,
                            n_jobs=None, random_state=None, shuffle=True,
                            tol=0.001, validation_fraction=0.1, verbose=0,
                            warm_start=False)

In [27]:
print(classification_report(y_pred=pa.predict(X_test), y_true=y_test, labels=new_classes))

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

       B-art       0.00      0.00      0.00        18
       B-eve       0.00      0.00      0.00        14
       B-geo       0.87      0.14      0.25       476
       B-gpe       0.97      0.57      0.72       320
       B-nat       0.00      0.00      0.00         3
       B-org       0.18      0.92      0.30       301
       B-per       0.91      0.34      0.50       258
       B-tim       0.81      0.73      0.77       272
       I-art       0.00      0.00      0.00         5
       I-eve       0.00      0.00      0.00        11
       I-geo       0.80      0.21      0.33       115
       I-gpe       0.00      0.00      0.00        11
       I-nat       1.00      0.33      0.50         3
       I-org       0.69      0.06      0.10       200
       I-per       0.84      0.13      0.23       306
       I-tim       1.00      0.04      0.09        90

   micro avg       0.39      0.37      0.38      2403
   macro avg       0.50   

None of the above classifiers produced satisfying results. It is obvious that it is not going to be easy to classify named entities using regular classifiers.

### Conditional Random Fields (CRFs)

CRFs is often used for labeling or parsing of sequential data, such as natural language processing and CRFs find applications in POS Tagging, named entity recognition, among others.
sklearn-crfsuite
We will train a CRF model for named entity recognition using sklearn-crfsuite on our data set.

In [29]:
!pip install sklearn_crfsuite

Collecting sklearn_crfsuite
  Downloading https://files.pythonhosted.org/packages/25/74/5b7befa513482e6dee1f3dd68171a6c9dfc14c0eaa00f885ffeba54fe9b0/sklearn_crfsuite-0.3.6-py2.py3-none-any.whl
Collecting python-crfsuite>=0.8.3
[?25l  Downloading https://files.pythonhosted.org/packages/95/99/869dde6dbf3e0d07a013c8eebfb0a3d30776334e0097f8432b631a9a3a19/python_crfsuite-0.9.7-cp36-cp36m-manylinux1_x86_64.whl (743kB)
[K     |████████████████████████████████| 747kB 4.0MB/s 
Installing collected packages: python-crfsuite, sklearn-crfsuite
Successfully installed python-crfsuite-0.9.7 sklearn-crfsuite-0.3.6


In [30]:
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

#### Get sentences

In [31]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s['Word'].values.tolist(), 
                                                           s['POS'].values.tolist(), 
                                                           s['Tag'].values.tolist())]
        self.grouped = self.data.groupby('Sentence #').apply(agg_func)
        self.sentences = [s for s in self.grouped]
        
    def get_next(self):
        try: 
            s = self.grouped['Sentence: {}'.format(self.n_sent)]
            self.n_sent += 1
            return s 
        except:
            return None

In [32]:
df

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O
...,...,...,...,...
49995,Sentence: 2270,pushed,VBN,O
49996,Sentence: 2270,by,IN,O
49997,Sentence: 2270,South,NNP,B-geo
49998,Sentence: 2270,Africa,NNP,I-geo


In [33]:
getter = SentenceGetter(df)
getter

<__main__.SentenceGetter at 0x7f2cfa3b4898>

In [40]:
sent = getter.get_next()
sent[:20]

[('The', 'DT', 'O'),
 ('protest', 'NN', 'O'),
 ('comes', 'VBZ', 'O'),
 ('on', 'IN', 'O'),
 ('the', 'DT', 'O'),
 ('eve', 'NN', 'O'),
 ('of', 'IN', 'O'),
 ('the', 'DT', 'O'),
 ('annual', 'JJ', 'O'),
 ('conference', 'NN', 'O'),
 ('of', 'IN', 'O'),
 ('Britain', 'NNP', 'B-geo'),
 ("'s", 'POS', 'O'),
 ('ruling', 'VBG', 'O'),
 ('Labor', 'NNP', 'B-org'),
 ('Party', 'NNP', 'I-org'),
 ('in', 'IN', 'O'),
 ('the', 'DT', 'O'),
 ('southern', 'JJ', 'O'),
 ('English', 'JJ', 'B-gpe')]

In [36]:
sentences = getter.sentences
sentences[:3]

[[('Thousands', 'NNS', 'O'),
  ('of', 'IN', 'O'),
  ('demonstrators', 'NNS', 'O'),
  ('have', 'VBP', 'O'),
  ('marched', 'VBN', 'O'),
  ('through', 'IN', 'O'),
  ('London', 'NNP', 'B-geo'),
  ('to', 'TO', 'O'),
  ('protest', 'VB', 'O'),
  ('the', 'DT', 'O'),
  ('war', 'NN', 'O'),
  ('in', 'IN', 'O'),
  ('Iraq', 'NNP', 'B-geo'),
  ('and', 'CC', 'O'),
  ('demand', 'VB', 'O'),
  ('the', 'DT', 'O'),
  ('withdrawal', 'NN', 'O'),
  ('of', 'IN', 'O'),
  ('British', 'JJ', 'B-gpe'),
  ('troops', 'NNS', 'O'),
  ('from', 'IN', 'O'),
  ('that', 'DT', 'O'),
  ('country', 'NN', 'O'),
  ('.', '.', 'O')],
 [('Iranian', 'JJ', 'B-gpe'),
  ('officials', 'NNS', 'O'),
  ('say', 'VBP', 'O'),
  ('they', 'PRP', 'O'),
  ('expect', 'VBP', 'O'),
  ('to', 'TO', 'O'),
  ('get', 'VB', 'O'),
  ('access', 'NN', 'O'),
  ('to', 'TO', 'O'),
  ('sealed', 'JJ', 'O'),
  ('sensitive', 'JJ', 'O'),
  ('parts', 'NNS', 'O'),
  ('of', 'IN', 'O'),
  ('the', 'DT', 'O'),
  ('plant', 'NN', 'O'),
  ('Wednesday', 'NNP', 'B-tim'),
  ('

#### Features extraction

Next, we extract more features (word parts, simplified POS tags, lower/title/upper flags, features of nearby words) and convert them to sklear-crfsuite format - each sentence should be converted to a list of dicts.

In [41]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    
    features = {
        'bias': 1.0, 
        'word.lower()': word.lower(), 
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

The above code were taken from sklearn-crfsuite official site.

Split train and test sets.

In [42]:
X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]

In [46]:
sentences[:1]

[[('Thousands', 'NNS', 'O'),
  ('of', 'IN', 'O'),
  ('demonstrators', 'NNS', 'O'),
  ('have', 'VBP', 'O'),
  ('marched', 'VBN', 'O'),
  ('through', 'IN', 'O'),
  ('London', 'NNP', 'B-geo'),
  ('to', 'TO', 'O'),
  ('protest', 'VB', 'O'),
  ('the', 'DT', 'O'),
  ('war', 'NN', 'O'),
  ('in', 'IN', 'O'),
  ('Iraq', 'NNP', 'B-geo'),
  ('and', 'CC', 'O'),
  ('demand', 'VB', 'O'),
  ('the', 'DT', 'O'),
  ('withdrawal', 'NN', 'O'),
  ('of', 'IN', 'O'),
  ('British', 'JJ', 'B-gpe'),
  ('troops', 'NNS', 'O'),
  ('from', 'IN', 'O'),
  ('that', 'DT', 'O'),
  ('country', 'NN', 'O'),
  ('.', '.', 'O')]]

In [47]:
X[:1]

[[{'+1:postag': 'IN',
   '+1:postag[:2]': 'IN',
   '+1:word.istitle()': False,
   '+1:word.isupper()': False,
   '+1:word.lower()': 'of',
   'BOS': True,
   'bias': 1.0,
   'postag': 'NNS',
   'postag[:2]': 'NN',
   'word.isdigit()': False,
   'word.istitle()': True,
   'word.isupper()': False,
   'word.lower()': 'thousands',
   'word[-2:]': 'ds',
   'word[-3:]': 'nds'},
  {'+1:postag': 'NNS',
   '+1:postag[:2]': 'NN',
   '+1:word.istitle()': False,
   '+1:word.isupper()': False,
   '+1:word.lower()': 'demonstrators',
   '-1:postag': 'NNS',
   '-1:postag[:2]': 'NN',
   '-1:word.istitle()': True,
   '-1:word.isupper()': False,
   '-1:word.lower()': 'thousands',
   'bias': 1.0,
   'postag': 'IN',
   'postag[:2]': 'IN',
   'word.isdigit()': False,
   'word.istitle()': False,
   'word.isupper()': False,
   'word.lower()': 'of',
   'word[-2:]': 'of',
   'word[-3:]': 'of'},
  {'+1:postag': 'VBP',
   '+1:postag[:2]': 'VB',
   '+1:word.istitle()': False,
   '+1:word.isupper()': False,
   '+1:w

In [48]:
y[:2]

[['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-geo',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-geo',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-gpe',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['B-gpe',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-tim',
  'O',
  'O',
  'O',
  'B-org',
  'O',
  'O',
  'O',
  'O',
  'O']]

In [49]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

In [61]:
print(len(X_train)) #1520 sentences
print(len(X_train[0])) # first sentences have 31 words
print(len(X_train[0][0]))# first word have 15 features
print(len(X_train[0][1]))# second word have 19 features 
print(len(y_train)) # 1520 labels
print(len(y_train[0])) # 31 label for 31 words in first sentence
print(len(X_test))
print(len(X_test[0]))

1520
31
15
19
1520
31
750
21


In [58]:
X_train[:1]

[[{'+1:postag': 'VBD',
   '+1:postag[:2]': 'VB',
   '+1:word.istitle()': False,
   '+1:word.isupper()': False,
   '+1:word.lower()': 'cited',
   'BOS': True,
   'bias': 1.0,
   'postag': 'NNS',
   'postag[:2]': 'NN',
   'word.isdigit()': False,
   'word.istitle()': True,
   'word.isupper()': False,
   'word.lower()': 'officials',
   'word[-2:]': 'ls',
   'word[-3:]': 'als'},
  {'+1:postag': 'NN',
   '+1:postag[:2]': 'NN',
   '+1:word.istitle()': False,
   '+1:word.isupper()': False,
   '+1:word.lower()': 'warming',
   '-1:postag': 'NNS',
   '-1:postag[:2]': 'NN',
   '-1:word.istitle()': True,
   '-1:word.isupper()': False,
   '-1:word.lower()': 'officials',
   'bias': 1.0,
   'postag': 'VBD',
   'postag[:2]': 'VB',
   'word.isdigit()': False,
   'word.istitle()': False,
   'word.isupper()': False,
   'word.lower()': 'cited',
   'word[-2:]': 'ed',
   'word[-3:]': 'ted'},
  {'+1:postag': 'NNS',
   '+1:postag[:2]': 'NN',
   '+1:word.istitle()': False,
   '+1:word.isupper()': False,
   '+1

In [62]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)



CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=True,
    averaging=None, c=None, c1=0.1, c2=0.1, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

In [63]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=new_classes)

0.7467577210509984

In [64]:
print(metrics.flat_classification_report(y_test, y_pred, labels = new_classes))

              precision    recall  f1-score   support

       B-art       0.50      0.05      0.10        19
       B-eve       0.71      0.29      0.42        17
       B-geo       0.67      0.86      0.75       475
       B-gpe       0.83      0.73      0.78       334
       B-nat       0.00      0.00      0.00         5
       B-org       0.71      0.57      0.63       354
       B-per       0.79      0.79      0.79       243
       B-tim       0.87      0.86      0.87       291
       I-art       0.00      0.00      0.00        12
       I-eve       1.00      0.25      0.40        16
       I-geo       0.65      0.65      0.65        92
       I-gpe       0.00      0.00      0.00         9
       I-nat       0.00      0.00      0.00         1
       I-org       0.77      0.76      0.76       254
       I-per       0.84      0.93      0.88       283
       I-tim       0.71      0.62      0.66        88

   micro avg       0.76      0.75      0.76      2493
   macro avg       0.56   

  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.grid_search import RandomizedSearchCV

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=new_classes)

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)
rs.fit(X_train, y_train)



Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 17.0min finished


RandomizedSearchCV(cv=3, error_score='raise',
          estimator=CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=True, averaging=None, c=None, c1=None, c2=None,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error...e,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=False),
          fit_params={}, iid=True, n_iter=50, n_jobs=-1,
          param_distributions={'c1': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000022C766AD048>, 'c2': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000022C76751978>},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          scoring=make_scorer(flat_f1_score, average=weighted, labels=['B-art', 'B-eve', 'B-geo', 'B-gpe', 'B-nat', 'B-org', 'B-per', 'B-tim', 'I-art', 'I-eve', 'I-geo', 'I-gpe', 'I-nat', 'I-org', 'I-per', 'I-tim']),
   

In [None]:
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

best params: {'c1': 0.0036898984638244928, 'c2': 0.11585183551331574}
best CV score: 0.7737211773297741
model size: 1.30M


In [None]:
crf = rs.best_estimator_
y_pred = crf.predict(X_test)
print(metrics.flat_classification_report(y_test, y_pred, labels=new_classes))

  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

      B-art       1.00      0.03      0.07        29
      B-eve       0.83      0.21      0.33        24
      B-geo       0.75      0.87      0.81      1043
      B-gpe       0.88      0.78      0.83       588
      B-nat       0.67      0.20      0.31        10
      B-org       0.74      0.63      0.68       649
      B-per       0.81      0.80      0.81       546
      B-tim       0.90      0.84      0.87       589
      I-art       0.00      0.00      0.00         7
      I-eve       0.67      0.22      0.33        18
      I-geo       0.67      0.71      0.69       204
      I-gpe       0.39      0.53      0.45        17
      I-nat       1.00      0.50      0.67         2
      I-org       0.78      0.72      0.75       545
      I-per       0.81      0.89      0.85       574
      I-tim       0.79      0.66      0.72       185

avg / total       0.80      0.78      0.78      5030



In [65]:
from collections import Counter

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(20))

print("\nTop unlikely transitions:")
print_transitions(Counter(crf.transition_features_).most_common()[-20:])

Top likely transitions:
B-geo  -> I-geo   6.025124
B-per  -> I-per   5.752572
I-tim  -> I-tim   5.537304
B-org  -> I-org   5.377582
I-org  -> I-org   5.225950
B-tim  -> I-tim   5.203619
B-gpe  -> I-gpe   4.811577
I-per  -> I-per   4.570931
B-art  -> I-art   4.515261
B-eve  -> I-eve   4.346047
I-gpe  -> I-gpe   4.146536
I-geo  -> I-geo   4.096209
I-art  -> I-art   3.888031
O      -> O       3.750474
B-nat  -> I-nat   3.301682
I-eve  -> I-eve   3.097023
B-org  -> B-art   2.426228
I-nat  -> I-nat   1.998332
O      -> B-eve   1.883422
O      -> B-per   1.803690

Top unlikely transitions:
I-geo  -> B-per   -1.034029
B-org  -> I-per   -1.076155
I-tim  -> B-tim   -1.083762
B-geo  -> I-org   -1.174412
B-geo  -> I-per   -1.183731
B-gpe  -> I-org   -1.188898
I-org  -> I-per   -1.284712
B-org  -> B-org   -1.326719
B-gpe  -> I-geo   -1.380641
B-tim  -> B-tim   -1.421861
O      -> I-art   -1.550078
B-tim  -> B-gpe   -1.830413
B-geo  -> B-per   -2.019120
B-per  -> B-per   -2.047799
B-gpe  -> B-gpe  

It is very likely that the beginning of a geographical entity (B-geo) will be followed by a token inside geographical entity (I-geo), but transitions to inside of an organization name (I-org) from tokens with other labels are penalized hugely.

In [66]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))

print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(30))

print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-30:])

Top positive:
5.507773 B-tim    word[-2:]:0s
4.825628 B-tim    word[-3:]:day
4.656911 O        bias
4.545841 O        BOS
3.855805 B-tim    word[-2:]:ay
3.588437 B-gpe    word.istitle()
3.468236 O        word.lower():jewish
3.242285 O        word.lower():kurdish
3.134375 B-tim    +1:word.lower():last
3.091690 I-tim    word[-3:]:day
2.953323 B-org    +1:word.lower():interim
2.876845 B-org    word.lower():al-qaida
2.871956 B-tim    word[-3:]:ber
2.845636 B-org    word.lower():parliament
2.814774 B-org    -1:word.lower():extremist
2.795836 B-gpe    word.lower():bosnia-herzegovina
2.775126 B-geo    -1:word.lower():sponsored
2.765393 B-per    word.lower():prime
2.712992 B-geo    +1:word.lower():province
2.685214 B-geo    +1:word.lower():palestinian
2.680229 O        -1:word.lower():prime
2.657546 B-per    word.lower():obama
2.655386 O        word[-2:]:ty
2.655353 B-gpe    -1:word.lower():recognize
2.638968 B-tim    word.lower():midnight
2.629440 B-org    -1:word.lower():nepal
2.616242 B-org

Observations: 

1). __```5.183603 B-tim word[-3]:day```__
The model learns that if a nearby word was “day” then the token is likely a part of a Time indicator.

2). __```3.370614 B-per word.lower():president```__
The model learns that token "president" is likely to be at the beginning of a person name.

3). __```-3.521244 O postag:NNP```__
The model learns that proper nouns are often entities.

4). __```-3.087828 O word.isdigit()```__
Digits are likely entities.

5). __```-3.233526 O word.istitle()```__
TitleCased words are likely entities.

### ELI5

ELI5 is a Python package which helps to debug machine learning classifiers and explain their predictions. ELI5 allows to check weights of sklearn_crfsuite.CRF models.

In [68]:
!pip install eli5

Collecting eli5
[?25l  Downloading https://files.pythonhosted.org/packages/97/2f/c85c7d8f8548e460829971785347e14e45fa5c6617da374711dec8cb38cc/eli5-0.10.1-py2.py3-none-any.whl (105kB)
[K     |███                             | 10kB 11.0MB/s eta 0:00:01[K     |██████▏                         | 20kB 1.8MB/s eta 0:00:01[K     |█████████▎                      | 30kB 2.0MB/s eta 0:00:01[K     |████████████▍                   | 40kB 2.3MB/s eta 0:00:01[K     |███████████████▌                | 51kB 2.0MB/s eta 0:00:01[K     |██████████████████▋             | 61kB 2.2MB/s eta 0:00:01[K     |█████████████████████▊          | 71kB 2.5MB/s eta 0:00:01[K     |████████████████████████▊       | 81kB 2.7MB/s eta 0:00:01[K     |███████████████████████████▉    | 92kB 2.9MB/s eta 0:00:01[K     |███████████████████████████████ | 102kB 2.8MB/s eta 0:00:01[K     |████████████████████████████████| 112kB 2.8MB/s 
Installing collected packages: eli5
Successfully installed eli5-0.10.1


In [69]:
import eli5

eli5.show_weights(crf, top=10)



From \ To,O,B-art,I-art,B-eve,I-eve,B-geo,I-geo,B-gpe,I-gpe,B-nat,I-nat,B-org,I-org,B-per,I-per,B-tim,I-tim
O,3.75,1.159,-1.55,1.883,-0.856,1.673,-3.154,0.63,-0.865,0.001,-0.546,1.392,-3.307,1.804,-2.772,1.59,-2.981
B-art,-0.117,0.0,4.515,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.016,-0.36,-0.102,-0.204,0.0
I-art,-0.066,0.0,3.888,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.253,0.0,0.0,0.0
B-eve,-0.603,0.0,0.0,0.0,4.346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.207,-0.29,0.0,-0.34,0.0
I-eve,-0.08,0.0,0.0,-0.533,3.097,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.043,0.0,0.0,0.0
B-geo,0.594,0.0,-0.403,0.0,-0.499,-0.477,6.025,1.06,-0.658,0.0,-0.236,-1.01,-1.174,-2.019,-1.184,1.492,-0.629
I-geo,0.127,0.0,-0.231,0.0,-0.038,-0.292,4.096,0.0,-0.125,0.0,0.0,-0.129,-0.628,-1.034,-0.611,0.0,-0.512
B-gpe,1.039,0.0,-0.363,0.0,-0.27,-0.592,-1.381,-2.106,4.812,0.0,0.0,1.774,-1.189,0.484,-0.962,-0.938,-0.455
I-gpe,-0.645,0.0,0.0,0.0,0.0,0.416,-0.023,-0.044,4.147,0.0,0.0,0.0,-0.05,-0.491,0.0,0.0,0.0
B-nat,-0.318,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.302,0.0,0.0,-0.037,0.0,0.0,0.0

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0,Unnamed: 13_level_0,Unnamed: 14_level_0,Unnamed: 15_level_0,Unnamed: 16_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4
Weight?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5,Unnamed: 9_level_5,Unnamed: 10_level_5,Unnamed: 11_level_5,Unnamed: 12_level_5,Unnamed: 13_level_5,Unnamed: 14_level_5,Unnamed: 15_level_5,Unnamed: 16_level_5
Weight?,Feature,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6,Unnamed: 7_level_6,Unnamed: 8_level_6,Unnamed: 9_level_6,Unnamed: 10_level_6,Unnamed: 11_level_6,Unnamed: 12_level_6,Unnamed: 13_level_6,Unnamed: 14_level_6,Unnamed: 15_level_6,Unnamed: 16_level_6
Weight?,Feature,Unnamed: 2_level_7,Unnamed: 3_level_7,Unnamed: 4_level_7,Unnamed: 5_level_7,Unnamed: 6_level_7,Unnamed: 7_level_7,Unnamed: 8_level_7,Unnamed: 9_level_7,Unnamed: 10_level_7,Unnamed: 11_level_7,Unnamed: 12_level_7,Unnamed: 13_level_7,Unnamed: 14_level_7,Unnamed: 15_level_7,Unnamed: 16_level_7
Weight?,Feature,Unnamed: 2_level_8,Unnamed: 3_level_8,Unnamed: 4_level_8,Unnamed: 5_level_8,Unnamed: 6_level_8,Unnamed: 7_level_8,Unnamed: 8_level_8,Unnamed: 9_level_8,Unnamed: 10_level_8,Unnamed: 11_level_8,Unnamed: 12_level_8,Unnamed: 13_level_8,Unnamed: 14_level_8,Unnamed: 15_level_8,Unnamed: 16_level_8
Weight?,Feature,Unnamed: 2_level_9,Unnamed: 3_level_9,Unnamed: 4_level_9,Unnamed: 5_level_9,Unnamed: 6_level_9,Unnamed: 7_level_9,Unnamed: 8_level_9,Unnamed: 9_level_9,Unnamed: 10_level_9,Unnamed: 11_level_9,Unnamed: 12_level_9,Unnamed: 13_level_9,Unnamed: 14_level_9,Unnamed: 15_level_9,Unnamed: 16_level_9
Weight?,Feature,Unnamed: 2_level_10,Unnamed: 3_level_10,Unnamed: 4_level_10,Unnamed: 5_level_10,Unnamed: 6_level_10,Unnamed: 7_level_10,Unnamed: 8_level_10,Unnamed: 9_level_10,Unnamed: 10_level_10,Unnamed: 11_level_10,Unnamed: 12_level_10,Unnamed: 13_level_10,Unnamed: 14_level_10,Unnamed: 15_level_10,Unnamed: 16_level_10
Weight?,Feature,Unnamed: 2_level_11,Unnamed: 3_level_11,Unnamed: 4_level_11,Unnamed: 5_level_11,Unnamed: 6_level_11,Unnamed: 7_level_11,Unnamed: 8_level_11,Unnamed: 9_level_11,Unnamed: 10_level_11,Unnamed: 11_level_11,Unnamed: 12_level_11,Unnamed: 13_level_11,Unnamed: 14_level_11,Unnamed: 15_level_11,Unnamed: 16_level_11
Weight?,Feature,Unnamed: 2_level_12,Unnamed: 3_level_12,Unnamed: 4_level_12,Unnamed: 5_level_12,Unnamed: 6_level_12,Unnamed: 7_level_12,Unnamed: 8_level_12,Unnamed: 9_level_12,Unnamed: 10_level_12,Unnamed: 11_level_12,Unnamed: 12_level_12,Unnamed: 13_level_12,Unnamed: 14_level_12,Unnamed: 15_level_12,Unnamed: 16_level_12
Weight?,Feature,Unnamed: 2_level_13,Unnamed: 3_level_13,Unnamed: 4_level_13,Unnamed: 5_level_13,Unnamed: 6_level_13,Unnamed: 7_level_13,Unnamed: 8_level_13,Unnamed: 9_level_13,Unnamed: 10_level_13,Unnamed: 11_level_13,Unnamed: 12_level_13,Unnamed: 13_level_13,Unnamed: 14_level_13,Unnamed: 15_level_13,Unnamed: 16_level_13
Weight?,Feature,Unnamed: 2_level_14,Unnamed: 3_level_14,Unnamed: 4_level_14,Unnamed: 5_level_14,Unnamed: 6_level_14,Unnamed: 7_level_14,Unnamed: 8_level_14,Unnamed: 9_level_14,Unnamed: 10_level_14,Unnamed: 11_level_14,Unnamed: 12_level_14,Unnamed: 13_level_14,Unnamed: 14_level_14,Unnamed: 15_level_14,Unnamed: 16_level_14
Weight?,Feature,Unnamed: 2_level_15,Unnamed: 3_level_15,Unnamed: 4_level_15,Unnamed: 5_level_15,Unnamed: 6_level_15,Unnamed: 7_level_15,Unnamed: 8_level_15,Unnamed: 9_level_15,Unnamed: 10_level_15,Unnamed: 11_level_15,Unnamed: 12_level_15,Unnamed: 13_level_15,Unnamed: 14_level_15,Unnamed: 15_level_15,Unnamed: 16_level_15
Weight?,Feature,Unnamed: 2_level_16,Unnamed: 3_level_16,Unnamed: 4_level_16,Unnamed: 5_level_16,Unnamed: 6_level_16,Unnamed: 7_level_16,Unnamed: 8_level_16,Unnamed: 9_level_16,Unnamed: 10_level_16,Unnamed: 11_level_16,Unnamed: 12_level_16,Unnamed: 13_level_16,Unnamed: 14_level_16,Unnamed: 15_level_16,Unnamed: 16_level_16
+4.657,bias,,,,,,,,,,,,,,,
+4.546,BOS,,,,,,,,,,,,,,,
+3.468,word.lower():jewish,,,,,,,,,,,,,,,
+3.242,word.lower():kurdish,,,,,,,,,,,,,,,
+2.680,-1:word.lower():prime,,,,,,,,,,,,,,,
+2.655,word[-2:]:ty,,,,,,,,,,,,,,,
… 926 more positive …,… 926 more positive …,,,,,,,,,,,,,,,
… 541 more negative …,… 541 more negative …,,,,,,,,,,,,,,,
-2.969,+1:word.lower():years,,,,,,,,,,,,,,,
-3.264,+1:word.lower():months,,,,,,,,,,,,,,,

Weight?,Feature
+4.657,bias
+4.546,BOS
+3.468,word.lower():jewish
+3.242,word.lower():kurdish
+2.680,-1:word.lower():prime
+2.655,word[-2:]:ty
… 926 more positive …,… 926 more positive …
… 541 more negative …,… 541 more negative …
-2.969,+1:word.lower():years
-3.264,+1:word.lower():months

Weight?,Feature
+1.696,word[-3:]:oxx
+1.696,word[-2:]:xx
+1.696,word.lower():vioxx
+1.646,-1:word.lower():tamilnet
+1.516,word.lower():dodge
+1.420,word.lower():english
+1.361,word.lower():chrysler
+1.346,word[-3:]:ler
+1.333,word.lower():jeep
… 145 more positive …,… 145 more positive …

Weight?,Feature
+1.457,+1:word.lower():airport
+1.215,+1:postag[:2]:NN
+0.959,-1:word.lower():endless
+0.959,word.lower():wire
+0.957,-1:word.lower():international
+0.836,word[-3:]:ire
+0.813,word[-2:]:re
+0.795,-1:word.lower():france
+0.774,word.lower():handelsblad
+0.774,-1:word.lower():nrc

Weight?,Feature
+2.140,-1:word.lower():war
+2.056,word[-3:]:pic
+2.056,word.lower():olympic
+1.420,word.lower():ii
+1.420,word[-3:]:II
+1.414,word[-2:]:II
+1.350,-1:word.lower():since
+1.227,+1:word.lower():war
+1.084,+1:word.lower():open
+1.063,word[-3:]:ian

Weight?,Feature
+1.005,word.lower():open
+1.004,word[-3:]:pen
+0.994,-1:word.istitle()
+0.871,postag:NNPS
+0.811,word[-3:]:mes
+0.796,word[-3:]:War
+0.793,word.lower():war
+0.786,word.lower():games
+0.782,-1:postag[:2]:NN
+0.765,-1:word.lower():australian

Weight?,Feature
+2.775,-1:word.lower():sponsored
+2.713,+1:word.lower():province
+2.685,+1:word.lower():palestinian
+2.177,word[-3:]:the
+2.100,-1:word.lower():west
+2.011,-1:word.lower():near
+2.003,word.lower():china
+1.950,word.lower():ukraine
+1.925,-1:word.lower():neighboring
+1.889,-1:word.lower():in

Weight?,Feature
+2.043,+1:word.lower():peacekeepers
+1.870,-1:word.lower():western
+1.817,word[-3:]:nds
+1.787,word[-2:]:ds
+1.696,word.lower():city
+1.623,-1:postag:JJ
+1.601,+1:word.lower():space
+1.576,+1:word.lower():during
+1.566,-1:postag[:2]:JJ
+1.514,word[-3:]:ast

Weight?,Feature
+3.588,word.istitle()
+2.796,word.lower():bosnia-herzegovina
+2.655,-1:word.lower():recognize
+2.529,+1:word.lower():representative
+2.519,word[-3:]:ans
+2.221,-1:word.lower():strike
+2.210,word.lower():thailand
+2.120,postag:NNS
+2.031,word[-3:]:pal
+2.031,word.lower():nepal

Weight?,Feature
+1.981,+1:word.lower():returned
+1.845,word[-3:]:can
+1.498,-1:postag:NNP
+1.129,word.lower():african
+1.119,-1:word.lower():state
+1.099,+1:word.lower():iraq
+1.052,postag[:2]:JJ
+1.027,+1:word.lower():health
+1.009,postag:JJ
… 90 more positive …,… 90 more positive …

Weight?,Feature
+1.696,word.isupper()
+1.587,word.lower():h5n1
+1.587,word[-3:]:5N1
+1.587,word[-2:]:N1
+1.584,word.lower():katrina
+1.441,+1:word.lower():had
+1.330,+1:word.lower():katrina
+1.278,word[-2:]:TB
+1.278,word.lower():xdr-tb
+1.278,word[-3:]:-TB

Weight?,Feature
+1.205,word.lower():katrina
+1.184,-1:word.lower():hurricane
+0.948,word[-3:]:ina
+0.941,word[-2:]:na
+0.885,-1:postag:NNP
+0.842,-1:word.lower():jing
+0.831,word.lower():jing
+0.830,-1:word.istitle()
+0.741,word[-3:]:ing
+0.674,-1:word.lower():respiratory

Weight?,Feature
+2.953,+1:word.lower():interim
+2.877,word.lower():al-qaida
+2.846,word.lower():parliament
+2.815,-1:word.lower():extremist
+2.629,-1:word.lower():nepal
+2.616,word[-3:]:ban
+2.581,word.lower():hamas
+2.520,+1:word.lower():citizenship
+2.452,word.lower():senate
+2.392,word[-3:]:wat

Weight?,Feature
+2.205,-1:word.lower():mediterranean
+1.889,-1:word.lower():qaida
+1.739,+1:word.lower():accuses
+1.729,+1:word.lower():fighters
+1.695,-1:word.lower():television
+1.561,+1:postag:VBP
+1.484,word.lower():committee-chairman
+1.480,+1:word.lower():position
+1.467,+1:word.lower():mr.
… 661 more positive …,… 661 more positive …

Weight?,Feature
+2.765,word.lower():prime
+2.658,word.lower():obama
+2.425,word.lower():jupiter
+2.397,-1:word.lower():inspector
+2.244,BOS
+1.985,word.lower():khayam
+1.985,word[-3:]:yam
+1.937,word[-3:]:ime
+1.924,word.lower():secretary
+1.841,word.lower():bush

Weight?,Feature
+2.561,-1:word.lower():president
+2.076,-1:postag:NN
+1.485,word[-2:]:li
+1.485,word.lower():condoleezza
+1.435,+1:word.lower():condoleezza
+1.319,+1:word.lower():donald
+1.309,+1:word.lower():george
… 502 more positive …,… 502 more positive …
… 73 more negative …,… 73 more negative …
-1.317,-1:word.lower():prime

Weight?,Feature
+5.508,word[-2:]:0s
+4.826,word[-3:]:day
+3.856,word[-2:]:ay
+3.134,+1:word.lower():last
+2.872,word[-3:]:ber
+2.639,word.lower():midnight
+2.615,+1:word.lower():year
+2.614,word.lower():january
+2.584,word.isdigit()
+2.506,-1:word.lower():last

Weight?,Feature
+3.092,word[-3:]:day
+2.221,word[-2:]:ay
+2.143,+1:word.lower():the
+2.098,word.lower():quarter
+1.960,word.isdigit()
+1.898,-1:word.lower():since
+1.717,word.lower():decades
+1.699,-1:word.lower():end
+1.697,word.lower():century
+1.648,+1:word.lower():months


It does make sense that I-entity must follow B-entity, such as I-geo follows B-geo, I-org follows B-org, I-per follows B-per, and so on. 

We can also see that it is not common in this dataset to have a person right after an organization name (B-org -> I-per has a large negative weight).

If we regularize CRF more, we can expect that only features which are generic will remain, and memoized tokens will go. Let’s check what effect does regularization have on CRF weights:

In [70]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=200,
    c2=0.1,
    max_iterations=20,
    all_possible_transitions=False,
)
crf.fit(X_train, y_train)
eli5.show_weights(crf, top=10)



From \ To,O,B-art,I-art,B-eve,I-eve,B-geo,I-geo,B-gpe,I-gpe,B-nat,I-nat,B-org,I-org,B-per,I-per,B-tim,I-tim
O,1.026,0.0,0.0,0.0,0.0,1.118,0.0,0.078,0.0,0.0,0.0,0.678,0.0,0.0,0.0,0.979,0.0
B-art,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
I-art,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B-eve,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
I-eve,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B-geo,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
I-geo,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B-gpe,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
I-gpe,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B-nat,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0,Unnamed: 13_level_0,Unnamed: 14_level_0,Unnamed: 15_level_0,Unnamed: 16_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4
Weight?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5,Unnamed: 9_level_5,Unnamed: 10_level_5,Unnamed: 11_level_5,Unnamed: 12_level_5,Unnamed: 13_level_5,Unnamed: 14_level_5,Unnamed: 15_level_5,Unnamed: 16_level_5
Weight?,Feature,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6,Unnamed: 7_level_6,Unnamed: 8_level_6,Unnamed: 9_level_6,Unnamed: 10_level_6,Unnamed: 11_level_6,Unnamed: 12_level_6,Unnamed: 13_level_6,Unnamed: 14_level_6,Unnamed: 15_level_6,Unnamed: 16_level_6
Weight?,Feature,Unnamed: 2_level_7,Unnamed: 3_level_7,Unnamed: 4_level_7,Unnamed: 5_level_7,Unnamed: 6_level_7,Unnamed: 7_level_7,Unnamed: 8_level_7,Unnamed: 9_level_7,Unnamed: 10_level_7,Unnamed: 11_level_7,Unnamed: 12_level_7,Unnamed: 13_level_7,Unnamed: 14_level_7,Unnamed: 15_level_7,Unnamed: 16_level_7
+4.808,bias,,,,,,,,,,,,,,,
+0.584,BOS,,,,,,,,,,,,,,,
+0.400,-1:postag[:2]:NN,,,,,,,,,,,,,,,
-2.037,word.istitle(),,,,,,,,,,,,,,,
-3.835,postag:NNP,,,,,,,,,,,,,,,
+0.814,postag:NNP,,,,,,,,,,,,,,,
+0.282,-1:postag:IN,,,,,,,,,,,,,,,
+0.282,-1:postag[:2]:IN,,,,,,,,,,,,,,,
+0.207,word.istitle(),,,,,,,,,,,,,,,
+1.584,postag:JJ,,,,,,,,,,,,,,,

Weight?,Feature
4.808,bias
0.584,BOS
0.4,-1:postag[:2]:NN
-2.037,word.istitle()
-3.835,postag:NNP

Weight?,Feature
0.814,postag:NNP
0.282,-1:postag:IN
0.282,-1:postag[:2]:IN
0.207,word.istitle()

Weight?,Feature
1.584,postag:JJ
1.554,postag[:2]:JJ
0.6,word.istitle()

Weight?,Feature
0.665,postag:NNP

Weight?,Feature
0.525,-1:postag:NNP
0.467,-1:word.istitle()
0.297,-1:postag[:2]:NN

Weight?,Feature
0.374,+1:postag:NNP
0.078,postag:NNP

Weight?,Feature
0.716,-1:postag:NNP
0.489,-1:postag[:2]:NN
0.439,-1:word.istitle()

Weight?,Feature
0.981,word[-2:]:ay
0.863,word[-3:]:day


In [71]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True,
)
crf.fit(X_train, y_train);
eli5.show_weights(crf, top=5, show=['transition_features'])



From \ To,O,B-art,I-art,B-eve,I-eve,B-geo,I-geo,B-gpe,I-gpe,B-nat,I-nat,B-org,I-org,B-per,I-per,B-tim,I-tim
O,3.75,1.159,-1.55,1.883,-0.856,1.673,-3.154,0.63,-0.865,0.001,-0.546,1.392,-3.307,1.804,-2.772,1.59,-2.981
B-art,-0.117,0.0,4.515,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.016,-0.36,-0.102,-0.204,0.0
I-art,-0.066,0.0,3.888,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.253,0.0,0.0,0.0
B-eve,-0.603,0.0,0.0,0.0,4.346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.207,-0.29,0.0,-0.34,0.0
I-eve,-0.08,0.0,0.0,-0.533,3.097,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.043,0.0,0.0,0.0
B-geo,0.594,0.0,-0.403,0.0,-0.499,-0.477,6.025,1.06,-0.658,0.0,-0.236,-1.01,-1.174,-2.019,-1.184,1.492,-0.629
I-geo,0.127,0.0,-0.231,0.0,-0.038,-0.292,4.096,0.0,-0.125,0.0,0.0,-0.129,-0.628,-1.034,-0.611,0.0,-0.512
B-gpe,1.039,0.0,-0.363,0.0,-0.27,-0.592,-1.381,-2.106,4.812,0.0,0.0,1.774,-1.189,0.484,-0.962,-0.938,-0.455
I-gpe,-0.645,0.0,0.0,0.0,0.0,0.416,-0.023,-0.044,4.147,0.0,0.0,0.0,-0.05,-0.491,0.0,0.0,0.0
B-nat,-0.318,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.302,0.0,0.0,-0.037,0.0,0.0,0.0


The model learned large negative weights for impossible transitions like O -> I-geo, O -> I-org and O -> I-tim, and so on.

In order to easy to read, we can check only a subset of tags.

In [72]:
eli5.show_weights(crf, top=10, targets=['O', 'B-org', 'I-per'])



From \ To,O,B-org,I-per
O,3.75,1.392,-2.772
B-org,0.217,-1.327,-1.076
I-per,0.067,-0.755,4.571

Weight?,Feature,Unnamed: 2_level_0
Weight?,Feature,Unnamed: 2_level_1
Weight?,Feature,Unnamed: 2_level_2
+4.657,bias,
+4.546,BOS,
+3.468,word.lower():jewish,
+3.242,word.lower():kurdish,
+2.680,-1:word.lower():prime,
+2.655,word[-2:]:ty,
… 926 more positive …,… 926 more positive …,
… 541 more negative …,… 541 more negative …,
-2.969,+1:word.lower():years,
-3.264,+1:word.lower():months,

Weight?,Feature
+4.657,bias
+4.546,BOS
+3.468,word.lower():jewish
+3.242,word.lower():kurdish
+2.680,-1:word.lower():prime
+2.655,word[-2:]:ty
… 926 more positive …,… 926 more positive …
… 541 more negative …,… 541 more negative …
-2.969,+1:word.lower():years
-3.264,+1:word.lower():months

Weight?,Feature
+2.953,+1:word.lower():interim
+2.877,word.lower():al-qaida
+2.846,word.lower():parliament
+2.815,-1:word.lower():extremist
+2.629,-1:word.lower():nepal
+2.616,word[-3:]:ban
+2.581,word.lower():hamas
+2.520,+1:word.lower():citizenship
+2.452,word.lower():senate
+2.392,word[-3:]:wat

Weight?,Feature
+2.561,-1:word.lower():president
+2.076,-1:postag:NN
+1.485,word[-2:]:li
+1.485,word.lower():condoleezza
+1.435,+1:word.lower():condoleezza
+1.319,+1:word.lower():donald
+1.309,+1:word.lower():george
… 502 more positive …,… 502 more positive …
… 73 more negative …,… 73 more negative …
-1.317,-1:word.lower():prime


Or check only some of the features for all tags.

In [73]:
eli5.show_weights(crf, top=10, feature_re='^word\.is',
                  horizontal_layout=False, show=['targets'])



Weight?,Feature
-2.025,word.isdigit()
-2.099,word.isupper()
-3.445,word.istitle()

Weight?,Feature
0.097,word.istitle()
-0.316,word.isupper()

Weight?,Feature
0.65,word.isdigit()
0.126,word.istitle()
0.01,word.isupper()

Weight?,Feature
0.835,word.isupper()
-0.002,word.istitle()

Weight?,Feature
0.562,word.isupper()
0.151,word.istitle()

Weight?,Feature
1.783,word.istitle()
-0.073,word.isupper()

Weight?,Feature
1.012,word.istitle()
-0.467,word.isupper()

Weight?,Feature
3.588,word.istitle()
0.788,word.isupper()

Weight?,Feature
0.447,word.istitle()

Weight?,Feature
1.696,word.isupper()
-0.001,word.istitle()

Weight?,Feature
0.02,word.istitle()

Weight?,Feature
2.285,word.isupper()
-0.212,word.istitle()

Weight?,Feature
0.684,word.istitle()
-0.132,word.isdigit()
-0.304,word.isupper()

Weight?,Feature
0.167,word.istitle()
-0.516,word.isupper()

Weight?,Feature
0.281,word.istitle()
-0.057,word.isupper()

Weight?,Feature
2.584,word.isdigit()
0.039,word.istitle()
-0.773,word.isupper()

Weight?,Feature
1.96,word.isdigit()
-0.593,word.istitle()
-0.594,word.isupper()
