This notebook illustrates how to train a NER model using the well known CONLL dataset, and sklearn_crfsuite library. 

### Importing Necessary Libraries

In [None]:
#Make the necessary imports
from nltk.tag import pos_tag
from sklearn_crfsuite import CRF, metrics
from sklearn.metrics import make_scorer,confusion_matrix
from pprint import pprint
from sklearn.metrics import f1_score,classification_report
from sklearn.pipeline import Pipeline
import string


### Loading The Data

In [41]:
"""
Load the training/testing data. 
input: conll format data, but with only 2 tab separated colums - words and NEtags.
output: A list where each item is 2 lists.  sentence as a list of tokens, NER tags as a list for each token.
"""
def load__data_conll(file_path):
    myoutput,words,tags = [],[],[]
    fh = open(file_path)
    for line in fh:
        line = line.strip()
        if "\t" not in line:
            #Sentence ended.
            myoutput.append([words,tags])
            words,tags = [],[]
        else:
            word, tag = line.split("\t")
            words.append(word)
            tags.append(tag)
    fh.close()
    return myoutput


In [40]:
"""
Get features for all words in the sentence
Features:
- word context: a window of 2 words on either side of the current word, and current word.
- POS context: a window of 2 POS tags on either side of the current word, and current tag. 
input: sentence as a list of tokens.
output: list of dictionaries. each dict represents features for that word.
"""
def sent2feats(sentence):
    feats = []
    sen_tags = pos_tag(sentence) #This format is specific to this POS tagger!
    for i in range(0,len(sentence)):
        word = sentence[i]
        wordfeats = {}
       #word features: word, prev 2 words, next 2 words in the sentence.
        wordfeats['word'] = word
        if i == 0:
            wordfeats["prevWord"] = wordfeats["prevSecondWord"] = "<S>"
        elif i==1:
            wordfeats["prevWord"] = sentence[0]
            wordfeats["prevSecondWord"] = "</S>"
        else:
            wordfeats["prevWord"] = sentence[i-1]
            wordfeats["prevSecondWord"] = sentence[i-2]
        #next two words as features
        if i == len(sentence)-2:
            wordfeats["nextWord"] = sentence[i+1]
            wordfeats["nextNextWord"] = "</S>"
        elif i==len(sentence)-1:
            wordfeats["nextWord"] = "</S>"
            wordfeats["nextNextWord"] = "</S>"
        else:
            wordfeats["nextWord"] = sentence[i+1]
            wordfeats["nextNextWord"] = sentence[i+2]
        
        #POS tag features: current tag, previous and next 2 tags.
        wordfeats['tag'] = sen_tags[i][1]
        if i == 0:
            wordfeats["prevTag"] = wordfeats["prevSecondTag"] = "<S>"
        elif i == 1:
            wordfeats["prevTag"] = sen_tags[0][1]
            wordfeats["prevSecondTag"] = "</S>"
        else:
            wordfeats["prevTag"] = sen_tags[i - 1][1]

            wordfeats["prevSecondTag"] = sen_tags[i - 2][1]
            # next two words as features
        if i == len(sentence) - 2:
            wordfeats["nextTag"] = sen_tags[i + 1][1]
            wordfeats["nextNextTag"] = "</S>"
        elif i == len(sentence) - 1:
            wordfeats["nextTag"] = "</S>"
            wordfeats["nextNextTag"] = "</S>"
        else:
            wordfeats["nextTag"] = sen_tags[i + 1][1]
            wordfeats["nextNextTag"] = sen_tags[i + 2][1]
        #That is it! You can add whatever you want!
        feats.append(wordfeats)
    return feats


### Extracting Features

In [39]:
#Extract features from the conll data, after loading it.
def get_feats_conll(conll_data):
    feats = []
    labels = []
    for sentence in conll_data:
        feats.append(sent2feats(sentence[0]))
        labels.append(sentence[1])
    return feats, labels

### Training a Model

In [38]:

#Train a sequence model
def train_seq(X_train,Y_train,X_dev,Y_dev):
   # crf = CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=50, all_possible_states=True)
    crf = CRF(algorithm='lbfgs', c1=0.1, c2=10, max_iterations=50)#, all_possible_states=True)
    #Just to fit on training data
    crf.fit(X_train, Y_train)
    labels = list(crf.classes_)
    #testing:
    y_pred = crf.predict(X_dev)
    sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0]))
    print(metrics.flat_f1_score(Y_dev, y_pred,average='weighted', labels=labels))
    print(metrics.flat_classification_report(Y_dev, y_pred, labels=sorted_labels, digits=3))
    #print(metrics.sequence_accuracy_score(Y_dev, y_pred))
    get_confusion_matrix(Y_dev, y_pred,labels=sorted_labels)


Confusion Matrix helper function . Source for this function is [here](https://gist.github.com/zachguo/10296432)

In [37]:
def print_cm(cm, labels):
    print("\n")
    """pretty print for confusion matrixes"""
    columnwidth = max([len(x) for x in labels] + [5])  # 5 is value length
    empty_cell = " " * columnwidth
    # Print header
    print("    " + empty_cell, end=" ")
    for label in labels:
        print("%{0}s".format(columnwidth) % label, end=" ")
    print()
    # Print rows
    for i, label1 in enumerate(labels):
        print("    %{0}s".format(columnwidth) % label1, end=" ")
        sum = 0
        for j in range(len(labels)):
            cell = "%{0}.0f".format(columnwidth) % cm[i, j]
            sum =  sum + int(cell)
            print(cell, end=" ")
        print(sum) #Prints the total number of instances per cat at the end.


In [36]:
#python-crfsuite does not have a confusion matrix function, 
#so writing it using sklearn's confusion matrix and print_cm from github
def get_confusion_matrix(y_true,y_pred,labels):
    trues,preds = [], []
    for yseq_true, yseq_pred in zip(y_true, y_pred):
        trues.extend(yseq_true)
        preds.extend(yseq_pred)
    print_cm(confusion_matrix(trues,preds,labels),labels)


### Call all our functions inside the main method

In [None]:
from google.colab import files
import requests  
import numpy as np
import pandas as pd

In [None]:
train_path = 'https://raw.githubusercontent.com/duybluemind1988/Data-science/master/Practical%20NLP%20Oreilly/Ch5/Data/conlldata/train.txt'
r1 = requests.get(train_path, stream = True)  
with open("train_1.txt", "wb") as file:  
    for block in r1.iter_content(chunk_size = 1024): 
         if block:  
             file.write(block) 


In [None]:
test_path = 'https://raw.githubusercontent.com/duybluemind1988/Data-science/master/Practical%20NLP%20Oreilly/Ch5/Data/conlldata/test.txt'
r2 = requests.get(test_path, stream = True)  
with open("test_1.txt", "wb") as file:  
    for block in r2.iter_content(chunk_size = 1024): 
         if block:  
             file.write(block) 

In [None]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [None]:
def main():
    train_path = '/content/train_1.txt'
    test_path = '/content/test_1.txt'
    conll_train = load__data_conll(train_path)
    conll_dev = load__data_conll(test_path)
    
    print("Training a Sequence classification model with CRF")
    feats, labels = get_feats_conll(conll_train)
    devfeats, devlabels = get_feats_conll(conll_dev)
    train_seq(feats, labels, devfeats, devlabels)
    print("Done with sequence model")

if __name__=="__main__":
    main()


Training a Sequence classification model with CRF
0.9255103670420659
              precision    recall  f1-score   support

           O      0.973     0.981     0.977     38323
       B-LOC      0.694     0.765     0.728      1668
       I-LOC      0.738     0.482     0.584       257
      B-MISC      0.648     0.309     0.419       702
      I-MISC      0.626     0.505     0.559       216
       B-ORG      0.670     0.561     0.611      1661
       I-ORG      0.551     0.704     0.618       835
       B-PER      0.773     0.766     0.769      1617
       I-PER      0.819     0.886     0.851      1156

    accuracy                          0.928     46435
   macro avg      0.721     0.662     0.679     46435
weighted avg      0.926     0.928     0.926     46435



                O  B-LOC  I-LOC B-MISC I-MISC  B-ORG  I-ORG  B-PER  I-PER 
         O  37579    118      3     22     32    193    224     88     64 38323
     B-LOC    143   1276      1     36      1     95     14     98   

This is pretty good. We already have a model which has an F-score of 92%!!!

# EXPLAIN DNN

In [2]:
!pip install sklearn_crfsuite

Collecting sklearn_crfsuite
  Downloading https://files.pythonhosted.org/packages/25/74/5b7befa513482e6dee1f3dd68171a6c9dfc14c0eaa00f885ffeba54fe9b0/sklearn_crfsuite-0.3.6-py2.py3-none-any.whl
Collecting python-crfsuite>=0.8.3
[?25l  Downloading https://files.pythonhosted.org/packages/95/99/869dde6dbf3e0d07a013c8eebfb0a3d30776334e0097f8432b631a9a3a19/python_crfsuite-0.9.7-cp36-cp36m-manylinux1_x86_64.whl (743kB)
[K     |▍                               | 10kB 12.5MB/s eta 0:00:01[K     |▉                               | 20kB 8.0MB/s eta 0:00:01[K     |█▎                              | 30kB 2.6MB/s eta 0:00:01[K     |█▊                              | 40kB 3.3MB/s eta 0:00:01[K     |██▏                             | 51kB 3.6MB/s eta 0:00:01[K     |██▋                             | 61kB 3.8MB/s eta 0:00:01[K     |███                             | 71kB 4.3MB/s eta 0:00:01[K     |███▌                            | 81kB 3.2MB/s eta 0:00:01[K     |████                        

In [11]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [12]:
#Make the necessary imports
from nltk.tag import pos_tag
from sklearn_crfsuite import CRF, metrics
from sklearn.metrics import make_scorer,confusion_matrix
from pprint import pprint
from sklearn.metrics import f1_score,classification_report
from sklearn.pipeline import Pipeline
import string
import requests  
import numpy as np
import pandas as pd

train_path = 'https://raw.githubusercontent.com/duybluemind1988/Data-science/master/Practical%20NLP%20Oreilly/Ch5/Data/conlldata/train.txt'
r1 = requests.get(train_path, stream = True)  
with open("train_1.txt", "wb") as file:  
    for block in r1.iter_content(chunk_size = 1024): 
         if block:  
             file.write(block) 
test_path = 'https://raw.githubusercontent.com/duybluemind1988/Data-science/master/Practical%20NLP%20Oreilly/Ch5/Data/conlldata/test.txt'
r2 = requests.get(test_path, stream = True)  
with open("test_1.txt", "wb") as file:  
    for block in r2.iter_content(chunk_size = 1024): 
         if block:  
             file.write(block) 

In [13]:
train_path = '/content/train_1.txt'
f = open(train_path, "r")
print(f.read(200))

EU	B-ORG
rejects	O
German	B-MISC
call	O
to	O
boycott	O
British	B-MISC
lamb	O
.	O

Peter	B-PER
Blackburn	I-PER

BRUSSELS	B-LOC
1996-08-22	O

The	O
European	B-ORG
Commission	I-ORG
said	O
on	O
Thursday	O


In [14]:
#def load__data_conll(file_path):
myoutput_train,words,tags = [],[],[]
fh = open(train_path)
for line in fh:
    line = line.strip()
    if "\t" not in line:
        #Sentence ended.
        myoutput_train.append([words,tags])
        words,tags = [],[]
    else:
        word, tag = line.split("\t")
        words.append(word)
        tags.append(tag)
fh.close()
conll_train=myoutput_train
print(len(myoutput_train))
myoutput_train[0]

14041


[['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'],
 ['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']]

In [15]:
test_path = '/content/test_1.txt'
f = open(test_path, "r")
print(f.read(200))

SOCCER	O
-	O
JAPAN	B-LOC
GET	O
LUCKY	O
WIN	O
,	O
CHINA	B-PER
IN	O
SURPRISE	O
DEFEAT	O
.	O

Nadim	B-PER
Ladki	I-PER

AL-AIN	B-LOC
,	O
United	B-LOC
Arab	I-LOC
Emirates	I-LOC
1996-12-06	O

Japan	B-LOC
be


In [16]:
#def load__data_conll(file_path):
myoutput_test,words,tags = [],[],[]
fh = open(test_path)
for line in fh:
    line = line.strip()
    if "\t" not in line:
        #Sentence ended.
        myoutput_test.append([words,tags])
        words,tags = [],[]
    else:
        word, tag = line.split("\t")
        words.append(word)
        tags.append(tag)
fh.close()
conll_dev=myoutput_test
print(len(myoutput_test))
myoutput_test[0]
# cu moi myoutput (sentences) se co 2 list: 1 list text va 1 list label

3453


[['SOCCER',
  '-',
  'JAPAN',
  'GET',
  'LUCKY',
  'WIN',
  ',',
  'CHINA',
  'IN',
  'SURPRISE',
  'DEFEAT',
  '.'],
 ['O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'B-PER', 'O', 'O', 'O', 'O']]

In [17]:
myoutput_test[1]

[['Nadim', 'Ladki'], ['B-PER', 'I-PER']]

In [18]:
conll_data=conll_train
for sentence in conll_data[:3]:
  print('sentence0',sentence[0])
  print('sentence1',sentence[1])

sentence0 ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']
sentence1 ['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']
sentence0 ['Peter', 'Blackburn']
sentence1 ['B-PER', 'I-PER']
sentence0 ['BRUSSELS', '1996-08-22']
sentence1 ['B-LOC', 'O']


Prepare train set

In [30]:

#Extract features from the conll data, after loading it.
conll_data=conll_train
feats_ = []
labels = []
for sentence in conll_data:
  #print('For loop 1')
  #def sent2feats(sentence):
  """
Get features for all words in the sentence
Features:
- word context: a window of 2 words on either side of the current word, and current word.
- POS context: a window of 2 POS tags on either side of the current word, and current tag. 
input: sentence as a list of tokens.
output: list of dictionaries. each dict represents features for that word.
"""
  sentence_=sentence[0] #feats
  #print('sentence_[0]',sentence_)
  feats = []
  sen_tags = pos_tag(sentence_) #This format is specific to this POS tagger!
  #print('sen_tags',sen_tags)
  #print('len(sentence_:',len(sentence_))
  for i in range(0,len(sentence_)):
    #print('For loop 2')
    #print('i',i)
    word = sentence_[i]
    #print('word',word)
    wordfeats = {}
    #word features: word, prev 2 words, next 2 words in the sentence.
    wordfeats['word'] = word
    if i == 0:
        wordfeats["prevWord"] = wordfeats["prevSecondWord"] = "<S>"
        #print('wordfeats["prevWord"]',wordfeats["prevWord"])
    elif i==1:
        wordfeats["prevWord"] = sentence_[0]
        wordfeats["prevSecondWord"] = "</S>"
        #print('wordfeats["prevWord"]',wordfeats["prevWord"])
        #print('wordfeats["prevSecondWord"]',wordfeats["prevSecondWord"])
    else:
        wordfeats["prevWord"] = sentence_[i-1]
        wordfeats["prevSecondWord"] = sentence_[i-2]
        #print('wordfeats["prevWord"]',wordfeats["prevWord"])
        #print('wordfeats["prevSecondWord"]',wordfeats["prevSecondWord"])
    #next two words as features
    if i == len(sentence_)-2:
        wordfeats["nextWord"] = sentence_[i+1]
        wordfeats["nextNextWord"] = "</S>"
        #print('wordfeats["nextWord"]',wordfeats["nextWord"])
        #print('wordfeats["nextNextWord"]',wordfeats["nextNextWord"])
    elif i==len(sentence_)-1:
        wordfeats["nextWord"] = "</S>"
        wordfeats["nextNextWord"] = "</S>"
        #print('wordfeats["nextWord"]',wordfeats["nextWord"])
        #print('wordfeats["nextNextWord"]',wordfeats["nextNextWord"])
    else:
        wordfeats["nextWord"] = sentence_[i+1]
        wordfeats["nextNextWord"] = sentence_[i+2]
        #print('wordfeats["nextWord"]',wordfeats["nextWord"])
        #print('wordfeats["nextNextWord"]',wordfeats["nextNextWord"])
    
    #POS tag features: current tag, previous and next 2 tags.
    wordfeats['tag'] = sen_tags[i][1]
    if i == 0:
        wordfeats["prevTag"] = wordfeats["prevSecondTag"] = "<S>"
        #print('wordfeats["prevTag"]',wordfeats["prevTag"])
    elif i == 1:
        wordfeats["prevTag"] = sen_tags[0][1]
        wordfeats["prevSecondTag"] = "</S>"
        #print('wordfeats["prevTag"]',wordfeats["prevTag"])
        #print('wordfeats["prevSecondTag"]',wordfeats["prevSecondTag"])
    else:
        wordfeats["prevTag"] = sen_tags[i - 1][1]
        wordfeats["prevSecondTag"] = sen_tags[i - 2][1]
        #print('wordfeats["prevTag"]',wordfeats["prevTag"])
        #print('wordfeats["prevSecondTag"]',wordfeats["prevSecondTag"])
        # next two words as features
    if i == len(sentence_) - 2:
        wordfeats["nextTag"] = sen_tags[i + 1][1]
        wordfeats["nextNextTag"] = "</S>"
        #print('wordfeats["nextTag"]',wordfeats["nextTag"])
        #print('wordfeats["nextNextTag"]',wordfeats["nextNextTag"])
    elif i == len(sentence_) - 1:
        wordfeats["nextTag"] = "</S>"
        wordfeats["nextNextTag"] = "</S>"
        #print('wordfeats["nextTag"]',wordfeats["nextTag"])
        #print('wordfeats["nextNextTag"]',wordfeats["nextNextTag"])
    else:
        wordfeats["nextTag"] = sen_tags[i + 1][1]
        wordfeats["nextNextTag"] = sen_tags[i + 2][1]
        #print('wordfeats["nextTag"]',wordfeats["nextTag"])
        #print('wordfeats["nextNextTag"]',wordfeats["nextNextTag"])
    #That is it! You can add whatever you want!
    feats.append(wordfeats)
    #return feats
    # End func

  feats_.append(feats)
  labels.append(sentence[1])
#return feats, labels

In [31]:
X_train=feats_
#X_train

In [32]:
y_train=labels
#y_train

Prepare test set

In [33]:

#Extract features from the conll data, after loading it.
conll_data=conll_dev
feats_ = []
labels = []
for sentence in conll_data:
  #print('For loop 1')
  #def sent2feats(sentence):
  """
Get features for all words in the sentence
Features:
- word context: a window of 2 words on either side of the current word, and current word.
- POS context: a window of 2 POS tags on either side of the current word, and current tag. 
input: sentence as a list of tokens.
output: list of dictionaries. each dict represents features for that word.
"""
  sentence_=sentence[0] #feats
  #print('sentence_[0]',sentence_)
  feats = []
  sen_tags = pos_tag(sentence_) #This format is specific to this POS tagger!
  #print('sen_tags',sen_tags)
  #print('len(sentence_:',len(sentence_))
  for i in range(0,len(sentence_)):
    #print('For loop 2')
    #print('i',i)
    word = sentence_[i]
    #print('word',word)
    wordfeats = {}
    #word features: word, prev 2 words, next 2 words in the sentence.
    wordfeats['word'] = word
    if i == 0:
        wordfeats["prevWord"] = wordfeats["prevSecondWord"] = "<S>"
        #print('wordfeats["prevWord"]',wordfeats["prevWord"])
    elif i==1:
        wordfeats["prevWord"] = sentence_[0]
        wordfeats["prevSecondWord"] = "</S>"
        #print('wordfeats["prevWord"]',wordfeats["prevWord"])
        #print('wordfeats["prevSecondWord"]',wordfeats["prevSecondWord"])
    else:
        wordfeats["prevWord"] = sentence_[i-1]
        wordfeats["prevSecondWord"] = sentence_[i-2]
        #print('wordfeats["prevWord"]',wordfeats["prevWord"])
        #print('wordfeats["prevSecondWord"]',wordfeats["prevSecondWord"])
    #next two words as features
    if i == len(sentence_)-2:
        wordfeats["nextWord"] = sentence_[i+1]
        wordfeats["nextNextWord"] = "</S>"
        #print('wordfeats["nextWord"]',wordfeats["nextWord"])
        #print('wordfeats["nextNextWord"]',wordfeats["nextNextWord"])
    elif i==len(sentence_)-1:
        wordfeats["nextWord"] = "</S>"
        wordfeats["nextNextWord"] = "</S>"
        #print('wordfeats["nextWord"]',wordfeats["nextWord"])
        #print('wordfeats["nextNextWord"]',wordfeats["nextNextWord"])
    else:
        wordfeats["nextWord"] = sentence_[i+1]
        wordfeats["nextNextWord"] = sentence_[i+2]
        #print('wordfeats["nextWord"]',wordfeats["nextWord"])
        #print('wordfeats["nextNextWord"]',wordfeats["nextNextWord"])
    
    #POS tag features: current tag, previous and next 2 tags.
    wordfeats['tag'] = sen_tags[i][1]
    if i == 0:
        wordfeats["prevTag"] = wordfeats["prevSecondTag"] = "<S>"
        #print('wordfeats["prevTag"]',wordfeats["prevTag"])
    elif i == 1:
        wordfeats["prevTag"] = sen_tags[0][1]
        wordfeats["prevSecondTag"] = "</S>"
        #print('wordfeats["prevTag"]',wordfeats["prevTag"])
        #print('wordfeats["prevSecondTag"]',wordfeats["prevSecondTag"])
    else:
        wordfeats["prevTag"] = sen_tags[i - 1][1]
        wordfeats["prevSecondTag"] = sen_tags[i - 2][1]
        #print('wordfeats["prevTag"]',wordfeats["prevTag"])
        #print('wordfeats["prevSecondTag"]',wordfeats["prevSecondTag"])
        # next two words as features
    if i == len(sentence_) - 2:
        wordfeats["nextTag"] = sen_tags[i + 1][1]
        wordfeats["nextNextTag"] = "</S>"
        #print('wordfeats["nextTag"]',wordfeats["nextTag"])
        #print('wordfeats["nextNextTag"]',wordfeats["nextNextTag"])
    elif i == len(sentence_) - 1:
        wordfeats["nextTag"] = "</S>"
        wordfeats["nextNextTag"] = "</S>"
        #print('wordfeats["nextTag"]',wordfeats["nextTag"])
        #print('wordfeats["nextNextTag"]',wordfeats["nextNextTag"])
    else:
        wordfeats["nextTag"] = sen_tags[i + 1][1]
        wordfeats["nextNextTag"] = sen_tags[i + 2][1]
        #print('wordfeats["nextTag"]',wordfeats["nextTag"])
        #print('wordfeats["nextNextTag"]',wordfeats["nextNextTag"])
    #That is it! You can add whatever you want!
    feats.append(wordfeats)
    #return feats
    # End func

  feats_.append(feats)
  labels.append(sentence[1])
#return feats, labels

In [34]:
X_dev=feats_
y_dev=labels

In [49]:
X_train[0]

[{'nextNextTag': 'JJ',
  'nextNextWord': 'German',
  'nextTag': 'VBZ',
  'nextWord': 'rejects',
  'prevSecondTag': '<S>',
  'prevSecondWord': '<S>',
  'prevTag': '<S>',
  'prevWord': '<S>',
  'tag': 'NNP',
  'word': 'EU'},
 {'nextNextTag': 'NN',
  'nextNextWord': 'call',
  'nextTag': 'JJ',
  'nextWord': 'German',
  'prevSecondTag': '</S>',
  'prevSecondWord': '</S>',
  'prevTag': 'NNP',
  'prevWord': 'EU',
  'tag': 'VBZ',
  'word': 'rejects'},
 {'nextNextTag': 'TO',
  'nextNextWord': 'to',
  'nextTag': 'NN',
  'nextWord': 'call',
  'prevSecondTag': 'NNP',
  'prevSecondWord': 'EU',
  'prevTag': 'VBZ',
  'prevWord': 'rejects',
  'tag': 'JJ',
  'word': 'German'},
 {'nextNextTag': 'VB',
  'nextNextWord': 'boycott',
  'nextTag': 'TO',
  'nextWord': 'to',
  'prevSecondTag': 'VBZ',
  'prevSecondWord': 'rejects',
  'prevTag': 'JJ',
  'prevWord': 'German',
  'tag': 'NN',
  'word': 'call'},
 {'nextNextTag': 'JJ',
  'nextNextWord': 'British',
  'nextTag': 'VB',
  'nextWord': 'boycott',
  'prevSec

In [48]:
print(pd.DataFrame(X_train).shape)
print(len(X_train))
pd.DataFrame(X_train).head()

(14041, 113)
14041


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112
0,"{'word': 'EU', 'prevWord': '<S>', 'prevSecondW...","{'word': 'rejects', 'prevWord': 'EU', 'prevSec...","{'word': 'German', 'prevWord': 'rejects', 'pre...","{'word': 'call', 'prevWord': 'German', 'prevSe...","{'word': 'to', 'prevWord': 'call', 'prevSecond...","{'word': 'boycott', 'prevWord': 'to', 'prevSec...","{'word': 'British', 'prevWord': 'boycott', 'pr...","{'word': 'lamb', 'prevWord': 'British', 'prevS...","{'word': '.', 'prevWord': 'lamb', 'prevSecondW...",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,"{'word': 'Peter', 'prevWord': '<S>', 'prevSeco...","{'word': 'Blackburn', 'prevWord': 'Peter', 'pr...",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,"{'word': 'BRUSSELS', 'prevWord': '<S>', 'prevS...","{'word': '1996-08-22', 'prevWord': 'BRUSSELS',...",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,"{'word': 'The', 'prevWord': '<S>', 'prevSecond...","{'word': 'European', 'prevWord': 'The', 'prevS...","{'word': 'Commission', 'prevWord': 'European',...","{'word': 'said', 'prevWord': 'Commission', 'pr...","{'word': 'on', 'prevWord': 'said', 'prevSecond...","{'word': 'Thursday', 'prevWord': 'on', 'prevSe...","{'word': 'it', 'prevWord': 'Thursday', 'prevSe...","{'word': 'disagreed', 'prevWord': 'it', 'prevS...","{'word': 'with', 'prevWord': 'disagreed', 'pre...","{'word': 'German', 'prevWord': 'with', 'prevSe...","{'word': 'advice', 'prevWord': 'German', 'prev...","{'word': 'to', 'prevWord': 'advice', 'prevSeco...","{'word': 'consumers', 'prevWord': 'to', 'prevS...","{'word': 'to', 'prevWord': 'consumers', 'prevS...","{'word': 'shun', 'prevWord': 'to', 'prevSecond...","{'word': 'British', 'prevWord': 'shun', 'prevS...","{'word': 'lamb', 'prevWord': 'British', 'prevS...","{'word': 'until', 'prevWord': 'lamb', 'prevSec...","{'word': 'scientists', 'prevWord': 'until', 'p...","{'word': 'determine', 'prevWord': 'scientists'...","{'word': 'whether', 'prevWord': 'determine', '...","{'word': 'mad', 'prevWord': 'whether', 'prevSe...","{'word': 'cow', 'prevWord': 'mad', 'prevSecond...","{'word': 'disease', 'prevWord': 'cow', 'prevSe...","{'word': 'can', 'prevWord': 'disease', 'prevSe...","{'word': 'be', 'prevWord': 'can', 'prevSecondW...","{'word': 'transmitted', 'prevWord': 'be', 'pre...","{'word': 'to', 'prevWord': 'transmitted', 'pre...","{'word': 'sheep', 'prevWord': 'to', 'prevSecon...","{'word': '.', 'prevWord': 'sheep', 'prevSecond...",,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,"{'word': 'Germany', 'prevWord': '<S>', 'prevSe...","{'word': ''s', 'prevWord': 'Germany', 'prevSec...","{'word': 'representative', 'prevWord': ''s', '...","{'word': 'to', 'prevWord': 'representative', '...","{'word': 'the', 'prevWord': 'to', 'prevSecondW...","{'word': 'European', 'prevWord': 'the', 'prevS...","{'word': 'Union', 'prevWord': 'European', 'pre...","{'word': ''s', 'prevWord': 'Union', 'prevSecon...","{'word': 'veterinary', 'prevWord': ''s', 'prev...","{'word': 'committee', 'prevWord': 'veterinary'...","{'word': 'Werner', 'prevWord': 'committee', 'p...","{'word': 'Zwingmann', 'prevWord': 'Werner', 'p...","{'word': 'said', 'prevWord': 'Zwingmann', 'pre...","{'word': 'on', 'prevWord': 'said', 'prevSecond...","{'word': 'Wednesday', 'prevWord': 'on', 'prevS...","{'word': 'consumers', 'prevWord': 'Wednesday',...","{'word': 'should', 'prevWord': 'consumers', 'p...","{'word': 'buy', 'prevWord': 'should', 'prevSec...","{'word': 'sheepmeat', 'prevWord': 'buy', 'prev...","{'word': 'from', 'prevWord': 'sheepmeat', 'pre...","{'word': 'countries', 'prevWord': 'from', 'pre...","{'word': 'other', 'prevWord': 'countries', 'pr...","{'word': 'than', 'prevWord': 'other', 'prevSec...","{'word': 'Britain', 'prevWord': 'than', 'prevS...","{'word': 'until', 'prevWord': 'Britain', 'prev...","{'word': 'the', 'prevWord': 'until', 'prevSeco...","{'word': 'scientific', 'prevWord': 'the', 'pre...","{'word': 'advice', 'prevWord': 'scientific', '...","{'word': 'was', 'prevWord': 'advice', 'prevSec...","{'word': 'clearer', 'prevWord': 'was', 'prevSe...","{'word': '.', 'prevWord': 'clearer', 'prevSeco...",,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [51]:
print(pd.DataFrame(y_train).shape)
print(len(y_train))
pd.DataFrame(y_train).head()

(14041, 113)
14041


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112
0,B-ORG,O,B-MISC,O,O,O,B-MISC,O,O,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,B-PER,I-PER,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,B-LOC,O,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,O,B-ORG,I-ORG,O,O,O,O,O,O,B-MISC,O,O,O,O,O,B-MISC,O,O,O,O,O,O,O,O,O,O,O,O,O,O,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,B-LOC,O,O,O,O,B-ORG,I-ORG,O,O,O,B-PER,I-PER,O,O,O,O,O,O,O,O,O,O,O,B-LOC,O,O,O,O,O,O,O,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [43]:
# crf = CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=50, all_possible_states=True)
crf = CRF(algorithm='lbfgs', c1=0.1, c2=10, max_iterations=50)#, all_possible_states=True)
#Just to fit on training data
crf.fit(X_train, y_train)
labels = list(crf.classes_)
#testing:
y_pred = crf.predict(X_dev)
sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0]))
print(metrics.flat_f1_score(y_dev, y_pred,average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=sorted_labels, digits=3))

0.9255103670420659
              precision    recall  f1-score   support

           O      0.973     0.981     0.977     38323
       B-LOC      0.694     0.765     0.728      1668
       I-LOC      0.738     0.482     0.584       257
      B-MISC      0.648     0.309     0.419       702
      I-MISC      0.626     0.505     0.559       216
       B-ORG      0.670     0.561     0.611      1661
       I-ORG      0.551     0.704     0.618       835
       B-PER      0.773     0.766     0.769      1617
       I-PER      0.819     0.886     0.851      1156

    accuracy                          0.928     46435
   macro avg      0.721     0.662     0.679     46435
weighted avg      0.926     0.928     0.926     46435



In [42]:
#print(metrics.sequence_accuracy_score(y_dev, y_pred))
get_confusion_matrix(y_dev, y_pred,labels=sorted_labels)



                O  B-LOC  I-LOC B-MISC I-MISC  B-ORG  I-ORG  B-PER  I-PER 
         O  37579    118      3     22     32    193    224     88     64 38323
     B-LOC    143   1276      1     36      1     95     14     98      4 1668
     I-LOC     32      6    124      1      5      0     52      0     37 257
    B-MISC    344     48      1    217      2     56     13     19      2 702
    I-MISC     58      1      4      4    109      2     29      0      9 216
     B-ORG    265    236      0     48      3    932     20    151      6 1661
     I-ORG     76     15     18      2     15     21    588      8     92 835
     B-PER     86    138      1      5      3     90     44   1238     12 1617
     I-PER     26      1     16      0      4      2     83      0   1024 1156
