<a href="https://colab.research.google.com/github/divyadass/NER/blob/develop_CRF/eda_conll_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## This code is referenced from the book Practical Natural Language Processing, chapter 5 on IE 

In [None]:
import pandas as pd
import numpy as np

In [None]:
"""
Load the training/testing data. 
input: conll format data, but with only 2 tab separated colums - words and NEtags.
output: A list where each item is 2 lists.  sentence as a list of tokens, NER tags as a list for each token.
"""
def load__data_conll(file_path):
    myoutput,words,tags = [],[],[]
    fh = open(file_path)
    for line in fh:
        line = line.strip()
        if "\t" not in line:
            #Sentence ended.
            myoutput.append([words,tags])
            words,tags = [],[]
        else:
            word, tag = line.split("\t")
            words.append(word)
            tags.append(tag)
    fh.close()
    return myoutput

In [None]:
train = load__data_conll('data/train.txt')
dev = load__data_conll('data/test.txt')

In [None]:
len(train), type(train)

14041

In [None]:
len(dev), type(dev)

3452

In [None]:
train

[[['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'],
  ['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']],
 [['Peter', 'Blackburn'], ['B-PER', 'I-PER']],
 [['BRUSSELS', '1996-08-22'], ['B-LOC', 'O']],
 [['The',
   'European',
   'Commission',
   'said',
   'on',
   'Thursday',
   'it',
   'disagreed',
   'with',
   'German',
   'advice',
   'to',
   'consumers',
   'to',
   'shun',
   'British',
   'lamb',
   'until',
   'scientists',
   'determine',
   'whether',
   'mad',
   'cow',
   'disease',
   'can',
   'be',
   'transmitted',
   'to',
   'sheep',
   '.'],
  ['O',
   'B-ORG',
   'I-ORG',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'B-MISC',
   'O',
   'O',
   'O',
   'O',
   'O',
   'B-MISC',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O']],
 [['Germany',
   "'s",
   'representative',
   'to',
   'the',
   'European',
   'Union',
   "'s",
   'veterinary',
   'committee',
 

In [None]:
dev

[[['SOCCER',
   '-',
   'JAPAN',
   'GET',
   'LUCKY',
   'WIN',
   ',',
   'CHINA',
   'IN',
   'SURPRISE',
   'DEFEAT',
   '.'],
  ['O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'B-PER', 'O', 'O', 'O', 'O']],
 [['Nadim', 'Ladki'], ['B-PER', 'I-PER']],
 [['AL-AIN', ',', 'United', 'Arab', 'Emirates', '1996-12-06'],
  ['B-LOC', 'O', 'B-LOC', 'I-LOC', 'I-LOC', 'O']],
 [['Japan',
   'began',
   'the',
   'defence',
   'of',
   'their',
   'Asian',
   'Cup',
   'title',
   'with',
   'a',
   'lucky',
   '2-1',
   'win',
   'against',
   'Syria',
   'in',
   'a',
   'Group',
   'C',
   'championship',
   'match',
   'on',
   'Friday',
   '.'],
  ['B-LOC',
   'O',
   'O',
   'O',
   'O',
   'O',
   'B-MISC',
   'I-MISC',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'B-LOC',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O']],
 [['But',
   'China',
   'saw',
   'their',
   'luck',
   'desert',
   'them',
   'in',
   'the',
   'second',
   'match',
   'of',
   'the'

In [None]:
!conda install pip
!pip install sklearn_crfsuite

import nltk
nltk.download('averaged_perceptron_tagger')

from nltk.tag import pos_tag
from sklearn_crfsuite import CRF, metrics
from sklearn.metrics import make_scorer,confusion_matrix
from pprint import pprint
from sklearn.metrics import f1_score,classification_report
from sklearn.pipeline import Pipeline
import string
import warnings
warnings.filterwarnings('ignore')

In [None]:
pos_tag(['EU', 'rejects', 'German', 'call', 'to', 'boycott'])

[('EU', 'NNP'),
 ('rejects', 'VBZ'),
 ('German', 'JJ'),
 ('call', 'NN'),
 ('to', 'TO'),
 ('boycott', 'VB')]

In [None]:
"""
Get features for all words in the sentence
Features:
- word context: a window of 2 words on either side of the current word, and current word.
- POS context: a window of 2 POS tags on either side of the current word, and current tag. 
input: sentence as a list of tokens.
output: list of dictionaries. each dict represents features for that word.
"""
def sent2feats(sentence):
    feats = []
    sen_tags = pos_tag(sentence) #This format is specific to this POS tagger!
    for i in range(0,len(sentence)):
        word = sentence[i]
        wordfeats = {}
       #word features: word, prev 2 words, next 2 words in the sentence.
        wordfeats['word'] = word
        if i == 0:
            wordfeats["prevWord"] = wordfeats["prevSecondWord"] = "<S>"
        elif i==1:
            wordfeats["prevWord"] = sentence[0]
            wordfeats["prevSecondWord"] = "</S>"
        else:
            wordfeats["prevWord"] = sentence[i-1]
            wordfeats["prevSecondWord"] = sentence[i-2]
        #next two words as features
        if i == len(sentence)-2:
            wordfeats["nextWord"] = sentence[i+1]
            wordfeats["nextNextWord"] = "</S>"
        elif i==len(sentence)-1:
            wordfeats["nextWord"] = "</S>"
            wordfeats["nextNextWord"] = "</S>"
        else:
            wordfeats["nextWord"] = sentence[i+1]
            wordfeats["nextNextWord"] = sentence[i+2]
        
        #POS tag features: current tag, previous and next 2 tags.
        wordfeats['tag'] = sen_tags[i][1]
        if i == 0:
            wordfeats["prevTag"] = wordfeats["prevSecondTag"] = "<S>"
        elif i == 1:
            wordfeats["prevTag"] = sen_tags[0][1]
            wordfeats["prevSecondTag"] = "</S>"
        else:
            wordfeats["prevTag"] = sen_tags[i - 1][1]

            wordfeats["prevSecondTag"] = sen_tags[i - 2][1]
            # next two words as features
        if i == len(sentence) - 2:
            wordfeats["nextTag"] = sen_tags[i + 1][1]
            wordfeats["nextNextTag"] = "</S>"
        elif i == len(sentence) - 1:
            wordfeats["nextTag"] = "</S>"
            wordfeats["nextNextTag"] = "</S>"
        else:
            wordfeats["nextTag"] = sen_tags[i + 1][1]
            wordfeats["nextNextTag"] = sen_tags[i + 2][1]
        #That is it! You can add whatever you want!
        feats.append(wordfeats)
    return feats

#Extract features from the conll data, after loading it.
def get_feats_conll(conll_data):
    feats = []
    labels = []
    for sentence in conll_data:
        feats.append(sent2feats(sentence[0]))
        labels.append(sentence[1])
    return feats, labels  

In [None]:
#Train a sequence model
def train_seq(X_train,Y_train,X_dev,Y_dev):
   # crf = CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=50, all_possible_states=True)
    crf = CRF(algorithm='lbfgs', c1=0.1, c2=10, max_iterations=50)#, all_possible_states=True)
    #Just to fit on training data
    crf.fit(X_train, Y_train)
    labels = list(crf.classes_)
    #testing:
    y_pred = crf.predict(X_dev)
    sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0]))
    print('flat_f1_score', metrics.flat_f1_score(Y_dev, y_pred,average='weighted', labels=labels))
    print('flat_classification_report \n', metrics.flat_classification_report(Y_dev, y_pred, labels=sorted_labels, digits=3))
    #print(metrics.sequence_accuracy_score(Y_dev, y_pred))
    print('get_confusion_matrix called')
    get_confusion_matrix(Y_dev, y_pred,labels=sorted_labels)
   

def print_cm(cm, labels):
    print("\n")
    """pretty print for confusion matrixes"""
    columnwidth = max([len(x) for x in labels] + [5])  # 5 is value length
    empty_cell = " " * columnwidth
    # Print header
    print("    " + empty_cell, end=" ")
    for label in labels:
        print("%{0}s".format(columnwidth) % label, end=" ")
    print()
    # Print rows
    for i, label1 in enumerate(labels):
        print("    %{0}s".format(columnwidth) % label1, end=" ")
        sum = 0
        for j in range(len(labels)):
            cell = "%{0}.0f".format(columnwidth) % cm[i, j]
            sum =  sum + int(cell)
            print(cell, end=" ")
        print(sum) #Prints the total number of instances per cat at the end.


#python-crfsuite does not have a confusion matrix function, 
#so writing it using sklearn's confusion matrix and print_cm from github
def get_confusion_matrix(y_true,y_pred,labels):
    trues,preds = [], []
    for yseq_true, yseq_pred in zip(y_true, y_pred):
        trues.extend(yseq_true)
        preds.extend(yseq_pred)
    print_cm(confusion_matrix(trues,preds,labels),labels)

In [None]:
feats

[[{'word': 'EU',
   'prevWord': '<S>',
   'prevSecondWord': '<S>',
   'nextWord': 'rejects',
   'nextNextWord': 'German',
   'tag': 'NNP',
   'prevTag': '<S>',
   'prevSecondTag': '<S>',
   'nextTag': 'VBZ',
   'nextNextTag': 'JJ'},
  {'word': 'rejects',
   'prevWord': 'EU',
   'prevSecondWord': '</S>',
   'nextWord': 'German',
   'nextNextWord': 'call',
   'tag': 'VBZ',
   'prevTag': 'NNP',
   'prevSecondTag': '</S>',
   'nextTag': 'JJ',
   'nextNextTag': 'NN'},
  {'word': 'German',
   'prevWord': 'rejects',
   'prevSecondWord': 'EU',
   'nextWord': 'call',
   'nextNextWord': 'to',
   'tag': 'JJ',
   'prevTag': 'VBZ',
   'prevSecondTag': 'NNP',
   'nextTag': 'NN',
   'nextNextTag': 'TO'},
  {'word': 'call',
   'prevWord': 'German',
   'prevSecondWord': 'rejects',
   'nextWord': 'to',
   'nextNextWord': 'boycott',
   'tag': 'NN',
   'prevTag': 'JJ',
   'prevSecondTag': 'VBZ',
   'nextTag': 'TO',
   'nextNextTag': 'VB'},
  {'word': 'to',
   'prevWord': 'call',
   'prevSecondWord': 'Germ

In [None]:
feats, labels = get_feats_conll(train)
devfeats, devlabels = get_feats_conll(dev)

In [None]:
train_seq(feats, labels, devfeats, devlabels)

flat_f1_score 0.9255163144785534
flat_classification_report 
               precision    recall  f1-score   support

           O      0.973     0.981     0.977     38289
       B-LOC      0.694     0.765     0.728      1667
       I-LOC      0.738     0.482     0.584       257
      B-MISC      0.650     0.310     0.419       701
      I-MISC      0.624     0.505     0.558       214
       B-ORG      0.670     0.561     0.610      1660
       I-ORG      0.551     0.704     0.618       834
       B-PER      0.773     0.766     0.769      1616
       I-PER      0.819     0.886     0.851      1156

    accuracy                          0.928     46394
   macro avg      0.721     0.662     0.679     46394
weighted avg      0.926     0.928     0.926     46394

get_confusion_matrix called


                O  B-LOC  I-LOC B-MISC I-MISC  B-ORG  I-ORG  B-PER  I-PER 
         O  37545    118      3     22     32    193    224     88     64 38289
     B-LOC    143   1275      1     36      1   