# Named Entity Recognition using CRF model
In Natural Language Processing (NLP) an Entity Recognition is one of the common problem. The entity is referred to as the part of the text that is interested in. In NLP, NER is a method of extracting the relevant information from a large corpus and classifying those entities into predefined categories such as location, organization, name and so on. 
Information about lables: 
* geo = Geographical Entity
* org = Organization
* per = Person
* gpe = Geopolitical Entity
* tim = Time indicator
* art = Artifact
* eve = Event
* nat = Natural Phenomenon

        1. Total Words Count = 1354149 
        2. Target Data Column: Tag

#### Importing Libraries

In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn_crfsuite import CRF
from sklearn_crfsuite.metrics import flat_f1_score
from sklearn_crfsuite.metrics import flat_classification_report



In [2]:
#Reading the csv file
df = pd.read_csv("Annotated/GMB_dataset.txt", sep="\t", encoding="latin1")
del df['Unnamed: 0']

In [3]:
#Display first 10 rows
df.head(10)

Unnamed: 0,Sentence #,Word,POS,Tag
0,1.0,Thousands,NNS,O
1,1.0,of,IN,O
2,1.0,demonstrators,NNS,O
3,1.0,have,VBP,O
4,1.0,marched,VBN,O
5,1.0,through,IN,O
6,1.0,London,NNP,B-geo
7,1.0,to,TO,O
8,1.0,protest,VB,O
9,1.0,the,DT,O


In [4]:
df[(df.Tag == 'B-tim') | (df.Tag == 'I-tim')]

Unnamed: 0,Sentence #,Word,POS,Tag
167,8.0,Wednesday,NNP,B-tim
211,10.0,Wednesday,NNP,B-tim
274,13.0,Tuesday,NNP,B-tim
341,15.0,Wednesday,NNP,B-tim
493,21.0,Wednesday,NNP,B-tim
...,...,...,...,...
66071,2996.0,13,CD,I-tim
66072,2996.0,",",",",I-tim
66073,2996.0,2005,CD,I-tim
66124,2997.0,1982,CD,B-tim


In [5]:
df.describe()

Unnamed: 0,Sentence #
count,66161.0
mean,1494.085594
std,864.286211
min,1.0
25%,746.0
50%,1484.0
75%,2255.0
max,2999.0


#### Observations : 
* There are total 47959 sentences in the dataset.
* Number unique words in the dataset are 35178.
* Total 17 lables (Tags).

In [6]:
#Displaying the unique Tags
df['Tag'].unique()

array(['O', 'B-geo', 'B-gpe', 'B-per', 'I-geo', 'B-org', 'I-org', 'B-tim',
       'B-art', 'I-art', 'I-per', 'I-gpe', 'I-tim', 'B-nat', 'B-eve',
       'I-eve', 'I-nat'], dtype=object)

In [7]:
#Checking null values, if any.
df.isnull().sum()

Sentence #    0
Word          0
POS           0
Tag           0
dtype: int64

There are lots of missing values in 'Sentence #' attribute. So we will use pandas fillna technique and use 'ffill' method which propagates last valid observation forward to next.

In [8]:
df = df.fillna(method = 'ffill')

In [9]:
# This is a class te get sentence. The each sentence will be list of tuples with its tag and pos.
class sentence(object):
    def __init__(self, df):
        self.n_sent = 1
        self.df = df
        self.empty = False
        agg = lambda s : [(w, p, t) for w, p, t in zip(s['Word'].values.tolist(),
                                                       s['POS'].values.tolist(),
                                                       s['Tag'].values.tolist())]
        self.grouped = self.df.groupby("Sentence #").apply(agg)
        self.sentences = [s for s in self.grouped]
        
    def get_text(self):
        try:
            s = self.grouped['Sentence: {}'.format(self.n_sent)]
            self.n_sent +=1
            return s
        except:
            return None

In [10]:
#Displaying one full sentence
getter = sentence(df)
sentences = [" ".join([s[0] for s in sent]) for sent in getter.sentences]
sentences[0]

'Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .'

In [11]:
sentences[1]

'Families of soldiers killed in the conflict joined the protesters who carried banners with such slogans as " Bush Number One Terrorist " and " Stop the Bombings . "'

In [12]:
#sentence with its pos and tag.
sent = getter.get_text()
print(sent)

None


Getting all the sentences in the dataset.

In [13]:
all_sentences = getter.sentences

#### Feature Preparation
These are the default features used by the NER in nltk. We can also modify it for our customization.

In [14]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [15]:
X = [sent2features(s) for s in all_sentences]
y = [sent2labels(s) for s in all_sentences]

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [17]:
crf = CRF(algorithm = 'lbfgs',
         c1 = 0.1,
         c2 = 0.1,
         max_iterations = 100,
         all_possible_transitions = True,
         keep_tempfiles=None)

In [18]:
try:
    crf.fit(X_train, y_train)
except AttributeError:
    pass
y_pred = crf.predict(X_test)

In [None]:
X_test[0]

In [None]:
predicted_tags = crf.predict(X_test)

# Get marginal probabilities
marginals = crf.predict_marginals(X_test)

for i, seq in enumerate(X_test):
    for j, token in enumerate(seq):
        word = token['word.lower()']
        tag = predicted_tags[i][j]
        confidence = marginals[i][j][tag]
        print(f"{word}, {tag}, {confidence:.2f}")


In [21]:
# #Predicting on the test set.
# y_pred = crf.predict(X_test)

#### Evaluating the model performance.
We will use precision, recall and f1-score metrics to evaluate the performance of the model since the accuracy is not a good metric for this dataset because we have an unequal number of data points in each class.

In [22]:
labels = list(crf.classes_)
labels.remove('O')
labels

['B-org',
 'I-org',
 'B-gpe',
 'I-gpe',
 'B-tim',
 'B-geo',
 'I-geo',
 'B-per',
 'I-per',
 'I-tim',
 'B-nat',
 'I-nat',
 'B-eve',
 'I-eve',
 'B-art',
 'I-art']

In [23]:
from __future__ import absolute_import, division
from functools import wraps

from sklearn_crfsuite.utils import flatten

y_pred = crf.predict(X_test)
f1_score = flat_f1_score(y_test, y_pred, average='weighted')
print(f1_score)

0.9568286221325394


In [24]:
# group B and I results
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
sorted_labels

['B-art',
 'I-art',
 'B-eve',
 'I-eve',
 'B-geo',
 'I-geo',
 'B-gpe',
 'I-gpe',
 'B-nat',
 'I-nat',
 'B-org',
 'I-org',
 'B-per',
 'I-per',
 'B-tim',
 'I-tim']

In [25]:
from sklearn.metrics import classification_report

y_test_flat = [tag for seq in y_test for tag in seq]
y_pred_flat = [tag for seq in y_pred for tag in seq]

print(classification_report(y_test_flat, y_pred_flat))


              precision    recall  f1-score   support

       B-art       0.00      0.00      0.00        15
       B-eve       0.80      0.67      0.73         6
       B-geo       0.75      0.87      0.81       398
       B-gpe       0.90      0.78      0.84       257
       B-nat       1.00      0.20      0.33         5
       B-org       0.71      0.65      0.68       244
       B-per       0.81      0.80      0.80       226
       B-tim       0.90      0.82      0.86       248
       I-art       0.00      0.00      0.00        12
       I-eve       0.75      0.75      0.75         4
       I-geo       0.74      0.70      0.72        81
       I-gpe       0.00      0.00      0.00         9
       I-nat       1.00      0.25      0.40         4
       I-org       0.67      0.71      0.69       176
       I-per       0.81      0.90      0.85       256
       I-tim       0.79      0.40      0.54        84
           O       0.99      0.99      0.99     11409

    accuracy              

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [26]:
from seqeval.metrics import precision_score, recall_score, f1_score

print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1-Score:", f1_score(y_test, y_pred))

Precision: 0.7811348563006633
Recall: 0.7576840600428878
F1-Score: 0.7692307692307693


In [27]:
# from matplotlib import pyplot as plt
# import random
# import matplotlib
# fig, ax = plt.subplots(figsize=(30,10))
# font = {'family' : 'normal',
#         'size'   : 16}
# matplotlib.rc('font', **font)
# final_text = []
# color = []
# samples = 10
# integer = random.randint(0,500)
# prediction = crf.predict(X_test[integer:integer+samples])
# for x,y in zip(sentences[integer:integer+samples], prediction):
#     for x1,y1, in zip(x,y):
#         if y1!='O':
#             final_text.append('<{}>'.format(x1[0]))
#             if y1[0]=='I':
#                 color.append(color[-1])
#             else:
#                   color.append ({'color':random.choice(['blue','green','red','magenta'])})
#         else:
#             final_text.append(x1[0])
#     final_text.append('\n')
# # You can either create a HighlightText object
# HighlightText(x=0, y=0.75,
#               s=' '.join(final_text),
#               highlight_textprops=color,
#               ax=ax)
# plt.axis('off')

This looks quite nice.