### NER Model Training
This notebook contains all the training runs and hyperparameter tuning used to train the NER model.

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
#from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
#from sklearn.linear_model import SGDClassifier
#from sklearn.linear_model import PassiveAggressiveClassifier
#from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
import pickle

In [2]:
PATH = '../GLG-Old Models, datasets/'
df = pd.read_csv(PATH + 'ner_dataset.csv', encoding = "ISO-8859-1")
#df = df[:500000]
df.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [3]:
df.shape

(1048575, 4)

In [4]:
df.fillna(method='ffill',inplace=True)

In [5]:
df.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O


In [6]:
#X = df.drop('Tag',axis=1)
#v = DictVectorizer(sparse=False)
#X = v.fit_transform(X.to_dict('records'))
#y = df['Tag']

classes = list(np.unique(df['Tag']))

#X_train, X_text, y_train, y_test = train_test_split(X, y, test_size=0.15,random_state=4)

In [7]:
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from collections import Counter

In [8]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s['Word'].values.tolist(), 
                                                           s['POS'].values.tolist(), 
                                                           s['Tag'].values.tolist())]
        self.grouped = self.data.groupby('Sentence #').apply(agg_func)
        self.sentences = [s for s in self.grouped]
        
    def get_next(self):
        try: 
            s = self.grouped['Sentence: {}'.format(self.n_sent)]
            self.n_sent += 1
            return s 
        except:
            return None
getter = SentenceGetter(df)
sentences = getter.sentences

In [9]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    
    features = {
        'bias': 1.0, 
        'word.lower()': word.lower(), 
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True
    return features
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]
def sent2labels(sent):
    return [label for token, postag, label in sent]
def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [10]:
X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=4)

In [11]:
## X_p = list(np.array([sent2features(s) for s in sentences]).flatten())
## y_p = list(np.array([sent2labels(s) for s in sentences]).flatten())
## X_train_p, X_test_p, y_train_p, y_test_p = train_test_split(X_p, y_p, test_size=0.1, random_state=4,stratify=y)

In [12]:
# 08:54
crf = sklearn_crfsuite.CRF(
#    algorithm='lbfgs',
    algorithm='l2sgd',
#    c1=0.1,
#    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)



CRF(algorithm='l2sgd', all_possible_transitions=True, keep_tempfiles=None,
    max_iterations=100)

In [12]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)



CRF(algorithm='lbfgs', all_possible_transitions=True, c1=0.1, c2=0.1,
    keep_tempfiles=None, max_iterations=100)

In [19]:
pd.DataFrame(y_train)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,94,95,96,97,98,99,100,101,102,103
0,O,B-geo,I-geo,O,O,O,O,O,O,O,...,,,,,,,,,,
1,O,O,B-gpe,O,O,O,O,O,O,O,...,,,,,,,,,,
2,O,O,O,O,O,B-gpe,O,O,O,O,...,,,,,,,,,,
3,O,O,O,O,O,O,O,O,B-geo,O,...,,,,,,,,,,
4,O,O,O,O,O,B-geo,O,O,O,O,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23974,O,B-org,I-org,O,O,B-org,O,O,O,B-per,...,,,,,,,,,,
23975,O,O,O,B-per,I-per,O,O,B-geo,O,O,...,,,,,,,,,,
23976,O,O,O,O,O,O,O,O,B-tim,O,...,,,,,,,,,,
23977,B-gpe,O,O,B-per,I-per,I-per,I-per,O,O,O,...,,,,,,,,,,


In [12]:
# Test size = 0.33
y_pred = crf.predict(X_test)
print(metrics.flat_classification_report(y_test, y_pred))



              precision    recall  f1-score   support

       B-art       0.35      0.13      0.19       141
       B-eve       0.61      0.42      0.50        98
       B-geo       0.85      0.90      0.88     12313
       B-gpe       0.97      0.94      0.96      5345
       B-nat       0.51      0.43      0.47        53
       B-org       0.80      0.74      0.77      6596
       B-per       0.85      0.82      0.84      5642
       B-tim       0.92      0.88      0.90      6638
       I-art       0.09      0.03      0.04       106
       I-eve       0.51      0.31      0.38        85
       I-geo       0.80      0.79      0.79      2366
       I-gpe       0.87      0.57      0.69        72
       I-nat       0.64      0.37      0.47        19
       I-org       0.81      0.80      0.80      5541
       I-per       0.85      0.90      0.87      5741
       I-tim       0.84      0.77      0.80      2150
           O       0.99      0.99      0.99    293212

    accuracy              

In [15]:
# Test size = 0.33
y_pred = crf.predict(X_test)
print(metrics.flat_classification_report(y_test, y_pred, labels=classes))



              precision    recall  f1-score   support

       B-art       0.35      0.13      0.19       141
       B-eve       0.61      0.42      0.50        98
       B-geo       0.85      0.90      0.88     12313
       B-gpe       0.97      0.94      0.96      5345
       B-nat       0.51      0.43      0.47        53
       B-org       0.80      0.74      0.77      6596
       B-per       0.85      0.82      0.84      5642
       B-tim       0.92      0.88      0.90      6638
       I-art       0.09      0.03      0.04       106
       I-eve       0.51      0.31      0.38        85
       I-geo       0.80      0.79      0.79      2366
       I-gpe       0.87      0.57      0.69        72
       I-nat       0.64      0.37      0.47        19
       I-org       0.81      0.80      0.80      5541
       I-per       0.85      0.90      0.87      5741
       I-tim       0.84      0.77      0.80      2150

   micro avg       0.86      0.85      0.85     52906
   macro avg       0.70   

In [13]:
#Default, test_size = 0.5
y_pred = crf.predict(X_test)
print(metrics.flat_classification_report(y_test, y_pred))



              precision    recall  f1-score   support

       B-art       0.37      0.11      0.17       206
       B-eve       0.49      0.39      0.43       145
       B-geo       0.86      0.90      0.88     18698
       B-gpe       0.97      0.94      0.95      8066
       B-nat       0.52      0.49      0.51        85
       B-org       0.79      0.73      0.76     10053
       B-per       0.84      0.82      0.83      8531
       B-tim       0.92      0.87      0.90     10083
       I-art       0.16      0.04      0.06       157
       I-eve       0.35      0.26      0.30       121
       I-geo       0.80      0.80      0.80      3603
       I-gpe       0.83      0.50      0.63       103
       I-nat       0.55      0.48      0.51        25
       I-org       0.80      0.78      0.79      8458
       I-per       0.84      0.91      0.87      8655
       I-tim       0.84      0.75      0.79      3256
           O       0.99      0.99      0.99    444333

    accuracy              

In [16]:
filename = 'crf_ner_model.sav'
pickle.dump(crf, open(filename, 'wb'))

In [27]:
print(metrics.flat_classification_report(y_train, y_train))

              precision    recall  f1-score   support

       B-art       1.00      1.00      1.00       196
       B-eve       1.00      1.00      1.00       163
       B-geo       1.00      1.00      1.00     18946
       B-gpe       1.00      1.00      1.00      7804
       B-nat       1.00      1.00      1.00       116
       B-org       1.00      1.00      1.00     10090
       B-per       1.00      1.00      1.00      8459
       B-tim       1.00      1.00      1.00     10250
       I-art       1.00      1.00      1.00       140
       I-eve       1.00      1.00      1.00       132
       I-geo       1.00      1.00      1.00      3811
       I-gpe       1.00      1.00      1.00        95
       I-nat       1.00      1.00      1.00        26
       I-org       1.00      1.00      1.00      8326
       I-per       1.00      1.00      1.00      8596
       I-tim       1.00      1.00      1.00      3272
           O       1.00      1.00      1.00    443575

    accuracy              

In [13]:
#SGD, test_size = 0.5
y_pred = crf.predict(X_test)
print(metrics.flat_classification_report(y_test, y_pred))



              precision    recall  f1-score   support

       B-art       0.50      0.02      0.04       206
       B-eve       0.62      0.36      0.45       145
       B-geo       0.86      0.89      0.88     18698
       B-gpe       0.96      0.93      0.95      8066
       B-nat       0.66      0.32      0.43        85
       B-org       0.78      0.74      0.76     10053
       B-per       0.84      0.81      0.82      8531
       B-tim       0.93      0.85      0.89     10083
       I-art       0.67      0.01      0.03       157
       I-eve       0.50      0.21      0.30       121
       I-geo       0.84      0.76      0.79      3603
       I-gpe       0.92      0.47      0.62       103
       I-nat       0.75      0.24      0.36        25
       I-org       0.78      0.80      0.79      8458
       I-per       0.83      0.92      0.87      8655
       I-tim       0.84      0.74      0.79      3256
           O       0.99      0.99      0.99    444333

    accuracy              

In [65]:
#SGD
y_pred = crf.predict(X_test)
print(metrics.flat_classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       B-art       0.50      0.06      0.11        49
       B-eve       0.50      0.35      0.41        23
       B-geo       0.85      0.92      0.88      3775
       B-gpe       0.97      0.94      0.96      1673
       B-nat       0.62      0.29      0.40        17
       B-org       0.82      0.73      0.77      1961
       B-per       0.84      0.82      0.83      1682
       B-tim       0.93      0.88      0.91      1978
       I-art       0.40      0.11      0.17        37
       I-eve       0.42      0.22      0.29        23
       I-geo       0.80      0.79      0.80       706
       I-gpe       0.95      0.55      0.69        33
       I-nat       0.67      0.40      0.50         5
       I-org       0.83      0.80      0.81      1641
       I-per       0.84      0.93      0.88      1710
       I-tim       0.84      0.78      0.81       622
           O       0.99      0.99      0.99     89109

    accuracy              

In [66]:
# SGD
y_pred = crf.predict(X_test)
print(metrics.flat_classification_report(y_test, y_pred, labels=classes))

              precision    recall  f1-score   support

       B-art       0.50      0.06      0.11        49
       B-eve       0.50      0.35      0.41        23
       B-geo       0.85      0.92      0.88      3775
       B-gpe       0.97      0.94      0.96      1673
       B-nat       0.62      0.29      0.40        17
       B-org       0.82      0.73      0.77      1961
       B-per       0.84      0.82      0.83      1682
       B-tim       0.93      0.88      0.91      1978
       I-art       0.40      0.11      0.17        37
       I-eve       0.42      0.22      0.29        23
       I-geo       0.80      0.79      0.80       706
       I-gpe       0.95      0.55      0.69        33
       I-nat       0.67      0.40      0.50         5
       I-org       0.83      0.80      0.81      1641
       I-per       0.84      0.93      0.88      1710
       I-tim       0.84      0.78      0.81       622

   micro avg       0.86      0.85      0.86     15935
   macro avg       0.74   

In [61]:
y_pred = crf.predict(X_test)
print(metrics.flat_classification_report(y_test, y_pred, labels=classes))

              precision    recall  f1-score   support

       B-art       0.40      0.12      0.19        49
       B-eve       0.60      0.39      0.47        23
       B-geo       0.86      0.91      0.88      3775
       B-gpe       0.97      0.95      0.96      1673
       B-nat       0.64      0.41      0.50        17
       B-org       0.81      0.74      0.78      1961
       B-per       0.86      0.84      0.85      1682
       B-tim       0.93      0.88      0.91      1978
       I-art       0.50      0.11      0.18        37
       I-eve       0.56      0.22      0.31        23
       I-geo       0.79      0.81      0.80       706
       I-gpe       0.90      0.55      0.68        33
       I-nat       0.40      0.40      0.40         5
       I-org       0.82      0.79      0.81      1641
       I-per       0.86      0.91      0.88      1710
       I-tim       0.84      0.78      0.81       622

   micro avg       0.87      0.85      0.86     15935
   macro avg       0.73   

In [60]:
y_pred = crf.predict(X_test)
print(metrics.flat_classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       B-art       0.40      0.12      0.19        49
       B-eve       0.60      0.39      0.47        23
       B-geo       0.86      0.91      0.88      3775
       B-gpe       0.97      0.95      0.96      1673
       B-nat       0.64      0.41      0.50        17
       B-org       0.81      0.74      0.78      1961
       B-per       0.86      0.84      0.85      1682
       B-tim       0.93      0.88      0.91      1978
       I-art       0.50      0.11      0.18        37
       I-eve       0.56      0.22      0.31        23
       I-geo       0.79      0.81      0.80       706
       I-gpe       0.90      0.55      0.68        33
       I-nat       0.40      0.40      0.40         5
       I-org       0.82      0.79      0.81      1641
       I-per       0.86      0.91      0.88      1710
       I-tim       0.84      0.78      0.81       622
           O       0.99      0.99      0.99     89109

    accuracy              

In [43]:
y_pred = crf.predict(X_test)
print(metrics.flat_classification_report(y_test, y_pred))



              precision    recall  f1-score   support

       B-art       0.22      0.08      0.12        25
       B-eve       0.70      0.40      0.51        40
       B-geo       0.87      0.91      0.89      2716
       B-gpe       0.97      0.93      0.95      1180
       B-nat       0.58      0.50      0.54        14
       B-org       0.79      0.75      0.77      1338
       B-per       0.84      0.82      0.83      1203
       B-tim       0.92      0.87      0.89      1425
       I-art       0.00      0.00      0.00        20
       I-eve       0.57      0.43      0.49        30
       I-geo       0.84      0.76      0.80       548
       I-gpe       0.75      0.35      0.48        17
       I-nat       1.00      0.25      0.40         8
       I-org       0.79      0.77      0.78      1062
       I-per       0.84      0.88      0.86      1225
       I-tim       0.84      0.73      0.78       470
           O       0.99      0.99      0.99     63882

    accuracy              

In [29]:
y_pred = crf.predict(X_test)
print(metrics.flat_classification_report(y_test, y_pred))#, labels = classes))

              precision    recall  f1-score   support

       B-art       0.50      0.33      0.40         6
       B-eve       0.60      0.38      0.46         8
       B-geo       0.72      0.78      0.75       225
       B-gpe       0.81      0.81      0.81       146
       B-nat       0.00      0.00      0.00         2
       B-org       0.63      0.56      0.59       149
       B-per       0.83      0.86      0.85       115
       B-tim       0.90      0.79      0.84       121
       I-art       0.00      0.00      0.00         1
       I-eve       0.50      0.33      0.40         6
       I-geo       0.65      0.47      0.55        55
       I-gpe       0.00      0.00      0.00         4
       I-org       0.79      0.60      0.68       147
       I-per       0.86      0.96      0.90       137
       I-tim       0.63      0.66      0.64        29
           O       0.99      0.99      0.99      6546

    accuracy                           0.95      7697
   macro avg       0.59   

### Predict on Some Sample Queries

In [13]:
import re

def prep_query(phrase):
    split_query = re.findall(r"[\w']+|[.,!?;]", phrase)
    
    pos_tags = pos_tag(split_query)
    
    df_query = pd.DataFrame({'Sentence #':['Sentence: 1'] * len(pos_tags),
                            'Word':[pair[0] for pair in pos_tags],
                            'POS':[pair[1] for pair in pos_tags],
                            'Tag':[None] * len(pos_tags)})
       
    return df_query

In [34]:
s = "Donald Trump is a former host on The Apprentice. He is an American businessman and former President."
s = 'hello how are you'
s = 'The Second World War started in 1914 and ended in 1918'
s = 'The Korean War started in 1939 and ended in 1945'
s = 'Iraq and Iran were once at war. Saddam Hussein was involved'
s = 'The World Cup is a quadrennial sporting event. FIFA is the governing body involved.'


x = prep_query(s)

In [35]:
getter_query = SentenceGetter(x)
sentences_query = getter_query.sentences

X_query = [sent2features(s) for s in sentences_query]
X_words = [s[0] for s in sentences_query[0]]

pred = crf.predict(X_query)

list(zip(pred[0],X_words))

[('O', 'The'),
 ('B-org', 'World'),
 ('I-org', 'Cup'),
 ('O', 'is'),
 ('O', 'a'),
 ('O', 'quadrennial'),
 ('O', 'sporting'),
 ('O', 'event'),
 ('O', '.'),
 ('B-org', 'FIFA'),
 ('O', 'is'),
 ('O', 'the'),
 ('O', 'governing'),
 ('O', 'body'),
 ('O', 'involved'),
 ('O', '.')]