In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd

#most common enetity per word
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report

#model building - tree classification
from sklearn.ensemble import RandomForestClassifier

#labeling
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline #Sequentially apply a list of transforms

#data = pd.read_csv("ner_dataset.csv", encoding="latin1")

url = "https://raw.githubusercontent.com/deanhoperobertson/Masters-/master/Thesis/ner_dataset.csv"
data = pd.read_csv(url)

In [21]:
data = data.fillna(method="ffill")
data.tail(10)

Unnamed: 0,Sentence #,Word,POS,Tag
1048565,Sentence: 47958,impact,NN,O
1048566,Sentence: 47958,.,.,O
1048567,Sentence: 47959,Indian,JJ,B-gpe
1048568,Sentence: 47959,forces,NNS,O
1048569,Sentence: 47959,said,VBD,O
1048570,Sentence: 47959,they,PRP,O
1048571,Sentence: 47959,responded,VBD,O
1048572,Sentence: 47959,to,TO,O
1048573,Sentence: 47959,the,DT,O
1048574,Sentence: 47959,attack,NN,O


In [0]:
words = list(set(data["Word"].values))
words.append("ENDPAD")

In [24]:
#number of words
n_words = len(words); n_words

35179

So we have 47959 sentences containing 35178 different words with 17 different tags. 

In [0]:
#define a class to retrieve sentences and labels

class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
    
    def get_next(self):
        try:
            s = self.data[self.data["Sentence #"] == "Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s["Word"].values.tolist(), s["POS"].values.tolist(), s["Tag"].values.tolist()    
        except:
            self.empty = True
            return None, None, None
        


In [0]:
getter = SentenceGetter(data)

In [0]:
sent, pos, tag = getter.get_next()

In [28]:
print(sent);print(pos); print(tag)

['Thousands', 'of', 'demonstrators', 'have', 'marched', 'through', 'London', 'to', 'protest', 'the', 'war', 'in', 'Iraq', 'and', 'demand', 'the', 'withdrawal', 'of', 'British', 'troops', 'from', 'that', 'country', '.']
['NNS', 'IN', 'NNS', 'VBP', 'VBN', 'IN', 'NNP', 'TO', 'VB', 'DT', 'NN', 'IN', 'NNP', 'CC', 'VB', 'DT', 'NN', 'IN', 'JJ', 'NNS', 'IN', 'DT', 'NN', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O']


# Base-line Model 

- most common entity tag per word

In [0]:
class MemoryTagger(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y):
        '''
        Expects a list of words as X and a list of tags as y.
        '''
        voc = {}
        self.tags = []
        for x, t in zip(X, y):
            if t not in self.tags:
                self.tags.append(t)
            if x in voc:
                if t in voc[x]:
                    voc[x][t] += 1
                else:
                    voc[x][t] = 1
            else:
                voc[x] = {t: 1}
        self.memory = {}
        for k, d in voc.items():
            self.memory[k] = max(d, key=d.get)
    
    def predict(self, X, y=None):
        '''
        Predict the the tag from memory. If word is unknown, predict 'O'.
        '''
        return [self.memory.get(x, 'O') for x in X]

In [0]:
tagger = MemoryTagger()

tagger.fit(sent, tag)

In [31]:
print(tagger.predict(sent))

['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O']


In [32]:
tagger.tags

['O', 'B-geo', 'B-gpe']

In [0]:
#cerate a list with all the words and 
words = data["Word"].values.tolist()
tags = data["Tag"].values.tolist()

In [0]:
pred = cross_val_predict(estimator=MemoryTagger(), X=words, y=tags, cv=5)

In [36]:
report = classification_report(y_pred=pred, y_true=tags)
print(report)

              precision    recall  f1-score   support

       B-art       0.28      0.07      0.11       402
       B-eve       0.52      0.25      0.33       308
       B-geo       0.78      0.84      0.81     37644
       B-gpe       0.94      0.93      0.94     15870
       B-nat       0.41      0.28      0.33       201
       B-org       0.65      0.49      0.56     20143
       B-per       0.77      0.65      0.71     16990
       B-tim       0.87      0.77      0.82     20333
       I-art       0.04      0.01      0.02       297
       I-eve       0.35      0.12      0.18       253
       I-geo       0.72      0.59      0.65      7414
       I-gpe       0.62      0.45      0.52       198
       I-nat       0.00      0.00      0.00        51
       I-org       0.69      0.53      0.60     16784
       I-per       0.74      0.64      0.69     17251
       I-tim       0.56      0.13      0.21      6528
           O       0.97      0.99      0.98    887908

   micro avg       0.95   

## **Simple Machine Learning (Classification Trees)**

In [0]:
#feature engineering

def feature_map(word):
    '''Simple feature map.'''
    return np.array([word.istitle(), word.islower(), word.isupper(), len(word),
                     word.isdigit(),  word.isalpha()])

In [0]:
#extract feature per word
words = [feature_map(w) for w in data["Word"].values.tolist()]


In [0]:
pred = cross_val_predict(RandomForestClassifier(n_estimators=20),
                         X=words, y=tags, cv=5)

In [47]:
report = classification_report(y_pred=pred, y_true=tags)
print(report)

  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

       B-art       0.00      0.00      0.00       402
       B-eve       0.00      0.00      0.00       308
       B-geo       0.26      0.80      0.40     37644
       B-gpe       0.25      0.03      0.05     15870
       B-nat       0.00      0.00      0.00       201
       B-org       0.65      0.17      0.27     20143
       B-per       0.97      0.20      0.33     16990
       B-tim       0.29      0.32      0.30     20333
       I-art       0.00      0.00      0.00       297
       I-eve       0.00      0.00      0.00       253
       I-geo       0.00      0.00      0.00      7414
       I-gpe       0.00      0.00      0.00       198
       I-nat       0.00      0.00      0.00        51
       I-org       0.36      0.03      0.06     16784
       I-per       0.47      0.02      0.04     17251
       I-tim       0.50      0.06      0.11      6528
           O       0.97      0.98      0.97    887908

   micro avg       0.87   

This is a terrible score and indicates that this model has unperformed - this is expected seeing as the random forest classification algorithm does not have much infomation to use. 

In [0]:
class FeatureTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        self.memory_tagger = MemoryTagger()
        self.tag_encoder = LabelEncoder()
        self.pos_encoder = LabelEncoder()
        
    def fit(self, X, y):
        words = X["Word"].values.tolist()
        self.pos = X["POS"].values.tolist()
        tags = X["Tag"].values.tolist()
        self.memory_tagger.fit(words, tags)
        self.tag_encoder.fit(tags)
        self.pos_encoder.fit(self.pos)
        return self
    
    def transform(self, X, y=None):
        def pos_default(p):
            if p in self.pos:
                return self.pos_encoder.transform([p])[0]
            else:
                return -1
        
        pos = X["POS"].values.tolist()
        words = X["Word"].values.tolist()
        out = []
        for i in range(len(words)):
            w = words[i]
            p = pos[i]
            if i < len(words) - 1:
                wp = self.tag_encoder.transform(self.memory_tagger.predict([words[i+1]]))[0]
                posp = pos_default(pos[i+1])
            else:
                wp = self.tag_encoder.transform(['O'])[0]
                posp = pos_default(".")
            if i > 0:
                if words[i-1] != ".":
                    wm = self.tag_encoder.transform(self.memory_tagger.predict([words[i-1]]))[0]
                    posm = pos_default(pos[i-1])
                else:
                    wm = self.tag_encoder.transform(['O'])[0]
                    posm = pos_default(".")
            else:
                posm = pos_default(".")
                wm = self.tag_encoder.transform(['O'])[0]
            out.append(np.array([w.istitle(), w.islower(), w.isupper(), len(w), w.isdigit(), w.isalpha(),
                                 self.tag_encoder.transform(self.memory_tagger.predict([w]))[0],
                                 pos_default(p), wp, wm, posp, posm]))
        return out

In [0]:
pred = cross_val_predict(Pipeline([("feature_map", FeatureTransformer()), 
                                   ("clf", RandomForestClassifier(n_estimators=20, n_jobs=3))]),
                         X=data, y=tags, cv=5)

In [52]:
report = classification_report(y_pred=pred, y_true=tags)
print(report)

              precision    recall  f1-score   support

       B-art       0.19      0.09      0.12       402
       B-eve       0.38      0.27      0.32       308
       B-geo       0.83      0.86      0.84     37644
       B-gpe       0.98      0.93      0.95     15870
       B-nat       0.21      0.22      0.22       201
       B-org       0.73      0.64      0.68     20143
       B-per       0.82      0.75      0.78     16990
       B-tim       0.89      0.80      0.84     20333
       I-art       0.04      0.02      0.03       297
       I-eve       0.30      0.15      0.20       253
       I-geo       0.76      0.67      0.71      7414
       I-gpe       0.74      0.45      0.56       198
       I-nat       0.44      0.22      0.29        51
       I-org       0.73      0.67      0.70     16784
       I-per       0.85      0.75      0.80     17251
       I-tim       0.81      0.53      0.64      6528
           O       0.98      0.99      0.99    887908

   micro avg       0.96   