## BaseLine Model

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

#Custom models
from prepro import readfile, get_sentence, is_number, extract_words, partial_tags


#most common entity per word
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report,f1_score

In [3]:
#import data from my github repo
train = readfile("train.txt")
corpus = train.copy()
test = readfile("test.txt")

#create corpus
corpus.extend(test)

In [4]:
words = []
tags = []
for sentence in corpus:
    for word in sentence:
        words.append(word[0])
        tags.append(word[1])           

In [5]:
tags = partial_tags(tags)

### Manual Tockenization

In [6]:
words=list(words)
n_words = len(words)
n_tags = len(set(tags))

labels=list(set(tags))
labels.remove('O')

print("Number of words in the dataset: ", n_words)
print("Number of tags in the dataset: ", n_tags)

Number of words in the dataset:  250056
Number of tags in the dataset:  5


In [7]:
word2idx = {w: i + 2 for i, w in enumerate(words)}
word2idx["UNK"] = 1 # Unknown words
word2idx["PAD"] = 0 # Padding
# Vocabulary Key:token_index -> Value:word
idx2word = {i: w for w, i in word2idx.items()}
print("The word 'rejects' is identified by the index: {}".format(word2idx["rejects"]))

The word 'rejects' is identified by the index: 218128


In [8]:
# The first entry is reserved for PAD
tag2idx = {t: i+1 for i, t in enumerate(tags)}
tag2idx["PAD"] = 0
# Vocabulary Key:tag_index -> Value:Label/Tag
idx2tag = {i: w for w, i in tag2idx.items()}
print("The labels LOC (location) is identified by the index: {}".format(tag2idx["LOC"]))

The labels LOC (location) is identified by the index: 250026


In [9]:
def tokenize(word_list):
    new_list= []
    for word in word_list:
        if is_number(word2idx[word]):
            new_list.append(word2idx[word])
    else:
        None
    return(new_list)

In [10]:
#training set
train_words, train_tags = extract_words(train)
train_tags=partial_tags(train_tags)

#tokenize words into tokens
tr_words = tokenize(train_words)

### Train baseline model

In [11]:
class MemoryTagger(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y):
        '''
        Expects a list of words as X and a list of tags as y.
        '''
        voc = {}
        self.tags = []
        for x, t in zip(X, y):
            if t not in self.tags:
                self.tags.append(t)
            if x in voc:
                if t in voc[x]:
                    voc[x][t] += 1
                else:
                    voc[x][t] = 1
            else:
                voc[x] = {t: 1}
        self.memory = {}
        for k, d in voc.items():
            self.memory[k] = max(d, key=d.get)
    
    def predict(self, X, y=None):
        '''
        Predict the the tag from memory. If word is unknown, predict 'O'.
        '''
        return [self.memory.get(x, 'O') for x in X]

In [12]:
#fit model to NER training set 
tagger = MemoryTagger()
tagger.fit(tr_words, train_tags)

### Predict on Training Data

In [13]:
#predict the first sentence using the base model
print(get_sentence(train,1))
print(tagger.predict(tokenize(get_sentence(train,1))))

['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']
['ORG', 'O', 'MISC', 'O', 'O', 'O', 'MISC', 'O', 'O']


In [14]:
#run model to get train prediction
pred = list(cross_val_predict(estimator=MemoryTagger(), X=tr_words, y=train_tags, cv=5))

In [18]:
#generate report on entire model
report = classification_report(y_pred=pred, y_true=train_tags,labels=labels)
f1= f1_score(y_pred=pred, y_true=train_tags,average='weighted',labels=labels)
print(report)
print("F1 Score:",f1)

              precision    recall  f1-score   support

        MISC       0.87      0.72      0.78      4593
         LOC       0.85      0.77      0.81      8297
         ORG       0.82      0.55      0.66     10025
         PER       0.95      0.63      0.76     11128

   micro avg       0.87      0.65      0.75     34043
   macro avg       0.87      0.67      0.75     34043
weighted avg       0.88      0.65      0.74     34043

F1 Score: 0.7445045871496001


### Predict on Test Data

In [19]:
#tokenize the test data set
test_words, test_tags = extract_words(test)
test_tags=partial_tags(test_tags)

#tokenize words into tokens
te_words = tokenize(test_words)

In [20]:
#run model to get train prediction
pred = cross_val_predict(estimator=MemoryTagger(), X=te_words, y=test_tags, cv=5)

In [21]:
#generate report on entire model
report = classification_report(y_pred=pred, y_true=test_tags,labels=labels)
f1= f1_score(y_pred=pred, y_true=test_tags,average='micro',labels=labels)
print(report)
print(f1)

              precision    recall  f1-score   support

        MISC       0.89      0.54      0.67       918
         LOC       0.84      0.64      0.72      1925
         ORG       0.77      0.38      0.51      2496
         PER       0.94      0.30      0.46      2773

   micro avg       0.84      0.43      0.57      8112
   macro avg       0.86      0.47      0.59      8112
weighted avg       0.86      0.43      0.56      8112

0.5725688222837596
