## BaseLine Model

In [115]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

#Custom models
from prepro import readfile, readstring, get_sentence, is_number, extract_words


#most common entity per word
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report

In [116]:
#import data from my github repo
train = readfile("train.txt")
corpus = train.copy()
test = readfile("test.txt")

#create corpus
corpus.extend(test)

In [117]:
words = []
tags = []
for sentence in corpus:
    for word in sentence:
        words.append(word[0])
        tags.append(word[1])           

### Manual Tockenization

In [118]:
words=list(words)
n_words = len(set(words))
n_tags = len(set(tags))

print("Number of words in the dataset: ", n_words)
print("Number of tags in the dataset: ", n_tags)

Number of words in the dataset:  27316
Number of tags in the dataset:  9


In [119]:
word2idx = {w: i + 2 for i, w in enumerate(words)}
word2idx["UNK"] = 1 # Unknown words
word2idx["PAD"] = 0 # Padding
# Vocabulary Key:token_index -> Value:word
idx2word = {i: w for w, i in word2idx.items()}
print("The word 'rejects' is identified by the index: {}".format(word2idx["rejects"]))

The word 'rejects' is identified by the index: 218128


In [120]:
# The first entry is reserved for PAD
tag2idx = {t: i+1 for i, t in enumerate(tags)}
tag2idx["PAD"] = 0
# Vocabulary Key:tag_index -> Value:Label/Tag
idx2tag = {i: w for w, i in tag2idx.items()}
print("The labels B-LOC (location) is identified by the index: {}".format(tag2idx["B-LOC"]))

The labels B-LOC (location) is identified by the index: 250026


In [121]:
def tokenize(word_list):
    new_list= []
    for word in word_list:
        if is_number(word2idx[word]):
            new_list.append(word2idx[word])
    else:
        None
    return(new_list)

In [122]:
#training set
train_words, train_tags = extract_words(train)

#tokenize words into tokens
tr_words = tokenize(train_words)

### Train baseline model

In [123]:
class MemoryTagger(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y):
        '''
        Expects a list of words as X and a list of tags as y.
        '''
        voc = {}
        self.tags = []
        for x, t in zip(X, y):
            if t not in self.tags:
                self.tags.append(t)
            if x in voc:
                if t in voc[x]:
                    voc[x][t] += 1
                else:
                    voc[x][t] = 1
            else:
                voc[x] = {t: 1}
        self.memory = {}
        for k, d in voc.items():
            self.memory[k] = max(d, key=d.get)
    
    def predict(self, X, y=None):
        '''
        Predict the the tag from memory. If word is unknown, predict 'O'.
        '''
        return [self.memory.get(x, 'O') for x in X]

In [124]:
#fit model to NER training set 
tagger = MemoryTagger()
tagger.fit(tr_words, train_tags)

### Predict on Training Data

In [125]:
#predict the first sentence using the base model
print(get_sentence(train,1))
print(tagger.predict(tokenize(get_sentence(train,1))))

['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']
['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']


In [126]:
#run model to get train prediction
pred = cross_val_predict(estimator=MemoryTagger(), X=tr_words, y=train_tags, cv=5)

In [127]:
#generate report on entire model
report = classification_report(y_pred=pred, y_true=train_tags)
print(report)

             precision    recall  f1-score   support

      B-LOC       0.85      0.79      0.82      7140
     B-MISC       0.85      0.73      0.79      3438
      B-ORG       0.79      0.51      0.62      6321
      B-PER       0.87      0.58      0.70      6600
      I-LOC       0.71      0.59      0.65      1157
     I-MISC       0.69      0.52      0.59      1155
      I-ORG       0.72      0.48      0.58      3704
      I-PER       0.72      0.46      0.56      4528
          O       0.95      1.00      0.97    169578

avg / total       0.92      0.93      0.92    203621



### Predict on Test Data

In [128]:
#tokenize the test data set
test_words, test_tags = extract_words(test)

#tokenize words into tokens
te_words = tokenize(test_words)

In [129]:
#run model to get train prediction
pred = cross_val_predict(estimator=MemoryTagger(), X=te_words, y=test_tags, cv=5)

In [130]:
#generate report on entire model
report = classification_report(y_pred=pred, y_true=test_tags)
print(report)

             precision    recall  f1-score   support

      B-LOC       0.83      0.66      0.74      1668
     B-MISC       0.87      0.53      0.66       702
      B-ORG       0.76      0.35      0.48      1661
      B-PER       0.84      0.28      0.42      1617
      I-LOC       0.65      0.44      0.53       257
     I-MISC       0.78      0.51      0.62       216
      I-ORG       0.62      0.33      0.43       835
      I-PER       0.70      0.21      0.32      1156
          O       0.90      1.00      0.95     38323

avg / total       0.88      0.89      0.87     46435

