# Window NER

Today we gonna work on simple window NER that no one uses... but it's a good starting point
Later on, once you learned LSTM, I will teach a better one for NER

# 1.Load data
Load the famous CoNll-2002 Shared Task

CoNLL-2002 Shared Task: Language-Independent Named Entity Recognition <br>
https://www.clips.uantwerpen.be/conll2002/ner/

In [1]:
# !pip install nltk #or do it in your terminal

In [1]:
import nltk
nltk.__version__

'3.7'

In [2]:
# import os
# os.environ['http_proxy']  = 'http://192.41.170.23:3128'
# os.environ['https_proxy'] = 'http://192.41.170.23:3128'

nltk.download('conll2002')  #this will download the dataset, and put it somewhere in your pc

[nltk_data] Downloading package conll2002 to
[nltk_data]     C:\Users\Guntsv\AppData\Roaming\nltk_data...
[nltk_data]   Package conll2002 is already up-to-date!


True

In [3]:
corpus = nltk.corpus.conll2002.iob_sents()

In [4]:
data = []
for cor in corpus:
    #extract only the first and third guy
    #one list containing the first, and one list containing the third
    #so we can use it to train
    # ['Sao',"Paulo','(','Brasil',....], ['B-IOC','I-LOC','B-IOC','O',........]
    # print(cor) #B-LOC = beginning of a location, I-LOC = inside of a location entity, 0 means non-entity
    sent, _, tag = list(zip(*cor))
    data.append([sent,tag])

    # break

In [5]:
data[0]

[('Sao',
  'Paulo',
  '(',
  'Brasil',
  ')',
  ',',
  '23',
  'may',
  '(',
  'EFECOM',
  ')',
  '.'),
 ('B-LOC', 'I-LOC', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O')]

In [6]:
len(data) #35k sentence

35651

## 2. Tokenization

If you look carefully, we don't need to!!! Yay.... things are already chopped

## 3. Numericalization

In [7]:
flatten = lambda l: [item for sublist in l for item in sublist]

#I want to get all unique vocabs
sents, tags = list(zip(*data))
vocab = list(set(flatten(sents)))
tagset = list(set(flatten(tags)))

#why we don't combine vocab and tagset into one single list

In [9]:
vocab[89:98]

['HEAT',
 'aarzelt',
 'filmcomponist',
 'estipulados',
 'versie',
 'accedieron',
 'verpatsen',
 'V-plan',
 'Financiewezen']

In [10]:
len(vocab)

65459

In [11]:
tagset

['B-LOC', 'I-ORG', 'B-PER', 'B-ORG', 'I-LOC', 'I-MISC', 'O', 'I-PER', 'B-MISC']

In [12]:
#create word2index library
word2index =  {'<UNK>': 0, '<DUMMY>': 1} #DUMMY facilitates me moving the windows
#loop each vocab
for v in vocab:
    #if that vocab does not exist yet in the word2index
    if word2index.get(v) is None:
        #the index of this vocab is basically the current len of word2index
        word2index[v] = len(word2index)
#create the index2word
index2word = {v:k for k,v in word2index.items()}

tag2index = {}
#do this the same for tagset 
#loop each vocab
for t in tagset:
    #if that tag does not exist yet in the tag2index
    if tag2index.get(t) is None:
        #the index of this tag is basically the current len of tag2index
        tag2index[t] = len(tag2index)
#create the index2word
index2tag = {v:k for k,v in tag2index.items()}

In [13]:
tag2index

{'B-LOC': 0,
 'I-ORG': 1,
 'B-PER': 2,
 'B-ORG': 3,
 'I-LOC': 4,
 'I-MISC': 5,
 'O': 6,
 'I-PER': 7,
 'B-MISC': 8}

In [14]:
index2tag

{0: 'B-LOC',
 1: 'I-ORG',
 2: 'B-PER',
 3: 'B-ORG',
 4: 'I-LOC',
 5: 'I-MISC',
 6: 'O',
 7: 'I-PER',
 8: 'B-MISC'}

## 4. Prepare window data

<img src="figures/ner_win.png" width="400">

E.g.,   Chaky   is  at  AIT,
        B-PER   0   0   B-LOC

Here I will have four sample of data:

E.g., windows = [[ '<DUMMY>', '<DUMMY>', 'Chaky', 'is', 'at'],'B-PER'], [], []]

In [15]:
for sample in data:
    print(sample[0])
    break

('Sao', 'Paulo', '(', 'Brasil', ')', ',', '23', 'may', '(', 'EFECOM', ')', '.')


In [16]:
ws = 2
windows = []
for sample in data:
    dummy = ['<DUMMY>'] * ws
    text = sample[0]
    fulltext = dummy + list(text) + dummy
    window = list(nltk.ngrams(fulltext,ws*2+1))
    # print(window)
    windows.extend([[list(window[i]),sample[1][i]] for i in range(len(sample[0]))])
    # break

In [17]:
windows[0]

[['<DUMMY>', '<DUMMY>', 'Sao', 'Paulo', '('], 'B-LOC']

In [18]:
len(windows)

678377

In [19]:
windows = windows[:1000]

In [43]:
import random
random.shuffle(windows)

train = windows[:int(len(windows)*0.9)]
test_data = windows[int(len(windows)*0.9):]

In [44]:
len(train), len(test_data)

(900, 100)

## 4. Model

<img src="figures/ner_model.png" width="600">

In [22]:
import numpy as np
x = np.array([[1,2,3],[4,5,6],[7,8,9]])
x

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [23]:
y = x.reshape(-1,3*3)
y

array([[1, 2, 3, 4, 5, 6, 7, 8, 9]])

In [24]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt

class WinNER(nn.Module):

    def __init__(self, voc_size, emb_size, hidden_size, window_size, out_size):
        super(WinNER,self).__init__()
        self.embbedding = nn.Embedding(voc_size, emb_size)
        self.hidden1 = nn.Linear(window_size*emb_size, hidden_size)
        self.hidden2 = nn.Linear(hidden_size, out_size)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)

    def forward(self, inputs):
        #inputs = (batch_size, window_size*2+1)
        input_embed = self.embbedding(inputs)
        # print(input_embed.shape)
        #input_embed = (batch_size, window_Size*2+1,emb_size)
        concats = input_embed.reshape(-1,input_embed.shape[1]*input_embed.shape[2])
        #concat everything ===> (batch_size, window_size *2 + 1*emb_size) ===> 5d
        h1 = self.dropout(self.relu(self.hidden1(concats))) #this is basically a liner layer of some hidden size
        #after_h = (batch_size, hidden_size), e.g., 8
        h2 = self.dropout(self.relu(self.hidden2(h1))) 
        #apply relu 
        #apply dropout
        #basiaclly, h2 -> relu -> droput where h2 project into one number
        # return what is return by h2
        return h2 

## Test your model

In [25]:
batch_size = 2
window_size = ws
inputs = torch.randint(0,len(vocab),(batch_size,5))
inputs

tensor([[28436, 37629, 26446, 41557, 16511],
        [12011, 45690, 46365, 48932,  9630]])

In [26]:
voc_size = len(vocab)
emb_size = 4
hidden_size = 8
window_size =  ws * 2 + 1

out_size = len(tagset)
model = WinNER(voc_size,emb_size,hidden_size,window_size,out_size)
model

WinNER(
  (embbedding): Embedding(65459, 4)
  (hidden1): Linear(in_features=20, out_features=8, bias=True)
  (hidden2): Linear(in_features=8, out_features=9, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.5, inplace=False)
)

In [27]:
NER = model(inputs)
NER[0]

tensor([1.5596, 0.0000, 1.1344, 0.0000, 0.4166, 0.0000, 0.0000, 1.1383, 0.0000],
       grad_fn=<SelectBackward0>)

# 5. Training

In [28]:
voc_size = len(vocab)
emb_size = 4
hidden_size = 8
window_size =  ws * 2 + 1
out_size = len(tagset)
num_epochs = 5
batch_size = 2

model = WinNER(voc_size,emb_size,hidden_size,window_size,out_size)
model

WinNER(
  (embbedding): Embedding(65459, 4)
  (hidden1): Linear(in_features=20, out_features=8, bias=True)
  (hidden2): Linear(in_features=8, out_features=9, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.5, inplace=False)
)

In [29]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(),lr=0.001)

In [30]:
train[0]

[['potencial', 'que', 'tiene', 'Brasil', 'y'], 'O']

In [31]:
def getBatch(batch_size, train):
    random.shuffle(train)
    s = 0
    e = batch_size
    
    while e < len(train):
        batch = train[s:e]
        temp  = e
        e     = e + batch_size
        s     = temp
        yield batch   #what is difference between yield and return (yield is MUCH more efficient than return)
    
    if e > len(train):
        batch = train[s:]
        yield batch

In [32]:
#Actuualy function to convert out batch to tensor
def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], seq))
    return torch.LongTensor(idxs)

def prepare_tag(tag,tag2index):
    return torch.LongTensor([tag2index[tag]])

In [33]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [34]:
import numpy as np
import time
num_epochs = 5
start_time = time.time()
#mkae sure that mode luses dropout and any normalization
model.train()
for epoch in range(num_epochs):#loop each epoch
    for i,batch in enumerate(getBatch(batch_size,train)): #loop each batch
        x,y = list(zip(*batch))
        inputs = torch.cat([prepare_sequence(sent, word2index).reshape(1,-1) for sent in x])
        #(batch_size, 5)
        targets = torch.cat([prepare_tag(tag,tag2index) for tag in y])
        #(batch_size)

        preds = model(inputs)#predict 
        loss = criterion(preds, targets)#get the loss
        model.zero_grad() #zero grad
        loss.backward() #backpropagate
        optimizer.step() #update parameters
        
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    #print epoch loss
    # if (epoch + 1) % 1000 == 0:
    print(f'Epoch {epoch+1} | Batch: {i:3.0f} | Loss {loss:.5f} | Time : {epoch_mins}m {epoch_secs}s')

    # break

Epoch 1 | Batch: 448 | Loss 1.57337 | Time : 0m 1s
Epoch 2 | Batch: 448 | Loss 1.64677 | Time : 0m 3s
Epoch 3 | Batch: 448 | Loss 2.20380 | Time : 0m 4s
Epoch 4 | Batch: 448 | Loss 2.19722 | Time : 0m 6s
Epoch 5 | Batch: 448 | Loss 0.44150 | Time : 0m 7s


# 6. Testing

In [35]:
for_f1_score = []

In [45]:
accuracy = 0

model.eval()
for test in test_data:
    x, y = test[0], test[1]
    input_ = prepare_sequence(x, word2index).view(1, -1)
    #input_ = [[18381, 33735, 59988, 48073, 33735]]
    #input_ : (1, window_size * 2 + 1)
    
    preds = model(input_)
    #preds : (1, label_size, i.e., number of tags)

    i = model(input_).max(1)[1]  #max on first dimension, then take the index which is the 1th index element returned by max
    pred = index2tag[i.item()]    
    
    for_f1_score.append([pred, y])
    if pred == y:
        accuracy += 1

print(accuracy/len(test_data) * 100)

82.0


This high score is because most of labels are 'O' tag. So we need to measure f1 score.

### f1-score

In [46]:
y_pred, y_test = list(zip(*for_f1_score))

In [47]:
set(y_pred)

{'O'}

In [48]:
set(y_test)

{'B-LOC', 'B-MISC', 'B-ORG', 'B-PER', 'I-LOC', 'I-MISC', 'I-ORG', 'I-PER', 'O'}

In [49]:
sorted_labels = sorted(
    list(set(y_test) - {'O'}),
    key=lambda name: (name[1:], name[0])
)

In [50]:
sorted_labels

['B-LOC', 'I-LOC', 'B-MISC', 'I-MISC', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER']

In [51]:
from sklearn import metrics
     
print(metrics.classification_report(y_test, y_pred, labels=sorted_labels, digits=3))

              precision    recall  f1-score   support

       B-LOC      0.000     0.000     0.000         9
       I-LOC      0.000     0.000     0.000         5
      B-MISC      0.000     0.000     0.000         1
      I-MISC      0.000     0.000     0.000         6
       B-ORG      0.000     0.000     0.000        11
       I-ORG      0.000     0.000     0.000         3
       B-PER      0.000     0.000     0.000         2
       I-PER      0.000     0.000     0.000         2

   micro avg      0.000     0.000     0.000        39
   macro avg      0.000     0.000     0.000        39
weighted avg      0.000     0.000     0.000        39



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
