# Window NER

Today we gonna work on simple window NER that no one uses....but it's a good starting point.

Later on, once you learned LSTM, I will teach a better one for NER.

## 1. Load data

Load the famous CoNLL-2002 Shared Task

In [1]:
# !pip install nltk  #or do it in your terminal

In [2]:
import nltk
nltk.__version__

'3.8'

In [3]:
# import os
# os.environ['http_proxy']  = 'http://192.41.170.23:3128'
# os.environ['https_proxy'] = 'http://192.41.170.23:3128'

nltk.download('conll2002')  #this will download the dataset, and put it somewhere in your pc

[nltk_data] Downloading package conll2002 to
[nltk_data]     /Users/chaklam/nltk_data...
[nltk_data]   Package conll2002 is already up-to-date!


True

In [4]:
corpus = nltk.corpus.conll2002.iob_sents()

In [5]:
data = []
for cor in corpus:
    #extract only the first and third guy
    #one list containing the first, and one list containing the third
    #so we can use it to train
    #['Sao', 'Paulo', '(', 'Brasil'.....)],  ['B-LOC', 'I-LOC', 'B-LOC', '0']
    sent, _, tag = list(zip(*cor))
    data.append([sent, tag])

In [6]:
data[9999]

[('ERC',
  'CONVOCA',
  'CONFERENCIA',
  'PRESENTARSE',
  'COMO',
  'PARTIDO',
  'DE',
  'GOBIERNO',
  'Barcelona',
  '.'),
 ('B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'B-LOC', 'O')]

In [7]:
len(data) #35k sentences.....

35651

## 2. Tokenization

If you look carefully, we don't need to!! Yay...things are already chopped.

## 3. Numericalization

In [8]:
flatten = lambda l: [item for sublist in l for item in sublist]

#I want to get all unique vocabs....
sents, tags = list(zip(*data))
vocab  = list(set(flatten(sents)))
tagset = list(set(flatten(tags)))

#why we don't combine vocab and tagset into one single list.....

In [9]:
vocab[89:98]

['Pera',
 'aplicará',
 'FOS',
 'Mattoso',
 'Mercury',
 'sorprendidos',
 'ruikt',
 'Noord-Duitse',
 'avanzadas']

In [10]:
len(vocab)

65459

In [11]:
tagset

['O', 'B-ORG', 'I-PER', 'B-PER', 'I-MISC', 'B-LOC', 'I-ORG', 'B-MISC', 'I-LOC']

In [12]:
#create word2index library
word2index={'<UNK>': 0, '<DUMMY>': 1}  #DUMMY facilitates me moving the windows.....

#loop each vocab
for v in vocab:
    #if that vocab does not exist yet in the word2index
    if word2index.get(v) is None:
        #the index of this vocab is basically the current len of word2indx
        word2index[v] = len(word2index)
#create the index2word
index2word = {v:k for k, v in word2index.items()}

tag2index = {}
#do this the same for tagset
#loop each vocab
for t in tagset:
    #if that vocab does not exist yet in the word2index
    if tag2index.get(t) is None:
        #the index of this vocab is basically the current len of word2indx
        tag2index[t] = len(tag2index)
#create the index2word
index2tag = {v:k for k, v in tag2index.items()}

In [13]:
tag2index

{'O': 0,
 'B-ORG': 1,
 'I-PER': 2,
 'B-PER': 3,
 'I-MISC': 4,
 'B-LOC': 5,
 'I-ORG': 6,
 'B-MISC': 7,
 'I-LOC': 8}

In [14]:
index2tag

{0: 'O',
 1: 'B-ORG',
 2: 'I-PER',
 3: 'B-PER',
 4: 'I-MISC',
 5: 'B-LOC',
 6: 'I-ORG',
 7: 'B-MISC',
 8: 'I-LOC'}

## 4. Prepare window data

E.g., Chaky   is at AIT.   
       B-PER   0 0  B-LOC

Here I will four samples of data:

E.g., windows = [['<DUMMY>', '<DUMMY>', 'Chaky', 'is', 'at'], 'B-PER'], [], []]

In [15]:
for sample in data:
    print(sample[1])
    break

('B-LOC', 'I-LOC', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O')


In [16]:
ws = 2
windows = []

for sample in data:
    dummy = ['<DUMMY>'] * ws
    text  = sample[0]
    fulltext = dummy + list(text) + dummy
    window = list(nltk.ngrams(fulltext, ws * 2 + 1))
    
    windows.extend([[list(window[i]), sample[1][i]] for i in range(len(sample[0]))])   

In [17]:
windows[0]

[['<DUMMY>', '<DUMMY>', 'Sao', 'Paulo', '('], 'B-LOC']

In [18]:
len(windows)

678377

In [19]:
windows = windows[:50]

In [20]:
import random
random.shuffle(windows)

train = windows[:int(len(windows) * 0.9)]
test  = windows[int(len(windows) * 0.9):]

In [21]:
len(train), len(test)

(45, 5)

## 4. Model

<img src="../figures/ner_model.png" width="600">


### How to concat stuff...

In [22]:
import numpy as np
x = np.array([ [1, 2, 3], [4, 5, 6], [7, 8, 9] ])
x.shape

(3, 3)

In [23]:
x

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [24]:
y = x.reshape(-1, 3 * 3)
y

array([[1, 2, 3, 4, 5, 6, 7, 8, 9]])

## Model

In [25]:
import torch
import torch.nn as nn

class WinNER(nn.Module):
    
    def __init__(self, voc_size, emb_size, hid_size, window_size, output_size):
        super(WinNER, self).__init__()
        self.embed   = nn.Embedding(voc_size, emb_size) #embedding the inputs
        self.h1      = nn.Linear(window_size * emb_size, hid_size)
        self.h2      = nn.Linear(hid_size, output_size)
        self.relu    = nn.ReLU()
        self.dropout = nn.Dropout(0.5)
        
    def forward(self, inputs):
        #inputs = (batch_size, window_size * 2 + 1)
        
        input_embed = self.embed(inputs)
        #input_embed = (batch_size, window_size * 2 + 1, emb_size)
                
        concats  = input_embed.reshape(-1, input_embed.shape[1] * input_embed.shape[2])
        #concats = (batch_size, window_size * 2 + 1 * emb_size)  ===> 5d, e.g., 20
        
        h       = self.dropout(self.relu(self.h1(concats)))
        h2      = self.dropout(self.relu(self.h2(h)))
        
        return h2

### Test your model

In [26]:
batch_size = 2
inputs = torch.randint(0, len(vocab), (batch_size, 5))
inputs

tensor([[59071, 23555,   400, 58441, 18270],
        [60154, 63074, 41766, 48850, 48017]])

In [27]:
voc_size = len(vocab)
emb_size = 4
hid_size = 8
window_size  = ws * 2 + 1
output_size = len(tagset)
model = WinNER(voc_size, emb_size, hid_size, window_size, output_size)

In [28]:
something = model(inputs)
something[0]

tensor([0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.2905, 0.6016, 0.6168],
       grad_fn=<SelectBackward0>)

## 5. Training

In [29]:
voc_size = len(vocab)
emb_size = 4
hid_size = 8
window_size  = ws * 2 + 1
num_epochs   = 5
batch_size   = 2
output_size = len(tagset)

model = WinNER(voc_size, emb_size, hid_size, window_size, output_size)

In [30]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001)

In [31]:
train[0]

[['en', 'servicio', 'tres', 'millones', 'de'], 'O']

In [32]:
def getBatch(batch_size, train):
    random.shuffle(train)
    s = 0
    e = batch_size
    
    while e < len(train):
        batch = train[s:e]
        temp  = e
        e     = e + batch_size
        s     = temp
        yield batch   #what is difference between yield and return (yield is MUCH more efficient than return)
    
    if e > len(train):
        batch = train[s:]
        yield batch

In [33]:
#utility function to convert out batch to tensor
def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], seq))
    return torch.LongTensor(idxs)

def prepare_tag(tag,tag2index):
    return torch.LongTensor([tag2index[tag]])

In [35]:
import numpy as np

#make sure that model uses dropout and any normalization
model.train()

#loop each epoch
for epoch in range(num_epochs):
    #loop each batch
    for i, batch in enumerate(getBatch(batch_size, train)):
        
        x, y = list(zip(*batch))
        
        inputs  = torch.cat([prepare_sequence(sent, word2index).reshape(1, -1) for sent in x])
        #(batch_size, 5)
        
        targets = torch.cat([prepare_tag(tag, tag2index) for tag in y])
        #(batch_size)
    
        
        #predict
        preds = model(inputs)
        #(batch_size, len(tag_size))
                        
        #get the loss
        loss = criterion(preds, targets)
        
        #zero grad
        model.zero_grad()

        #backpropagate
        loss.backward()
        
        #update parameters
        optimizer.step()
        
    
    print(f"Epoch: {epoch + 1} | Batch: {i:5.0f} | loss: {loss.item()}")

Epoch: 1 | Batch:    22 | loss: 1.9488742351531982
Epoch: 2 | Batch:    22 | loss: 2.248173475265503
Epoch: 3 | Batch:    22 | loss: 2.1972246170043945
Epoch: 4 | Batch:    22 | loss: 1.7686676979064941
Epoch: 5 | Batch:    22 | loss: 1.6203813552856445
