# Window NER

Today we gonna work on simple window NER that no one uses....but it's a good starting point.

Later on, once you learned LSTM, I will teach a better one for NER.

## 1. Load data

Load the famous CoNLL-2002 Shared Task

In [None]:
# !pip install nltk  #or do it in your terminal

In [1]:
import nltk
nltk.__version__

'3.8'

In [2]:
# import os
# os.environ['http_proxy']  = 'http://192.41.170.23:3128'
# os.environ['https_proxy'] = 'http://192.41.170.23:3128'

nltk.download('conll2002')  #this will download the dataset, and put it somewhere in your pc

[nltk_data] Downloading package conll2002 to
[nltk_data]     /Users/chaklam/nltk_data...
[nltk_data]   Package conll2002 is already up-to-date!


True

In [3]:
corpus = nltk.corpus.conll2002.iob_sents()

In [6]:
data = []
for cor in corpus:
    #extract only the first and third guy
    #one list containing the first, and one list containing the third
    #so we can use it to train
    #['Sao', 'Paulo', '(', 'Brasil'.....)],  ['B-LOC', 'I-LOC', 'B-LOC', '0']
    sent, _, tag = list(zip(*cor))
    data.append([sent, tag])

In [9]:
data[9999]

[('ERC',
  'CONVOCA',
  'CONFERENCIA',
  'PRESENTARSE',
  'COMO',
  'PARTIDO',
  'DE',
  'GOBIERNO',
  'Barcelona',
  '.'),
 ('B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'B-LOC', 'O')]

In [8]:
len(data) #35k sentences.....

35651

## 2. Tokenization

If you look carefully, we don't need to!! Yay...things are already chopped.

## 3. Numericalization

In [10]:
flatten = lambda l: [item for sublist in l for item in sublist]

#I want to get all unique vocabs....
sents, tags = list(zip(*data))
vocab  = list(set(flatten(sents)))
tagset = list(set(flatten(tags)))

#why we don't combine vocab and tagset into one single list.....

In [11]:
vocab[89:98]

['Renaat',
 'profundidad',
 'dusver',
 'chiefs',
 'aanvatten',
 'fakkeldrager',
 'variedades',
 'completamos',
 'ES']

In [12]:
len(vocab)

65459

In [13]:
tagset

['O', 'B-ORG', 'B-MISC', 'I-MISC', 'B-PER', 'I-ORG', 'B-LOC', 'I-LOC', 'I-PER']

In [14]:
#create word2index library
word2index={'<UNK>': 0, '<DUMMY>': 1}  #DUMMY facilitates me moving the windows.....

#loop each vocab
for v in vocab:
    #if that vocab does not exist yet in the word2index
    if word2index.get(v) is None:
        #the index of this vocab is basically the current len of word2indx
        word2index[v] = len(word2index)
#create the index2word
index2word = {v:k for k, v in word2index.items()}

tag2index = {}
#do this the same for tagset
#loop each vocab
for t in tagset:
    #if that vocab does not exist yet in the word2index
    if tag2index.get(t) is None:
        #the index of this vocab is basically the current len of word2indx
        tag2index[t] = len(tag2index)
#create the index2word
index2tag = {v:k for k, v in tag2index.items()}

In [15]:
tag2index

{'O': 0,
 'B-ORG': 1,
 'B-MISC': 2,
 'I-MISC': 3,
 'B-PER': 4,
 'I-ORG': 5,
 'B-LOC': 6,
 'I-LOC': 7,
 'I-PER': 8}

In [16]:
index2tag

{0: 'O',
 1: 'B-ORG',
 2: 'B-MISC',
 3: 'I-MISC',
 4: 'B-PER',
 5: 'I-ORG',
 6: 'B-LOC',
 7: 'I-LOC',
 8: 'I-PER'}

## 4. Prepare window data

E.g., Chaky   is at AIT.   
       B-PER   0 0  B-LOC

Here I will four samples of data:

E.g., windows = [['<DUMMY>', '<DUMMY>', 'Chaky', 'is', 'at'], 'B-PER'], [], []]

In [23]:
for sample in data:
    print(sample[1])
    break

('B-LOC', 'I-LOC', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O')


In [24]:
ws = 2
windows = []

for sample in data:
    dummy = ['<DUMMY>'] * ws
    text  = sample[0]
    fulltext = dummy + list(text) + dummy
    window = list(nltk.ngrams(fulltext, ws * 2 + 1))
    
    windows.extend([[list(window[i]), sample[1][i]] for i in range(len(sample[0]))])   

In [25]:
windows[0]

[['<DUMMY>', '<DUMMY>', 'Sao', 'Paulo', '('], 'B-LOC']

In [26]:
len(windows)

678377

In [27]:
windows = windows[:50]

In [28]:
import random
random.shuffle(windows)

train = windows[:int(len(windows) * 0.9)]
test  = windows[int(len(windows) * 0.9):]

In [29]:
len(train), len(test)

(45, 5)

## 4. Model

<img src="../figures/ner_model.png" width="600">


### How to concat stuff...

In [31]:
import numpy as np
x = np.array([ [1, 2, 3], [4, 5, 6], [7, 8, 9] ])
x.shape

(3, 3)

In [32]:
x

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [33]:
y = x.reshape(-1, 3 * 3)
y

array([[1, 2, 3, 4, 5, 6, 7, 8, 9]])

## Model

In [34]:
import torch
import torch.nn as nn

class WinNER(nn.Module):
    
    def __init__(self, voc_size, emb_size, hid_size, ws, output_size):
        super(WinNER, self).__init__()
        self.embed   = nn.Embedding(voc_size, emb_size) #embedding the inputs
        self.h1      = nn.Linear((ws * 2 + 1) * emb_size, hid_size)
        self.h2      = nn.Linear(hid_size, output_size)
        self.relu    = nn.ReLU(inplace=True)
        self.dropout = nn.Dropout(0.5, inplace=True)
        
    def forward(self, inputs):
        #inputs = (batch_size, window_size * 2 + 1)
        input_embed = self.embed(inputs)
        #input_embed = (batch_size, window_size * 2 + 1, emb_size)
        
        # concats  = 
        #concat everything ==> (batch_size, window_size * 2 + 1 * emb_size)  ===> 5d, e.g., 20
        
        # after_h  = self.h1(concats)   #this h is basically a linear layer of some hidden size
        #after_h  = (batch_size, hidden_size), e.g., 8
    
        #apply relu
        #apply dropout
        
        #basically, h2 -> relu -> dropout where h2 project into one number
        
        return input_embed

In [41]:
batch_size = 2
inputs = torch.randint(0, len(vocab), (batch_size, 5))
inputs

tensor([[ 8044, 55893, 49664,   246, 13387],
        [ 4212, 51817, 64399, 34396, 46947]])

In [42]:
voc_size = len(vocab)
emb_size = 4
hid_size = 8
ws       = ws * 2 + 1
output_size = len(tagset)
model = WinNER(voc_size, emb_size, hid_size, ws, output_size)

In [44]:
something = model(inputs)
something.shape

torch.Size([2, 5, 4])