# 4. Word Window Classification and Neural Networks
- Implementation word window classification
- https://nbviewer.jupyter.org/github/DSKSD/DeepNLP-models-Pytorch/blob/master/notebooks/04.Window-Classifier-for-NER.ipynb

## 1. Import

In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import nltk
import random
import numpy as np
from collections import Counter
flatten = lambda l: [item for sublist in l for item in sublist]

In [2]:
from sklearn_crfsuite import metrics

In [3]:
random.seed(1024)

In [4]:
print(torch.__version__)
print(nltk.__version__)

0.4.1
3.2.4


In [5]:
use_cuda = torch.cuda.is_available()
gpus = [0]

In [6]:
torch.cuda.set_device(gpus[0])

RuntimeError: cuda runtime error (35) : CUDA driver version is insufficient for CUDA runtime version at torch/csrc/cuda/Module.cpp:32

In [10]:
FloatTensor = torch.cuda.FloatTensor if use_cuda else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if use_cuda else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if use_cuda else torch.ByteTensor

## 2. Setting Functions

In [11]:
def getBatch(batch_size, train_data):
    random.shuffle(train_data)
    sindex = 0
    eindex = batch_size
    while eindex < len(train_data):
        batch = train_data[sindex: eindex]
        temp = eindex
        eindex = eindex + batch_size
        sindex = temp
        yield batch
    
    if eindex >= len(train_data):
        batch = train_data[sindex:]
        yield batch

In [12]:
def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], seq))
    return Variable(LongTensor(idxs))

def prepare_word(word, word2index):
    return Variable(LongTensor([word2index[word]]) if word2index.get(word) is not None else LongTensor([word2index["<UNK>"]]))

def prepare_tag(tag,tag2index):
    return Variable(LongTensor([tag2index[tag]]))

## 3. Data load and Preprocessing

In [13]:
corpus = nltk.corpus.conll2002.iob_sents()
corpus # 각 list안에는 한 문장이 들어가는데,
       # 각 문장의 단어들이 (단어, chunck, tag)로 형성되어 있다.

[[('Sao', 'NC', 'B-LOC'), ('Paulo', 'VMI', 'I-LOC'), ('(', 'Fpa', 'O'), ('Brasil', 'NC', 'B-LOC'), (')', 'Fpt', 'O'), (',', 'Fc', 'O'), ('23', 'Z', 'O'), ('may', 'NC', 'O'), ('(', 'Fpa', 'O'), ('EFECOM', 'NP', 'B-ORG'), (')', 'Fpt', 'O'), ('.', 'Fp', 'O')], [('-', 'Fg', 'O')], ...]

In [14]:
data = []
for cor in corpus:
    sent, _, tag = list(zip(*cor))  # zip(*iterable) 여기서 *는 묶는 방식을 바꾼다.
                                    # 일반 zip : [(a1 b1), (a2, b2)], *추가 : [(a1,a2), (b1,b2)]
                                    # '_'은 불러오기는 하지만 따로 변수에 저장하지는 않겠다는 뜻
    data.append([sent, tag])

In [15]:
# what the cor looks like?
print(list(zip(corpus[0]))) # [(단어1, chunck1, tag1), (단어2, chunck2, tag2), ....]
print('')
print(list(zip(*corpus[0]))) # [(단어1, 단어2 ....), (chunck1, chunck2, ....), (tag1, tag2, ...)]

[(('Sao', 'NC', 'B-LOC'),), (('Paulo', 'VMI', 'I-LOC'),), (('(', 'Fpa', 'O'),), (('Brasil', 'NC', 'B-LOC'),), ((')', 'Fpt', 'O'),), ((',', 'Fc', 'O'),), (('23', 'Z', 'O'),), (('may', 'NC', 'O'),), (('(', 'Fpa', 'O'),), (('EFECOM', 'NP', 'B-ORG'),), ((')', 'Fpt', 'O'),), (('.', 'Fp', 'O'),)]

[('Sao', 'Paulo', '(', 'Brasil', ')', ',', '23', 'may', '(', 'EFECOM', ')', '.'), ('NC', 'VMI', 'Fpa', 'NC', 'Fpt', 'Fc', 'Z', 'NC', 'Fpa', 'NP', 'Fpt', 'Fp'), ('B-LOC', 'I-LOC', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O')]


In [16]:
print(len(data))
print(data[0])

35651
[('Sao', 'Paulo', '(', 'Brasil', ')', ',', '23', 'may', '(', 'EFECOM', ')', '.'), ('B-LOC', 'I-LOC', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O')]


## 4. Build vocab

In [17]:
sents, tags = list(zip(*data))
print(sents[0])
print('')
print(tag[0])

('Sao', 'Paulo', '(', 'Brasil', ')', ',', '23', 'may', '(', 'EFECOM', ')', '.')

O


In [18]:
vocab = list(set(flatten(sents))) # 단어 list를 만들고 set를 통해 중복되는 단어 제거
tagset = list(set(flatten(tags))) # 위와 같음

In [19]:
word2index = {'<UNK>' : 0, '<DUMMY>' : 1}
for vo in vocab:
    if word2index.get(vo) is None:
        word2index[vo] = len(word2index)
index2word = {v:k for k, v in word2index.items()}

tag2index = {}
for tag in tagset:
    if tag2index.get(tag) is None:
        tag2index[tag] = len(tag2index)
index2tag = {v:k for k, v in tag2index.items()}

## 5. Prepare data

In [20]:
window_size = 2
windows = []

In [21]:
dummy = ['<DUMMY>'] * window_size
for sample in data:
    window = list(nltk.ngrams(dummy + list(sample[0]) + dummy, window_size * 2 + 1))
    windows.extend([[list(window[i]), sample[1][i]] for i in range(len(sample[0]))])
windows[0]

[['<DUMMY>', '<DUMMY>', 'Sao', 'Paulo', '('], 'B-LOC']

In [22]:
len(windows)

678377

In [23]:
random.shuffle(windows)

train_data = windows[:int(len(windows) * 0.9)]
test_data = windows[int(len(windows) * 0.9):]

## 6. Modeling

In [24]:
class WindowClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_size, window_size, hidden_size, output_size):
        super(WindowClassifier, self).__init__()
        
        self.embed = nn.Embedding(vocab_size, embedding_size)
        self.h_layer1 = nn.Linear(embedding_size * (window_size * 2 + 1), hidden_size)
        self.h_layer2 = nn.Linear(hidden_size, hidden_size)
        self.o_layer = nn.Linear(hidden_size, output_size)
        self.relu = nn.ReLU()
        self.softmax = nn.LogSoftmax(dim = 1)
        self.dropout = nn.Dropout(0.3)
        
    def forward(self, inputs, is_training = False):
        embeds = self.embed(inputs) # B x W x D
        concated = embeds.view(-1, embeds.size(1)*embeds.size(2)) # B x (W*D)
        h0 = self.relu(self.h_layer1(concated))
        if is_training:
            h0 = self.dropout(h0)
        h1 = self.relu(self.h_layer2(h0))
        if is_training:
            h1 = self.dropout(h1)
        out = self.softmax(self.o_layer(h1))
        return out

In [25]:
BATCH_SIZE = 128
EMBEDDING_SIZE = 50 # x (WINDOW_SIZE*2+1) = 250
HIDDEN_SIZE = 300
EPOCH = 3
LEARNING_RATE = 0.001

## 7. Training

In [26]:
model = WindowClassifier(len(word2index), EMBEDDING_SIZE, window_size, HIDDEN_SIZE, len(tag2index))
if use_cuda:
    model = model.cuda()
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [27]:
for epoch in range(EPOCH):
    losses = []
    for i, batch in enumerate(getBatch(BATCH_SIZE, train_data)):
        x, y = list(zip(*batch))
        inputs = torch.cat([prepare_sequence(sent, word2index).view(1, -1) for sent in x])
        targets = torch.cat([prepare_tag(tag, tag2index) for tag in y])
        model.zero_grad()
        preds = model(inputs, is_training = True)
        loss = loss_function(preds, targets)
        losses.append(loss.data.tolist())
        loss.backward()
        optimizer.step()
        
        if i % 1000 == 0:
            print("[%d/%d] mean_loss : %0.2f" %(epoch, EPOCH, np.mean(losses)))
            losses = []

[0/3] mean_loss : 2.08
[0/3] mean_loss : 0.47
[0/3] mean_loss : 0.38
[0/3] mean_loss : 0.32
[0/3] mean_loss : 0.29
[1/3] mean_loss : 0.20
[1/3] mean_loss : 0.22
[1/3] mean_loss : 0.21
[1/3] mean_loss : 0.20
[1/3] mean_loss : 0.19
[2/3] mean_loss : 0.18
[2/3] mean_loss : 0.15
[2/3] mean_loss : 0.14
[2/3] mean_loss : 0.14
[2/3] mean_loss : 0.14


![train_result](image/word_window_classification_train_result.png)
- train은 GPU가 장착되어 있는 컴퓨터에서 대신 시행했다.

## 8. Test

In [42]:
for_f1_score = []

In [None]:
accuracy = 0
for test in test_data:
    x, y = test[0], test[1]
    input_ = prepare_sequence(x, word2index).view(1, -1)
    
    i = model(input_).max(1)[1]
    pred = index2tag[i.data.tolist()]
    for_f1_socre.append([pred, y])
    if pred == y:
        accuracy += 1
        
print(accuracy/len(test_data) * 100)