# Master Pytorch 6 : NLP - BoW Classifier
- 논리 회귀 Bag-of-Words 분류기 만들기
- BoW 표현을 레이블에 대한 로그 확률로 매핑

In [2]:
import pandas as pd

data = {'단어' : ['hello', 'world']}
df = pd.DataFrame(data)
df

Unnamed: 0,단어
0,hello
1,world


- 각각 0과 1의 색인을 가진 두 단어(hello, world)가 있다.
- 위 사전을 이용하면 다음과 같이 매핑된다.[count(hello), count(world)]
>"hello hello hello hello" = [4,0]<br>
"helloworldworldhello" = [2,2]



## data 준비

In [106]:
data = [("me gusta comer en la cafeteria".split(), 'SPANISH'),
        ("Give it to me".split(), 'ENGLISH'),
        ("No creo que sea una buena idea".split(), 'SPANISH'),
        ('No it is not a good idea to get lost at sea'.split(), 'ENGLISH')]

test_data = [('Yo creo que si'.split(), 'SPANISH'),
             ('it is lost on me'.split(), 'ENGLISH')]

## Word to ix
- 각 단어를 고유한 숫자로 매핑

In [107]:
word_to_ix = {}
for sent, lan in data + test_data:
    for word in sent:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
            
print(word_to_ix)
vocab_size = len(word_to_ix)
labels_n = 2

{'me': 0, 'gusta': 1, 'comer': 2, 'en': 3, 'la': 4, 'cafeteria': 5, 'Give': 6, 'it': 7, 'to': 8, 'No': 9, 'creo': 10, 'que': 11, 'sea': 12, 'una': 13, 'buena': 14, 'idea': 15, 'is': 16, 'not': 17, 'a': 18, 'good': 19, 'get': 20, 'lost': 21, 'at': 22, 'Yo': 23, 'si': 24, 'on': 25}


In [108]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [109]:
class BoWClassifier(nn.Module):
    
    def __init__(self, input_size, output_size):
        super(BoWClassifier, self).__init__()
        
        
        self.linear = nn.Linear(input_size, output_size) # input : vocab_size, output : num_labels
        
    def forward(self, bow_vec):
        
        y = self.linear(bow_vec)
        y = F.log_softmax(y, dim = 1)
        
        return y
    
def make_bow_vector(sentence, word_to_ix):
    vec = torch.zeros(len(word_to_ix))
    for word in sentence:
        vec[word_to_ix[word]] += 1
    
    return vec.view(1, -1) # size가 [26]이 아닌 [26,1]로 나와야한다.

def make_target(label, label_to_ix):
    
    return torch.LongTensor([label_to_ix[label]])

model = BoWClassifier(vocab_size, labels_n)
print(model)
print('')
for p in model.parameters():
    print(p)

BoWClassifier(
  (linear): Linear(in_features=26, out_features=2, bias=True)
)

Parameter containing:
tensor([[-0.0122,  0.1313,  0.1545,  0.1909,  0.1013,  0.0814,  0.1573, -0.1030,
         -0.0226,  0.1394, -0.1690,  0.0458, -0.0189,  0.1291, -0.0462,  0.0343,
          0.1948, -0.1006,  0.1232,  0.1361, -0.0965,  0.0749, -0.0406,  0.0141,
         -0.0104, -0.1298],
        [ 0.0434,  0.0142,  0.0841, -0.1256, -0.1771,  0.0910,  0.0271, -0.0692,
          0.0730,  0.1447,  0.1214, -0.1635,  0.0304,  0.1904,  0.0600,  0.1613,
          0.1901,  0.0935,  0.0770, -0.1552, -0.1486, -0.0907,  0.0387,  0.1441,
          0.0006,  0.0938]], requires_grad=True)
Parameter containing:
tensor([0.1596, 0.0765], requires_grad=True)


In [111]:
with torch.no_grad(): # grad없이(학습 없이) 그냥 결과만 확인하는 방법
    sample = data[0]
    bow_vector = make_bow_vector(sample[0], word_to_ix)
    log_probs = model(bow_vector)
    print(log_probs)

tensor([[-0.3710, -1.1713]])


In [112]:
label_to_ix = {'SPANISH' : 0, 'ENGLISH' : 1}

with torch.no_grad(): # test data 확인하기
    for sent, label in test_data:
        bow_vec = make_bow_vector(sent, word_to_ix)
        log_probs = model(bow_vec)
        print(log_probs)

tensor([[-0.7651, -0.6260]])
tensor([[-0.7234, -0.6638]])


In [113]:
print(next(model.parameters())[:, word_to_ix['creo']]) # creo에 해당하는 가중치 행렬 부분 출력
print(next(model.parameters())[:, word_to_ix['is']])

tensor([-0.1690,  0.1214], grad_fn=<SelectBackward>)
tensor([0.1948, 0.1901], grad_fn=<SelectBackward>)


In [114]:
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr = 0.1)

batch_size = 1
epoch_n = 30
iter_n = 100

for epoch in range(epoch_n):
    loss_avg = 0
    for sent, label in data:
        
        model.zero_grad()
        
        bow_vec = make_bow_vector(sent, word_to_ix)
        target = make_target(label, label_to_ix)
        
        log_probs = model(bow_vec)
        
        loss = loss_function(log_probs, target)
        loss.backward()
        optimizer.step()

In [115]:
print(next(model.parameters())[:, word_to_ix['creo']])
print(next(model.parameters())[:, word_to_ix['is']])

tensor([ 0.1871, -0.2347], grad_fn=<SelectBackward>)
tensor([-0.0130,  0.3978], grad_fn=<SelectBackward>)
