# Master Pytorch 6 : NLP - BoW Classifier
- 논리 회귀 Bag-of-Words 분류기 만들기
- BoW 표현을 레이블에 대한 로그 확률로 매핑

In [1]:
import pandas as pd

data = {'단어' : ['hello', 'world']}
df = pd.DataFrame(data)
df

Unnamed: 0,단어
0,hello
1,world


- 각각 0과 1의 색인을 가진 두 단어(hello, world)가 있다.
- 위 사전을 이용하면 다음과 같이 매핑된다.[count(hello), count(world)]
>"hello hello hello hello" = [4,0]<br>
"helloworldworldhello" = [2,2]



## data 준비

In [2]:
data = [("me gusta comer en la cafeteria".split(), 'SPANISH'),
        ("Give it to me".split(), 'ENGLISH'),
        ("No creo que sea una buena idea".split(), 'SPANISH'),
        ('No it is not a good idea to get lost at sea'.split(), 'ENGLISH')]

test_data = [('Yo creo que si'.split(), 'SPANISH'),
             ('it is lost on me'.split(), 'ENGLISH')]

## Word to ix
- 각 단어를 고유한 숫자로 매핑

In [3]:
word_to_ix = {}
for sent, lan in data + test_data:
    for word in sent:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
            
print(word_to_ix)
vocab_size = len(word_to_ix)
labels_n = 2

{'me': 0, 'gusta': 1, 'comer': 2, 'en': 3, 'la': 4, 'cafeteria': 5, 'Give': 6, 'it': 7, 'to': 8, 'No': 9, 'creo': 10, 'que': 11, 'sea': 12, 'una': 13, 'buena': 14, 'idea': 15, 'is': 16, 'not': 17, 'a': 18, 'good': 19, 'get': 20, 'lost': 21, 'at': 22, 'Yo': 23, 'si': 24, 'on': 25}


In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [5]:
class BoWClassifier(nn.Module):
    
    def __init__(self, input_size, output_size):
        super(BoWClassifier, self).__init__()
        
        
        self.linear = nn.Linear(input_size, output_size) # input : vocab_size, output : num_labels
        
    def forward(self, bow_vec):
        
        y = self.linear(bow_vec)
        y = F.log_softmax(y, dim = 1)
        
        return y
    
def make_bow_vector(sentence, word_to_ix):
    vec = torch.zeros(len(word_to_ix))
    for word in sentence:
        vec[word_to_ix[word]] += 1
    
    return vec.view(1, -1) # size가 [26]이 아닌 [26,1]로 나와야한다.

def make_target(label, label_to_ix):
    
    return torch.LongTensor([label_to_ix[label]])

model = BoWClassifier(vocab_size, labels_n)
print(model)
print('')
for p in model.parameters():
    print(p)

BoWClassifier(
  (linear): Linear(in_features=26, out_features=2, bias=True)
)

Parameter containing:
tensor([[ 0.1035,  0.0093, -0.0486, -0.0721,  0.0149,  0.1658,  0.0544, -0.0806,
         -0.1053, -0.0785, -0.1955,  0.1215,  0.1145,  0.1684,  0.0007,  0.0694,
          0.1832, -0.0012,  0.1012, -0.0823, -0.1031,  0.0987,  0.0949, -0.0051,
          0.1895, -0.0622],
        [-0.1550,  0.1548, -0.0475, -0.0765, -0.0176, -0.1746,  0.1676, -0.0587,
         -0.0842, -0.1545,  0.0199, -0.0701,  0.0995, -0.1513,  0.0943, -0.1480,
         -0.1725, -0.1530, -0.1153,  0.1657, -0.1579,  0.1926,  0.0329,  0.1762,
          0.1112, -0.0442]], requires_grad=True)
Parameter containing:
tensor([-0.1557,  0.1500], requires_grad=True)


In [6]:
with torch.no_grad(): # grad없이(학습 없이) 그냥 결과만 확인하는 방법
    sample = data[0]
    bow_vector = make_bow_vector(sample[0], word_to_ix)
    log_probs = model(bow_vector)
    print(log_probs)

tensor([[-0.6056, -0.7891]])


In [7]:
label_to_ix = {'SPANISH' : 0, 'ENGLISH' : 1}

with torch.no_grad(): # test data 확인하기
    for sent, label in test_data:
        bow_vec = make_bow_vector(sent, word_to_ix)
        log_probs = model(bow_vec)
        print(log_probs)

tensor([[-0.9326, -0.5001]])
tensor([[-0.6095, -0.7844]])


In [8]:
print(next(model.parameters())[:, word_to_ix['creo']]) # creo에 해당하는 가중치 행렬 부분 출력
print(next(model.parameters())[:, word_to_ix['is']])

tensor([-0.1955,  0.0199], grad_fn=<SelectBackward>)
tensor([ 0.1832, -0.1725], grad_fn=<SelectBackward>)


In [9]:
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr = 0.1)

batch_size = 1
epoch_n = 300
iter_n = 1000

for epoch in range(epoch_n):
    loss_avg = 0
    for sent, label in data:
        
        model.zero_grad()
        
        bow_vec = make_bow_vector(sent, word_to_ix)
        target = make_target(label, label_to_ix)
        
        log_probs = model(bow_vec)
        
        loss = loss_function(log_probs, target)
        loss.backward()
        optimizer.step()

In [10]:
print(next(model.parameters())[:, word_to_ix['creo']])
print(next(model.parameters())[:, word_to_ix['is']])

tensor([ 0.3299, -0.5055], grad_fn=<SelectBackward>)
tensor([-0.1390,  0.1497], grad_fn=<SelectBackward>)


In [11]:
label_to_ix = {'SPANISH' : 0, 'ENGLISH' : 1}

with torch.no_grad(): # test data 확인하기
    for sent, label in test_data:
        bow_vec = make_bow_vector(sent, word_to_ix)
        log_probs = model(bow_vec)
        print(sent, log_probs)

['Yo', 'creo', 'que', 'si'] tensor([[-0.1436, -2.0118]])
['it', 'is', 'lost', 'on', 'me'] tensor([[-3.1194, -0.0452]])


- Train 전, 후로 test data의 문장을 비교하면 각 언어로 prob이 증가한 것을 알 수 있다.