In [1]:
import torch
import torch.nn as nn

In [2]:
train_data = "you need to know how to code"

word_set = set(train_data.split())
vocab = {tkn: i + 2 for i, tkn in enumerate(word_set)}
vocab["<unk>"] = 0
vocab["<pad>"] = 1

- num_embeddings: 임베딩 할 단어 수
- embedding_dim: 임베딩 벡터 차원
- padding_idx: 패딩을 위한 토큰 인덱스

In [3]:
embedding_layer = nn.Embedding(
    num_embeddings=len(vocab), embedding_dim=3, padding_idx=1
)

In [4]:
embedding_layer.weight

Parameter containing:
tensor([[ 1.2850,  0.6503,  0.4057],
        [ 0.0000,  0.0000,  0.0000],
        [-1.0173, -3.1059, -0.9376],
        [ 1.2750,  1.4486, -0.6100],
        [-1.2053, -0.0387, -1.4802],
        [ 2.3452,  0.7615,  0.1973],
        [ 2.2656,  0.1313, -1.1834],
        [-0.8589,  0.4082, -0.1675]], requires_grad=True)

# 사전 훈련된 워드 임베딩

훈련 데이터가 적다면 nn.Embedding으로 해당 문제에 충분히 특화된 임베딩 벡터를 만드는 게 쉽지 않다. 이 경우 문제에 특화된 것은 아니지만 보다 일반적이고 보다 많은 훈련 데이터로 이미 Word2Vec이나 GloVE 등으로 학습된 임베딩 벡터를 사용하는 게 성능에 좋을 수 있다.

## 사전 훈련된 임베딩을 사용하지 않는 경우

In [5]:
from collections import Counter

import gensim
import numpy as np

In [6]:
sentences = [
    "nice great best amazing",
    "stop lies",
    "pitiful nerd",
    "excellent work",
    "supreme quality",
    "bad",
    "highly respectable",
]
y_train = [1, 0, 0, 1, 1, 0, 1]

In [7]:
tokenized_sentences = [sent.split() for sent in sentences]
tokenized_sentences

[['nice', 'great', 'best', 'amazing'],
 ['stop', 'lies'],
 ['pitiful', 'nerd'],
 ['excellent', 'work'],
 ['supreme', 'quality'],
 ['bad'],
 ['highly', 'respectable']]

In [8]:
word_list = []
for sent in tokenized_sentences:
    for word in sent:
        word_list.append(word)

word_counts = Counter(word_list)
len(word_counts), word_counts

(15,
 Counter({'nice': 1,
          'great': 1,
          'best': 1,
          'amazing': 1,
          'stop': 1,
          'lies': 1,
          'pitiful': 1,
          'nerd': 1,
          'excellent': 1,
          'work': 1,
          'supreme': 1,
          'quality': 1,
          'bad': 1,
          'highly': 1,
          'respectable': 1}))

In [9]:
vocab = sorted(word_counts, key=word_counts.get, reverse=True)
vocab

['nice',
 'great',
 'best',
 'amazing',
 'stop',
 'lies',
 'pitiful',
 'nerd',
 'excellent',
 'work',
 'supreme',
 'quality',
 'bad',
 'highly',
 'respectable']

In [10]:
word_to_index = {}
word_to_index["<PAD>"] = 0
word_to_index["<UNK>"] = 1

for index, word in enumerate(vocab):
    word_to_index[word] = index + 2

vocab_size = len(word_to_index)
vocab_size

17

In [11]:
word_to_index

{'<PAD>': 0,
 '<UNK>': 1,
 'nice': 2,
 'great': 3,
 'best': 4,
 'amazing': 5,
 'stop': 6,
 'lies': 7,
 'pitiful': 8,
 'nerd': 9,
 'excellent': 10,
 'work': 11,
 'supreme': 12,
 'quality': 13,
 'bad': 14,
 'highly': 15,
 'respectable': 16}

In [12]:
def texts_to_sequences(tokenized_X_data, word_to_index):
    encoded_X_data = []
    for sent in tokenized_X_data:
        index_sequences = []
        for word in sent:
            try:
                index_sequences.append(word_to_index[word])
            except KeyError:
                index_sequences.append(word_to_index["<UNK>"])
        encoded_X_data.append(index_sequences)
    return encoded_X_data

In [13]:
X_encoded = texts_to_sequences(tokenized_sentences, word_to_index)
X_encoded

[[2, 3, 4, 5], [6, 7], [8, 9], [10, 11], [12, 13], [14], [15, 16]]

In [14]:
max_len = max(len(l) for l in X_encoded)
max_len

4

In [15]:
def pad_sequences(sentences, max_len):
    features = np.zeros((len(sentences), max_len), dtype=int)
    for index, sentence in enumerate(sentences):
        if len(sentence) != 0:
            features[index, : len(sentence)] = np.array(sentence)[:max_len]
    return features

In [16]:
X_train = pad_sequences(X_encoded, max_len=max_len)
y_train = np.array(y_train)

In [17]:
X_train

array([[ 2,  3,  4,  5],
       [ 6,  7,  0,  0],
       [ 8,  9,  0,  0],
       [10, 11,  0,  0],
       [12, 13,  0,  0],
       [14,  0,  0,  0],
       [15, 16,  0,  0]])

In [18]:
from torch.optim import Adam
from torch.utils.data import DataLoader, TensorDataset

In [19]:
class SimpleModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.flatten = nn.Flatten()
        self.fc = nn.Linear(embedding_dim * max_len, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        embedded = self.embedding(x)
        flattened = self.flatten(embedded)
        output = self.fc(flattened)
        return self.sigmoid(output)

In [20]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [21]:
embedding_dim = 100
simple_model = SimpleModel(vocab_size, embedding_dim).to(device)

출력츠엥 로지스틱 회귀를 이용한 이진 분류를 푸는 모델이므로 바이너리 크로스엔트로피 함수를 사용한다.

In [22]:
criterion = nn.BCELoss()
optimizer = Adam(simple_model.parameters())

In [23]:
train_dataset = TensorDataset(
    torch.tensor(X_train, dtype=torch.long), torch.tensor(y_train, dtype=torch.float32)
)
train_dataloader = DataLoader(train_dataset, batch_size=2)

In [24]:
len(train_dataloader)

4

In [25]:
for epoch in range(10):
    for inputs, targets in train_dataloader:
        inputs, targets = inputs.to(device), targets.to(device)

        optimizer.zero_grad()
        outputs = simple_model(inputs).view(-1)
        loss = criterion(outputs, targets)
        loss.backward()

        optimizer.step()

    print(f"Epoch {epoch + 1}, Loss: {loss.item()}")

Epoch 1, Loss: 0.6253615021705627
Epoch 2, Loss: 0.48311108350753784
Epoch 3, Loss: 0.3619559705257416
Epoch 4, Loss: 0.27890700101852417
Epoch 5, Loss: 0.226105198264122
Epoch 6, Loss: 0.1928638219833374
Epoch 7, Loss: 0.17089177668094635
Epoch 8, Loss: 0.15466190874576569
Epoch 9, Loss: 0.1408553272485733
Epoch 10, Loss: 0.127839133143425


## 사전 훈련된 임베딩

In [28]:
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format(
    "../model/GoogleNews-vectors-negative300.bin.gz", binary=True
)

In [29]:
embedding_matrix = np.zeros((vocab_size, 300))
embedding_matrix.shape

(17, 300)

In [30]:
def get_vector(word):
    if word in word2vec_model:
        return word2vec_model[word]
    else:
        return None

단어 집합으로부터 단어를 한개씩 호출해 word2vec_model에서 단어의 임베딩 벡터값을 가져온다.

In [31]:
for word, i in word_to_index.items():
    if i > 2:
        temp = get_vector(word)
        if temp is not None:
            embedding_matrix[i] = temp

In [32]:
class PretrainedEmbeddingModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.embedding.weight = nn.Parameter(
            torch.tensor(embedding_matrix, dtype=torch.float32)
        )
        self.embedding.weight.requires_grad = True
        self.flatten = nn.Flatten()
        self.fc = nn.Linear(embedding_dim * max_len, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        embedded = self.embedding(x)
        flattened = self.flatten(embedded)
        output = self.fc(flattened)
        return self.sigmoid(output)

In [33]:
pretrained_embedding_model = PretrainedEmbeddingModel(vocab_size, 300).to(device)

In [34]:
criterion = nn.BCELoss()
optimizer = Adam(pretrained_embedding_model.parameters())

In [35]:
train_dataset = TensorDataset(
    torch.tensor(X_train, dtype=torch.long), torch.tensor(y_train, dtype=torch.float32)
)
train_dataloader = DataLoader(train_dataset, batch_size=2)

In [36]:
for epoch in range(10):
    for inputs, targets in train_dataloader:
        inputs, targets = inputs.to(device), targets.to(device)

        optimizer.zero_grad()

        outputs = pretrained_embedding_model(inputs).view(-1)

        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch + 1}, Loss: {loss.item()}")

Epoch 1, Loss: 0.6374518275260925
Epoch 2, Loss: 0.5786298513412476
Epoch 3, Loss: 0.5203221440315247
Epoch 4, Loss: 0.46565109491348267
Epoch 5, Loss: 0.4154130816459656
Epoch 6, Loss: 0.369762122631073
Epoch 7, Loss: 0.3286037743091583
Epoch 8, Loss: 0.29172682762145996
Epoch 9, Loss: 0.2588597238063812
Epoch 10, Loss: 0.22969816625118256
