# Sequence Classification


- 23.12 update
  - torchtext에서 더이상 legacy 문법을 지원하지 않음에 따라 새 버전(0.16.0)에 맞는 문법으로 변경함
  - 기존의 legacy 문법에서 black-box 처럼 사용해왔던 함수들을 새 버전에서는 더 자세하게 세팅할 수 있도록 변경되었음
  - 그에 따라 코드의 양이 증가하였으나 동작하는 바는 동일함
  - pytorch에서 공식적으로 제공하는 문서를 참고하였음
    - https://pytorch.org/text/stable/datasets.html
    - https://colab.research.google.com/github/pytorch/text/blob/master/examples/legacy_tutorial/migration_tutorial.ipynb

### Install requirement package

In [1]:
!pip install 'portalocker>=2.0.0'

Collecting portalocker>=2.0.0
  Downloading portalocker-2.8.2-py3-none-any.whl (17 kB)
Installing collected packages: portalocker
Successfully installed portalocker-2.8.2


### Import

In [2]:
import os
import random
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from torchtext.vocab import vocab
from torchtext.datasets import IMDB
from torchtext.data.utils import get_tokenizer
from collections import Counter

from google.colab import drive
drive.mount('/content/drive')

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

torch.manual_seed(777)
if device == 'cuda':
    torch.cuda.manual_seed_all(777)

# parameters
batch_size = 64
learning_rate = 0.001
training_epochs = 1

Mounted at /content/drive
cuda


### 1 Dataset

In [3]:
train_iter, test_iter = IMDB(split=('train', 'test'))

tokenizer = get_tokenizer('basic_english')
counter = Counter()
for (label, line) in train_iter:
    counter.update(tokenizer(line))
vocab = vocab(counter, min_freq=10, specials=('<unk>', '<BOS>', '<EOS>', '<PAD>'))
vocab_size = len(vocab)

print("The length of the new vocab is", vocab_size)
new_stoi = vocab.get_stoi()
print("The index of '<BOS>' is", new_stoi['<BOS>'])
new_itos = vocab.get_itos()
print("The token at index 2 is", new_itos[2])

text_transform = lambda x: [vocab['<BOS>']] + [vocab[token] if token in new_itos else vocab['<unk>'] for token in tokenizer(x)] + [vocab['<EOS>']]
label_transform = lambda x: 0 if x == 1 else 1

# Print out the output of text_transform
print("input to the text_transform:", "here is an example")
print("output of the text_transform:", text_transform("here is an example"))

The length of the new vocab is 20439
The index of '<BOS>' is 1
The token at index 2 is <EOS>
input to the text_transform: here is an example
output of the text_transform: [1, 972, 55, 198, 3456, 2]


In [4]:
def collate_batch(batch):
   label_list, text_list = [], []
   for (_label, _text) in batch:
        label_list.append(label_transform(_label))
        processed_text = torch.tensor(text_transform(_text))
        text_list.append(processed_text)
   return torch.tensor(label_list), pad_sequence(text_list, padding_value=3.0)


def batch_sampler():
    indices = [(i, len(tokenizer(s[1]))) for i, s in enumerate(train_list)]
    random.shuffle(indices)
    pooled_indices = []
    # create pool of indices with similar lengths
    for i in range(0, len(indices), batch_size * 100):
        pooled_indices.extend(sorted(indices[i:i + batch_size * 100], key=lambda x: x[1]))

    pooled_indices = [x[0] for x in pooled_indices]
    # yield indices for current batch
    for i in range(0, len(pooled_indices), batch_size):
        yield pooled_indices[i:i + batch_size]

In [5]:
data_list = list(train_iter)
random.shuffle(data_list)
train_list = data_list[:int(len(data_list)*0.8)]
valid_list = data_list[int(len(data_list)*0.8):]
test_list = list(test_iter)

train_loader = DataLoader(train_list, batch_sampler=batch_sampler(),
                               collate_fn=collate_batch)
valid_loader = DataLoader(valid_list, batch_size=batch_size, shuffle=True,
                              collate_fn=collate_batch)
test_loader = DataLoader(test_list, batch_size=batch_size, shuffle=True,
                              collate_fn=collate_batch)

n_classes = 2 # Positive, Negative Class가 두 개

print("[TrainSet]: %d [ValSet]: %d [TestSet]: %d [Vocab]: %d [Classes] %d"
      % (len(train_list),len(valid_list), len(test_list), vocab_size, n_classes))

[TrainSet]: 20000 [ValSet]: 5000 [TestSet]: 25000 [Vocab]: 20439 [Classes] 2


### 2 RNN model
* Layer 설계
  + Layer 1
    - Embedding Layer
    - Input size = n_Vocabs = 46159
    - Output size = Embedding size
  + Layer 2
    - GRU Layer
    - Input size = Embedding size
    - Output size = Hidden size
    - Dropout = 0.2
  + Layer 3
    - Linear Layer
    - Input size = Hidden size
    - Output size = n_Classes = 2

In [6]:
class BasicGRU(nn.Module):
    def __init__(self, n_layers, hidden_dim, n_vocab, embed_dim, n_classes, dropout_p=0.2):
        super(BasicGRU, self).__init__()
        self.n_layers = n_layers # 일반적으로는 2

        #n_vocab : Vocab 안에 있는 단어의 개수, embed_dim : 임베딩 된 단어 텐서가 갖는 차원 값(dimension)
        self.embed = nn.Embedding(n_vocab, embed_dim)

        # hidden vector의 dimension과 dropout 정의
        self.hidden_dim = hidden_dim
        self.dropout = nn.Dropout(dropout_p)

        #앞에서 정의한 하이퍼 파라미터를 넣어 GRU 정의
        self.gru = nn.GRU(embed_dim, self.hidden_dim,
                          num_layers=self.n_layers,
                          batch_first=True)

        #Input: GRU의 hidden vector(context), Output : Class probability vector
        self.out = nn.Linear(self.hidden_dim, n_classes)

    def forward(self, x):
        # Input data: 한 batch 내 모든 영화 평가 데이터

        x = self.embed(x)# 영화 평 임베딩
        x, _ = self.gru(x)  # [i, b, h] 출력값 :  (batch_size, 입력 x의 길이, hidden_dim)

        # h_t : Batch 내 모든 sequential hidden state vector의 제일 마지막 토큰을 내포한 (batch_size, 1, hidden_dim)형태의 텐서 추출
        # 다른 의미로 영화 리뷰 배열들을 압축한 hidden state vector
        h_t = x[:,-1,:]

        self.dropout(h_t)# dropout 설정 후,

        # linear layer의 입력으로 주고, 각 클래스 별 결과 logit을 생성.
        out = self.out(h_t)  # [b, h] -> [b, o]
        return out

### 3 Train

In [None]:
# contruct model
model = BasicGRU(1, 256, vocab_size, 128, n_classes, 0.5).to(device)

# define cost/loss & optimizer
criterion = torch.nn.CrossEntropyLoss().to(device)    # Softmax
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# train
for epoch in range(training_epochs):
    avg_cost = 0
    for label, text in train_loader:
        print(label)
        label = label.to(device)
        text = text.transpose(1,0)
        text = text.to(device)
        optimizer.zero_grad()
        hypothesis = model(text)
        print(hypothesis)
        cost = criterion(hypothesis, label)
        cost.backward()
        print(cost)
        optimizer.step()
        avg_cost += float(cost / batch_size)
    print('[Epoch: {:>4}] cost = {:>.9f}'.format(epoch + 1, avg_cost))
print('Learning Finished!')

# model save
torch.save(model.state_dict(), '/content/drive/MyDrive/model_s1.pt')

[Epoch:    1] cost = 0.000000000
[Epoch:    2] cost = 0.000000000
[Epoch:    3] cost = 0.000000000
[Epoch:    4] cost = 0.000000000
[Epoch:    5] cost = 0.000000000
Learning Finished!


In [None]:
# model load
model_new = BasicGRU(1, 256, vocab_size, 128, n_classes, 0.5).to(device)
model_new.load_state_dict(torch.load('/content/drive/MyDrive/model_s1.pt'))

corrects = 0
for label, text in valid_loader:
  label = label.to(device)
  text = text.transpose(1,0)
  text = text.to(device)
  hypothesis = model_new(text)
  corrects += (hypothesis.max(1)[1].view(label.size()).data == label.data).sum()

print('accuracy = ', corrects/len(valid_list)*100.0)

accuracy =  tensor(73.4400, device='cuda:0')


### 4 Assignment
##### a) 아래 예제 코드를 이용해 텍스트 입력의 숫자 변환 과정을 체크한다
##### b) testset의 임의 입력을 학습 완료된 모델에 입력해보고, 결과가 어떠한지 체크한다.

In [10]:
# 'text_transform'이라는 함수가 텍스트를 정수로 변환한다고 가정하겠습니다.
input_text = "movie good"
print("Original text:", input_text)
transformed_text = text_transform(input_text)
print("Transformed text (as numbers):", transformed_text)
# 이전에 정의한 함수를 사용하여 입력 텍스트를 적절한 형식(숫자)으로 변환합니다.
input_text = "movie good"
numerical_input = text_transform(input_text)
numerical_input = torch.tensor(numerical_input)  # PyTorch Tensor로 변환
numerical_input = numerical_input.unsqueeze(0).to(device)  # 배치 차원 추가하고 device로 보내기

# 추론 전에 모델이 eval 모드에 있는지 확인해야 합니다.
model_new = torch.load('/content/drive/MyDrive/model_s1.pt')
model_new.eval()  # 모델을 평가 모드로 설정합니다.

# 입력을 모델을 통해 전달합니다.
with torch.no_grad():  # 이 블록에서 그래디언트가 계산되지 않도록 합니다.
    output = model_new(numerical_input)

print("Model's Logits Output:", output)

# 로짓에 argmax를 적용하여 예측된 클래스를 얻습니다.
predicted_class = torch.argmax(output, axis=1)
print("Predicted class index:", predicted_class.item())


Original text: movie good
Transformed text (as numbers): [0, 1]


AttributeError: ignored