# 1. Preparations

### 1-1. Import Libraries
- 데이터셋 다운로드와 전처리를 쉽게 하는 torchtext 라이브러리를 import 합니다.


In [1]:
import os
import random
import time
import sys
import math

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchtext.legacy import data, datasets

import random
import time
import spacy
import numpy as np
from torch import Tensor

### 1-2. Load data
- Field 를 정의합니다.
- IMDB 데이터를 다운받습니다.
- Train, Valid, Test 데이터셋으로 split 합니다.

In [2]:
TEXT = data.Field(include_lengths=True)

LABEL = data.LabelField(dtype = torch.float) 

In [3]:
# Download IMDB data
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

In [4]:
# Set the random seed
SEED = 1234

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [5]:
# Split train and valid data
train_data, valid_data = train_data.split(random_state = random.seed(SEED))

In [6]:
print('Number of training examples: {}'.format(len(train_data)))
print('Number of validation examples: {}'.format(len(valid_data)))
print('Number of testing examples: {}'.format(len(test_data)))

Number of training examples: 17500
Number of validation examples: 7500
Number of testing examples: 25000


### 1-3. Cuda Setup
- GPU 사용을 위한 Cuda 설정
- Colab 페이지 상단 메뉴>수정>노트설정에서 GPU 사용 설정이 선행되어야 합니다.


In [7]:
USE_CUDA = torch.cuda.is_available()
device = torch.device("cuda" if USE_CUDA else "cpu")

In [8]:
!nvidia-smi

Wed Nov 10 16:14:10 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.73.01    Driver Version: 460.73.01    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  GeForce GTX 166...  Off  | 00000000:0A:00.0  On |                  N/A |
|  0%   50C    P5    13W / 125W |    499MiB /  5941MiB |      2%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

## 2. Preprocess data
- Vocab (단어장) 을 만듭니다.
- Iterator 를 만듭니다. (Iterator 를 통해 batch training 을 위한 batching 과 padding, 그리고 데이터 내 단어들의 인덱스 변환이 이루어집니다.)  

In [9]:
MAX_VOCAB_SIZE = 25000

TEXT.build_vocab(train_data,
                 max_size = MAX_VOCAB_SIZE,
                 vectors = "glove.6B.100d",
                 unk_init = torch.Tensor.normal_                 
                 )
LABEL.build_vocab(train_data)

In [10]:
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

Unique tokens in TEXT vocabulary: 25002
Unique tokens in LABEL vocabulary: 2


In [11]:
# Batching - construct iterator
BATCH_SIZE = 4
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_sizes = (BATCH_SIZE, BATCH_SIZE, BATCH_SIZE),
    sort_within_batch = True,
    device = device)

In [12]:
for batch in test_iterator:
    break

In [13]:
len(batch.text[0])

8

In [14]:

for batch in train_iterator:
    break

In [15]:
batch.text[0]

tensor([[   49, 11413,   129,  1489],
        [   23,    43, 17376,   101],
        [    7,    12, 14788,     3],
        [   41,   200,   644,   642],
        [    3,    26,     7,   392],
        [  879,    27,    20,     5],
        [  295,     2,     2,  1972],
        [   35, 17332,     0,  4897],
        [   40,     0,     4,     4],
        [   33,     5,   106,  4976],
        [ 3360,     2, 14798, 24872],
        [   16,     0,  6369,   605],
        [    0,     0,   214,    12],
        [   20, 13247,    27,   268],
        [    0,    34,    24,   327],
        [   13,     2,   414,    41],
        [   93,   706,  2589,  1940],
        [  134,  1076,     5,   191],
        [    7,    10,     0,  1168],
        [  397,     9,     8,    10],
        [13450,    98,    11,     9],
        [    9,  1237,  6735,    90],
        [ 2060,     3,  5064,   207],
        [    6,   162,  3219,     0],
        [   66,     0,     2,     2],
        [   94,    18,     0,     0],
        [   

## 3. Build Model
- Embedding layer, Transformer layer, Fully-connected layer 로 이루어진 모델을 만듭니다.
- Classification task 에 활용하기 위해 기존 Seq2Seq Transformer 를 변형하여, Transformer Encoder 만을 활용합니다.
- Positional Encoding 식
> $PE_(pos,2i) =sin(pos/10000^{2i/d_{model}})$  
> $PE_(pos,2i+1) =cos(pos/10000^{2i/d_{model}})$








In [16]:
class PositionalEnc(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
        
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [17]:
class TransformerNet(nn.Module):    
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, n_heads, n_layers, dropout, pad_idx):
        super().__init__()

        # Define parameters
        self.hidden_him = hidden_dim
        self.n_layers = n_layers

        # Define Layers
        # TO-DO Embedding layer
        self.embedding = nn.Embedding(input_dim, embedding_dim, padding_idx=pad_idx)

        self.pe =  PositionalEnc(embedding_dim)
      
        # Encoder layer
        enc_layer = nn.TransformerEncoderLayer(embedding_dim, n_heads, hidden_dim, dropout=dropout)
        self.encoder = nn.TransformerEncoder(enc_layer, num_layers=n_layers)
      
        # TO-DO Fully connected layer and Dropout layer
        self.fc = nn.Linear(embedding_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        # text = [sent len, batch size]

        # TO-DO Transformer 가 출력한 output 의 평균값을 Dropout, FC layer 을 통과하여 반환
        # hint
        # embedded : [sent len, batch size, emb dim]
        # trans_out : [sent len, batch_size, emb_dim]
        # pooled : transformer 출력의 평균
        # final : Dropout -> FC 통과한 logits

        embedded =  self.embedding(text)
        pos_encoded = self.pe(embedded)
        trans_out = self.encoder(pos_encoded)       
        pooled = trans_out.mean(0) 
        final = self.fc(self.dropout(pooled))

        return final

In [18]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 128
OUTPUT_DIM = 1
N_HEADS = 2  #embedding_dim must be divisible by n_heads
N_LAYERS = 1
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = TransformerNet(input_dim=INPUT_DIM,      
                       embedding_dim=EMBEDDING_DIM, 
                       hidden_dim=HIDDEN_DIM, 
                       output_dim=OUTPUT_DIM, 
                       n_heads=N_HEADS,
                       n_layers=N_LAYERS, 
                       dropout=DROPOUT,
                       pad_idx=PAD_IDX)

 

In [19]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print('The model has {:,} trainable parameters'.format(count_parameters(model)))

# load pretrained embeddings
pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings);

The model has 2,566,929 trainable parameters


## 4. Train model

In [20]:
criterion = nn.BCEWithLogitsLoss()
model= model.to(device)
criterion = criterion.to(device)

In [21]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [22]:
def train(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    optimizer = optim.Adam(model.parameters())

    model.train()
    
    for batch in iterator:
        
        # TO-DO
        # General Training Scheme
        optimizer.zero_grad()    
        prediction = model(batch.text[0]).squeeze(1)
        loss = criterion(prediction, batch.label)
        acc = binary_accuracy(prediction, batch.label)

        loss.backward()
        optimizer.step()
      
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [23]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:
            # TO-DO
            # General Evaluation Scheme
            prediction = model(batch.text[0]).squeeze(1)
            loss = criterion(prediction, batch.label)
            acc = binary_accuracy(prediction, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)


### *Do Training!*

In [24]:
N_EPOCHS = 7

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    train_loss, train_acc = train(model, train_iterator, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    if valid_loss < best_valid_loss: # For early stopping
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'trans-model.pt') # Save the model
    
    print('Epoch: {:02}'.format(epoch+1))
    print('\tTrain Loss: {:.3f} | Train Acc: {:.2f}%'.format(train_loss, train_acc*100))
    print('\t Val. Loss: {:.3f} |  Val. Acc: {:.2f}%'.format(valid_loss, valid_acc*100))

Epoch: 01
	Train Loss: 0.500 | Train Acc: 74.29%
	 Val. Loss: 0.402 |  Val. Acc: 83.20%
Epoch: 02
	Train Loss: 0.311 | Train Acc: 87.32%
	 Val. Loss: 0.491 |  Val. Acc: 83.91%
Epoch: 03
	Train Loss: 0.243 | Train Acc: 90.39%
	 Val. Loss: 0.449 |  Val. Acc: 84.67%
Epoch: 04
	Train Loss: 0.200 | Train Acc: 92.17%
	 Val. Loss: 0.588 |  Val. Acc: 83.33%
Epoch: 05
	Train Loss: 0.176 | Train Acc: 93.06%
	 Val. Loss: 0.509 |  Val. Acc: 84.19%
Epoch: 06
	Train Loss: 0.152 | Train Acc: 94.39%
	 Val. Loss: 0.673 |  Val. Acc: 82.75%
Epoch: 07
	Train Loss: 0.134 | Train Acc: 95.06%
	 Val. Loss: 0.603 |  Val. Acc: 84.60%


In [25]:
model.load_state_dict(torch.load('trans-model.pt'))
test_loss, test_acc = evaluate(model, test_iterator, criterion)


print('Test Loss: {:.3f} | Test Acc: {:.2f}%'.format(test_loss, test_acc*100))

Test Loss: 0.387 | Test Acc: 83.74%


## 5. Test model
우리가 직접 예문을 작성해서 트레인된 모델에서 예문을 어떻게 평가하는지 확인합니다.



In [27]:
# 토크나이저로 spacy 를 사용합니다.
nlp = spacy.load('en_core_web_sm')

# 사용자가 입력한 sentence 를 훈련된 모델에 넣었을때의 결과값을 확인합니다.
def predict_sentiment(model, sentence):
    model.eval()
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]  # Tokenization
    print(tokenized)
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]   # 위에서 만든 vocab 에 부여된 index 로 indexing
    print(indexed)
    tensor = torch.LongTensor(indexed).to(device)   # indexing 된 sequence 를 torch tensor 형태로 만들어줌.
    print(tensor.shape)
    tensor = tensor.unsqueeze(1)   # 입력 텐서에 batch 차원을 만들어줌.
    prediction = torch.sigmoid(model(tensor))  # 모델에 입력한 후 확률값 도출을 위한 sigmoid 적용 
    return prediction.item() # prediction 값 출력

In [28]:
predict_sentiment(model, "This film is terrible") #아주 낮은 값의 확률이 도출되는 것을 확인할 수 있습니다.(부정)

['This', 'film', 'is', 'terrible']
[49, 23, 7, 538]
torch.Size([4])


0.0014805947430431843

In [29]:
predict_sentiment(model, "This film is great") #아주 높은 값의 확률이 도출되는 것을 확인할 수 있습니다. (긍정)

['This', 'film', 'is', 'great']
[49, 23, 7, 97]
torch.Size([4])


0.9992258548736572

# Homework (Part2-2) 
## LSTM + Self-attention Model for Text Classification
- Attention mechanism 을 통해 마지막 순서와 각 순서와의 관계 고려하기.
- (1) Embedding layer
- (2) LSTM layer
- (3) Self-attention layer: LSTM 의 마지막 representation 을 추출하여 Self-Attention 으로 representation 업데이트
- (4) Dropout layer
- (5) Fully-connected layer

In [31]:
class Self_Attention(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        self.scale = 1. / math.sqrt(hidden_dim)

    def forward(self, query, key, value): 
        # query == hidden: = [N, H]
        # key/value == output: [L, N, H]

        ### Key 텐서의 크기를 재배열 ###
        query = query.unsqueeze(1) # [N, 1, H]
        key = key.permute(1,2,0)  # [N, H, L]  

        # bmm: batch matrix-matrix multiplication
        # Query 값과 key 의 각 토큰값 간의 Dot-product
        attention_weight = torch.bmm(query, key)  # [N, 1, L] 

        # Scale and normalize
        attention_weight = F.softmax(attention_weight.mul_(self.scale), dim=2) # [N, 1, L]

        # Attention weight 과 value 곱해서 attention output 구하기
        # Value 텐서의 크기를 재배열
        value =  value.transpose(0,1)            # [N, L, H]
        attention_output = torch.bmm(attention_weight, value)   # [N, 1, H]
        attention_output = attention_output.squeeze(1) # [N, H]

        return attention_output, attention_weight.squeeze(1)

In [57]:
# Make Custom GRU + Self-attention Model

class CustomModel(nn.Module): 
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, n_layers, dropout, pad_idx):
        super().__init__()

        # Define parameters
        self.hidden_him = hidden_dim
        self.n_layers = n_layers

        # Define Embedding Layers
        self.embedding = nn.Embedding(input_dim, embedding_dim, padding_idx=pad_idx)
        # Define Layers
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout = dropout)
        
        # Attention layer
        self.attention = Self_Attention(hidden_dim)
 
        # Dropout layer
        self.dropout = nn.Dropout(dropout)

        # Fully connected layer
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text):
        
        embedded = self.embedding(text)
        query, (key, value) = self.lstm(embedded)
        post_att = self.attention(query=query[-1,:,:], key=key, value=value)
        output = self.fc(self.dropout(post_att[0]))
        '''
        Homework (2-2)
        Apply GRU and Dropout, then Self-Attention
        Use the last hidden representation of the sequence
        hint: 
        1) self.lstm(XX) Remind that lstm layer returns a tuple
        2) self.attention(query=XX, key=XX, value=XX) 
            * Use the last representation of LSTM outputs as query
            * query tensor shape: [N, H]
        3) self.dropout(XX)
        4) self.fc(XX)
        '''
        
        return output


In [58]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 128
OUTPUT_DIM = 1
N_LAYERS = 2
DROPOUT = 0.1
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = CustomModel(INPUT_DIM, 
            EMBEDDING_DIM, 
            HIDDEN_DIM, 
            OUTPUT_DIM, 
            N_LAYERS,
            DROPOUT, 
            PAD_IDX)   


In [59]:
# CustomModel 의 파라미터 수 확인
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)  # Count number of elements of all parameters

print('The model has {:,} trainable parameters'.format(count_parameters(model))) # 2,750,185

The model has 2,750,185 trainable parameters


In [60]:
# load pretrained embeddings
pretrained_embeddings = TEXT.vocab.vectors
print(type(pretrained_embeddings))
model.embedding.weight.data.copy_(pretrained_embeddings);

<class 'torch.Tensor'>


In [61]:
criterion = nn.BCEWithLogitsLoss()  # 손실함수 정의 
model = model.to(device)  #모델을 GPU 로 이동
criterion = criterion.to(device)

In [62]:
N_EPOCHS = 5

best_valid_loss = float('inf') # Represents infinity

for epoch in range(N_EPOCHS):
    
    train_loss, train_acc = train(model, train_iterator, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    if valid_loss < best_valid_loss: # For early stopping
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'custom_model.pt')
    else:
      break
    
    print('Epoch: {:02}'.format(epoch+1))
    print('\tTrain Loss: {:.3f} | Train Acc: {:.2f}%'.format(train_loss, train_acc*100))
    print('\t Val. Loss: {:.3f} |  Val. Acc: {:.2f}%'.format(valid_loss, valid_acc*100))

Epoch: 01
	Train Loss: 0.514 | Train Acc: 72.00%
	 Val. Loss: 0.326 |  Val. Acc: 86.47%


In [63]:
model.load_state_dict(torch.load('custom_model.pt'))
test_loss, test_acc = evaluate(model, test_iterator, criterion)

print('Test Loss: {:.3f} | Test Acc: {:.2f}%'.format(test_loss, test_acc*100))

Test Loss: 0.324 | Test Acc: 86.41%
