In [4]:
# 모델
import torch.nn as nn
class GRU(nn.Module):
    def __init__(
        self, n_layers, hidden_dim, n_vocab, 
        embed_dim, n_classes, dropout_p=0.2):
        super(GRU, self).__init__()
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim

        self.embed = nn.Embedding(n_vocab, embed_dim)
        self.dropout = nn.Dropout(dropout_p)
        self.gru = nn.GRU(embed_dim, self.hidden_dim,
                          num_layers=self.n_layers,
                          batch_first=True)
        self.out = nn.Linear(self.hidden_dim, n_classes)

    def forward(self, x):
        x = self.embed(x)
        h_0 = self._init_state(batch_size=x.size(0))
        x, _ = self.gru(x, h_0)
        h_t = x[:,-1,:]
        self.dropout(h_t)
        logit = self.out(h_t)
        return logit

    def _init_state(self, batch_size=1):
        weight = next(self.parameters()).data
        return weight.new(self.n_layers, batch_size, self.hidden_dim).zero_()

#### Legacy Code  
!pip install --upgrade torchtext==0.6.0

In [2]:
import torch

from torchtext import data
from torchtext import data, datasets

USE_CUDA = torch.cuda.is_available()
DEVICE = torch.device("cuda" if USE_CUDA else "cpu")

TEXT = data.Field(sequential=True, batch_first=True, lower=True)
LABEL = data.Field(sequential=False, batch_first=True)
trainset, testset = datasets.IMDB.splits(TEXT, LABEL)

TEXT.build_vocab(trainset, min_freq=5)
LABEL.build_vocab(trainset)

BATCH_SIZE = 64
trainset, valset = trainset.split(split_ratio=0.8)
train_iter, val_iter, test_iter = data.BucketIterator.splits(
        (trainset, valset, testset), batch_size=BATCH_SIZE,
        shuffle=True, repeat=False)

In [3]:
n_classes = 2
vocab_size = len(TEXT.vocab)
lr = 0.001

model = GRU(1, 256, vocab_size, 128, n_classes, 0.5).to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [4]:
import torch.nn.functional as F

def train(model, optimizer, train_iter):
    model.train()
    for batch in train_iter:
        x, y = batch.text.to(DEVICE), batch.label.to(DEVICE)
        y.data.sub_(1) # <unk>:0 인 token 값 제거
        optimizer.zero_grad()

        logit = model(x)
        loss = F.cross_entropy(logit, y)
        loss.backward()
        optimizer.step()

# 모델 평가 함수를 만듭니다.
def evaluate(model, valid_iter):
    """evaluate model"""
    model.eval()
    corrects, total_loss = 0, 0
    for batch in valid_iter:
        x, y = batch.text.to(DEVICE), batch.label.to(DEVICE)
        y.data.sub_(1) # <unk>:0 인 token 값 제거
        logit = model(x)
        loss = F.cross_entropy(logit, y, reduction='sum')
        total_loss += loss.item()
        corrects += (logit.max(1)[1].view(y.size()).data == y.data).sum()
    size = len(valid_iter.dataset)
    avg_loss = total_loss / size
    avg_accuracy = 100.0 * corrects / size
    return avg_loss, avg_accuracy

best_model = None
best_val_loss = None
EPOCHS = 10

for epoch in range(1, EPOCHS+1):
    train(model, optimizer, train_iter)
    val_loss, val_accuracy = evaluate(model, val_iter)
    print(f"[Epoch: {epoch}] valid loss: {val_loss:.2f} | "
          f"valid accuracy: {val_accuracy:.2f}")

    # 검증 오차가 가장 적은 최적의 모델을 저장
    if not best_val_loss or val_loss < best_val_loss:
        best_model = model
        best_val_loss = val_loss

[Epoch: 1] valid loss: 0.69 | valid accuracy: 49.28
[Epoch: 2] valid loss: 0.70 | valid accuracy: 50.02
[Epoch: 3] valid loss: 0.75 | valid accuracy: 49.38
[Epoch: 4] valid loss: 0.77 | valid accuracy: 50.26
[Epoch: 5] valid loss: 0.39 | valid accuracy: 83.06
[Epoch: 6] valid loss: 0.32 | valid accuracy: 86.38
[Epoch: 7] valid loss: 0.33 | valid accuracy: 86.48
[Epoch: 8] valid loss: 0.36 | valid accuracy: 86.68
[Epoch: 9] valid loss: 0.39 | valid accuracy: 86.24
[Epoch: 10] valid loss: 0.41 | valid accuracy: 86.10


In [5]:
test_loss, test_acc = evaluate(model, test_iter)
print(f'test loss: {test_loss:.2f} | test accuracy: {test_acc:.2f}')

test loss: 0.36 | test accuracy: 86.49


#### New Code  
!pip install --upgrade torchtext==0.14.0

In [1]:
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import random_split
from torch.utils.data import DataLoader

from torchtext.datasets import IMDB
from torchtext.data.functional import to_map_style_dataset
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

USE_CUDA = torch.cuda.is_available()
DEVICE = torch.device("cuda" if USE_CUDA else "cpu")

tokenizer = get_tokenizer('basic_english')
train_iter, test_iter = IMDB(root='.data', split=('train', 'test'))

def train_valid_split(train_iterator, split_ratio=0.8, seed=42):
    train_count = int(split_ratio * len(train_iterator))
    valid_count = len(train_iterator) - train_count
    generator = torch.Generator().manual_seed(seed)
    train_set, valid_set = random_split(
        train_iterator, lengths=[train_count, valid_count],
        generator=generator)
    return train_set, valid_set

train_iter = to_map_style_dataset(train_iter)
test_iter = to_map_style_dataset(test_iter)

train_set, val_set = train_valid_split(train_iter)

def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(
    iterator=yield_tokens(train_iter),
    min_freq=5,
    specials=['<unk>'],)
vocab.set_default_index(vocab['<unk>'])

def collate_batch(batch):
    text_pipeline = lambda x: vocab(tokenizer(x))
    label_pipeline = lambda x: int(x)

    label_list, text_list = [], []
    for (_label, _text) in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
    label_list = torch.tensor(label_list, dtype=torch.int64)
    text_tensor = pad_sequence(text_list, padding_value=1, batch_first=True)
    return text_tensor, label_list

train_dataloader = DataLoader(
    train_set, batch_size=64, shuffle=True, collate_fn=collate_batch)
val_dataloader = DataLoader(
    val_set, batch_size=64, shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(
    test_iter, batch_size=64, shuffle=True, collate_fn=collate_batch)

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
n_classes = 2
vocab_size = len(vocab)
lr = 0.001

model = GRU(1, 256, vocab_size, 128, n_classes, 0.5).to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [6]:
import torch.nn.functional as F

def train(model, optimizer, train_iter):
    model.train()
    for x, y in train_iter:
        x, y = x.to(DEVICE), y.to(DEVICE)
        y.sub_(1) # <unk>:0 인 token 값 제거
        optimizer.zero_grad()

        logit = model(x)
        loss = F.cross_entropy(logit, y)
        loss.backward()
        optimizer.step()

def evaluate(model, valid_iter, total_valid_set_len):
    """evaluate model"""
    model.eval()
    corrects, total_loss = 0, 0
    for x, y in valid_iter:
        x, y = x.to(DEVICE), y.to(DEVICE)
        y.sub_(1) # <unk>:0 인 token 값 제거
        logit = model(x)
        loss = F.cross_entropy(logit, y, reduction='sum')
        total_loss += loss.item()
        corrects += (logit.max(1)[1].view(y.size()).data == y.data).sum()
    
    size = total_valid_set_len
    avg_loss = total_loss / size
    avg_accuracy = 100.0 * corrects / size
    return avg_loss, avg_accuracy

best_model = None
best_val_loss = None
EPOCHS = 10

for epoch in range(1, EPOCHS+1):
    train(model, optimizer, train_dataloader)
    with torch.no_grad():
        val_loss, val_accuracy = evaluate(model, val_dataloader, len(val_set))
    print(f"[Epoch: {epoch}] valid loss: {val_loss:.2f} | "
          f"valid accuracy: {val_accuracy:.2f}")
    if not best_val_loss or val_loss < best_val_loss:
        best_model = model
        best_val_loss = val_loss

[Epoch: 1] valid loss: 0.70 | valid accuracy: 49.62
[Epoch: 2] valid loss: 0.66 | valid accuracy: 62.36
[Epoch: 3] valid loss: 0.37 | valid accuracy: 84.74
[Epoch: 4] valid loss: 0.31 | valid accuracy: 87.56
[Epoch: 5] valid loss: 0.38 | valid accuracy: 87.36
[Epoch: 6] valid loss: 0.36 | valid accuracy: 88.36
[Epoch: 7] valid loss: 0.41 | valid accuracy: 87.66
[Epoch: 8] valid loss: 0.46 | valid accuracy: 87.86
[Epoch: 9] valid loss: 0.48 | valid accuracy: 88.82
[Epoch: 10] valid loss: 0.48 | valid accuracy: 88.70


In [7]:
test_loss, test_acc = evaluate(model, test_dataloader, len(test_iter))
print(f'test loss: {test_loss:.2f} | test accuracy: {test_acc:.2f}')

test loss: 0.53 | test accuracy: 87.60
