In [1]:
import torch
import torchtext
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import time

In [2]:
start = time.time()
TEXT = torchtext.data.Field(lower=True, batch_first=False, fix_length=200)
LABEL = torchtext.data.Field(sequential=False)

In [3]:
train_data, test_data = torchtext.datasets.IMDB.splits(TEXT, LABEL)


downloading aclImdb_v1.tar.gz


.data\imdb\aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:41<00:00, 2.05MB/s]


In [4]:
print(vars(train_data.examples[0]))

{'text': ['bromwell', 'high', 'is', 'a', 'cartoon', 'comedy.', 'it', 'ran', 'at', 'the', 'same', 'time', 'as', 'some', 'other', 'programs', 'about', 'school', 'life,', 'such', 'as', '"teachers".', 'my', '35', 'years', 'in', 'the', 'teaching', 'profession', 'lead', 'me', 'to', 'believe', 'that', 'bromwell', "high's", 'satire', 'is', 'much', 'closer', 'to', 'reality', 'than', 'is', '"teachers".', 'the', 'scramble', 'to', 'survive', 'financially,', 'the', 'insightful', 'students', 'who', 'can', 'see', 'right', 'through', 'their', 'pathetic', "teachers'", 'pomp,', 'the', 'pettiness', 'of', 'the', 'whole', 'situation,', 'all', 'remind', 'me', 'of', 'the', 'schools', 'i', 'knew', 'and', 'their', 'students.', 'when', 'i', 'saw', 'the', 'episode', 'in', 'which', 'a', 'student', 'repeatedly', 'tried', 'to', 'burn', 'down', 'the', 'school,', 'i', 'immediately', 'recalled', '.........', 'at', '..........', 'high.', 'a', 'classic', 'line:', 'inspector:', "i'm", 'here', 'to', 'sack', 'one', 'of', '

In [5]:
import string

for example in train_data.examples:
    text = [x.lower() for x in vars(example)["text"]]
    text = [x.replace("<br", "") for x in text]
    text = ["".join(c for c in s if c not in string.punctuation) for s in text]
    text = [s for s in text if s]
    vars(example)["text"] = text

In [6]:
print(vars(train_data.examples[0]))

{'text': ['bromwell', 'high', 'is', 'a', 'cartoon', 'comedy', 'it', 'ran', 'at', 'the', 'same', 'time', 'as', 'some', 'other', 'programs', 'about', 'school', 'life', 'such', 'as', 'teachers', 'my', '35', 'years', 'in', 'the', 'teaching', 'profession', 'lead', 'me', 'to', 'believe', 'that', 'bromwell', 'highs', 'satire', 'is', 'much', 'closer', 'to', 'reality', 'than', 'is', 'teachers', 'the', 'scramble', 'to', 'survive', 'financially', 'the', 'insightful', 'students', 'who', 'can', 'see', 'right', 'through', 'their', 'pathetic', 'teachers', 'pomp', 'the', 'pettiness', 'of', 'the', 'whole', 'situation', 'all', 'remind', 'me', 'of', 'the', 'schools', 'i', 'knew', 'and', 'their', 'students', 'when', 'i', 'saw', 'the', 'episode', 'in', 'which', 'a', 'student', 'repeatedly', 'tried', 'to', 'burn', 'down', 'the', 'school', 'i', 'immediately', 'recalled', 'at', 'high', 'a', 'classic', 'line', 'inspector', 'im', 'here', 'to', 'sack', 'one', 'of', 'your', 'teachers', 'student', 'welcome', 'to',

In [7]:
import random 
train_data, valid_data = train_data.split(random_state=random.seed(0), split_ratio=0.8)

In [8]:
print("Number of training example: ", len(train_data))
print("Number of validating example: ", len(valid_data))
print("Number of testing examples: ",len(test_data))

Number of training example:  20000
Number of validating example:  5000
Number of testing examples:  25000


In [10]:
TEXT.build_vocab(train_data, max_size=10000, min_freq=10, vectors=None)
LABEL.build_vocab(train_data)

print("TEXT tokens Vocabulary size: ", len(TEXT.vocab))
print("Label tokens size: ", len(LABEL.vocab))
print(LABEL.vocab.stoi)

TEXT tokens Vocabulary size:  10002
Label tokens size:  3
defaultdict(<bound method Vocab._default_unk_index of <torchtext.vocab.Vocab object at 0x0000025D357F7090>>, {'<unk>': 0, 'pos': 1, 'neg': 2})


In [14]:
BATCH_SIZE = [64 for _ in range(100)]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
embeding_dim = 100
hidden_size = 300

train_iter, valid_iter, test_iter = torchtext.data.BucketIterator.splits(
    (train_data, valid_data, test_data), batch_sizes=BATCH_SIZE, device=device
)

In [15]:
# 10번 셀
class RNNCell_Encoder(nn.Module):
    def __init__(self, input_dim, hidden_size):
        super().__init__()
        self.rnn = nn.RNNCell(input_dim, hidden_size)

    def forward(self, inputs):
        bz = inputs.shape[1]
        ht = torch.zeros(bz, hidden_size, device=device)
        for word in inputs:
            ht = self.rnn(word, ht)
        return ht

class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.em = nn.Embedding(len(TEXT.vocab.stoi), embeding_dim)
        self.rnn = RNNCell_Encoder(embeding_dim, hidden_size)
        self.fc1 = nn.Linear(hidden_size, 256)
        self.fc2 = nn.Linear(256, 3)
    
    def forward(self, x):
        x = self.em(x)
        x = self.rnn(x)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x


In [16]:
model = Net()
model.to(device)

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [17]:
# 12번 셀
def training(epoch, model, trainloader, validloader):
    correct = 0
    total = 0
    running_loss = 0
    
    model.train()
    for b in trainloader:
        x, y = b.text, b.label
        x, y = x.to(device), y.to(device)
        y_pred = model(x)
        loss = loss_fn(y_pred, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        with torch.no_grad():
            predicted = torch.argmax(y_pred, dim=1)
            correct += (predicted == y).sum().item()
            total += y.size(0)
            running_loss += loss.item()
    
    epoch_loss = running_loss / len(trainloader)
    epoch_acc = correct / total
    
    valid_correct = 0
    valid_total = 0
    valid_running_loss = 0
    
    model.eval()
    with torch.no_grad():
        for b in validloader:
            x, y = b.text, b.label
            x, y = x.to(device), y.to(device)
            y_pred = model(x)
            loss = loss_fn(y_pred, y)
            predicted = torch.argmax(y_pred, dim=1)
            valid_correct += (predicted == y).sum().item()
            valid_total += y.size(0)
            valid_running_loss += loss.item()
    
    epoch_valid_loss = valid_running_loss / len(validloader)
    epoch_valid_acc = valid_correct / valid_total
    
    print(
        f'epoch: {epoch}',
        f'train loss: {epoch_loss:.4f}',
        f'train acc: {epoch_acc:.4f}',
        f'valid loss: {epoch_valid_loss:.4f}',
        f'valid acc: {epoch_valid_acc:.4f}'
        )
    return epoch_loss, epoch_acc, epoch_valid_loss, epoch_valid_acc

In [18]:
epochs = 5
train_loss = []
train_acc = []
valid_loss = []
valid_acc = []

start = time.time()
for epoch in range(epochs):
    loss, acc, v_loss, v_acc = training(epoch, model, train_iter, valid_iter)
    train_loss.append(loss)
    train_acc.append(acc)
    valid_loss.append(v_loss)
    valid_acc.append(v_acc)

end = time.time()
print(end-start)


epoch: 0 train loss: 0.7093 train acc: 0.4948 valid loss: 0.6993 valid acc: 0.5054
epoch: 1 train loss: 0.6966 train acc: 0.5063 valid loss: 0.6981 valid acc: 0.4922
epoch: 2 train loss: 0.6910 train acc: 0.5291 valid loss: 0.7079 valid acc: 0.4928
epoch: 3 train loss: 0.6947 train acc: 0.5113 valid loss: 0.6994 valid acc: 0.4946
epoch: 4 train loss: 0.6913 train acc: 0.5170 valid loss: 0.6983 valid acc: 0.5032
359.05937576293945


In [19]:
# 14번 셀
def evaluate(epoch, model, testloader):
    test_correct = 0
    test_total = 0
    test_running_loss = 0

    model.eval()
    with torch.no_grad():
        for b in testloader:
            x, y = b.text, b.label
            x, y = x.to(device), y.to(device)
            y_pred = model(x)
            loss = loss_fn(y_pred, y)
            predicted = torch.argmax(y_pred, dim=1)
            test_correct += (predicted == y).sum().item()
            test_total += y.size(0)
            test_running_loss += loss.item()

    epoch_test_loss = test_running_loss / len(testloader)
    epoch_test_acc = test_correct / test_total

    print(
        f"epoch: {epoch}",
        f"test loss: {epoch_test_loss:.4f}",
        f"test acc: {epoch_test_acc:.4f}",
    )
    return epoch_test_loss, epoch_test_acc

In [20]:
# 15번 셀
start = time.time()
epochs = 5
test_loss = []
test_acc = []

for epoch in range(epoch):
    loss, acc = evaluate(epoch, model, test_iter)
    test_loss.append(loss)
    test_acc.append(acc)

end = time.time()
print(end - start)

epoch: 0 test loss: 0.6985 test acc: 0.5118
epoch: 1 test loss: 0.6985 test acc: 0.5118
epoch: 2 test loss: 0.6985 test acc: 0.5118
epoch: 3 test loss: 0.6985 test acc: 0.5118
92.06058669090271
