In [1]:
!pip install torchdata
!pip install portalocker>=2.0.0
!pip install torch==2.3.1

Collecting torchdata
  Downloading torchdata-0.7.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.7/4.7 MB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=2->torchdata)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=2->torchdata)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=2->torchdata)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=2->torchdata)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=2->torchdata)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x8

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchtext
import torch.utils.data as data
from torchsummary import summary
from torch.utils.data import Dataset, DataLoader
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.datasets import PennTreebank



In [3]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
DEVICE

device(type='cuda')

In [4]:
train_iter, valid_iter, test_iter = PennTreebank()

def yield_tokens(text_iter):
    tokenizer = get_tokenizer('basic_english')
    for line in text_iter:
        tokens = tokenizer(line)
        yield tokens

vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

In [5]:
class LMDataset(Dataset):
    def __init__(self, data_iter, vocab, sequence_length):
        super(LMDataset, self).__init__()

        self.vocab = vocab
        self.sequence_length = sequence_length

        data = self.data_process(data_iter)
        data = self.split_by_sequece_len(data)
        self.data, self.targets = self.label_data(data).values()

    def __getitem__(self, index):
        return self.data[index], self.targets[index]

    def __len__(self):
        return len(self.data)

    def data_process(self, text_iter):
        tokenizer = get_tokenizer('basic_english')
        data = [torch.tensor([self.vocab[token] for token in tokenizer(line)], dtype=torch.long) for line in text_iter]
        return torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))

    def split_by_sequece_len(self, data):
        nsequence = data.size(0) // self.sequence_length
        data = data.narrow(0, 0, nsequence * self.sequence_length)
        data = data.reshape(-1, self.sequence_length)
        return data

    def label_data(self, data):
        return {'data': data[:, :-1], 'targets': data[:, 1:]}

In [6]:
class RNNLM(nn.Module):
    def __init__(self, vocab_size, hidden_size):
        super(RNNLM, self).__init__()
        self.rnn = nn.RNN(vocab_size, hidden_size)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, inputs):
        inputs = F.one_hot(inputs, len(vocab)).type(torch.float32)
        return self.fc(self.rnn(inputs)[0])

In [7]:
SEQ_LEN = 32
BATCH_SIZE = 1024
VOCAB_SIZE = len(vocab)
HIDDEN_SIZE = 256

In [8]:
train_set = LMDataset(train_iter, vocab, SEQ_LEN)
test_set = LMDataset(test_iter, vocab, SEQ_LEN)

In [9]:
train_loader = DataLoader(
    train_set,
    batch_size=BATCH_SIZE,
    shuffle=True
)
test_loader = DataLoader(
    test_set,
    batch_size=BATCH_SIZE,
    shuffle=False
)

In [None]:
for X, Y in train_loader:
    print(X.shape, Y.shape)
    break

torch.Size([1024, 31]) torch.Size([1024, 31])


In [10]:
def train_run_epoch(model, device, train_loader, optimizer, criterion):
    model.train()
    total_loss = 0

    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device).t(), target.to(device).t()

        output = model(data)
        target = F.one_hot(target, len(vocab)).type(torch.float32)
        loss = criterion(output, target)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)

    return avg_loss

In [22]:
def test_run_epoch(model, device, test_loader, criterion):
    model.eval()
    total_loss = 0
    correct = 0

    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device).t(), target.to(device).t()
            data = data.transpose(0, 1)

            output = model(data)
            target_onehot = F.one_hot(target, len(vocab)).type(torch.float32).transpose(0, 1)
            loss = criterion(output, target_onehot)

            pred = output.argmax(dim=2)

            correct += pred.eq(target.t()).sum().item()

            total_loss += loss.item()

        accuracy = correct / (len(test_set)*SEQ_LEN)
        avg_loss = total_loss / len(test_loader)

    return accuracy, avg_loss

In [18]:
def train(model, device, train_loder, test_loader, optimizer, criterion, num_epochs):
    train_avg_losses = []
    test_avg_losses = []
    accuracies = []

    for epoch in range(1, num_epochs+1):
        print(f"Epoch [{epoch}/{num_epochs}]")

        train_avg_loss = train_run_epoch(model, device, train_loader, optimizer, criterion)
        train_avg_losses.append(train_avg_loss)
        print(f"Average Train Loss = {train_avg_loss:.20f}")

        accuracy, test_avg_loss = test_run_epoch(model, device, test_loader, criterion)
        test_avg_losses.append(test_avg_loss)
        accuracies.append(accuracy)
        print(f"Average Test Loss = {test_avg_loss:.20f}")
        print(f"Test Accuracy = {accuracy:.2f}")
        print()

    return train_avg_losses, test_avg_losses, accuracies

In [14]:
num_epochs = 100
learning_rate = 1

rnnlm = RNNLM(VOCAB_SIZE, HIDDEN_SIZE).to(DEVICE)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(rnnlm.parameters(), lr=learning_rate)

In [23]:
train_avg_losses, test_avg_losses, accuracies = train(rnnlm, DEVICE, train_loader, test_loader, optimizer, criterion, num_epochs)

Epoch [1/100]
Average Train Loss = 0.69425559095267597165
Average Test Loss = 0.01071708028515180010
Test Accuracy = 0.00

Epoch [2/100]
Average Train Loss = 0.69415141385177081546
Average Test Loss = 0.01071458092580238920
Test Accuracy = 0.00

Epoch [3/100]
Average Train Loss = 0.69404493220921215091
Average Test Loss = 0.01071204276134570497
Test Accuracy = 0.00

Epoch [4/100]
Average Train Loss = 0.69393574138139857421
Average Test Loss = 0.01070945834120114702
Test Accuracy = 0.01

Epoch [5/100]
Average Train Loss = 0.69382270168641513308
Average Test Loss = 0.01070681835214296919
Test Accuracy = 0.01

Epoch [6/100]
Average Train Loss = 0.69370612759014660398
Average Test Loss = 0.01070411596447229385
Test Accuracy = 0.01

Epoch [7/100]
Average Train Loss = 0.69358439229685686644
Average Test Loss = 0.01070134062319993973
Test Accuracy = 0.01

Epoch [8/100]
Average Train Loss = 0.69345749143896429079
Average Test Loss = 0.01069848549862702745
Test Accuracy = 0.02

Epoch [9/100]
Av

In [None]:
def interpret(data):
    vocab_list = vocab.get_itos()
    return [vocab_list[x] for x in data]