In [1]:
%pip install torch matplotlib numpy tqdm

Collecting torch
  Using cached torch-2.4.1-cp312-none-macosx_11_0_arm64.whl.metadata (26 kB)
Collecting matplotlib
  Downloading matplotlib-3.9.2-cp312-cp312-macosx_11_0_arm64.whl.metadata (11 kB)
Collecting numpy
  Downloading numpy-2.1.1-cp312-cp312-macosx_14_0_arm64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.9/60.9 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tqdm
  Using cached tqdm-4.66.5-py3-none-any.whl.metadata (57 kB)
Collecting filelock (from torch)
  Using cached filelock-3.16.1-py3-none-any.whl.metadata (2.9 kB)
Collecting typing-extensions>=4.8.0 (from torch)
  Using cached typing_extensions-4.12.2-py3-none-any.whl.metadata (3.0 kB)
Collecting sympy (from torch)
  Using cached sympy-1.13.3-py3-none-any.whl.metadata (12 kB)
Collecting networkx (from torch)
  Using cached networkx-3.3-py3-none-any.whl.metadata (5.1 kB)
Collecting jinja2 (from torch)
  Using cached jinja2-3.1.4-py3-none-any.whl.metadata (2.6 kB)


In [2]:
import torch
from torch import nn
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt

Matplotlib is building the font cache; this may take a moment.


In [18]:
seed = 123
np.random.seed(seed)
torch.manual_seed(seed)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
from torch.utils.data import Dataset
from zipfile import ZipFile

class PTB(Dataset):
    def __init__(self, root_dir, which):
        """
        Arguments:
            root_dir (string): Directory containing the data files.
            which (string): "train", "test", or "val" dataset
        """
        d = {"train": 1, "test": 5, "val": 7}
        with ZipFile(root_dir, 'r') as zip_ref:
            file_name = zip_ref.namelist()[d[which]]
            with zip_ref.open(file_name) as file:
                content = file.read().decode('utf-8')
                words = content.split()
                self.index_to_word = {idx: word for idx, word in enumerate(set(words))}
                self.word_to_index = {word: idx for idx, word in enumerate(set(words))}
                self.data = np.array([self.word_to_index[word] for word in words], dtype=np.int32)
        self.sequence_length = 24

    def __len__(self):
        return len(self.data) - 1

    def __getitem__(self, idx):
        # deal with slices
        if isinstance(idx, slice):
          return [self[i] for i in range(*idx.indices(len(self)))]

        if torch.is_tensor(idx):
            idx = idx.item()

        idx = idx % (len(self.data) - self.sequence_length)
        sequence = self.data[idx:idx + self.sequence_length]
        target = self.data[idx + 1:idx + self.sequence_length + 1]
        return sequence, target


train_set = PTB(root_dir='./ptb_data.zip', which='train')
test_set = PTB(root_dir='./ptb_data.zip', which='test')
val_set = PTB(root_dir='./ptb_data.zip', which='val')

x = train_set[:1]
x

[(array([9983, 2140,  150, 3327, 3300, 3109, 1683, 2266, 8868, 7117, 4775,
         1982, 7780, 9959, 9064, 9411,  545, 4534, 8760, 5053, 7647, 4453,
         9199, 7319], dtype=int32),
  array([2140,  150, 3327, 3300, 3109, 1683, 2266, 8868, 7117, 4775, 1982,
         7780, 9959, 9064, 9411,  545, 4534, 8760, 5053, 7647, 4453, 9199,
         7319,   66], dtype=int32))]

In [5]:
from torch.utils.data import DataLoader

batch_size = 20

train_loader = DataLoader(train_set, batch_size=batch_size)#, shuffle=True)
val_loader = DataLoader(val_set, batch_size=batch_size)#, shuffle=True)
test_loader = DataLoader(test_set, batch_size=batch_size)#, shuffle=True)

In [6]:
for text, pred in train_loader:
  print(text.shape, pred.shape)
  break

torch.Size([20, 24]) torch.Size([20, 24])


In [8]:
class LSTM(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=2, batch_first=True)
        self.linear = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        embeds = self.embedding(x)
        out, _ = self.lstm(embeds)
        out = self.linear(out)
        return out

net = LSTM(64, 200, len(train_set.index_to_word))
net.to(device)
print(net)

LSTM(
  (embedding): Embedding(9999, 64)
  (lstm): LSTM(64, 200, num_layers=2, batch_first=True)
  (linear): Linear(in_features=200, out_features=9999, bias=True)
)


In [94]:
# from torch.nn import functional as F

# class LSTM(nn.Module):
#     def __init__(self, hidden_dim, vocab_size):
#         super().__init__()
#         self.lstm = nn.LSTM(vocab_size, hidden_dim, num_layers=2)
#         self.linear = nn.Linear(hidden_dim, vocab_size)

#     def forward(self, x):
#         out, _ = self.lstm(F.one_hot(x.long(), num_classes=len(train_set.index_to_word)).float())
#         out = self.linear(out.view(len(out), -1))
#         return out

# net = LSTM(200, len(train_set.index_to_word))
# net.to("cuda")

LSTM(
  (lstm): LSTM(9999, 200, num_layers=2)
  (linear): Linear(in_features=200, out_features=9999, bias=True)
)

In [9]:
for text, pred in train_loader:
  text = text.to(device)
  pred = pred.to(device)
  print(text.shape, pred.shape)
  outputs = net(text)
  print(outputs.shape)
  break

torch.Size([20, 24]) torch.Size([20, 24])
torch.Size([20, 24, 9999])


In [16]:
from tqdm.notebook import tqdm

def train(model, train_loader, val_loader, loss_fn, optimizer, epochs=10):
  try:
    # for epoch in tqdm(range(epochs)):
    for epoch in range(epochs):
      model.train()
      print(f"Epoch: {epoch}")
      total_loss = 0
      count = 0
      for text, pred in train_loader:
        text = text.to(device)
        pred = pred.to(device)
        optimizer.zero_grad()
        outputs = net(text)
        # loss = loss_fn(outputs, pred.long())
        loss = loss_fn(outputs.view(-1, outputs.size(-1)), pred.view(-1).long())
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        count += 1
      print(f"Train Loss: {total_loss / count}")
      perplexity = torch.exp(torch.tensor(total_loss / len(train_loader)))
      print(f"Perplexity: {perplexity}")
      model.eval()
      with torch.no_grad():
        total_loss = 0
        count = 0
        for text, pred in val_loader:
          text = text.to(device)
          pred = pred.to(device)
          outputs = net(text)
          # loss = loss_fn(outputs, pred.long())
          loss = loss_fn(outputs.view(-1, outputs.size(-1)), pred.view(-1).long())
          total_loss += loss.item()
          count += 1
        print(f"Val Loss: {total_loss / count}")
        perplexity = torch.exp(torch.tensor(total_loss / batch_size))
        print(f"Perplexity: {perplexity}")
  except KeyboardInterrupt:
    print("Exiting...")

def test(model, test_loader, loss_fn):
  model.eval()
  total_loss = 0
  with torch.no_grad():
    for text, pred in test_loader:
      text = text.to(device)
      pred = pred.to(device)
      outputs = net(text)
      total_loss += loss_fn(outputs.view(-1, outputs.size(-1)), pred.view(-1).long())
    perplexity = torch.exp(torch.tensor(total_loss / batch_size))
    print(f"Perplexity: {perplexity}")

In [13]:
len(val_loader)

3520

In [17]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(net.parameters(), lr=1)
net_acc, net_test_acc = train(net, train_loader, val_loader, loss_fn, optimizer, epochs=10)

Epoch: 0
Exiting...


TypeError: cannot unpack non-iterable NoneType object

In [81]:
test(net, test_loader)

Perplexity: 255238.828125
