In [1]:
# to add our venv's packages to the path
import sys
sys.path.append('/Users/tunadorable/local-repos/next-concept-predictor/v1/venv/lib/python3.11/site-packages')

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim

In [3]:
# Define the model
class SimpleTransformerModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, num_layers, num_classes):
        super(SimpleTransformerModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        transformer_layer = nn.TransformerEncoderLayer(
            d_model=embed_dim, 
            nhead=num_heads, 
            batch_first=True  # Set batch_first to True
        )
        self.transformer = nn.TransformerEncoder(transformer_layer, num_layers=num_layers)
        self.output = nn.Linear(embed_dim, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        x = self.transformer(x)
        x = x.mean(dim=1)  # Aggregating over the sequence length
        return self.output(x)

In [4]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
import io

# Load the dataset
def load_dataset(file_path):
    with io.open(file_path, encoding='utf8') as f:
        return f.read()

# Tokenization and Vocabulary Building
def yield_tokens(data):
    for line in data.split('\n'):
        yield tokenizer(line)

tokenizer = get_tokenizer('basic_english')
data = load_dataset('tinyshakespeare.txt')
vocab = build_vocab_from_iterator(yield_tokens(data), specials=["<unk>", "<dlevel>", "<ulevel>"])
vocab.set_default_index(vocab["<unk>"])

print(f"Vocabulary size: {len(vocab)}")

Vocabulary size: 12238


In [6]:
# Model parameters
VOCAB_SIZE = len(vocab)
EMBED_DIM = 128
NUM_HEADS = 4
NUM_LAYERS = 2
NUM_CLASSES = EMBED_DIM  # Output is a single embedding vector

In [7]:
# Create the model
model = SimpleTransformerModel(VOCAB_SIZE, EMBED_DIM, NUM_HEADS, NUM_LAYERS, NUM_CLASSES)

# Define optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
# Placeholder for the data loader
# This is where you will later modify how the data is loaded based on the model's output

# Placeholder for the training loop
# This is where you can experiment with the interaction between the data loader and the training loop

# Example training loop structure
# for epoch in range(num_epochs):
#     for batch in data_loader:
#         # Your training code here

In [8]:
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset

class ShakespeareDataset(Dataset):
    def __init__(self, text, seq_len=90):
        self.data = [vocab[token] for token in tokenizer(text)]
        self.seq_len = seq_len

    def __len__(self):
        return len(self.data) - self.seq_len

    def __getitem__(self, idx):
        return (
            torch.tensor(self.data[idx:idx+self.seq_len], dtype=torch.long),
            torch.tensor(self.data[idx+1:idx+self.seq_len+1], dtype=torch.long)
        )

# Creating the dataset and data loader
seq_len = 90  # Length of the sequence
dataset = ShakespeareDataset(data, seq_len)
train_loader = DataLoader(dataset, batch_size=1, shuffle=True)