In [1]:
import sys
from collections import Counter
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torchtext
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import Vocab, vocab, GloVe
import random

### Pull in my modules

In [2]:
# import my modules 
from path import get_model_folder_path
sys.path.append(get_model_folder_path())

from model import TransformerModel
from datasets import GloVeDataset


In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
tokenizer = get_tokenizer('basic_english')

### Create Dataset

Create the dataset from the torch Glove embeddings

In [5]:
glove_vectors = GloVe(name='6B', dim=300)
embeddings = nn.Embedding.from_pretrained(glove_vectors.vectors)

In [6]:
print("Vocab Size:",glove_vectors.vectors.shape[0])
print("Embedded Dimensions:", glove_vectors.vectors.shape[1])

Vocab Size: 400000
Embedded Dimensions: 300


### Define vocab from vectors

In [7]:
def build_vocab(in_data, tokenizer):
  counter = Counter()
  for string in in_data:
    counter.update(tokenizer(string))

  return vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])

def data_process(in_data, tokenizer, vocab: Vocab):
  raw_iter = iter(in_data)
  data = []
  for raw in raw_iter:
    tensor = torch.tensor([vocab[token] for token in tokenizer(raw)], dtype=torch.long)
    data.append(tensor)
    
  return torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))

In [8]:
vocab = build_vocab(
  in_data=glove_vectors.itos,
  tokenizer=tokenizer)

processed_data = 



NameError: name 'glo' is not defined

### Instantiate Model
Using the glove vocab, make the model

In [None]:
model = TransformerModel(
  vocab_size=glove_vectors.vectors.shape[0], 
  embed_dim=glove_vectors.vectors.shape[1], 
  num_heads=6, 
  hidden_dim=512, 
  num_layers=6,
  dropout=0.2, 
  pretrained_embedding=embeddings
).to(device)

In [None]:

def generate_text(model, tokenizer, vocab, starting_text, max_len=50):
    # Tokenize the starting text
    tokens = tokenizer(starting_text.lower())
    print("tokens:", tokens)
    # Convert tokens to tensor and add batch dimension
    tokens_tensor = torch.tensor([vocab[token] for token in tokens]).unsqueeze(0)
    print("tokens tensor:", tokens_tensor)
    # Set model to evaluation mode
    model.eval()
    with torch.no_grad():
        for i in range(max_len):
            # Generate output sequence
            outputs = model(tokens_tensor)
            # Get last predicted token (ignoring the batch dimension)
            predicted_token = outputs.argmax(dim=-1)[:,-1].item()
            # Check for end-of-sequence token
            if predicted_token == vocab['<eos>']:
                break
            # Append predicted token to input sequence
            tokens.append(tokenizer.vocab.itos[predicted_token])
            tokens_tensor = torch.tensor([vocab[token] for token in tokens]).unsqueeze(0)

    # Convert output tokens back to text
    generated_text = tokenizer.decode(tokens)
    # Return generated text
    return generated_text


In [None]:
starting_text = "The quick brown fox"
generated_text = generate_text(model, tokenizer, vocab, starting_text)
print("Generated Text:", generated_text)

tokens: ['the', 'quick', 'brown', 'fox']


RuntimeError: Token the not found and default index is not set