In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import random
#import tiktoken  # Use TikToken for fast tokenization

In [None]:
# prompt: generate a 1 million character strings of ABCDE. A C can occur anywhere. But B will occur only after AA and D will only come after CCCC. E can occur only after AC.

import random

def generate_string(length):
  """
  Generates a string of specified length with the given constraints:
  - C can occur anywhere.
  - B occurs only after AA.
  - D occurs only after CCCC.
  - E occurs only after AC.
  """
  result = ""
  for _ in range(length):
    valid_chars = ['A', 'C']
    if len(result) >= 2 and result[-2:] == "AA":
        valid_chars.append("B")
    if len(result) >= 4 and result[-4:] == "CCCC":
        valid_chars.append("D")
    if len(result) >= 2 and result[-2:] == 'AC':
        valid_chars.append("E")


    result += random.choice(valid_chars)

  return result

# Generate a 1 million character string
text_data = generate_string(1000000)
print(len(text_data))


1000000


In [None]:
import collections

distribution = collections.Counter(text_data)

# Print the distribution
distribution


Counter({'A': 432500, 'B': 61860, 'C': 432438, 'E': 61240, 'D': 11962})

In [None]:
class RNNLM(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_size, num_layers, dropout):
        super(RNNLM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.rnn = nn.GRU(embed_dim, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, hidden):
        x = self.embedding(x)
        out, hidden = self.rnn(x, hidden)
        out = self.fc(out)
        return out, hidden


In [None]:

tokenizer = {x:i for i,x in enumerate(set(text_data))}
class TextDataset(Dataset):
    def __init__(self, tokens, seq_len):
        self.tokens = [tokenizer[x] for x in tokens]
        self.seq_len = seq_len

    def __len__(self):
        return len(self.tokens) - self.seq_len

    def __getitem__(self, idx):
        x = torch.tensor(self.tokens[idx:idx+self.seq_len], dtype=torch.long)
        y = torch.tensor(self.tokens[idx+1:idx+self.seq_len+1], dtype=torch.long)
        return x, y

In [None]:
# Hyperparameters
BATCH_SIZE = 64
SEQ_LEN = 128
EMBED_DIM = 16
HIDDEN_SIZE = 32
NUM_LAYERS = 1
DROPOUT = 0.3
LR = 3e-4
EPOCHS = 3
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
VOCAB_SIZE = len(set(text_data))

In [None]:
dataset = TextDataset(text_data, SEQ_LEN)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4)



In [None]:
model = RNNLM(VOCAB_SIZE, EMBED_DIM, HIDDEN_SIZE, NUM_LAYERS, DROPOUT).to(DEVICE)
optimizer = optim.AdamW(model.parameters(), lr=LR)
criterion = nn.CrossEntropyLoss()




In [None]:
# Training loop
def train():
    model.train()
    for epoch in range(EPOCHS):
        hidden = None
        for batch_idx, (x, y) in enumerate(dataloader):
            x, y = x.to(DEVICE), y.to(DEVICE)
            optimizer.zero_grad()
            prediction, hidden = model(x, hidden)
            hidden = hidden.detach()
            loss = criterion(prediction.view(-1, VOCAB_SIZE), y.view(-1))
            loss.backward()
            optimizer.step()

            if batch_idx % 100 == 0:
                print(f'Epoch [{epoch+1}/{EPOCHS}], Step [{batch_idx}/{len(dataloader)}], Loss: {loss.item():.4f}')

        # Save model checkpoint
        torch.save(model.state_dict(), f'rnnlm_epoch{epoch+1}.pth')


In [None]:
train()

Epoch [1/3], Step [0/15623], Loss: 1.6406
Epoch [1/3], Step [100/15623], Loss: 1.1744
Epoch [1/3], Step [200/15623], Loss: 1.0703
Epoch [1/3], Step [300/15623], Loss: 1.0003
Epoch [1/3], Step [400/15623], Loss: 0.9514
Epoch [1/3], Step [500/15623], Loss: 0.9310
Epoch [1/3], Step [600/15623], Loss: 0.9149
Epoch [1/3], Step [700/15623], Loss: 0.8986
Epoch [1/3], Step [800/15623], Loss: 0.8908
Epoch [1/3], Step [900/15623], Loss: 0.8829
Epoch [1/3], Step [1000/15623], Loss: 0.8792
Epoch [1/3], Step [1100/15623], Loss: 0.8750
Epoch [1/3], Step [1200/15623], Loss: 0.8756
Epoch [1/3], Step [1300/15623], Loss: 0.8690
Epoch [1/3], Step [1400/15623], Loss: 0.8669
Epoch [1/3], Step [1500/15623], Loss: 0.8716
Epoch [1/3], Step [1600/15623], Loss: 0.8700
Epoch [1/3], Step [1700/15623], Loss: 0.8714
Epoch [1/3], Step [1800/15623], Loss: 0.8699
Epoch [1/3], Step [1900/15623], Loss: 0.8695
Epoch [1/3], Step [2000/15623], Loss: 0.8651
Epoch [1/3], Step [2100/15623], Loss: 0.8636
Epoch [1/3], Step [220

In [None]:
# prompt: Load model from rnnlm_epoch1.pth

# Load the model's state dictionary
model.load_state_dict(torch.load('rnnlm_epoch1.pth'))

# Set the model to evaluation mode
model.eval()

print("Model loaded successfully from rnnlm_epoch1.pth")



In [None]:
tokenizer = {x:i for i,x in enumerate(set(text_data))}

In [None]:
# prompt: Generate 10 next tokens from the context string 'AAAA'

def generate_next_tokens(model, tokenizer, context_string, num_tokens=10):
  """
  Generates the next tokens based on the provided context string using the trained RNNLM model.

  Args:
      model: The trained RNNLM model.
      context_string: The context string (e.g., "AAAA").
      num_tokens: The number of tokens to generate.

  Returns:
      A string containing the generated tokens.
  """
  itos = {i: x for x, i in tokenizer.items()}

  # Convert the context string to tokens
  context_tokens = [tokenizer[char] for char in context_string]
  context_tensor = torch.tensor([context_tokens], dtype=torch.long).to(DEVICE)

  # Initialize hidden state
  hidden = None

  generated_tokens = []

  # Generate next tokens
  with torch.no_grad():
    #first lets use context
    for token in context_tensor[0]:
      output, hidden = model(token.unsqueeze(0).unsqueeze(0),hidden)
    for _ in range(num_tokens):
        output, hidden = model(token.unsqueeze(0).unsqueeze(0), hidden)
        probabilities = torch.softmax(output[0, -1, :], dim=0)
        # print probability vector with label from tokenizer
        for i, prob in enumerate(probabilities):
            print(f"{itos[i]}: {prob.item():.4f}")

        next_token_idx = torch.multinomial(probabilities, num_samples=1).item()
        generated_tokens.append(itos[next_token_idx])

        print(f"I got {itos[next_token_idx]}. Now string is {context_string + ''.join(generated_tokens)}")

        token = torch.tensor(next_token_idx).to(DEVICE)
  return "".join(generated_tokens)

# Example usage:
context_string = "AAAA"
next_tokens = generate_next_tokens(model, tokenizer, context_string)
print(f"Context: '{context_string}'")
print(f"Next tokens: '{next_tokens}'")


B: 0.6624
C: 0.1674
D: 0.0001
A: 0.1700
E: 0.0001
I got B. Now string is AAAAB
B: 0.0001
C: 0.4974
D: 0.0000
A: 0.5020
E: 0.0004
I got A. Now string is AAAABA
B: 0.0048
C: 0.5080
D: 0.0000
A: 0.4871
E: 0.0001
I got A. Now string is AAAABAA
B: 0.5574
C: 0.2227
D: 0.0001
A: 0.2198
E: 0.0000
I got B. Now string is AAAABAAB
B: 0.0000
C: 0.5003
D: 0.0000
A: 0.4994
E: 0.0003
I got C. Now string is AAAABAABC
B: 0.0000
C: 0.4852
D: 0.0002
A: 0.5125
E: 0.0021
I got C. Now string is AAAABAABCC
B: 0.0000
C: 0.4797
D: 0.0011
A: 0.5173
E: 0.0019
I got C. Now string is AAAABAABCCC
B: 0.0000
C: 0.4827
D: 0.0040
A: 0.5121
E: 0.0012
I got C. Now string is AAAABAABCCCC
B: 0.0001
C: 0.3063
D: 0.3752
A: 0.3169
E: 0.0016
I got D. Now string is AAAABAABCCCCD
B: 0.0000
C: 0.5079
D: 0.0000
A: 0.4920
E: 0.0001
I got C. Now string is AAAABAABCCCCDC
Context: 'AAAA'
Next tokens: 'BAABCCCCDC'


In [None]:
tokenizer

In [None]:
text_data[0:100]