In [2]:
import torch
import os
import torch.nn as nn
import numpy as np
from torch.nn.utils import clip_grad_norm

In [3]:
class Dictionary(object):
    def __init__(self):
        # Maps words to indices
        self.word2idx = {}
        # Maps indices back to words
        self.idx2word = {}
        # Keeps track of the next available index
        self.idx = 0

    def add_word(self, word):
        # Add the word if it's not already in the dictionary
        if word not in self.word2idx:
            self.word2idx[word] = self.idx     # Assign current index to word
            self.idx2word[self.idx] = word     # Store word at that index
            self.idx += 1                      # Increment index for next word
            
    def __len__(self):
        # Return total number of unique words in the dictionary
        return len(self.word2idx)


In [4]:
class TextProcess(object):
    
    def __init__(self):
        # Create a Dictionary instance to map each unique word to an index
        self.dictionary = Dictionary()

    def get_data(self, path, batch_size=20):
        # ---- First Pass: Build the vocabulary ----
        with open(path, 'r') as f:
            tokens = 0  # Keeps track of total number of words (tokens) in the file
            for line in f:
                # Split each line into words and add a special <eos> token for end of sentence
                words = line.split() + ['<eos>']
                tokens += len(words)  # Count words in this line
                # Add each word to the dictionary (assign index if not already added)
                for word in words: 
                    self.dictionary.add_word(word)

        # ---- Create a tensor to store all tokens as indices ----
        # We now know the total number of tokens, so create a LongTensor of that size
        rep_tensor = torch.LongTensor(tokens)
        index = 0  # Pointer to current position in rep_tensor

        # ---- Second Pass: Convert words to their numeric indices ----
        with open(path, 'r') as f:
            for line in f:
                # Again split words and add end-of-sentence token
                words = line.split() + ['<eos>']
                for word in words:
                    # Map each word to its index using the dictionary and store it in the tensor
                    rep_tensor[index] = self.dictionary.word2idx[word]
                    index += 1  # Move to next position

        # ---- Prepare batches ----
        # Total number of complete batches we can make (ignore leftover tokens)
        num_batches = rep_tensor.shape[0] // batch_size
        # Trim off any extra tokens that don't fit into a complete batch
        rep_tensor = rep_tensor[:num_batches * batch_size]
        # Reshape the tensor into (batch_size, num_batches_per_batch)
        # Each column represents a time step, and each row a batch sample
        rep_tensor = rep_tensor.view(batch_size, -1)

        # Return the processed data tensor
        return rep_tensor


In [5]:
embed_size = 300      # Each word will be converted into a 300-dimensional vector (richer word embedding)
hidden_size = 1024    # Number of units (neurons) in each LSTM layer, controls the capacity of the model
num_layers = 2        # Number of stacked LSTM layers (more layers to learn complex patterns)
num_epochs = 40       # How many times the model will see the entire dataset during training (longer training)
batch_size = 32       # Number of sequences (samples) processed together in one training step (larger batch for stability)
timesteps = 50        # Number of words (time steps) in each sequence input to the LSTM (longer context)
learning_rate = 0.001 # Speed at which the model will update its weights (slower learning rate for smoother training)

In [6]:
corpus = TextProcess()

In [7]:
rep_tensor = corpus.get_data('alice.txt', batch_size)

In [8]:
#rep_tensor is the tensor that contains the index of all the words. Each row contains 1659 words by default 
print(rep_tensor.shape)

torch.Size([32, 927])


In [9]:
# Get the total number of unique words in the text file
vocab_size = len(corpus.dictionary)  

# Print the vocabulary size (how many distinct words are in alice.txt)
print(vocab_size)

5290


In [10]:
# Calculate how many complete batches of sequences we can get
# rep_tensor.shape[1] -> total number of tokens per batch row
# timesteps -> number of words (time steps) in each sequence for the LSTM
num_batches = rep_tensor.shape[1] // timesteps

# Print the total number of batches we can form
print(num_batches)

18


In [11]:
class TextGenerator(nn.Module):
    
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers):
        super(TextGenerator, self).__init__()
        # Embedding layer: converts word indices into dense vectors of size embed_size
        self.embed = nn.Embedding(vocab_size, embed_size)
        # LSTM layer: processes the sequence of embeddings
        # batch_first=True -> input/output has shape (batch_size, timesteps, features)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        # Linear layer: maps the LSTM's hidden output to vocabulary size (for word prediction)
        self.linear = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, h):
        # x: input word indices of shape (batch_size, timesteps)
        # h: previous hidden and cell states (h0, c0)

        # Convert word indices to embeddings
        x = self.embed(x)

        # Pass embeddings through LSTM; get output sequence and new states
        out, (h, c) = self.lstm(x, h)

        # Reshape output so that each time step in the batch is treated as a separate sample
        # Shape: (batch_size * timesteps, hidden_size)
        out = out.reshape(out.size(0) * out.size(1), out.size(2))

        # Map LSTM outputs to vocabulary scores (logits for each word in the vocab)
        out = self.linear(out)

        # Return predicted scores for all words + new hidden and cell states
        return out, (h, c)

In [12]:
# Create an instance of the TextGenerator model
# Pass vocabulary size, embedding dimension, hidden layer size, and number of LSTM layers
model = TextGenerator(vocab_size, embed_size, hidden_size, num_layers)

In [13]:
# Define the loss function as Cross Entropy Loss
# This is commonly used for multi-class classification tasks like predicting the next word
loss_fn = nn.CrossEntropyLoss()

# Define the optimizer as Adam, which updates model parameters during training
# Uses the specified learning rate
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [14]:
def detach(states):
    """
If we have a tensor z,'z.detach()' returns a tensor that shares the same storage
as 'z', but with the computation history forgotten. It doesn't know anything
about how it was computed. In other words, we have broken the tensor z away from its past history
Here, we want to perform truncated Backpropagation
TBPTT splits the 1,000-long sequence into 50 sequences (say) each of length 20 and treats each sequence of length 20 as 
a separate training case. This is a sensible approach that can work well in practice, but it is blind to temporal 
dependencies that span more than 20 timesteps.
    """
    return [state.detach() for state in states] 

In [15]:
for epoch in range(num_epochs):
    # Initialize hidden and cell states for the LSTM with zeros at the start of each epoch
    states = (torch.zeros(num_layers, batch_size, hidden_size),
              torch.zeros(num_layers, batch_size, hidden_size))

    # Loop over the data in chunks of 'timesteps' length
    for i in range(0, rep_tensor.size(1) - timesteps, timesteps):
        # Prepare input batch: sequences of length 'timesteps'
        inputs = rep_tensor[:, i:i+timesteps]
        # Prepare target batch: next words following inputs (shifted by 1)
        targets = rep_tensor[:, (i+1):(i+1)+timesteps]
        
        # Forward pass: get model outputs and updated states
        outputs, _ = model(inputs, states)
        
        # Compute loss comparing predicted outputs with true targets
        # targets.reshape(-1) flattens targets to match outputs shape
        loss = loss_fn(outputs, targets.reshape(-1))

        # Zero out gradients from the previous step
        model.zero_grad()
        # Backpropagate the loss to compute gradients
        loss.backward()
        
        # Gradient clipping: limit gradient values between -0.5 and 0.5 to prevent exploding gradients
        clip_grad_norm(model.parameters(), 0.5)
        
        # Update model parameters using optimizer
        optimizer.step()
              
        step = (i + 1) // timesteps
        # Print loss every 100 steps for monitoring training progress
        if step % 100 == 0:
            print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch + 1, num_epochs, loss.item()))

  clip_grad_norm(model.parameters(), 0.5)


Epoch [1/40], Loss: 8.5764
Epoch [2/40], Loss: 6.6756
Epoch [3/40], Loss: 6.3445
Epoch [4/40], Loss: 6.1226
Epoch [5/40], Loss: 5.8413
Epoch [6/40], Loss: 5.5317
Epoch [7/40], Loss: 5.2067
Epoch [8/40], Loss: 4.8968
Epoch [9/40], Loss: 4.7172
Epoch [10/40], Loss: 4.4067
Epoch [11/40], Loss: 4.0655
Epoch [12/40], Loss: 3.7729
Epoch [13/40], Loss: 3.4590
Epoch [14/40], Loss: 3.2306
Epoch [15/40], Loss: 2.9017
Epoch [16/40], Loss: 2.5615
Epoch [17/40], Loss: 2.2984
Epoch [18/40], Loss: 2.0699
Epoch [19/40], Loss: 1.8987
Epoch [20/40], Loss: 1.6491
Epoch [21/40], Loss: 1.3611
Epoch [22/40], Loss: 1.1665
Epoch [23/40], Loss: 0.9983
Epoch [24/40], Loss: 0.7911
Epoch [25/40], Loss: 0.6174
Epoch [26/40], Loss: 0.5008
Epoch [27/40], Loss: 0.3859
Epoch [28/40], Loss: 0.2998
Epoch [29/40], Loss: 0.2144
Epoch [30/40], Loss: 0.1576
Epoch [31/40], Loss: 0.1210
Epoch [32/40], Loss: 0.1030
Epoch [33/40], Loss: 0.0756
Epoch [34/40], Loss: 0.0645
Epoch [35/40], Loss: 0.0568
Epoch [36/40], Loss: 0.0536
E

In [16]:
# Test the model and generate text without computing gradients
with torch.no_grad():
    # Open a file to save the generated text
    with open('results.txt', 'w') as f:
        # Initialize hidden and cell states with zeros (for batch size = 1)
        state = (torch.zeros(num_layers, 1, hidden_size),
                 torch.zeros(num_layers, 1, hidden_size))
        
        # Randomly pick one word index as the starting input, shape: (1,1)
        input = torch.randint(0, vocab_size, (1,)).long().unsqueeze(1)

        # Generate 500 words one by one
        for i in range(500):
            # Forward pass: get model output and updated state for the current input
            output, _ = model(input, state)
            print(output.shape)  # (batch_size*timesteps, vocab_size), here batch=1, timesteps=1 → (1, vocab_size)

            # Convert logits to probabilities by applying exponential (softmax is usually better, but exp works here)
            prob = output.exp()

            # Sample a word index from the probability distribution
            word_id = torch.multinomial(prob, num_samples=1).item()
            print(word_id)  # Print sampled word id
            
            # Update the input tensor with the sampled word id for the next iteration
            input.fill_(word_id)

            # Convert the sampled word id back to the actual word
            word = corpus.dictionary.idx2word[word_id]
            # Replace <eos> token with newline for better readability
            word = '\n' if word == '<eos>' else word + ' '

            # Write the generated word to the file
            f.write(word)
            
            # Print progress every 100 words generated
            if (i + 1) % 100 == 0:
                print('Sampled [{}/{}] words and saved to {}'.format(i + 1, 500, 'results.txt'))

torch.Size([1, 5290])
3006
torch.Size([1, 5290])
5
torch.Size([1, 5290])
3582
torch.Size([1, 5290])
80
torch.Size([1, 5290])
1054
torch.Size([1, 5290])
512
torch.Size([1, 5290])
38
torch.Size([1, 5290])
5
torch.Size([1, 5290])
5
torch.Size([1, 5290])
1927
torch.Size([1, 5290])
4025
torch.Size([1, 5290])
2156
torch.Size([1, 5290])
2939
torch.Size([1, 5290])
990
torch.Size([1, 5290])
3231
torch.Size([1, 5290])
144
torch.Size([1, 5290])
1386
torch.Size([1, 5290])
117
torch.Size([1, 5290])
3
torch.Size([1, 5290])
2647
torch.Size([1, 5290])
117
torch.Size([1, 5290])
3
torch.Size([1, 5290])
3244
torch.Size([1, 5290])
2326
torch.Size([1, 5290])
2138
torch.Size([1, 5290])
3769
torch.Size([1, 5290])
66
torch.Size([1, 5290])
44
torch.Size([1, 5290])
2032
torch.Size([1, 5290])
13
torch.Size([1, 5290])
1234
torch.Size([1, 5290])
5
torch.Size([1, 5290])
439
torch.Size([1, 5290])
569
torch.Size([1, 5290])
2162
torch.Size([1, 5290])
4266
torch.Size([1, 5290])
248
torch.Size([1, 5290])
2359
torch.Size