# [Word Embeddings](https://pytorch.org/tutorials/beginner/nlp/word_embeddings_tutorial.html#an-example-n-gram-language-modeling)

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [2]:
vocab_list = ["I", "love", "eating", "and", "sleeping"]
word_2_idx = {
    x: i for i, x in enumerate(vocab_list)
}

print(word_2_idx)

{'I': 0, 'love': 1, 'eating': 2, 'and': 3, 'sleeping': 4}


In [3]:
# The indices of the words represent the values in the lookup table
word_to_idx = {"I": 0, "love": 1, "eating": 2, "and": 3, "sleeping": 4}
embeddings = nn.Embedding(num_embeddings=5, embedding_dim=7) # 5 words in vocab, embedding size is 7
word_index = torch.tensor([word_to_idx['love']])
print(word_index)

tensor([1])


In [4]:
love = embeddings(word_index)
print(love)
print(love.shape)

tensor([[ 1.8205,  0.9894, -1.1814,  0.0540,  0.1847, -0.6044,  0.0840]],
       grad_fn=<EmbeddingBackward0>)
torch.Size([1, 7])


In [5]:
all_ind = torch.tensor([w for w in range(5)], dtype=torch.long)
all_words = embeddings(all_ind)
print(all_words)
print(all_words.shape)

tensor([[-0.5853, -0.6189, -0.2841, -0.3253, -0.2754,  0.3738, -0.3552],
        [ 1.8205,  0.9894, -1.1814,  0.0540,  0.1847, -0.6044,  0.0840],
        [-1.4661,  2.1970, -1.0759,  0.8244,  0.8581,  0.0240,  1.0185],
        [-0.6483, -1.3046,  0.5031, -1.8648, -1.2970, -0.4997, -0.3255],
        [ 0.4510, -0.0352, -0.1473,  0.4301,  0.0471, -0.1816, -0.8646]],
       grad_fn=<EmbeddingBackward0>)
torch.Size([5, 7])


## Download Dataset

In [2]:
import os
import requests

url = "https://raw.githubusercontent.com/fawazsammani/The-Complete-Neural-Networks-Bootcamp-Theory-Applications/refs/heads/master/alice.txt"
response = requests.get(url)

os.makedirs("data", exist_ok=True)

# Save the content to a file
with open("data/alice.txt", "w", encoding="utf-8") as file:
    file.write(response.text)

print("File saved successfully.")

File saved successfully.


In [8]:
# Imports required libraries for handling tensors, implementing datasets, and data loaders
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

# Defines the file path to the text data
file_path = "data/alice.txt"


# Step 1: Reads the raw text data from a file
def read_text_file(path):
    # Opens file in read mode with UTF-8 encoding and loads entire content as a string
    with open(path, "r", encoding="utf-8") as f:
        text = f.read()
    return text


# Step 2: Tokenizes the text, builds a vocabulary, and encodes words as indices
def build_vocab_and_encode(text):
    # Tokenizes the text using spaces and appends an end-of-sequence token
    tokens = text.split() + ['<eos>']
    # Creates a dictionary mapping each unique word to a unique index
    word2idx = {word: i for i, word in enumerate(set(tokens))}
    # Encodes the entire text as a tensor of word indices
    encoded_text = torch.tensor([word2idx[word] for word in tokens], dtype=torch.long)
    return encoded_text, len(word2idx)


# Step 3: Defines a custom dataset class suitable for language modeling tasks
class TextDataset(Dataset):
    """
    Dataset class for text data.

    Generates input-output sequences based on the text data for training and validation.
    """

    def __init__(self, encoded_text, seq_length, train=True, train_split=0.8):
        """
        Initializes the dataset with encoded text and splits it into training/validation datasets.

        Args:
            encoded_text (Tensor): Tensor representation of the text.
            seq_length (int): Number of words per sequence sample.
            train (bool): True for training set, False for validation set.
            train_split (float): Proportion of data for training (default 80%).
        """
        self.seq_length = seq_length

        # Splits the text data into training and validation based on train_split
        split_idx = int(len(encoded_text) * train_split)
        if train:
            # Uses the first portion for training
            self.data = encoded_text[:split_idx]
        else:
            # Uses the last portion for validation
            self.data = encoded_text[split_idx:]

    def __len__(self):
        """
        Returns the number of sequences available in the dataset.

        Returns:
            int: Total number of sequences minus incomplete/last sequence.
        """
        return len(self.data) - self.seq_length

    def __getitem__(self, idx):
        """
        Retrieves a sample sequence and its corresponding target sequence.

        Args:
            idx (int): Starting word index for the sequence.

        Returns:
            tuple: Input sequence and the target sequence tensors.
        """
        # Extracts the input sequence based on seq_length
        sequence = self.data[idx: idx + self.seq_length]
        # Extracts the target sequence by shifting the input by one word
        target = self.data[idx + 1: idx + self.seq_length + 1]
        return sequence, target


# Step 4: Loads the text data, tokenizes and encodes it into sequences
text = read_text_file(file_path)  # Loads raw text from file
encoded_text, vocab_size = build_vocab_and_encode(text)  # Encodes text and gets vocab size

# Sets the parameters for sequence length and batch size for the data loaders
seq_length = 30  # Number of words in each input sequence
batch_size = 20  # Number of sequences per batch

# Creates training and validation datasets and respective data loaders
train_dataset = TextDataset(encoded_text, seq_length, train=True, train_split=0.8)  # Training dataset
val_dataset = TextDataset(encoded_text, seq_length, train=False, train_split=0.8)  # Validation dataset

# Wraps the datasets in DataLoader objects for mini-batch processing
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Step 5: Tests loading of batches from the training and validation data loaders
print("Training DataLoader Example:")
# Iterates through the training data loader
for i, (sequence, target) in enumerate(train_loader):
    # Prints the shape of input and target batch tensors
    print(f"Sequence Shape: {sequence.shape}, Target Shape: {target.shape}")
    # Prints the first input sequence and corresponding target sequence sample
    print("Sequence Example:", sequence[0])  # First input sequence
    print("Target Example:", target[0])  # Corresponding target
    if i == 1:  # Stops after processing two batches
        break

print("\nValidation DataLoader Example:")
# Iterates through the validation data loader
for i, (sequence, target) in enumerate(val_loader):
    # Prints the shape of input and target batch tensors
    print(f"Sequence Shape: {sequence.shape}, Target Shape: {target.shape}")
    # Prints the first input sequence and corresponding target sequence sample
    print("Sequence Example:", sequence[0])  # First input sequence
    print("Target Example:", target[0])  # Corresponding target
    if i == 1:  # Stops after processing two batches
        break

Training DataLoader Example:
Sequence Shape: torch.Size([20, 30]), Target Shape: torch.Size([20, 30])
Sequence Example: tensor([4270,  318, 3928,  361, 3256, 4081, 1172, 2197, 4081, 1745, 1710, 3333,
        1948, 4918, 2477, 2886, 4828, 2168, 3333, 2748, 4918,  525, 5281, 3936,
        3955, 2135, 2838, 2886, 3109, 1974])
Target Example: tensor([ 318, 3928,  361, 3256, 4081, 1172, 2197, 4081, 1745, 1710, 3333, 1948,
        4918, 2477, 2886, 4828, 2168, 3333, 2748, 4918,  525, 5281, 3936, 3955,
        2135, 2838, 2886, 3109, 1974, 3256])
Sequence Shape: torch.Size([20, 30]), Target Shape: torch.Size([20, 30])
Sequence Example: tensor([ 382, 3654,  338, 2330, 4081,  621, 1974, 1710, 4750,  706, 4751, 2866,
        1974, 2863, 4081, 4483, 4751, 2866, 1974,  980,   47, 4884, 2935, 3336,
        2583, 1067, 2696,   49, 4126, 3936])
Target Example: tensor([3654,  338, 2330, 4081,  621, 1974, 1710, 4750,  706, 4751, 2866, 1974,
        2863, 4081, 4483, 4751, 2866, 1974,  980,   47, 4884, 

In [7]:
# Training function to process sequences, compute loss, optimize weights, and track accuracy
def train_model(model, train_loader, criterion, optimizer, device, seq_length, vocab_size):
    """
    Training loop for the LSTM model to process the training data and optimize parameters.
    
    Args:
        model (nn.Module): The LSTM model instance to train.
        train_loader (DataLoader): DataLoader for the training dataset.
        criterion (nn.Module): Loss function for calculating training loss.
        optimizer (torch.optim.Optimizer): Optimizer for adjusting model weights.
        device (torch.device): Device to place tensors on for computation.
        seq_length (int): Sequence length of word indices in the dataset.
        vocab_size (int): Size of the vocabulary.
        
    Returns:
        tuple: Average training loss and training accuracy as a percentage.
    """
    model.train()  # Set the model to training mode, enabling parameter updates (e.g., dropout and batch norm act accordingly).
    total_loss = 0  # Initialize total_loss to keep track of cumulative loss for the epoch.
    total_correct = 0  # Initialize total_correct to count the total number of correct predictions.
    total_elements = 0  # Initialize total_elements to store the total number of elements processed during the epoch (for accuracy calculation).

    for sequences, targets in train_loader:
        sequences, targets = sequences.to(device), targets.to(
            device)  # Transfer input (`sequences`) and target (`targets`) tensors to the specified device (CPU or GPU) for computation.
        batch_size = sequences.size(0)  # Get the batch size (number of sequences in the current batch).

        # Initialize hidden state
        # Initialize a zero-value hidden state for each batch to maintain LSTM statefulness
        hidden = initialize_hidden(batch_size, hidden_size, num_layers,
                                   device)  # Initialize a zero-value hidden state and cell state for the LSTM, ensuring statefulness for each batch.

        # Forward pass
        outputs, hidden = model(sequences, hidden)  # Perform a forward pass through the LSTM model.
        # Reshape outputs (logits with shape: [batch_size, seq_length, vocab_size]) to a 2D tensor: [batch_size * seq_length, vocab_size].
        # Reshape targets (true labels with shape: [batch_size, seq_length]) to match the outputs as a 1D tensor: [batch_size * seq_length].
        loss = criterion(outputs.view(-1, vocab_size),
                         # Compute the cross-entropy loss between predicted logits and target labels.
                         targets.view(
                             -1))  # Both tensors are reshaped to align in (N, C) and (N) format for CrossEntropyLoss.
        # CrossEntropyLoss expects logits of shape (N, C) and targets of shape (N), where N=batch_size*seq_length

        # Calculate accuracy
        # Get the predicted word index with the highest probability using argmax
        predictions = torch.argmax(outputs,
                                   dim=-1)  # Take the index of the highest probability (predicted word) for each time step.
        correct = (predictions.view(-1) == targets.view(
            -1)).sum().item()  # Count how many predictions match the target values.
        total_correct += correct
        total_elements += targets.numel()

        # Backward pass and optimization
        optimizer.zero_grad()  # Reset gradients from the previous iteration to prepare for a new update cycle.
        loss.backward()  # Backpropagate the loss to compute gradients for model parameters.
        optimizer.step()  # Update model parameters (e.g., weights, biases) using the computed gradients.

        total_loss += loss.item()  # Accumulate loss

    avg_loss = total_loss / len(train_loader)  # Compute the average loss across all batches in the training epoch.
    accuracy = total_correct / total_elements * 100  # Calculate accuracy as a percentage using correct predictions and total elements.
    return avg_loss, accuracy  # Return the average training loss and accuracy for the current epoch.

In [None]:

# Validation function to evaluate the model on unseen data and compute loss/accuracy
def validate_model(model, val_loader, criterion, device, seq_length, vocab_size):
    """
    Validation loop for the LSTM model to evaluate performance on the validation set.
    
    Args:
        model (nn.Module): The LSTM model instance to validate.
        val_loader (DataLoader): DataLoader for the validation dataset.
        criterion (nn.Module): Loss function for calculating validation loss.
        device (torch.device): Device to place tensors on for computation.
        seq_length (int): Sequence length of word indices in the dataset.
        vocab_size (int): Size of the vocabulary.
        
    Returns:
        tuple: Average validation loss and validation accuracy as a percentage.
    """
    model.eval()  # Set the model to evaluation mode to ensure dropout layers (if any) behave deterministically.

    # Initialize accumulators for loss, correct predictions, and total processed elements.
    total_loss = 0  # Total validation loss across all batches.
    total_correct = 0  # Number of correct predictions made so far.
    total_elements = 0  # Total number of words processed in the validation set.

    # Disable gradient computations to optimize memory usage and speed up validation.
    with torch.no_grad():
        # Iterate over the validation dataset batch by batch
        for sequences, targets in val_loader:
            # Move input (sequences) and target (targets) tensors to the designated device (GPU/CPU)
            sequences, targets = sequences.to(device), targets.to(device)

            # Extract batch size from the current batch
            batch_size = sequences.size(0)

            # Initialize the hidden and cell states of the LSTM to zeros for validation
            hidden = initialize_hidden(batch_size, hidden_size, num_layers, device)

            # Forward pass: get the model's predictions for the input sequences
            outputs, hidden = model(sequences, hidden)

            # Compute the loss by comparing the predicted logits and true target indices
            # Reshape outputs and targets to align with CrossEntropyLoss requirements
            loss = criterion(outputs.view(-1, vocab_size), targets.view(-1))

            # Calculate validation accuracy by comparing predicted indices with the ground truth
            predictions = torch.argmax(outputs, dim=-1)  # Get the word with the highest probability at each step
            # Determine the number of correct predictions
            correct = (predictions.view(-1) == targets.view(-1)).sum().item()

            # Aggregate the correct predictions and total elements processed for accuracy computation
            total_correct += correct
            total_elements += targets.numel()

            # Accumulate the total loss for the validation set
            total_loss += loss.item()

            # Calculate the average validation loss across all batches
    avg_loss = total_loss / len(val_loader)

    # Calculate the validation accuracy as a percentage
    accuracy = total_correct / total_elements * 100

    # Return the average validation loss and accuracy
    return avg_loss, accuracy

In [None]:
# LSTM-based language model
# This model takes input sequences, embeds them into dense representations using an embedding layer,
# processes the sequence through an LSTM layer to capture temporal dependencies, 
# and finally maps the outputs to probabilities over the vocabulary using a fully connected layer.
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)  # Embedding layer
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)  # Fully connected layer to output vocab probabilities

    def forward(self, x, hidden):
        x = self.embedding(x)  # Embed input word indices into embeddings
        output, hidden = self.lstm(x, hidden)  # LSTM output and updated hidden states
        output = self.fc(output)  # Map to vocab size
        return output, hidden


# Function to initialize the hidden and cell states of the LSTM to zeros
def initialize_hidden(batch_size, hidden_size, num_layers, device):
    """
    Initializes the LSTM hidden and cell states to zeros for the start of training or validation.
    
    Args:
        batch_size (int): Number of samples in each training batch.
        hidden_size (int): Number of LSTM hidden units per layer.
        num_layers (int): Number of stacked LSTM layers in the network.
        device (torch.device): The device (CPU or GPU) to allocate memory for the tensors.
    
    Returns:
        tuple: A tuple containing the initialized (hidden state, cell state), 
               each of shape (num_layers, batch_size, hidden_size), 
               to reset the learning context between different sequences or batches.
    """
    return (torch.zeros(num_layers, batch_size, hidden_size, device=device),
            torch.zeros(num_layers, batch_size, hidden_size, device=device))


In [None]:
# Define hyperparameters: these are the tunable parameters that control the model structure and learning process.
embed_size = 128  # Dimensionality of the word embeddings.
hidden_size = 256  # Number of units in the hidden layers of the LSTM.
num_layers = 2  # Number of stacked LSTM layers.
num_epochs = 10  # Number of complete passes through the training data.
learning_rate = 0.001  # Learning rate for the optimizer.

# Step 2: Define the model, loss function, and optimizer
# Set the device to GPU if available, otherwise use CPU.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Instantiate the LSTM model and push it to the chosen device.
model = LSTMModel(vocab_size, embed_size, hidden_size, num_layers).to(device)

# Define the loss function as CrossEntropyLoss, which is suitable for multi-class classification.
criterion = nn.CrossEntropyLoss()

# Adam optimizer is used for efficient gradient-based optimization of the model parameters.
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Step 3: Train and Validate
# Loop over the number of epochs, training and validating the model in each epoch.
for epoch in range(1, num_epochs + 1):
    # Train the model using the training data and compute training loss and accuracy.
    train_loss, train_accuracy = train_model(model, train_loader, criterion, optimizer, device, seq_length,
                                             vocab_size)
    # Validate the model using the validation data and compute validation loss and accuracy.
    val_loss, val_accuracy = validate_model(model, val_loader, criterion, device, seq_length, vocab_size)

    # Print the results for the current epoch, including training and validation performance metrics.
    print(
        f"Epoch {epoch}/{num_epochs}, Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.2f}%, "
        f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.2f}%")

In [8]:
class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = {}
        self.idx = 0

    def add_word(self, word):
        if word not in self.word2idx:
            self.word2idx[word] = self.idx
            self.idx2word[self.idx] = word
            self.idx += 1

    def __len__(self):
        return len(self.word2idx)

In [9]:
class TextProcess(object):
    def __init__(self):
        self.dictionary = Dictionary()

    def get_data(self, path, batch_size=20):
        with open(path, "r") as f:
            tokens = 0
            for line in f:
                words = line.split() + ['<eos>']
                tokens += len(words)
                for word in words:
                    self.dictionary.add_word(word)

        # Create a 1-D tensor that contains the index of all the words in the file
        rep_tensor = torch.LongTensor(tokens)
        index = 0
        with open(path, "r") as f:
            for line in f:
                words = line.split() + ['<eos>']
                for word in words:
                    rep_tensor[index] = self.dictionary.word2idx[word]
                    index += 1

        # Find out how many batches we need
        num_batches = rep_tensor.shape[0] // batch_size
        # Remove the remainder (Filter out the ones that don't fit)
        rep_tensor = rep_tensor[:num_batches * batch_size]
        # return (batch_size, numb_batches)
        rep_tensor = rep_tensor.view(batch_size, -1)
        return rep_tensor

In [10]:
with open("data/alice.txt", "r") as f:
    for line in f:
        print(line.split() + ['<eos>'])

['CHAPTER', 'I.', 'Down', 'the', 'Rabbit-Hole', '<eos>']
['<eos>']
['Alice', 'was', 'beginning', 'to', 'get', 'very', 'tired', 'of', 'sitting', 'by', 'her', 'sister', 'on', 'the', '<eos>']
['bank,', 'and', 'of', 'having', 'nothing', 'to', 'do:', 'once', 'or', 'twice', 'she', 'had', 'peeped', 'into', 'the', '<eos>']
['book', 'her', 'sister', 'was', 'reading,', 'but', 'it', 'had', 'no', 'pictures', 'or', 'conversations', 'in', '<eos>']
['it,', "'and", 'what', 'is', 'the', 'use', 'of', 'a', "book,'", 'thought', 'Alice', "'without", 'pictures', 'or', '<eos>']
["conversations?'", '<eos>']
['<eos>']
['So', 'she', 'was', 'considering', 'in', 'her', 'own', 'mind', '(as', 'well', 'as', 'she', 'could,', 'for', 'the', '<eos>']
['hot', 'day', 'made', 'her', 'feel', 'very', 'sleepy', 'and', 'stupid),', 'whether', 'the', 'pleasure', '<eos>']
['of', 'making', 'a', 'daisy-chain', 'would', 'be', 'worth', 'the', 'trouble', 'of', 'getting', 'up', 'and', '<eos>']
['picking', 'the', 'daisies,', 'when', 'su

In [23]:
embed_size = 128 # Input features to LSTM
hidden_size = 1024 # Number of LSTM Units
num_layers = 1
num_epochs = 20
batch_size = 20
timesteps = 30
learning_rate = 0.002

In [24]:
corpus = TextProcess()

In [25]:
# Split the rep_tensor into training and validation datasets
def split_data(tensor, train_split=0.8):
    split_idx = int(tensor.size(1) * train_split)
    train_tensor = tensor[:, :split_idx]
    val_tensor = tensor[:, split_idx:]
    return train_tensor, val_tensor

# Get the data and split it
rep_tensor = corpus.get_data("data/alice.txt", batch_size)
train_tensor, val_tensor = split_data(rep_tensor)

print(f"Train Tensor Shape: {train_tensor.shape}")
print(f"Validation Tensor Shape: {val_tensor.shape}")

Train Tensor Shape: torch.Size([20, 1187])
Validation Tensor Shape: torch.Size([20, 297])


In [26]:
# rep_tensor is the tensor that contains the index of all the words. Each row contains 1659 words
print(rep_tensor.shape)

torch.Size([20, 1484])


In [27]:
vocab_size = len(corpus.dictionary)
print(vocab_size)

5290


In [28]:
num_batches = rep_tensor.shape[1] // timesteps
print(num_batches)

49


In [29]:
class TextGenerator(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers):
        super(TextGenerator, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, h):
        # Perform word embedding
        x = self.embed(x)
        # Reshape the input tensor
        # x = x.view(batch_size, timesteps, embed_size)
        out, (h, c) = self.lstm(x, h)
        # Reshape the output from (samples, timesteps, output_features) to shape appropriate for the FC layer
        # (batch_size * timesteps, hidden_size)
        out = out.reshape(out.size(0) * out.size(1), out.size(2))
        # Decode hidden states of all time steps
        out = self.linear(out)
        return out, (h, c)


In [30]:
model = TextGenerator(vocab_size, embed_size, hidden_size, num_layers)

In [31]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [34]:
for epoch in range(num_epochs):
    # Set initial hidden and cell states
    states = (
        torch.zeros(num_layers, batch_size, hidden_size),
        torch.zeros(num_layers, batch_size, hidden_size),
    )
    train_correct = 0
    train_total = 0

    for i in range(0, train_tensor.size(1) - timesteps, timesteps):
        # Get mini-batch inputs and targets
        inputs = train_tensor[:, i:i + timesteps]
        targets = train_tensor[:, (i + 1):(i + 1) + timesteps]

        # Forward pass
        outputs, _ = model(inputs, states)  # outputs shape: (batch_size * timesteps, vocab_size)

        # Calculate loss
        loss = loss_fn(outputs, targets.reshape(-1))

        # Calculate accuracy
        predicted = torch.argmax(outputs, dim=1)  # Predicted indices
        train_correct += (predicted == targets.reshape(-1)).sum().item()
        train_total += targets.numel()

        # Backpropagation and Weight Update
        model.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        step = (i + 1) // timesteps
        if step % 100 == 0:
            train_accuracy = (train_correct / train_total) * 100  # Convert accuracy to percentage
            print(
                f"Epoch [{epoch + 1}/{num_epochs}], Step [{step}/{num_batches}], Loss: {loss.item():.4f}, Accuracy: {train_accuracy:.2f}%")

    # --- Validation Phase ---
    model.eval()  # Set the model to evaluation mode
    val_loss = 0
    val_correct = 0
    val_total = 0

    with torch.no_grad():  # Disable gradient calculation
        states = (
            torch.zeros(num_layers, batch_size, hidden_size),
            torch.zeros(num_layers, batch_size, hidden_size),
        )
        for i in range(0, val_tensor.size(1) - timesteps, timesteps):  # Assume val_tensor is your validation data
            inputs = val_tensor[:, i:i + timesteps]
            targets = val_tensor[:, (i + 1):(i + 1) + timesteps]

            # Forward pass
            outputs, states = model(inputs, states)
            loss = loss_fn(outputs, targets.reshape(-1))
            val_loss += loss.item()

            # Calculate validation accuracy
            predicted = torch.argmax(outputs, dim=1)
            val_correct += (predicted == targets.reshape(-1)).sum().item()
            val_total += targets.numel()

    val_loss /= (val_tensor.size(1) // timesteps)
    val_accuracy = (val_correct / val_total) * 100  # Convert accuracy to percentage
    print(
        f"Epoch [{epoch + 1}/{num_epochs}], Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.2f}%")


Epoch [1/20], Step [0/49], Loss: 0.0544, Accuracy: 98.00%
Epoch [1/20], Validation Loss: 8.5483, Validation Accuracy: 16.00%
Epoch [2/20], Step [0/49], Loss: 0.0540, Accuracy: 98.00%
Epoch [2/20], Validation Loss: 8.5619, Validation Accuracy: 16.06%
Epoch [3/20], Step [0/49], Loss: 0.0539, Accuracy: 98.00%
Epoch [3/20], Validation Loss: 8.5762, Validation Accuracy: 15.98%
Epoch [4/20], Step [0/49], Loss: 0.0535, Accuracy: 98.00%
Epoch [4/20], Validation Loss: 8.5891, Validation Accuracy: 16.04%
Epoch [5/20], Step [0/49], Loss: 0.0534, Accuracy: 98.00%
Epoch [5/20], Validation Loss: 8.6026, Validation Accuracy: 15.96%
Epoch [6/20], Step [0/49], Loss: 0.0531, Accuracy: 98.00%
Epoch [6/20], Validation Loss: 8.6150, Validation Accuracy: 15.96%
Epoch [7/20], Step [0/49], Loss: 0.0530, Accuracy: 98.00%
Epoch [7/20], Validation Loss: 8.6280, Validation Accuracy: 15.98%
Epoch [8/20], Step [0/49], Loss: 0.0527, Accuracy: 98.00%
Epoch [8/20], Validation Loss: 8.6399, Validation Accuracy: 16.00%


In [22]:
# Test the model
with torch.no_grad():
    with open("data/results.txt", "w") as f:
        # Set initial hidden ane cell states
        state = (
            torch.zeros(num_layers, 1, hidden_size),
            torch.zeros(num_layers, 1, hidden_size),
        )
        # Select one word id randomly and convert it ot shape (1, 1)
        input = torch.randint(0, vocab_size, (1,)).long().unsqueeze(1)

        for i in range(500):
            output, _ = model(input, state)
            print(output.shape)
            # Sample a word id from the exponential of the output
            prob = output.exp()
            word_id = torch.multinomial(prob, num_samples=1).item()
            print(word_id)
            # Replace the input with sampled word id for the next timestep
            input.fill_(word_id)

            # Write the results to file
            word = corpus.dictionary.idx2word[word_id]
            word = '\n' if word == '<eos>' else word + ' '
            f.write(word)

            if (i+1) % 100 == 0:
                print(f'Sampled [{i+1}/{500}] words and save to results.txt file')

torch.Size([1, 5290])
362
torch.Size([1, 5290])
389
torch.Size([1, 5290])
69
torch.Size([1, 5290])
1194
torch.Size([1, 5290])
41
torch.Size([1, 5290])
5
torch.Size([1, 5290])
5
torch.Size([1, 5290])
776
torch.Size([1, 5290])
103
torch.Size([1, 5290])
93
torch.Size([1, 5290])
1388
torch.Size([1, 5290])
9
torch.Size([1, 5290])
3737
torch.Size([1, 5290])
153
torch.Size([1, 5290])
1196
torch.Size([1, 5290])
25
torch.Size([1, 5290])
2971
torch.Size([1, 5290])
2591
torch.Size([1, 5290])
165
torch.Size([1, 5290])
2812
torch.Size([1, 5290])
5
torch.Size([1, 5290])
4033
torch.Size([1, 5290])
7
torch.Size([1, 5290])
300
torch.Size([1, 5290])
4202
torch.Size([1, 5290])
3
torch.Size([1, 5290])
4008
torch.Size([1, 5290])
4168
torch.Size([1, 5290])
44
torch.Size([1, 5290])
497
torch.Size([1, 5290])
55
torch.Size([1, 5290])
5
torch.Size([1, 5290])
5
torch.Size([1, 5290])
3830
torch.Size([1, 5290])
329
torch.Size([1, 5290])
4755
torch.Size([1, 5290])
5
torch.Size([1, 5290])
1697
torch.Size([1, 5290])
