In [1]:
import requests

url = "https://www.gutenberg.org/files/345/345-0.txt"
response = requests.get(url)

# Save the content to a file
with open("data/text.txt", "w", encoding="utf-8") as file:
    file.write(response.text)

print("Dracula text downloaded and saved as dracula.txt")


Dracula text downloaded and saved as dracula.txt


In [None]:
from collections import Counter, \
    defaultdict  # Import utilities for character frequency counting and default dictionary creation.

import numpy as np  # Import for numerical operations.
import pandas as pd  # Import for handling tabular data.
import torch  # Import PyTorch framework for building deep learning models.
import torch.nn as nn  # Import neural network modules from PyTorch.
import torch.nn.functional as F  # Import functional methods for building layers/operations.
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler  # Tools for data handling and loading in PyTorch.
from tqdm import tqdm  # Import for progress bar visualization during training loops.


class CharacterDataset(Dataset):
    """Custom class for creating a dataset of character sequences.

    Useful for feeding a sequence of characters and their corresponding outputs to a neural network.
    """
    """Custom dataset.

    Parameters
    ----------
    text : str
        Input text that will be used to create the entire database.

    window_size : int
        Number of characters to use as input features.

    vocab_size : int
        Number of characters in the vocabulary. Note that the last character
        is always reserved for a special "~" out-of-vocabulary character.

    Attributes
    ----------
    ch2ix : defaultdict
        Mapping from the character to the position of that character in the
        vocabulary. Note that all characters that are not in the vocabulary
        will get mapped into the index `vocab_size - 1`.

    ix2ch : dict
        Mapping from the character position in the vocabulary to the actual
        character.

    vocabulary : list
        List of all characters. `len(vocabulary) == vocab_size`.
    """

    def __init__(self, text, window_size=1, vocab_size=50):
        self.text = text.replace("\n",
                                 " ")  # Replace newline characters in the input text with spaces to unify text formatting.
        self.window_size = window_size
        self.ch2ix = defaultdict(lambda: vocab_size - 1)

        # Create a dictionary mapping each of the `vocab_size - 1` most common characters
        # in the input text to a unique index. The character itself is the key (x[0]),
        # and the assigned index (given by `enumerate`) is the value.
        # Any character not in this mapping will default to the out-of-vocabulary index.
        most_common_ch2ix = {  # Map the `vocab_size-1` most common characters to unique indices for model input.
            x[0]: i
            for i, x in enumerate(Counter(self.text).most_common()[: (vocab_size - 1)])
        }
        # Update the mapping "ch2ix" (character to index) with the most common characters.
        self.ch2ix.update(most_common_ch2ix)

        # Assign the special character '~' to the index `vocab_size - 1`,
        # which acts as a placeholder for any out-of-vocabulary characters.
        self.ch2ix["~"] = vocab_size - 1

        # Create a reverse mapping "ix2ch" (index to character) from the "ch2ix" dictionary,
        # enabling easy lookup of characters by their assigned index.
        self.ix2ch = {v: k for k, v in self.ch2ix.items()}

        # Generate the vocabulary list by mapping each index `i` to its corresponding
        # character from the "ix2ch" dictionary for all indices in the range of `vocab_size`.
        # This list represents all characters included in the model's vocabulary.
        self.vocabulary = [self.ix2ch[i] for i in range(vocab_size)]

    def __len__(self):
        return len(
            self.text) - self.window_size  # Return the number of samples in the dataset (length of text minus window size).

    def __getitem__(self, ix):
        X = torch.LongTensor(
            # Convert the sequence of characters to a numerical representation (IDs from the vocabulary mapping).
            [self.ch2ix[c] for c in self.text[ix: ix + self.window_size]]
        )
        y = self.ch2ix[self.text[ix + self.window_size]]

        return X, y


class Network(nn.Module):
    # Define the model structure, combining embedding, LSTM, and dense layers for character-level text generation.
    """
    Custom network for predicting the next character of a string or handling multi-class tasks.

    Parameters
    ----------
    vocab_size : int
        Number of characters in the vocabulary.
    embedding_dim : int
        Dimension of the character embedding vectors.
    dense_dim : int
        Number of neurons in the intermediate dense layer.
    hidden_dim : int
        Size of the LSTM hidden state.
    max_norm : int, optional
        Maximum L2 norm of the embedding vectors, default is 2.
    n_layers : int, optional
        Number of LSTM layers, default is 1.
    dropout_rate : float, optional
        Probability of dropout applied after the LSTM and dense layer, default is 0.5.
    """

    def __init__(
            self,
            vocab_size,
            embedding_dim=64,
            dense_dim=64,
            hidden_dim=128,
            max_norm=2,
            n_layers=1,
            dropout_rate=0.5,
    ):
        super(Network, self).__init__()

        # Validations
        if vocab_size < 1:
            raise ValueError("`vocab_size` must be >= 1")
        if max_norm <= 0:
            raise ValueError("`max_norm` must be positive")
        if not (0 <= dropout_rate <= 1):
            raise ValueError("`dropout_rate` must be between 0 and 1")

        # Layers
        self.embedding = nn.Embedding(  # Embedding layer to map characters to dense vector representations.
            vocab_size,
            embedding_dim,
            padding_idx=vocab_size - 1,  # Assume last index in vocab is for padding
            max_norm=max_norm,
        )
        self.lstm = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_dim,
            num_layers=n_layers,
            batch_first=True
        )
        self.linear_1 = nn.Linear(hidden_dim, dense_dim)
        self.activation = nn.ReLU()
        self.dropout = nn.Dropout(p=dropout_rate)
        self.linear_2 = nn.Linear(dense_dim, vocab_size)

        # Hidden state initialization
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers

    def forward(self, x, hidden=None):
        """
        Forward pass through the network.

        Parameters
        ----------
        x : torch.Tensor
            Input tensor of shape `(batch_size, sequence_length)` with dtype `torch.int64`.
        hidden : tuple of torch.Tensor, optional  # Previous hidden and cell states of the LSTM, used for sequential processing.
            Tuple of (hidden_state, cell_state), each of shape `(n_layers, batch_size, hidden_dim)`.

        Returns
        -------
        logits : torch.Tensor
            Output tensor of shape `(batch_size, vocab_size)` with unnormalized scores for each class.
        hidden : tuple of torch.Tensor
            Updated hidden states from the LSTM.
        """
        x = x.to(
            next(self.parameters()).device)  # Move input to the same device (CPU or GPU) as the model's parameters.

        # Initialize hidden states if not provided
        if hidden is None:
            hidden = self.init_hidden(x.size(0))

        # Embedding layer
        embeddings = self.embedding(x)  # Create dense vector representations of the input characters.

        # LSTM layer
        _, hidden = self.lstm(embeddings,
                              hidden)  # Process the input embeddings through the LSTM and update hidden states.

        # Process last hidden state of all LSTM layers
        last_hidden = hidden[0].mean(dim=0)  # Take mean across LSTM layers -> Shape: (batch_size, hidden_dim)

        # Fully connected layers
        dense_output = self.activation(
            self.linear_1(last_hidden))  # Apply a linear transformation followed by ReLU activation.
        dense_output = self.dropout(dense_output)  # Apply dropout
        logits = self.linear_2(dense_output)  # Shape: (batch_size, vocab_size)

        return logits, hidden  # Return the predicted logits and updated hidden states.

    def init_hidden(self, batch_size):
        """
        Initialize the hidden states for the LSTM.

        Parameters
        ----------
        batch_size : int
            The batch size of the input.

        Returns
        -------
        hidden : tuple of torch.Tensor
            Initial hidden_state and cell_state, both of shape `(n_layers, batch_size, hidden_dim)`.
        """
        device = next(self.parameters()).device  # Determine the device (CPU or GPU) to initialize the hidden states.
        return (
            torch.zeros(self.n_layers, batch_size, self.hidden_dim, device=device),
            torch.zeros(self.n_layers, batch_size, self.hidden_dim, device=device)
        )


def compute_loss(cal, net, dataloader):
    """Computer average loss over a dataset."""
    net.eval()
    all_losses = []
    for X_batch, y_batch in dataloader:
        probs, _, _ = net(X_batch)

        all_losses.append(cal(probs, y_batch).item())

    return np.mean(all_losses)  # Calculate and return the average loss over the dataset.


def generate_text(n_chars, net, dataset, initial_text="Hello", random_state=None):
    """Generate text with the character-level model.

    Parameters
    ----------
    n_chars : int
        Number of characters to generate.

    net : Module
        Character-level model.

    dataset : CharacterDataset
        Instance of the `CharacterDataset`.

    initial_text : str
        The starting text to be used as the initial condition for the model.

    random_state : None or int
        If not None, then the result is reproducible.

    Returns
    -------
    res : str
        Generated text.
    """
    if not initial_text:
        raise ValueError("You need to specify the initial text")

    res = initial_text  # Start generating text with the user-specified initial string.
    net.eval()
    h, c = None, None

    if random_state is not None:
        np.random.seed(random_state)

    for _ in range(n_chars):  # Generate the specified number of characters, updating the initial text.
        previous_chars = initial_text if res == initial_text else res[-1]
        features = torch.LongTensor([[dataset.ch2ix[c] for c in previous_chars]])
        logits, h, c = net(features, h, c)
        probs = F.softmax(logits[0],
                          dim=0).detach().numpy()  # Apply softmax to obtain probabilities for each character in the vocabulary.
        new_ch = np.random.choice(dataset.vocabulary, p=probs)
        res += new_ch

    return res


if __name__ == "__main__":
    with open("text.txt", "r") as f:  # Read the training text dataset from the file.
        text = "\n".join(f.readlines())

    # Hyperparameters model
    vocab_size = 70
    window_size = 10
    embedding_dim = 2
    hidden_dim = 16
    dense_dim = 32
    n_layers = 1
    max_norm = 2

    # Training config
    n_epochs = 25  # Number of epochs the model will train for.
    train_val_split = 0.8
    batch_size = 128
    random_state = 13

    torch.manual_seed(random_state)

    loss_f = torch.nn.CrossEntropyLoss()
    dataset = CharacterDataset(text, window_size=window_size,
                               vocab_size=vocab_size)  # Prepare the character-level dataset.

    n_samples = len(dataset)
    split_ix = int(n_samples * train_val_split)

    train_indices, val_indices = np.arange(split_ix), np.arange(split_ix, n_samples)

    train_dataloader = DataLoader(
        dataset, sampler=SubsetRandomSampler(train_indices), batch_size=batch_size
    )
    val_dataloader = DataLoader(
        dataset, sampler=SubsetRandomSampler(val_indices), batch_size=batch_size
    )

    net = Network(
        vocab_size,
        hidden_dim=hidden_dim,
        n_layers=n_layers,
        dense_dim=dense_dim,
        embedding_dim=embedding_dim,
        max_norm=max_norm,
    )
    optimizer = torch.optim.Adam(
        net.parameters(),
        lr=1e-2,
    )

    emb_history = []

    for e in range(n_epochs + 1):
        net.train()
        for X_batch, y_batch in tqdm(train_dataloader):
            if e == 0:
                break

            optimizer.zero_grad()
            probs, _, _ = net(X_batch)
            loss = loss_f(probs, y_batch)
            loss.backward()

            optimizer.step()

        train_loss = compute_loss(loss_f, net, train_dataloader)
        val_loss = compute_loss(loss_f, net, val_dataloader)
        print(f"Epoch: {e}, {train_loss=:.3f}, {val_loss=:.3f}")

        # Generate one sentence
        initial_text = "I hope it works "
        generated_text = generate_text(
            100, net, dataset, initial_text=initial_text, random_state=random_state
        )
        print(generated_text)

        # Prepare DataFrame
        weights = net.embedding.weight.detach().clone().numpy()

        df = pd.DataFrame(weights, columns=[f"dim_{i}" for i in range(embedding_dim)])
        df["epoch"] = e
        df["character"] = dataset.vocabulary

        emb_history.append(df)

final_df = pd.concat(emb_history)
final_df.to_csv("res.csv", index=False)  # Save the history of embeddings over epochs to a CSV file.

In [None]:
import torch
import torch.nn as nn


class TextClassificationLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_classes, num_layers=1, dropout=0.5):
        super(TextClassificationLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, batch_first=True)
        self.dropout = nn.Dropout(dropout)
        self.fc1 = nn.Linear(hidden_dim, 64)  # Intermediate hidden-to-hidden layer
        self.fc2 = nn.Linear(64, num_classes)  # Final hidden-to-output layer

    def forward(self, x):
        # Embedding layer
        x = self.embedding(x)  # Shape: (batch_size, seq_len, embedding_dim)

        # LSTM layer
        _, (hidden, _) = self.lstm(x)  # hidden: (num_layers, batch_size, hidden_dim)

        # Take the last hidden state from the last LSTM layer
        hidden = hidden[-1]  # Shape: (batch_size, hidden_dim)

        # Apply dropout to the hidden layer output
        hidden = self.dropout(hidden)

        # Pass through fully connected layers
        x = self.fc1(hidden)
        x = self.fc2(x)  # Output logits for num_classes

        return x


In [None]:
class LanguageModelLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers=1, dropout=0.5):
        super(LanguageModelLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, batch_first=True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden=None):
        # Embedding layer
        x = self.embedding(x)  # Shape: (batch_size, seq_len, embedding_dim)

        # LSTM layer
        x, hidden = self.lstm(x, hidden)  # Shape: (batch_size, seq_len, hidden_dim)

        # Apply dropout to the hidden layer output
        x = self.dropout(x)

        # Flatten the LSTM output for the FC layer
        x = x.reshape(-1, x.size(2))  # Shape: (batch_size * seq_len, hidden_dim)

        # Fully connected layer for logits
        x = self.fc(x)  # Shape: (batch_size * seq_len, vocab_size)

        return x, hidden


In [None]:
class Seq2SeqLSTM(nn.Module):
    def __init__(self, input_vocab_size, output_vocab_size, embedding_dim, hidden_dim, num_layers=1, dropout=0.5):
        super(Seq2SeqLSTM, self).__init__()
        # Encoder
        self.encoder_embedding = nn.Embedding(input_vocab_size, embedding_dim)
        self.encoder_lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)

        # Decoder
        self.decoder_embedding = nn.Embedding(output_vocab_size, embedding_dim)
        self.decoder_lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)

        # Dropout
        self.dropout = nn.Dropout(dropout)

        # Fully connected layer for the decoder to map hidden states to output vocab
        self.fc = nn.Linear(hidden_dim, output_vocab_size)

    def forward(self, src, trg, hidden=None):
        # Encoder
        src = self.encoder_embedding(src)  # Shape: (batch_size, src_len, embedding_dim)
        _, hidden = self.encoder_lstm(src, hidden)  # hidden: (num_layers, batch_size, hidden_dim)

        # Decoder
        trg = self.decoder_embedding(trg)  # Shape: (batch_size, trg_len, embedding_dim)
        x, hidden = self.decoder_lstm(trg, hidden)  # x: (batch_size, trg_len, hidden_dim)

        # Apply dropout to decoder LSTM outputs (before FC layer)
        x = self.dropout(x)

        # Fully connected layer to output vocabulary logits
        x = self.fc(x)  # Shape: (batch_size, trg_len, output_vocab_size)

        return x


In [None]:
class TimeSeriesLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers=1, dropout=0.5):
        super(TimeSeriesLSTM, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers=num_layers, batch_first=True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x, hidden=None):
        # LSTM layer
        x, hidden = self.lstm(x, hidden)  # x: (batch_size, seq_len, hidden_dim)

        # Only take the last hidden state for prediction
        x = x[:, -1, :]  # Shape: (batch_size, hidden_dim)

        # Apply dropout before the fully connected layer
        x = self.dropout(x)

        # Fully connected layer to produce predictions
        x = self.fc(x)  # Shape: (batch_size, output_dim)

        return x, hidden


The decision of whether to apply dropout between `fc1` and `fc2` in the **TextClassificationLSTM** model depends on how you want to regularize the model and which part of the network you want to prevent from overfitting. Let’s break this down conceptually so it makes sense why I chose to apply dropout before `fc1` and why it could (or could not) also be applied between `fc1` and `fc2`.
### **Where Dropout Is Currently Applied**
In the corrected example, dropout is applied **before `fc1`**, which means dropout acts on the high-dimensional representation extracted from the LSTM’s hidden states. This is the critical feature space that carries most of the information extracted from the sequenced input, so regularizing this representation has the biggest impact on generalization.
By regularizing these features **before any dense layer**, we ensure:
1. We prevent over-reliance on specific neurons in the hidden LSTM representation.
2. `fc1` processes robust features that have already been regularized.

This strategic placement ensures the dense transformation (`fc1`) is working on stabilized inputs.
### **Why Not Apply Dropout Between `fc1` and `fc2`?**
#### 1. **Prevent Excessive Regularization**
Including dropout **both before `fc1` and between `fc1` and `fc2` could result in excessive regularization**:
- Dropout in multiple places along a small MLP (multi-layer perceptron) can hurt the model's ability to learn detailed patterns because too many features are "dropped out" at various stages.
- For smaller models like this, it’s generally sufficient to apply dropout **once** at the input to the fully connected layers (directly after the LSTM in this case).

#### 2. **fc1 is Acting as a Feature Reduction Layer**
The first fully connected layer (`fc1`) is reducing the dimensionality of the hidden state output from the LSTM (e.g., from `hidden_dim` → 64 in the example). This feature reduction itself acts as a type of regularization because it forces the model to condense the information into a smaller space. Applying dropout after the dimensionality has already been reduced may not add much benefit.
#### 3. **Dropout Is Computationally Expensive**
Dropout adds some computational cost during training. Applying it too frequently in smaller architectures, especially in simple MLP chains, will generally result in diminishing returns since the primary goal of dropout—to regularize activations—has already been achieved after the LSTM output stage.
#### 4. **It's More Common to Regularize Larger Feature Spaces**
Dropout is most effective when applied to **high-dimensional feature spaces**, such as the outputs of an LSTM, convolutional layers, or very large dense layers. Since the output of `fc1` (64 units in the example) is already compressed, the likelihood of overfitting at this stage is lower, so dropout may not have a major impact here.
### **When Should You Apply Dropout Between `fc1` and `fc2`?**
While the design choice not to apply dropout between `fc1` and `fc2` is generally fine for the current model, there are situations where it might make sense to do so:
1. **Larger Models with Deeper FC Layers**
    - If the fully connected section of your network consists of multiple dense layers (e.g., `fc1`, `fc2`, `fc3`...), then applying dropout between layers can help regularize each transformation in this deeper part of the model.
    - Example:
``` python
x = self.fc1(hidden)
x = self.relu(x)
x = self.dropout(x)  # Dropout applied between fc1 and fc2
x = self.fc2(x)
```
1. **When Overfitting is Observed**
    - If validation performance significantly lags behind training performance in your current model, you can experiment with adding dropout between dense layers to further reduce overfitting.

2. **Higher Dimensional Representations**
    - If `fc1` has a much larger output dimension (e.g., reducing from 1024 → 512 → 256), then applying dropout between dense layers (e.g., after `fc1`) can help ensure that intermediate representations don’t overfit.

### **Updated TextClassificationLSTM Example with Dropout Between fc1 and fc2**
Here’s how the model would look if dropout **is applied between `fc1` and `fc2`**:
``` python
class TextClassificationLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_classes, num_layers=1, dropout=0.5):
        super(TextClassificationLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, batch_first=True)
        self.fc1 = nn.Linear(hidden_dim, 64)  # Reduce dimensionality
        self.relu = nn.ReLU()  # Activation
        self.dropout = nn.Dropout(dropout)
        self.fc2 = nn.Linear(64, num_classes)  # Map to output classes

    def forward(self, x):
        # Embedding layer
        x = self.embedding(x)  # Shape: (batch_size, seq_len, embedding_dim)

        # LSTM layer
        _, (hidden, _) = self.lstm(x)  # hidden: (num_layers, batch_size, hidden_dim)
        hidden = hidden[-1]  # Take the last hidden state: (batch_size, hidden_dim)

        # Pass hidden state through two FC layers
        x = self.fc1(hidden)
        x = self.relu(x)
        x = self.dropout(x)  # Dropout applied between FC layers
        x = self.fc2(x)

        return x
```
### **Key Takeaways**
- Dropout placement depends on how rich and high-dimensional the feature space is:
    - Apply **dropout once on the largest transformations** (e.g., the LSTM output).
    - Adding dropout **between smaller dense layers** might help if overfitting is still present.

- Avoid **over-regularizing** small models by applying dropout too frequently, as it can degrade performance.
- Test dropout placement experimentally if you're unsure—it’s often a hyperparameter choice that depends on the task and dataset.
