Import libraries

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt
import requests
import re

Load and preprocess Data

In [3]:
# URL for the text of "Alice's Adventures in Wonderland"
url = "https://www.gutenberg.org/files/11/11-0.txt"

response = requests.get(url)
if response.status_code == 200:
    text = response.text
    print("Dataset downloaded successfully!")
else:
    print("Error downloading dataset:", response.status_code)

# Optionally, inspect the first few hundred characters
print(text[:500])

Dataset downloaded successfully!
*** START OF THE PROJECT GUTENBERG EBOOK 11 ***

[Illustration]




Alice’s Adventures in Wonderland

by Lewis Carroll

THE MILLENNIUM FULCRUM EDITION 3.0

Contents

 CHAPTER I.     Down the Rabbit-Hole
 CHAPTER II.    The Pool of Tears
 CHAPTER III.   A Caucus-Race and a Long Tale
 CHAPTER IV.    The Rabbit Sends in a Little Bill
 CHAPTER V.     Advice from a Caterpillar
 CHAPTER VI.    Pig and Pepper
 CHAPTER VII.   A Mad Tea-Party
 CHAPTER VIII.  The Queen’s Croquet-Grou


EDA

In [4]:
# Basic cleaning:
# Remove Project Gutenberg header and footer if present.
start_marker = r"\*\*\* START OF THIS PROJECT GUTENBERG EBOOK .* \*\*\*"
end_marker = r"\*\*\* END OF THIS PROJECT GUTENBERG EBOOK .* \*\*\*"
text = re.split(start_marker, text, flags=re.IGNORECASE)[-1]
text = re.split(end_marker, text, flags=re.IGNORECASE)[0]

# Convert to lowercase and strip extra spaces/newlines
text = text.lower().strip()

# Basic preprocessing: convert text to lowercase (optional) and remove header/footer
#text = text.lower()

# Create a set of characters (vocabulary)
vocab = sorted(set(text))
vocab_size = len(vocab)
print("Unique characters:", vocab_size)

# Create mappings from characters to integers and vice versa
char_to_idx = {ch: i for i, ch in enumerate(vocab)}
idx_to_char = {i: ch for i, ch in enumerate(vocab)}

# Encode the full text to integers
encoded_text = np.array([char_to_idx[ch] for ch in text])

# Set sequence length for training examples
seq_length = 100

# Create input and target sequences using a sliding window approach
inputs = []
targets = []

for i in range(len(encoded_text) - seq_length):
    inputs.append(encoded_text[i:i+seq_length])
    targets.append(encoded_text[i+1:i+seq_length+1])  # target is shifted by one

inputs = np.array(inputs)
targets = np.array(targets)

print("Input shape:", inputs.shape)
print("Target shape:", targets.shape)

Unique characters: 51
Input shape: (147978, 100)
Target shape: (147978, 100)


Define the LSTM Model


Set Hyperparameters and Initialize Model

In [5]:
class CharLSTM(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers):
        super(CharLSTM, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, dropout=0.5, batch_first=True) ## Added dropout for regularization
        self.fc = nn.Linear(hidden_size, vocab_size)
        if embed_size == hidden_size:
            self.fc.weight = self.embed.weight
    
    def forward(self, x, hidden):
        # x: (batch, seq_length)
        x = self.embed(x)  # (batch, seq_length, embed_size)
        out, hidden = self.lstm(x, hidden)  # out: (batch, seq_length, hidden_size)
        # Reshape output for the fully connected layer
        out = out.contiguous().view(-1, out.shape[2])
        out = self.fc(out)  # (batch * seq_length, vocab_size)
        return out, hidden
    
    def init_hidden(self, batch_size):
        # Initialize hidden state and cell state with zeros
        weight = next(self.parameters()).data
        hidden = (weight.new(num_layers, batch_size, hidden_size).zero_(),
                  weight.new(num_layers, batch_size, hidden_size).zero_())
        return hidden
    

# Hyperparameters
embed_size = 256 ## Set to the same as hidden_size for weight tying
hidden_size = 256
num_layers = 2
learning_rate = 0.002
num_epochs = 50 ## Increased epochs for better training
batch_size = 256 ## Increased batch size for faster training

# Initialize the model, loss function and optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CharLSTM(vocab_size, embed_size, hidden_size, num_layers).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=2) ## Added learning rate scheduler

Train and test dataset

In [None]:
# Utility function to get batches
def get_batches(inputs, targets, batch_size):
    total_batches = len(inputs) // batch_size
    for i in range(0, total_batches * batch_size, batch_size):
        x = inputs[i:i+batch_size]
        y = targets[i:i+batch_size]
        yield torch.LongTensor(x), torch.LongTensor(y)

# Training loop
all_losses = []
model.train()
for epoch in range(num_epochs):
    hidden = model.init_hidden(batch_size)
    epoch_loss = 0.0
    batch_count = 0
    for x_batch, y_batch in get_batches(inputs, targets, batch_size):
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)
        
        # Detach hidden state to prevent backprop through entire training history
        hidden = tuple([h.detach() for h in hidden])
        
        optimizer.zero_grad()
        outputs, hidden = model(x_batch, hidden)
        
        # Reshape target to match outputs
        loss = criterion(outputs, y_batch.view(-1))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5)  ## Added gradient clipping to prevent exploding gradients
        optimizer.step()
        
        epoch_loss += loss.item()
        batch_count += 1
    
    avg_loss = epoch_loss / batch_count
    all_losses.append(avg_loss)
    scheduler.step(avg_loss) ## Adjust learning rate based on validation loss
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")

# Plot training loss
plt.figure(figsize=(8, 4))
plt.plot(all_losses, label='Training Loss')
plt.title("Training Loss over Epochs")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.show()


Epoch [1/50], Loss: 1.9125
Epoch [2/50], Loss: 1.5796
Epoch [3/50], Loss: 1.4494
Epoch [4/50], Loss: 1.3664
Epoch [5/50], Loss: 1.3043
Epoch [6/50], Loss: 1.2551
Epoch [7/50], Loss: 1.2139


Evaluate the Model with test data

In [None]:
def sample(model, start_str, length=200, temperature=1.0): ## Added temperature parameter for sampling
    model.eval()
    # Convert start string to tensor
    input_seq = torch.LongTensor([char_to_idx[ch] for ch in start_str]).unsqueeze(0).to(device)
    hidden = model.init_hidden(1)
    predicted = start_str
    
    for _ in range(length):
        output, hidden = model(input_seq, hidden)
        # Get last character's prediction from the output
        output = output[-1] / (temperature if temperature > 0 else 1.0)  ## Added temperature scaling for sampling
        prob = torch.softmax(output, dim=0).data
        char_idx = torch.multinomial(prob, 1).item()
        predicted_char = idx_to_char[char_idx]
        predicted += predicted_char
        
        # Prepare next input
        input_seq = torch.LongTensor([[char_idx]]).to(device)
    return predicted

# Example: generate text starting with "alice"
seed = "alice"
generated_text = sample(model, seed, length=300, temperature=0.8)  ## Adjusted temperature for more creative sampling
print("Generated text:\n")
print(generated_text)

Generated text:

alice first) all voice bringing from the officerfully as well as she evening all. there try
thundink three to one of this time, in the dormouse
speaking again, the long passing the new was no one great made of the drace. “offence, and it
lessons to—the whiter side,” said the king.

“call has began o
