# Character-Level RNN and Hidden State Evolution

## Step 1: Setting Up the RNN Environment


In [None]:
import torch
import torch.nn as nn

# Set a seed for reproducibility of results, especially the random weight initialization
torch.manual_seed(0)

# Define a basic RNN
class CharRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(CharRNN, self).__init__()
        self.hidden_size = hidden_size
        # batch_first=True means input/output tensors are (batch, seq, feature)
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, hidden):
        # x shape: (batch_size, seq_len, input_size)
        # hidden shape: (num_layers * num_directions, batch_size, hidden_size)

        out, hidden = self.rnn(x, hidden)
        # out shape: (batch_size, seq_len, hidden_size) after RNN

        # Pass RNN output through the fully connected layer
        # self.fc expects input of shape (..., hidden_size) and applies to the last dimension
        out = self.fc(out)
        # out shape: (batch_size, seq_len, output_size)
        return out, hidden

    def init_hidden(self, batch_size):
        # Initialize hidden state with zeros
        # Shape: (num_layers * num_directions, batch_size, hidden_size)
        # For nn.RNN default: num_layers=1, num_directions=1
        return torch.zeros(1, batch_size, self.hidden_size)

## Step 2: Demonstrating Character-Level Input Handling


In [None]:
# Character to index mapping and vocabulary
char2idx = {'a': 0, 'b': 1, 'c': 2}
idx2char = {v: k for k, v in char2idx.items()}
vocab_size = len(char2idx)

# Sample input sequence
input_seq = 'abc'

# Convert characters to numerical indices
input_indices = [char2idx[c] for c in input_seq]

# Prepare input tensor for the RNN
# Shape: (batch_size, seq_len, input_size)
# - batch_size = 1: We process one sequence at a time.
# - seq_len = len(input_seq): Length of our input sequence.
# - input_size = 1: We use the character's index directly as its feature.
#   (A common alternative is one-hot encoding, where input_size would be vocab_size)
input_tensor = torch.tensor(input_indices, dtype=torch.float32).view(1, len(input_seq), 1)

print(f"Input sequence: '{input_seq}'")
print(f"Character indices: {input_indices}")
print(f"Input tensor shape: {input_tensor.shape}")
print(f"Input tensor preview: {input_tensor}")

Input sequence: 'abc'
Character indices: [0, 1, 2]
Input tensor shape: torch.Size([1, 3, 1])
Input tensor preview: tensor([[[0.],
         [1.],
         [2.]]])


## Step 3: Running the Forward Pass and Observing Hidden State Evolution


In [None]:
# Model parameters
rnn_input_size = 1     # Corresponds to input_tensor's feature dimension
rnn_hidden_size = 5    # Arbitrary size for the hidden layer
rnn_output_size = vocab_size # Network predicts one of the characters in vocab

# Instantiate the model
model = CharRNN(input_size=rnn_input_size, hidden_size=rnn_hidden_size, output_size=rnn_output_size)

# Initialize the hidden state for a batch_size of 1
# batch_size for hidden state should match batch_size of input tensor
hidden = model.init_hidden(batch_size=1)

print(f"\nModel: CharRNN(input_size={rnn_input_size}, hidden_size={rnn_hidden_size}, output_size={rnn_output_size})")
print(f"Initial hidden state shape: {hidden.shape}")
print(f"\nProcessing sequence '{input_seq}' step-by-step:\n")

# Loop through each character in the input sequence
for i in range(input_tensor.size(1)):  # input_tensor.size(1) is seq_len
    # Get the current character's tensor slice
    # Shape: (batch_size, 1, input_size) which is (1, 1, 1) for this iteration
    current_char_tensor = input_tensor[:, i:i+1, :]

    # Perform a forward pass through the model
    # - output: Logits for the next character prediction based on current input and hidden state.
    #           Shape: (batch_size, 1, output_size) e.g., (1, 1, vocab_size)
    # - hidden: Updated hidden state after processing the current character.
    #           Shape: (1, batch_size, hidden_size) e.g., (1, 1, rnn_hidden_size)
    output, hidden = model(current_char_tensor, hidden)

    current_char = idx2char[input_indices[i]]
    print(f"Step {i+1}: Input Char '{current_char}'")
    print(f"  Tensor: {current_char_tensor.item():.1f}") # Display the single float value from the (1,1,1) tensor
    # .detach() is used to remove the tensor from gradient tracking for printing
    print(f"  Hidden State: {hidden.detach().numpy().squeeze()}") # .squeeze() to make it more readable
    print(f"  Output Logits (for next char): {output.detach().numpy().squeeze()}\n")

# After the loop, 'output' holds the logits from processing the *last* character ('c'),
# and 'hidden' is the final hidden state.


Model: CharRNN(input_size=1, hidden_size=5, output_size=3)
Initial hidden state shape: torch.Size([1, 1, 5])

Processing sequence 'abc' step-by-step:

Step 1: Input Char 'a'
  Tensor: 0.0
  Hidden State: [-0.5321693   0.12468402  0.09426841 -0.00631211 -0.09539835]
  Output Logits (for next char): [-0.26382893  0.26871914 -0.4557599 ]

Step 2: Input Char 'b'
  Tensor: 1.0
  Hidden State: [-0.5642533   0.38454792 -0.21128117 -0.36226147 -0.26284283]
  Output Logits (for next char): [-0.14454445  0.34262192 -0.4753492 ]

Step 3: Input Char 'c'
  Tensor: 2.0
  Hidden State: [-0.642596    0.7131781  -0.44515392 -0.60197914 -0.09150449]
  Output Logits (for next char): [-0.19470005  0.43686768 -0.35626823]



## Step 4: Visualizing and Interpreting Output Predictions


In [None]:
# The 'output' variable from the loop contains the logits after processing the last character.
# Its shape is (batch_size, 1, output_size).

# Predict the index of the next character by finding the max logit.
# torch.argmax operates on the specified dimension (dim=2, which is the output_size dimension).
predicted_index_tensor = torch.argmax(output, dim=2) # Shape: (batch_size, 1) e.g., (1,1)

# .item() converts a single-element tensor to a Python number.
predicted_index = predicted_index_tensor.item()

# Convert the predicted index back to a character.
predicted_char = idx2char[predicted_index]

print(f"--- Final Prediction ---")
print(f"After processing the full sequence '{input_seq}', the final output logits are: {output.detach().numpy().squeeze()}")
print(f"The model predicts the next character could be: '{predicted_char}' (Index: {predicted_index})")
print("\nNote: This prediction is from an untrained model with random initial weights.")
print("The purpose is to show the *mechanism* of prediction and hidden state flow, not to demonstrate learned behavior.")

--- Final Prediction ---
After processing the full sequence 'abc', the final output logits are: [-0.19470005  0.43686768 -0.35626823]
The model predicts the next character could be: 'b' (Index: 1)

Note: This prediction is from an untrained model with random initial weights.
The purpose is to show the *mechanism* of prediction and hidden state flow, not to demonstrate learned behavior.
