# 数据准备

In [5]:
import os
import numpy as np
import tensorflow as tf

# 读取Shakespeare文本文件
with open('shakespeare.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# 打印文本的前100个字符
print(f"文本长度: {len(text)}")
print(f"文本前100个字符:\n{text[:100]}")

# 创建字符级别的字典
vocab = sorted(set(text))
print(f"字典大小: {len(vocab)}")
print(f"字典内容: {vocab}")

# 创建字符到索引的映射
char_to_idx = {char: idx for idx, char in enumerate(vocab)}
idx_to_char = {idx: char for idx, char in enumerate(vocab)}

# 打印映射示例
print("\n字符到索引的映射示例:")
for char in text[:20]:
    print(f"'{char}' -> {char_to_idx[char]}")

# 将文本转换为数字序列
text_as_int = np.array([char_to_idx[c] for c in text]) #把全部文本都变为id
print(f"\n文本转换为数字序列的前20个元素:\n{text_as_int[:20]}")
print(f"将数字序列转回字符:\n{''.join([idx_to_char[idx] for idx in text_as_int[:20]])}")



文本长度: 1115394
文本前100个字符:
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You
字典大小: 65
字典内容: ['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']

字符到索引的映射示例:
'F' -> 18
'i' -> 47
'r' -> 56
's' -> 57
't' -> 58
' ' -> 1
'C' -> 15
'i' -> 47
't' -> 58
'i' -> 47
'z' -> 64
'e' -> 43
'n' -> 52
':' -> 10
'
' -> 0
'B' -> 14
'e' -> 43
'f' -> 44
'o' -> 53
'r' -> 56

文本转换为数字序列的前20个元素:
[18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56]
将数字序列转回字符:
First Citizen:
Befor


# 把莎士比亚文集分成一个一个的样本

In [None]:
# 定义序列长度和批次大小
import torch
from torch.utils.data import Dataset, DataLoader

seq_length = 100  # 每个样本的序列长度
batch_size = 64   # 每个批次的样本数量

# 创建自定义数据集类
class ShakespeareDataset(Dataset):
    def __init__(self, text_as_int, seq_length):
        self.text_as_int = text_as_int
        self.seq_length = seq_length
        self.sub_len = seq_length + 1 #一个样本的长度
        
    def __len__(self):
        # 计算可能的序列数量
        return len(self.text_as_int)//(self.seq_length+1) #+1是因为要预测下一个字符
        
    def __getitem__(self, idx):
        return self.text_as_int[idx*self.sub_len:(idx+1)*self.sub_len] #返回一个样本

# 定义collate函数，用于处理批次数据
def collate_fct(batch):
    # 将批次数据转换为张量
    batch = torch.tensor(batch)
    # 输入序列是除了最后一个字符的所有字符
    input_batch = batch[:, :-1]
    # 目标序列是除了第一个字符的所有字符
    target_batch = batch[:, 1:]
    return input_batch, target_batch

# 创建数据集实例
shakespeare_dataset = ShakespeareDataset(text_as_int, seq_length)

# 创建数据加载器
dataloader = DataLoader(shakespeare_dataset, batch_size=batch_size, shuffle=True, drop_last=True, collate_fn=collate_fct)

# 打印示例，查看输入和目标
for input_batch, target_batch in dataloader:
    print(f"输入批次形状: {input_batch.shape}")
    print(f"目标批次形状: {target_batch.shape}")
    
    # 打印第一个样本的输入和目标
    print(input_batch)
    print(target_batch)
    break

print(f"\n数据集大小: {len(shakespeare_dataset)}")
print(f"批次数量: {len(dataloader)}")


输入批次形状: torch.Size([64, 100])
目标批次形状: torch.Size([64, 100])
tensor([[53, 59, 56,  ...,  6,  0, 32],
        [58, 46,  1,  ..., 57, 58, 56],
        [52, 42,  1,  ..., 52, 41, 43],
        ...,
        [43, 43,  1,  ..., 43, 39, 58],
        [39, 52, 42,  ...,  1, 57, 53],
        [52, 58, 11,  ..., 21, 26, 19]], dtype=torch.int32)
tensor([[59, 56,  1,  ...,  0, 32, 46],
        [46,  1, 53,  ..., 58, 56, 47],
        [42,  1, 35,  ..., 41, 43,  1],
        ...,
        [43,  1, 53,  ..., 39, 58, 46],
        [52, 42,  1,  ..., 57, 53, 52],
        [58, 11,  1,  ..., 26, 19,  1]], dtype=torch.int32)

数据集大小: 11043
批次数量: 172


  batch = torch.tensor(batch)


# 搭建模型

In [None]:
import torch.nn as nn
import torch.nn.functional as F

# 定义莎士比亚文本生成的RNN模型
class ShakespeareRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, batch_size):
        """
        初始化RNN模型
        
        参数:
        vocab_size: 词汇表大小，即字符数量
        embedding_dim: 嵌入层维度，将字符转换为向量表示
        hidden_dim: RNN隐藏层维度
        batch_size: 批次大小
        """
        super(ShakespeareRNN, self).__init__()
        
        # 嵌入层：将字符ID转换为密集向量表示
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        # RNN层：处理序列信息
        self.rnn = nn.RNN(
            embedding_dim,      # 输入特征维度
            hidden_dim,         # 隐藏状态维度
            num_layers=1,       # RNN层数
            bidirectional=False, # 单向RNN
            batch_first=True    # 批次维度在前
        )
        
        # 全连接层：将RNN输出映射到词汇表大小
        self.dense = nn.Linear(hidden_dim, vocab_size)
        
    def forward(self, x, hidden=None):
        """
        前向传播
        
        参数:
        x: 输入序列，形状为 [batch_size, sequence_length]
        hidden: 初始隐藏状态，默认为None
        
        返回:
        output: 模型输出，形状为 [batch_size, sequence_length, vocab_size]
        hidden: 最终隐藏状态
        """
        # 输入形状: [batch_size, sequence_length]
        x = self.embedding(x)  # 嵌入层：形状变为 [batch_size, sequence_length, embedding_dim]
        output, hidden = self.rnn(x, hidden)  # RNN层：形状为 [batch_size, sequence_length, hidden_dim]
        output = self.dense(output)  # 全连接层：形状为 [batch_size, sequence_length, vocab_size]
        return output, hidden
    

# 定义模型超参数
vocab_size = len(char_to_idx)  # 词汇表大小：字符到索引的映射数量
embedding_dim = 256  # 嵌入维度：字符向量表示的维度
rnn_units = 1024  # RNN隐藏单元数量：控制模型的表达能力

# 实例化模型
model = ShakespeareRNN(vocab_size, embedding_dim, rnn_units, batch_size)
print(model)


ShakespeareRNN(
  (embedding): Embedding(65, 256)
  (rnn): RNN(256, 1024, batch_first=True)
  (dense): Linear(in_features=1024, out_features=65, bias=True)
)


In [10]:
# 创建测试数据
batch_size = 2
sequence_length = 10

# 生成随机输入序列（字符ID）
test_input = torch.randint(0, vocab_size, (batch_size, sequence_length))
print(f"输入形状: {test_input.shape}")
print(f"输入数据:\n{test_input}")

# 进行前向传播
with torch.no_grad():  # 不计算梯度，节省内存
    output, hidden = model(test_input)

# 打印输出信息
print(f"\n输出形状: {output.shape}")
print(f"隐藏状态形状: {hidden.shape}")
print(f"输出数据形状: {output.shape}")
print(f"词汇表大小: {vocab_size}")

# 验证输出维度是否正确
expected_output_shape = (batch_size, sequence_length, vocab_size)
expected_hidden_shape = (1, batch_size, rnn_units)  # (num_layers, batch_size, hidden_dim)

print(f"\n验证结果:")
print(f"输出形状正确: {output.shape == expected_output_shape}")
print(f"隐藏状态形状正确: {hidden.shape == expected_hidden_shape}")


print("\n✅ 模型前向传播测试通过！")


输入形状: torch.Size([2, 10])
输入数据:
tensor([[23,  3, 55,  6, 13, 41, 51, 18, 64, 54],
        [ 0, 50, 41, 15, 47, 59, 37, 40,  7, 55]])

输出形状: torch.Size([2, 10, 65])
隐藏状态形状: torch.Size([1, 2, 1024])
输出数据形状: torch.Size([2, 10, 65])
词汇表大小: 65

验证结果:
输出形状正确: True
隐藏状态形状正确: True

✅ 模型前向传播测试通过！
