In [15]:
import torch
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pad_sequence
import torch
import pickle
from torch.utils.data import DataLoader, Dataset

In [2]:
print(torch.__version__)

2.3.1+cu121


In [3]:
text_list = []
def read_text_to_list(filepath,lines):
    """
    读取文本文件，将每一行作为一项保存到列表中。

    :param filepath: 文本文件路径
    :return: 包含文件每一行内容的列表
    """
    try:
        # 打开文件，使用 'r' 模式读取，指定编码为 UTF-8
        with open(filepath, 'r', encoding='utf-8') as file:
            # 逐行读取文件内容
            for line in file:
                lines.append(line.strip())  # 去掉行末的换行符并添加到列表
        print(f"文件已成功读取，共 {len(lines)} 行。")
    except Exception as e:
        print(f"读取文件时出错：{e}")

read_text_to_list('../../dataset/data_pro/唐诗/七言唐诗.txt',text_list)
read_text_to_list('../../dataset/data_pro/唐诗/六言唐诗.txt',text_list)
read_text_to_list('../../dataset/data_pro/唐诗/五言唐诗.txt',text_list)

文件已成功读取，共 812548 行。
文件已成功读取，共 815991 行。
文件已成功读取，共 1599430 行。


In [4]:
label_text,target_text = [],[]
for i in text_list:
    label_text.append(i.split()[0])
    target_text.append(i.split()[1])

In [5]:
label_text[0:10]

['少年力学志须彊',
 '试问邯郸欹枕客',
 '脂脸轻匀作艳粧',
 '夭红不见凌霜操',
 '纷纷朝市竞秋毫',
 '不问扬澜与彭浪',
 '似闻疏雨打篷声',
 '明日觉来浑不记',
 '夹屋青松翠霭中',
 '重来乌石冈头路']

In [6]:
target_text[0:10]

['得失由来一梦长',
 '人间几度熟黄粱',
 '未应洁白似梅香',
 '漫向春前取次芳',
 '江上霜风正怒号',
 '翩然东下日千艘',
 '枕上悠扬梦半醒',
 '隔船相语过前汀',
 '去年经此亦匆匆',
 '依旧松声带晓风']

In [8]:
# 构建词汇表
def yield_tokens(data):
    for text in data:
        yield tokenizer(text)


# 初始化分词器（字符级别）
tokenizer = get_tokenizer(None)  # 默认按字符分词

vocab = build_vocab_from_iterator(yield_tokens(label_text + target_text), specials=["<unk>", "<pad>"])
vocab.set_default_index(vocab["<unk>"])

# 将文本转换为索引序列
label_sequences = [torch.tensor(vocab(tokenizer(text)), dtype=torch.long) for text in label_text]
target_sequences = [torch.tensor(vocab(tokenizer(text)), dtype=torch.long) for text in target_text]

# 填充序列，使它们具有相同的长度
label_sequences = pad_sequence(label_sequences, batch_first=True, padding_value=vocab["<pad>"])
target_sequences = pad_sequence(target_sequences, batch_first=True, padding_value=vocab["<pad>"])

In [9]:
len(vocab)

10203

In [11]:
# 保存 vocab 到文件
with open('./vocab.pkl', 'wb') as f:
    pickle.dump(vocab, f)

In [None]:
# 从文件加载 vocab
with open('./model/v1.0_lstm_单字/vocab.pkl', 'rb') as f:
    vocab = pickle.load(f)

In [16]:
class TextDataset(Dataset):
    def __init__(self, label_sequences, target_sequences):
        self.label_sequences = label_sequences
        self.target_sequences = target_sequences
        
    def __len__(self):
        return len(self.label_sequences)
    
    def __getitem__(self, idx):
        return self.label_sequences[idx], self.target_sequences[idx]

In [17]:
len(label_text)

1599430

In [18]:
dataset = TextDataset(label_text, target_text)
dataloader = DataLoader(dataset, batch_size=600000, shuffle=True)

In [19]:
import torch.nn as nn

class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        output = self.fc(lstm_out)
        return output

# 模型参数
vocab_size = len(vocab)
embedding_dim = 64
hidden_dim = 128

# 初始化模型
model = LSTMModel(vocab_size, embedding_dim, hidden_dim)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
# 打印模型结构
print(model)

LSTMModel(
  (embedding): Embedding(10203, 64)
  (lstm): LSTM(64, 128, batch_first=True)
  (fc): Linear(in_features=128, out_features=10203, bias=True)
)


In [None]:
import torch.optim as optim

# 损失函数和优化器
criterion = nn.CrossEntropyLoss(ignore_index=vocab["<pad>"])  # 忽略填充部分
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 训练循环
num_epochs = 100
model.train()
for epoch in range(num_epochs):
    optimizer.zero_grad()

    # 前向传播
    outputs = model(input_sequences)
    loss = criterion(outputs.view(-1, vocab_size), target_sequences.view(-1))

    # 反向传播和优化
    loss.backward()
    optimizer.step()

    # 打印损失
    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

In [None]:

# 调整批次大小
batch_size = 32
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# 假设你已经定义好了模型
model = YourModel().to(device)  # 将模型移动到GPU上

# 损失函数和优化器
criterion = nn.CrossEntropyLoss(ignore_index=vocab["<pad>"])  # 忽略填充部分
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 训练循环
num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
4
    for batch_input, batch_target in dataloader:
        # 将数据移动到GPU上
        batch_input, batch_target = batch_input.to(device), batch_target.to(device)

        optimizer.zero_grad()

        # 前向传播
        outputs = model(batch_input)
        loss = criterion(outputs.view(-1, vocab_size), batch_target.view(-1))

        # 反向传播和优化
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    # 打印损失
    if (epoch + 1) % 10 == 0:
        avg_loss = total_loss / len(dataloader)
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}')