In [11]:
import torch
import torch.nn as nn
import torch.optim as optim

# from torchtext.data.utils import get_tokenizer
# from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pad_sequence
import torch
import pickle
from torch.utils.data import DataLoader, Dataset
import random

In [13]:
# # 数据准备
# input_texts = ["床前明月光", "举头望明月", "千山鸟飞绝"]
# target_texts = ["疑是地上霜", "低头思故乡", "万径人踪灭"]
text_list = []
def read_text_to_list(filepath,lines):
    """
    读取文本文件，将每一行作为一项保存到列表中。

    :param filepath: 文本文件路径
    :return: 包含文件每一行内容的列表
    """
    try:
        # 打开文件，使用 'r' 模式读取，指定编码为 UTF-8
        with open(filepath, 'r', encoding='utf-8') as file:
            # 逐行读取文件内容
            for line in file:
                lines.append(line.strip())  # 去掉行末的换行符并添加到列表
        print(f"文件已成功读取，共 {len(lines)} 行。")
    except Exception as e:
        print(f"读取文件时出错：{e}")

read_text_to_list('../../dataset/data_pro/唐诗/七言唐诗.txt',text_list)
read_text_to_list('../../dataset/data_pro/唐诗/六言唐诗.txt',text_list)
read_text_to_list('../../dataset/data_pro/唐诗/五言唐诗.txt',text_list)

input_texts,target_texts = [],[]
for i in text_list:
    input_texts.append(i.split()[0])
    target_texts.append(i.split()[1])

文件已成功读取，共 812548 行。
文件已成功读取，共 815991 行。
文件已成功读取，共 1599430 行。


In [6]:
# 创建字符到索引的映射
chars = list(set("".join(input_texts + target_texts)))
pad =list({"<PAD>","<EOS>"})
chars = pad+chars  # 使用集合的并集操作

In [4]:
# char_to_idx = {char: idx for idx, char in enumerate(chars)}
# idx_to_char = {idx: char for char, idx in char_to_idx.items()}

In [5]:
# with open('char_to_idx_1.pkl', 'wb') as f:
#     pickle.dump(char_to_idx, f)

# # 保存 idx_to_char 到文件
# with open('idx_to_char_1.pkl', 'wb') as f:
#     pickle.dump(idx_to_char, f)

In [15]:
# 读取 char_to_idx
with open('char_to_idx_1.pkl', 'rb') as f:
    char_to_idx = pickle.load(f)

# 读取 idx_to_char
with open('idx_to_char_1.pkl', 'rb') as f:
    idx_to_char = pickle.load(f)

In [17]:
# 将文本转换为索引序列
def text_to_seq(text, char_to_idx):
    return [char_to_idx.get(char,random.randint(2, len(char_to_idx)-1)) for char in text]

In [19]:
input_seqs = [text_to_seq(text, char_to_idx) for text in input_texts]
target_seqs = [text_to_seq(text, char_to_idx) for text in target_texts]

In [21]:
target_seqs[1]

[1975, 1723, 3163, 476, 1839, 2291, 2529]

In [10]:
target_seqs[1]

[1975, 1723, 3163, 476, 1839, 2291, 2529]

In [11]:
# 填充序列到相同长度
max_len = 10
# max_len = max(len(seq) for seq in input_seqs + target_seqs)
input_seqs = [seq +[1] + [0] * (max_len - len(seq) -1) for seq in input_seqs]
target_seqs = [seq + [1] + [0] * (max_len - len(seq) -1) for seq in target_seqs]

In [12]:
class TextDataset(Dataset):
    def __init__(self, label_sequences, target_sequences):
        self.label_sequences =  torch.tensor(label_sequences, dtype=torch.long)
        self.target_sequences = torch.tensor(target_sequences, dtype=torch.long)
        
    def __len__(self):
        return len(self.label_sequences)
    
    def __getitem__(self, idx):
        return self.label_sequences[idx], self.target_sequences[idx]

In [12]:
# input_tensor = torch.tensor(input_seqs, dtype=torch.long)
# target_tensor = torch.tensor(target_seqs, dtype=torch.long)

In [13]:
dataset = TextDataset(input_seqs, target_seqs)
dataloader = DataLoader(dataset, batch_size=80000, shuffle=True)

In [16]:
testset = TextDataset(input_seqs[0:50], target_seqs[0:50])
dataloader = DataLoader(testset, batch_size=10, shuffle=True)
# input_tests = input_tests[0:50]

In [23]:
# 定义Seq2Seq模型
class Seq2Seq(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(Seq2Seq, self).__init__()
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.encoder = nn.LSTM(hidden_size, hidden_size, batch_first=True)
        self.decoder = nn.LSTM(hidden_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        embedded = self.embedding(x)
        _, (hidden, cell) = self.encoder(embedded)
        output, _ = self.decoder(embedded, (hidden, cell))
        output = self.fc(output)
        return output

In [17]:
# 模型参数
input_size = len(chars)
hidden_size = 64
output_size = len(chars)

In [21]:
# 初始化模型、损失函数和优化器
model = Seq2Seq(input_size, hidden_size, output_size)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Seq2Seq(
  (embedding): Embedding(3865, 64)
  (encoder): LSTM(64, 64, batch_first=True)
  (decoder): LSTM(64, 64, batch_first=True)
  (fc): Linear(in_features=64, out_features=3865, bias=True)
)

In [12]:
model = torch.load('./model_seq2seq_h64.pth')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Seq2Seq(
  (embedding): Embedding(3865, 64)
  (encoder): LSTM(64, 64, batch_first=True)
  (decoder): LSTM(64, 64, batch_first=True)
  (fc): Linear(in_features=64, out_features=3865, bias=True)
)

In [15]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

NameError: name 'model' is not defined

In [18]:
# 训练模型
epochs = 1
losss = 15
for epoch in range(epochs):
    optimizer.zero_grad()
    for input_tensor, target_tensor in dataloader:
        input_tensor = input_tensor.to(device)
        target_tensor = target_tensor.to(device)
        output = model(input_tensor)
        loss = criterion(output.view(-1, output_size), target_tensor.view(-1))
        loss.backward()
        optimizer.step()
        # if (epoch + 1) % 10 == 0:
        #     print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}")
    print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}")
    if loss.item() < losss:
        losss = loss.item()
        torch.save(model, 'model_seq2seq_h64.pth')
        # print(f"Loss: {loss.item():.4f}，效果更好，进行保存")

Epoch [1/1], Loss: 0.1337


In [None]:
torch.save(model, 'model_seq2seq_h64.pth')

In [17]:
input_size = len(chars)
hidden_size =128
output_size = len(chars)

In [19]:
# 初始化模型、损失函数和优化器
model = Seq2Seq(input_size, hidden_size, output_size)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


In [20]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
# 训练模型
epochs = 2000
losss = 15
for epoch in range(epochs):
    optimizer.zero_grad()
    for input_tensor, target_tensor in dataloader:
        input_tensor = input_tensor.to(device)
        target_tensor = target_tensor.to(device)
        output = model(input_tensor)
        loss = criterion(output.view(-1, output_size), target_tensor.view(-1))
        loss.backward()
        optimizer.step()
        # if (epoch + 1) % 10 == 0:
        #     print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}")
    print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}")
    if loss.item() < losss:
        losss = loss.item()
        torch.save(model, 'model_seq2seq_h128.pth')
        print(f"Loss: {loss.item():.4f}，效果更好，进行保存")

Epoch [1/2000], Loss: 6.0268
Loss: 6.0268，效果更好，进行保存
Epoch [2/2000], Loss: 4.7342
Loss: 4.7342，效果更好，进行保存
Epoch [3/2000], Loss: 4.4887
Loss: 4.4887，效果更好，进行保存
Epoch [4/2000], Loss: 4.3923
Loss: 4.3923，效果更好，进行保存
Epoch [5/2000], Loss: 4.3086
Loss: 4.3086，效果更好，进行保存
Epoch [6/2000], Loss: 4.2662
Loss: 4.2662，效果更好，进行保存
Epoch [7/2000], Loss: 4.2190
Loss: 4.2190，效果更好，进行保存
Epoch [8/2000], Loss: 4.1741
Loss: 4.1741，效果更好，进行保存
Epoch [9/2000], Loss: 4.1307
Loss: 4.1307，效果更好，进行保存
Epoch [10/2000], Loss: 4.0865
Loss: 4.0865，效果更好，进行保存
Epoch [11/2000], Loss: 4.0340
Loss: 4.0340，效果更好，进行保存
Epoch [12/2000], Loss: 3.9992
Loss: 3.9992，效果更好，进行保存
Epoch [13/2000], Loss: 3.9539
Loss: 3.9539，效果更好，进行保存
Epoch [14/2000], Loss: 3.9336
Loss: 3.9336，效果更好，进行保存
Epoch [15/2000], Loss: 3.8856
Loss: 3.8856，效果更好，进行保存
Epoch [16/2000], Loss: 3.7783
Loss: 3.7783，效果更好，进行保存
Epoch [17/2000], Loss: 3.6825
Loss: 3.6825，效果更好，进行保存
Epoch [18/2000], Loss: 3.6002
Loss: 3.6002，效果更好，进行保存
Epoch [19/2000], Loss: 3.5009
Loss: 3.5009，效果更好，进行保存
Ep

In [27]:
model = torch.load('./model_seq2seq_h128.pth',weights_only=False,map_location=torch.device('cpu'))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Seq2Seq(
  (embedding): Embedding(3865, 128)
  (encoder): LSTM(128, 128, batch_first=True)
  (decoder): LSTM(128, 128, batch_first=True)
  (fc): Linear(in_features=128, out_features=3865, bias=True)
)

In [29]:
def test_met(input_text):
    input_seq = [text_to_seq(text, char_to_idx) for text in [input_text]]
    max_len = 10
    input_seq = [seq +[1] + [0] * (max_len - len(seq) -1) for seq in input_seq]
    input_seq = torch.tensor(input_seq)
    input_seq = input_seq.to(device)
    test_output = model(input_seq)
    predicted = torch.argmax(test_output, dim=-1)
    # print(f"Predicted: {''.join([idx_to_char[idx.item()] for idx in predicted[0]])}")
    return {''.join([idx_to_char[idx.item()] for idx in predicted[0]])}

In [31]:
for i in range(10):
    print(input_texts[i])
    print(test_met(input_texts[i]))
    print(target_texts[i])
    print('__________________')

少年力学志须彊
{'得失由来一梦长<PAD><EOS><EOS>'}
得失由来一梦长
__________________
试问邯郸欹枕客
{'人间几度熟黄粱<PAD><EOS><EOS>'}
人间几度熟黄粱
__________________
脂脸轻匀作艳粧
{'未应洁白似梅香<PAD><EOS><EOS>'}
未应洁白似梅香
__________________
夭红不见凌霜操
{'漫向春前取次芳<PAD><EOS><EOS>'}
漫向春前取次芳
__________________
纷纷朝市竞秋毫
{'江上霜风正怒号<PAD><EOS><EOS>'}
江上霜风正怒号
__________________
不问扬澜与彭浪
{'翩然东下日千艘<PAD><EOS><EOS>'}
翩然东下日千艘
__________________
似闻疏雨打篷声
{'枕上悠扬梦半醒<PAD><EOS><EOS>'}
枕上悠扬梦半醒
__________________
明日觉来浑不记
{'隔船相语过前汀<PAD><EOS><EOS>'}
隔船相语过前汀
__________________
夹屋青松翠霭中
{'去年经此亦匆匆<PAD><EOS><EOS>'}
去年经此亦匆匆
__________________
重来乌石冈头路
{'依旧松声带晓风<PAD><EOS><EOS>'}
依旧松声带晓风
__________________


In [68]:
test_met('风急天高鸟飞回')

{'老栖衣白龙九和<PAD><EOS><EOS>'}