# 文本生成案例

### 词嵌入层 Embedding

In [23]:
import torch
import torch.nn as nn
import jieba #分词工具 pip install jieba

def test01():
    text = '北京东奥的进度条已经过半，不少外国运动员在完成自己的比赛后踏上归途。'
    
    # 1.分词
    words = jieba.lcut(text) 
    print(words)
    
    # 2.构建词表
    index_to_word = {} #用索引区词
    word_to_index = {} #用word取词索引
    # 去重
    unique_words = list(set(words))
    for idx, word in enumerate(unique_words):
        index_to_word[idx] = word
        word_to_index[word] = idx
    
    # print(index_to_word)
    
    ##########
    # 3.构建词嵌入层    num_embeddings:词表大小;   embedding_dim:词向量维度
    embed = nn.Embedding(num_embeddings=len(index_to_word), embedding_dim=4)
    print(embed)
    
    # 4.句子用词向量表示
    for word in words:
        # 
        idx = word_to_index[word]
        # 获取这个词的词向量表示
        word_vec = embed(torch.tensor(idx))
        print(word, '->', word_vec)
        
    
test01()

['北京', '东奥', '的', '进度条', '已经', '过半', '，', '不少', '外国', '运动员', '在', '完成', '自己', '的', '比赛', '后', '踏上', '归途', '。']
Embedding(18, 4)
北京 -> tensor([0.3457, 0.3825, 0.5380, 1.7388], grad_fn=<EmbeddingBackward0>)
东奥 -> tensor([0.3363, 0.7970, 1.7221, 0.0421], grad_fn=<EmbeddingBackward0>)
的 -> tensor([-0.2941, -0.8324,  1.8048, -0.0365], grad_fn=<EmbeddingBackward0>)
进度条 -> tensor([ 0.7872, -0.1584,  0.8657, -1.6459], grad_fn=<EmbeddingBackward0>)
已经 -> tensor([ 0.7600, -1.7611,  1.2300,  0.3257], grad_fn=<EmbeddingBackward0>)
过半 -> tensor([-1.1398,  0.8186, -2.8867,  2.1403], grad_fn=<EmbeddingBackward0>)
， -> tensor([ 0.2748, -0.1309, -0.6391,  0.2122], grad_fn=<EmbeddingBackward0>)
不少 -> tensor([ 1.1781,  0.7161, -1.8390, -1.4239], grad_fn=<EmbeddingBackward0>)
外国 -> tensor([-0.5269,  0.2707,  0.8005, -0.5764], grad_fn=<EmbeddingBackward0>)
运动员 -> tensor([ 1.9003, -0.2485, -0.8052,  0.1101], grad_fn=<EmbeddingBackward0>)
在 -> tensor([1.5074, 0.5562, 1.0836, 1.3871], grad_fn=<EmbeddingBackwa

In [26]:
# rnn输入单个单词
def test02(): 

    # input_size: 单个词向量的维度
    # hidden_size: 隐藏层大小，也就是隐藏层中神经元个数，同时也是这个隐藏层输出数据的个数(每个神经元输出一个标量，多个神经元就输出多个标量)
    # num_layers: 隐藏层的个数
    input_size = 128
    hidden_size = 3
    num_layers = 1
    rnn = nn.RNN(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers)
    
    # 模拟要输入的一个词向量
    # inputs形状 (seq_len, batch_size, input_size)
    # seq_len: 句子长度，句子中词向量个数
    seq_len = 1
    batch_size = 1
    inputs = torch.randn(seq_len,batch_size,input_size)
    
    # 初始隐藏层
    # 隐藏层形状 (num_layers, batch_size, hidden_size)
    hn = torch.zeros(num_layers, batch_size, hidden_size)
    
    # 将这个词向量送到rnn中
    # output表示每个输入词向量对应的中间状态，h0表示最后一个词向量的中间状态
    # 在Pytorch中定义的RNN，其实是没有y这个的输出。
    # Pytorch版本的两个输出，output=[h1, h2, h3, h4], hn = h4
    # 如果想要得到输出层y，可以自行加一个全连接层
    output, hn = rnn(inputs, hn)
    print(output)
    print(hn)
    
test02()

tensor([[[ 0.8623, -0.9483,  0.9983]]], grad_fn=<StackBackward0>)
tensor([[[ 0.8623, -0.9483,  0.9983]]], grad_fn=<StackBackward0>)


In [30]:
# rnn输入句子
def test03(): 
    # input_size: 单个词向量的维度
    # hidden_size: 隐藏层大小，也就是隐藏层中神经元个数，同时也是这个隐藏层输出数据的个数(每个神经元输出一个标量，多个神经元就输出多个标量)
    # num_layers: 隐藏层的个数
    input_size = 128
    hidden_size = 8
    num_layers = 1
    rnn = nn.RNN(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers)
    
    # 模拟要输入的一个词向量
    # seq_len: 句子长度，句子中词向量个数
    # inputs形状 (seq_len, batch_size, input_size)
    seq_len = 3 #3个词向量
    batch_size = 1
    inputs = torch.randn(seq_len,batch_size,input_size)
    
    # 初始隐藏层
    # 隐藏层形状 (num_layers, batch_size, hidden_size)
    hn = torch.zeros(num_layers, batch_size, hidden_size)
    
    # 将句子送到rnn中
    # output表示每个输入词向量对应的中间状态，h0表示最后一个词向量的中间状态
    # 在Pytorch中定义的RNN，其实是没有y这个的输出。
    # Pytorch版本的两个输出，output=[h1, h2, h3, h4], hn = h4
    # 如果想要得到输出层y，可以自行加一个全连接层
    output, hn = rnn(inputs, hn)
    print(output)
    print(hn)
    
test03()

tensor([[[ 0.4931, -0.6627, -0.9259,  0.2352, -0.8524, -0.6563, -0.9168,
          -0.9999]],

        [[ 0.9473, -0.9958,  0.2207, -0.9973, -0.6312,  0.9970, -0.9940,
          -0.7935]],

        [[-0.9988,  0.4833,  0.9190, -0.0450,  0.8556, -0.2926,  0.3449,
           0.9977]]], grad_fn=<StackBackward0>)
tensor([[[-0.9988,  0.4833,  0.9190, -0.0450,  0.8556, -0.2926,  0.3449,
           0.9977]]], grad_fn=<StackBackward0>)


# 文本生成---周杰伦歌词

In [381]:
import torch
import re
import jieba
from torch.utils.data import DataLoader
import torch.nn.functional as F
import time

# 构建词典
def build_vocab():
    fname = 'data/jaychou_lyrics.txt'
    
    # 1.文本数据清晰
    clean_sentences = [] #存放的是清理好的句子
    for line in open(fname, 'r'):
        # 去除某些内容
        line = line.replace('〖韩语Rap译文〗','') 
        # 只保留特定符号 (中文、英文、数字、部分标点符号)
        line = re.sub(r'[^\u4e00-\u9fa5 a-zA-Z0-9!?,]', '', line) 
        # 连续空格替换成1个
        line = re.sub(r'[ ]{2,}', ' ', line)
        # 去除两侧空格、换行
        line = line.strip()
        # 去除单字的行
        if len(line) <= 1:
            continue
        
        #去掉重复行
        if line not in clean_sentences:
            clean_sentences.append(line)
        
    # print(clean_sentences)
    # 2.分词并构建重要词表映射
    index_to_word = [] # 索引到词的映射 ['词1'，'词2'，'词3']
    all_sentences = [] # 存放所有句子分词后的形式[[句子1分词],[句子2分词],[]]
    for line in clean_sentences:
        #分词
        words = jieba.lcut(line)
        # 分好词的句子存放到all_sentences中
        all_sentences.append(words)
        
        ####构造index_to_word 索引到词的 词表映射
        for word in words:
            if word not in index_to_word:
                index_to_word.append(word)
    
    # print(index_to_word)
    ##### 构建词到索引的词表映射
    word_to_index = {word:idx for idx, word in enumerate(index_to_word)}
    
    ##### 将输入的句子转换为索引表示
    # 把all_sentences中的每个词变成索引 ->放到 corpus_index
    corpus_index = []
    for line in all_sentences:
        temp = []
        for word in line:
            temp.append(word_to_index[word])
        
        # 每个句子最后加个空格 ' ' ?????
        temp.append(word_to_index[' '])
        #corpus_index.append(temp)
        corpus_index.extend(temp) # 变成一维数组? 其实无所谓，做能做成datasets就行
    
    # [0, 1, 2, 39,   0, 3, 4, 5, 6, 7, 39,    ]
    # print(corpus_index)
    # 词的数量
    word_count = len(index_to_word)
    return index_to_word, word_to_index, word_count,corpus_index,all_sentences
   
    
# test
def test00():
    index_to_word, word_to_index, word_count,corpus_index, all_sentences = build_vocab()
    print('word_count-> ',word_count)
    print('index_to_word-> ',index_to_word[:10])
    print('word_to_index-> ',dict(list(word_to_index.items())[:7]))
    
    print('corpus_index-> ',corpus_index[:10]) #显示前10个词
    print('all_sentences-> ',all_sentences[:2]) #显示前2个句子

# test00()

In [395]:
# 构建数据集对象
class LyricsDataset:
    # corpus_idx: 所有句子的索引表示
    # num_chars: 用它表示句子的长度，也就是一个句子的单词个数，但实际情况句子长度不是固定的，这里就假设固定了
    #           
    def __init__(self, corpus_index, num_chars):
        # 所有句子的索引表示
        self.corpus_index = corpus_index
        # 句子固定长度
        self.num_chars = num_chars
        # 词的个数
        self.word_count = len(self.corpus_index)
        # 整个数据集中有多少个句子
        self.number = self.word_count // self.num_chars
    
    # 整个数据集中有多少个句子
    def __len__(self):
        return self.number
    
    # 获取一个样本, idx:表示取第几个样本
    def __getitem__(self, idx):
        # 修改索引值: [0, self.word_count - 1]
        start = min(max(idx,0), self.word_count - self.num_chars - 2)
        
        # 假设样本      x = [0, 1, 9, 8, ... ]
        # 那么目标值就是 y = [1, 9, 8, ... ], 正好跟上面的x错开一位
        # start = start * self.num_chars # 要不要加这一句 ?????
        x = self.corpus_index[start: start + self.num_chars]
        y = self.corpus_index[start+1: start+1  + self.num_chars]

        return torch.tensor(x), torch.tensor(y)

    
    
def test02():
        _, _, _, corpus_idx,_ = build_vocab()
        # 数据集
        lyrics = LyricsDataset(corpus_idx, 5) # 句子长度是5
        # 加载数据
        dataloader = DataLoader(lyrics, batch_size=2)

        
        print('corpus_index-> ',corpus_idx[:15])
        for x,y in dataloader:
            print(x)
            print(y)
            break
        
# test02()

In [594]:
# 构建网络模型
class TextGenerator(nn.Module):
    # word_len: 词表大小
    def __init__(self, word_len):
        super(TextGenerator, self).__init__()
        
        self.dropout = nn.Dropout()
        
        # 初始化词嵌入层
        # num_embeddings:词表大小
        # embedding_dim: 词向量维度
        embedding_dim = 128
        self.ebd = nn.Embedding(num_embeddings=word_len, embedding_dim=embedding_dim)
        # print(self.ebd(torch.tensor(4)))
        
        # 循环网络 
        # input_size: 词向量维度
        # hidden_size: 隐藏层大小,输出维度(随便写),神经元个数,中间状态个数
        # num_layers: 网络层个数
        self.rnn = nn.RNN(input_size=embedding_dim, hidden_size=128, num_layers=1) 
        
        # 输出层, 输出值个数是word_len的长度，应为要从整个此表中预测一个词的概率
        self.out = nn.Linear(128, word_len)
      
    
    def get_embd(self):
        return self.ebd

    # 前向传播
    # inputs: 输入的样本  (btach_size, seq_len)
    # hn: 中间隐藏层状态  (num_layers, batch_size, hidden_size)
    def forward(self, inputs, hn): 
        
        # 词嵌入
        embed = self.ebd(inputs)
        # print('inputs-> ',inputs.shape)
        # print('embed-> ', embed.shape)

        # 正则化
        embed = self.dropout(embed)
        
        # 送入rnn
        # (btach_size, seq_len, word_dim) -> (seq_len, btach_size, word_dim)
        # output: 包含每个时刻中间状态
        # hn: 最后一个时刻的中间状态
        # print('-->111111 ',inputs.shape, embed.shape, hn.shape)
        output, hn = self.rnn(embed.transpose(0,1), hn)
        
        # 放入全连接层  output -> out
        out = self.out(output)

        # hn = self.out(hn)
        return out, hn
       
        
    def init_hn(self,batch_size):
        # print('batch_size-> ', batch_size)
        # (num_layers, batch_size, hidden_size)
        return torch.zeros(1,batch_size,128)

    
    
def test03():
    index_to_word, word_to_index, word_count, corpus_idx,_ = build_vocab()   
    lyrics = LyricsDataset(corpus_idx, 5) # 5:句子长度
   
    batch_size = 1
    lyrics_dataloader = DataLoader(lyrics, shuffle=False, batch_size=batch_size)
    model = TextGenerator(word_count)
    
    for x, y in lyrics_dataloader:
        h0 = model.init_hn(batch_size)
        print('h0.shape-> ',h0.shape) #(num_layers, batch_size, hidden_size)
        print('x.shape-> ', x.shape)  #(btach_size, seq_len)
        print('y.shape-> ', y.shape)  #(btach_size, seq_len)
       
        output, hn = model(x, h0)
        print('output.shape->',output.shape) #(seq_len, btach_size, 词表长度)
        print('hn.shape->', hn.shape)

        break

# test03()



### 训练
def train():
    # 构建词典
    index_to_word, word_to_index, word_count, corpus_idx,_ = build_vocab()
    # 数据集
    num_chars = 32  # 句子长度
    lyrics = LyricsDataset(corpus_idx, num_chars)
    
    # 初始化模型
    model = TextGenerator(word_count)
    emdb = model.get_embd()
    
    # 损失函数
    loss_fun = nn.CrossEntropyLoss()
     # 优化方法
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-1)
    
    # 训练轮数
    epoch = 10

    # 开始训练
    for epoch_idx in range(epoch):
        # print('词向量 --->',epoch_idx ,'--- ',emdb(torch.tensor(4)))
        for name,param in model.named_parameters():
            print(name,'---> ',param)
            

        batch_size = 333
 
         # 数据加载器
        lyrics_dataloader = DataLoader(lyrics, shuffle=True, batch_size=batch_size)
        # 训练时间
        start = time.time()
        # 迭代次数
        iter_num = 0
        # 训练损失
        total_loss = 0.0
        
        for x, y in lyrics_dataloader:
            # x形状 (btach_size, seq_len)
            
            # 初始隐藏状态  batch_size == x.shape[0]
            hn = model.init_hn(x.shape[0])
            
            # 模型计算
            # out形状 (seq_len, btach_size, 词表长度)
            out, hn = model(x, hn)
            # print('out.shape-> ',out.shape) #[seq_len, batch_size, 词表长度]
            # print('y.shape-> ',y.shape,y) #[batch_size, seq_len]
            # print('out before-> ', out)
            
            # out形状 [seq_len, batch_size, 词表长度(分类个数)] -> [batch_size * seq_len, 词表长度(分类个数)]
            out = out.permute(1,0,2)
            out = out.reshape(out.shape[0]*out.shape[1], out.shape[2])
            # print(out.shape)
            # print('out after-> ', out)

            
            # y形状 (batch_size, seq_len) -> (batch_size * seq_len)
            # print('y before->',y)
            y = y.reshape(y.shape[0] * y.shape[1])
            # print('y after->',y)
            
            # 计算损失
            loss = loss_fun(out, y)
            
            # 梯度清零
            optimizer.zero_grad()
            # 反向传播
            loss.backward()
            # 参数更新
            optimizer.step()
            
            iter_num += 1
            total_loss += loss.item()
        
        
        message = 'epoch %3s loss: %.5f time %.2f' % \
                  (epoch_idx + 1,
                   total_loss / iter_num,
                   time.time() - start)
        print(message)  
        
    # 模型存储
    torch.save(model.state_dict(), 'model/text-generator_10.pth')

# 训练
train()

--->  <generator object Module.parameters at 0x147f7d000>
epoch   1 loss: 6.79065 time 3.09
--->  <generator object Module.parameters at 0x147f7d000>
epoch   2 loss: 4.55815 time 3.04
--->  <generator object Module.parameters at 0x147f7d000>


KeyboardInterrupt: 

In [592]:
# 预测函数
# text :输入的句子
# sentence_length: 要预测的单词个数
def predict(text, sentence_length):
    # 构建词典
    index_to_word, word_to_index, word_count, _ ,_= build_vocab()
    # 构建模型
    model = TextGenerator(word_count)
    # 加载参数
    model.load_state_dict(torch.load('model/text-generator_10.pth'))
    model.eval()
    
    emdb = model.get_embd()
    print(emdb(torch.tensor(4)))
    
    #分词
    words = jieba.lcut(text)
    word_idx_s = [] #分好的词放到这里
    generate_sentence = [] #预测的值放到这里
    for word in words:
        word_idx = word_to_index[word]
        word_idx_s.append(word_idx)
        # 存放预测的结果
        generate_sentence.append(word_idx)
  
    # print('---> ',word_idx,index_to_word[word_idx])
    
    # 初始隐藏状态
    hn = model.init_hn(1)
    # print(h0)
        
    # 开始预测
    for _ in range(sentence_length):
        # print(word_idx_s)
        out, hn = model(torch.tensor([word_idx_s]), hn)
        # print(out.shape)
        
        # 取最后一个预测的词
        out = out[out.shape[0] - 1]
        # print('out.shape-> ',out.shape)

        # 选择分数最大的词作为预测词
        word_idx = torch.argmax(out,dim=1).item()
        # print('--> ',word_idx)
  
        # 预测词放入到预测列表
        generate_sentence.append(word_idx)
        
        word_idx_s = [word_idx]
    
    # 打印预测的词
    for idx in generate_sentence:
        print(index_to_word[idx],'(',idx,')', end='')
        pass
        
    

predict('你是谁',50)

tensor([-1.6032,  1.0777,  1.0448,  0.3063,  1.0159, -1.1750,  0.1273, -0.6190,
         0.0452, -0.4296, -1.4291,  0.5964,  1.1408,  1.9881, -0.6465, -0.1233,
        -0.5359,  0.4323, -0.9345, -0.5459, -0.5024, -0.6420,  0.1008,  1.9956,
        -1.7389, -1.7927,  0.5973,  1.9554,  0.1012,  0.3456,  0.4889, -0.0502,
        -0.8623,  2.0572,  0.5201, -1.3454, -0.9763, -1.9414,  1.2468,  0.3165,
         0.8494,  1.0635, -1.0093,  0.9467,  0.8961, -0.3868,  0.6270,  1.0438,
         1.2811,  0.4118, -1.7610, -0.4277,  0.6290,  1.3352, -1.0936, -0.1874,
        -1.0669, -1.2879, -0.7912, -0.0576,  1.5061, -0.9747, -0.2678, -0.1020,
         0.8284, -0.8102,  0.2684, -1.2154, -0.2392,  0.1568, -1.9914, -1.6981,
        -1.4385,  0.4117, -0.5493, -0.6133, -1.1851, -0.9440, -1.6551,  0.8597,
        -1.1864, -2.1758, -0.7756, -0.4431, -0.6054, -1.7768, -0.9309,  0.0692,
        -0.4577,  0.4310, -1.3327, -0.9335, -0.3493, -0.0701, -0.1699, -0.4641,
        -0.4948, -0.8849,  0.5000, -1.02