In [1]:
import numpy as np
import random
import os

In [2]:
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F

In [3]:
use_cuda = torch.cuda.is_available()

In [4]:
class Config(object):
    """RNNLM模型配置项"""
    embedding_dim = 256  # 词向量维度

    rnn_type = 'LSTM'  # 支持RNN/LSTM/GRU
    hidden_dim = 256  # 隐藏层维度
    num_layers = 1  # RNN 层数

    dropout = 0.5  # 丢弃概率
    tie_weights = False  # 是否绑定参数

    clip = 0.25  # 用于梯度规范化
    learning_rate = 0.01  # 初始学习率

    log_interval = 500  # 每隔多少个批次输出一次状态
    save_interval = 3  # 每个多少个轮次保存一次参数

In [5]:
class RNNLM(nn.Module):
    """基于RNN的语言模型，包含一个encoder，一个rnn模块，一个decoder。"""

    def __init__(self, config):
        super(RNNLM, self).__init__()

        v_size = config.vocab_size
        em_dim = config.embedding_dim
        dropout = config.dropout
        
        self.rnn_type = rnn_type = config.rnn_type
        self.hi_dim = hi_dim = config.hidden_dim
        self.n_layers = n_layers = config.num_layers

        # self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(v_size, em_dim)

        # rnn: RNN / LSTM / GRU
        self.rnn = getattr(nn, rnn_type)(em_dim, hi_dim, n_layers)
        self.decoder = nn.Linear(hi_dim, v_size)
        self.softmax = nn.LogSoftmax(dim=1)

        # tie_weights将encoder和decoder的参数绑定为同一参数。
        if config.tie_weights:
            if hi_dim != em_dim:  # 这两个维度必须相同
                raise ValueError('When using the tied flag, hi_dim must be equal to em_dim')
            self.decoder.weight = self.encoder.weight

        self.init_weights()  # 初始化权重

    def forward(self, inputs, hidden):
        seq_len = len(inputs)
        emb = self.encoder(inputs).view(seq_len, 1, -1)
        output, hidden = self.rnn(emb, hidden)
        output = F.relu(self.decoder(output.view(seq_len, -1)))
        output = self.softmax(output)
        return output, hidden  # 复原

    def init_weights(self):
        """权重初始化，如果tie_weights，则encoder和decoder权重是相同的"""
        init_range = 0.1
        self.encoder.weight.data.uniform_(-init_range, init_range)
        self.decoder.weight.data.uniform_(-init_range, init_range)
        self.decoder.bias.data.fill_(0)

    def init_hidden(self):
        """初始化隐藏层"""
        weight = next(self.parameters()).data
        if self.rnn_type == 'LSTM':  # lstm：(h0, c0)
            return (Variable(weight.new(self.n_layers, 1, self.hi_dim).zero_()),
                    Variable(weight.new(self.n_layers, 1, self.hi_dim).zero_()))
        else:  # gru 和 rnn：h0
            return Variable(weight.new(self.n_layers, 1, self.hi_dim).zero_())

In [6]:
def open_file(filename, mode='r'):
    return open(filename, mode=mode, encoding='utf-8', errors='ignore')

class Corpus(object):
    """
    文本预处理，获取词汇表，并将字符串文本转换为数字序列。
    """

    def __init__(self, train_dir, vocab_dir):
        assert os.path.exists(train_dir), 'File %s does not exist.' % train_dir
        
        if not os.path.exists(vocab_dir):
            words = list(set(list(open_file(train_dir).read().replace('\n', ''))))
            open_file(vocab_dir, 'w').write('\n'.join(sorted(words)) + '\n')
        
        words = open_file(vocab_dir).read().strip().split('\n')
        words = ['<SOS>', '<EOS>'] + words
        word_to_id = dict(zip(words, range(len(words))))
        
        data = []
        with open_file(train_dir) as f:
            for line in f:
                poem = [word_to_id[x] for x in line.strip() if x in word_to_id]
                data.append([0] + poem + [1])
        
        self.words = words
        self.word_to_id = word_to_id
        self.data = data

    def __repr__(self):
        return "Corpus length: %d, Vocabulary size: %d" % (len(self.data), len(self.words))

In [7]:
config = Config()
corpus = Corpus('data/poem.tang.txt', 'data/poem.vocab.txt')
config.vocab_size = len(corpus.words)
corpus

Corpus length: 51836, Vocabulary size: 7355

In [8]:
model = RNNLM(config)
if use_cuda:
    model.cuda()

In [9]:
criterion = nn.NLLLoss()

In [10]:
optimizer = optim.RMSprop(model.parameters(), lr=0.01, weight_decay=0.0001)

In [11]:
def generate(model, words, word_len=50, temperature=1.0):
    """生成一定数量的文本，temperature结合多项式分布可增添抽样的多样性。"""
    model.eval()
    hidden = model.init_hidden()  # batch_size为1
    inputs = Variable(torch.rand(1, 1).mul(len(words)).long(), volatile=True)  # 随机选取一个字作为开始
    if use_cuda:
        inputs = inputs.cuda()

    word_list = []
    for i in range(word_len):  # 逐字生成
        output, hidden = model(inputs, hidden)
        word_weights = output.squeeze().data.div(temperature).exp().cpu()

        # 基于词的权重，对其再进行一次抽样，增添其多样性，如果不使用此法，会导致常用字的无限循环
        word_idx = torch.multinomial(word_weights, 1)[0]
        inputs.data.fill_(word_idx)  # 将新生成的字赋给inputs
        word = words[word_idx]
        word_list.append(word)
    return word_list

In [12]:
def generate2(model, words, word_len=50, temperature=1.0):
    """生成一定数量的文本，temperature结合多项式分布可增添抽样的多样性。"""
    model.eval()
    hidden = model.init_hidden()  # batch_size为1
    #inputs = Variable(torch.rand(1, 1).mul(len(words)).long(), volatile=True)  # 随机选取一个字作为开始
    inputs = Variable(torch.LongTensor([0]))
    if use_cuda:
        inputs = inputs.cuda()

    word_list = []
    for i in range(word_len):  # 逐字生成
        output, hidden = model(inputs, hidden)
        topv, topi = output.data.topk(1)
        word_idx = topi[0][0]
        inputs.data.fill_(word_idx)  # 将新生成的字赋给inputs
        word = words[word_idx]
        word_list.append(word)
    return word_list

In [None]:
total_loss = 0.0
for i in range(50000):
    model.train()
    rand_data = random.choice(corpus.data)
    inputs = Variable(torch.LongTensor(rand_data[:-1]))
    targets = Variable(torch.LongTensor(rand_data[1:]))
    
    if use_cuda:
        inputs = inputs.cuda()
        targets = targets.cuda()

    hidden = model.init_hidden()
    outputs, hidden = model(inputs, hidden)
    loss = criterion(outputs, targets)
    model.zero_grad()
    loss.backward()
    optimizer.step()
    
    total_loss += loss
    if i % 50 == 0 and i > 0:
        print(total_loss.data[0] / 50)
        total_loss = 0.0
    
    if i % 100 == 0 and i > 0:
        gen_words = generate2(model, corpus.words)
        print(''.join(gen_words))

8.608308715820312
8.002685546875
<EOS>堂堂，，，，。<EOS>，。<EOS>，，。<EOS>，。时。。<EOS>。<EOS>，。<EOS>。<EOS>。<EOS>，。<EOS>，。<EOS>，。<EOS>。<EOS>。<EOS>，，。<EOS>，。
7.96868896484375
7.855473022460938
今来不事，，，。<EOS>。<EOS>。<EOS>。<EOS>。<EOS>。<EOS>。<EOS>。<EOS>。<EOS>。<EOS>。<EOS>。<EOS>。<EOS>。<EOS>。<EOS>。<EOS>。<EOS>。<EOS>。<EOS>。<EOS>。<EOS>。
