In [1]:
import os
import random
from collections import Counter

import time
import math

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import torch.nn.functional as F

use_cuda = torch.cuda.is_available()

In [3]:
def open_file(filename, mode='r'):
    return open(filename, mode=mode, encoding='utf-8', errors='ignore')


def read_vocab(vocab_path):
    words = open_file(vocab_path).read().strip().split('\n')
    word_to_id = dict(zip(words, range(len(words))))
    return words, word_to_id


def build_vocab(data_path, vocab_path, vocab_size):
    tokens = ['<sos>', '<eos>', '<unk>']  # 词汇表中的几个重要标记
    all_words = open_file(data_path).read().strip().replace('\n', ' ').split()
    count_pairs = Counter(all_words).most_common(vocab_size)
    words, _ = list(zip(*count_pairs))
    tokens += words
    open_file(vocab_path, 'w').write('\n'.join(tokens) + '\n')


def text_to_id(text, w2id, unk_token):
    return [w2id[x] if x in w2id else unk_token for x in text.split()]


def id_to_text(ids, words):
    return ' '.join([words[x] for x in ids])

def variables_from_ids(ids):
    ids_var = Variable(torch.LongTensor(ids))
    if use_cuda:
        ids_var = ids_var.cuda()
    return ids_var

def variables_from_pair(pair):
    input_var = Variable(torch.LongTensor(pair[0]))
    target_var = Variable(torch.LongTensor(pair[1]))
    if use_cuda:
        input_var = input_var.cuda()
        target_var = target_var.cuda()
    return (input_var, target_var)

In [4]:
class Corpus(object):
    def __init__(self, data_path, vocab_path, vocab_size=10000):
        assert os.path.exists(data_path)

        if not os.path.exists(vocab_path):
            build_vocab(data_path, vocab_path, vocab_size - 3)

        self.words, self.word_to_id = read_vocab(vocab_path)

        self.tokenize(data_path)

    def tokenize(self, data_path):
        unk_token = self.word_to_id['<unk>']
        lines = []
        data = []
        for line in open_file(data_path):
            if len(line.strip()) == 0:
                data.extend(list(zip(lines[:-1], lines[1:])))
                lines = []
            line_ids = text_to_id(line + ' <eos>', self.word_to_id, unk_token)
            if line_ids.count(unk_token) < len(line_ids) * 0.2:
                lines.append(line_ids)

        # 打乱，分离数据集
        random.shuffle(data)
        data_len = len(data)
        #self.data_train = data[:int(0.7 * data_len)]
        #self.data_val = data[int(0.7 * data_len):int(0.8 * data_len)]
        #self.data_test = data[int(0.8 * data_len):]
        
        self.data_train = [variables_from_pair(pair) for pair in data[:7000]]
        self.data_val = [variables_from_pair(pair) for pair in data[7000:8000]]
        self.data_test = [variables_from_pair(pair) for pair in data[8000:10000]]

    def __repr__(self):
        return "Vocab size: {}\nTrain len: {}\nValidation len: {}\nTest len: {}".format(len(self.words),
                                                                                        len(self.data_train),
                                                                                        len(self.data_val),
                                                                                        len(self.data_test))

In [5]:
corpus = Corpus('lyric_full.txt', 'lyric_vocab.txt')
print(corpus)

Vocab size: 10000
Train len: 7000
Validation len: 1000
Test len: 2000


In [6]:
r_t = random.choice(corpus.data_train)
print(id_to_text(r_t[0].data.cpu().numpy(), corpus.words))
print(id_to_text(r_t[1].data.cpu().numpy(), corpus.words))

承受 不了 你 的 善变 我 <unk> <eos>
不管 对 不 对 是 对 爱 不想 有所 违背 <eos>


In [7]:
class EncoderRNN(nn.Module):
    def __init__(self, embedding, hidden_size, n_layers=1, dropout=0.1):
        super(EncoderRNN, self).__init__()

        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.dropout = dropout
        
        self.embedding = embedding
        self.rnn = nn.LSTM(hidden_size, hidden_size, n_layers, dropout=dropout)
        
    def forward(self, input_s, hidden=None):
        embedded = self.embedding(input_s).view(len(input_s), 1, -1)
        outputs, hidden = self.rnn(embedded, hidden)
        return outputs, hidden

In [8]:
class DecoderRNN(nn.Module):
    def __init__(self, embedding, hidden_size, output_size, n_layers=1, dropout=0.1):
        super(DecoderRNN, self).__init__()

        # Keep for reference
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout = dropout

        # Define layers
        self.embedding = embedding
        self.embedding_dropout = nn.Dropout(dropout)
        self.rnn = nn.LSTM(hidden_size, hidden_size, n_layers, dropout=dropout)
        self.out = nn.Linear(hidden_size, output_size)

    def forward(self, input_s, last_hidden):
        # Note: we run this one step at a time
        # Get the embedding of the current input word (last output word)
        embedded = self.embedding(input_s)
        embedded = self.embedding_dropout(embedded).view(1, 1, -1)

        # Get current hidden state from input word and last hidden state
        rnn_output, hidden = self.rnn(embedded, last_hidden)

        rnn_output = rnn_output.squeeze(0) # S=1 x B x N -> B x N

        # Finally predict next token (Luong eq. 6, without softmax)
        output = self.out(rnn_output)

        # Return final output, hidden state, and attention weights (for visualization)
        return output, hidden

In [9]:
embedding_size = 500
hidden_size = 500
n_layers = 2

vocab_size = len(corpus.words)
embedding = nn.Embedding(vocab_size, embedding_size)
encoder_test = EncoderRNN(embedding, hidden_size, n_layers)

In [10]:
decoder_test = DecoderRNN(embedding, hidden_size, vocab_size, n_layers)
decoder_test

DecoderRNN(
  (embedding): Embedding(10000, 500)
  (embedding_dropout): Dropout(p=0.1)
  (rnn): LSTM(500, 500, num_layers=2, dropout=0.1)
  (out): Linear(in_features=500, out_features=10000)
)

In [11]:
if use_cuda:
    encoder_test.cuda()
    decoder_test.cuda()

In [12]:
input_var, target_var = random.choice(corpus.data_train)
print(input_var.size(), target_var.size())
encoder_outputs, encoder_hidden = encoder_test(input_var)
print(encoder_outputs.size())

decoder_hidden = encoder_hidden
for i in range(len(target_var)):
    decoder_output, decoder_hidden = decoder_test(target_var[i], decoder_hidden)
    print(decoder_output.size(), decoder_hidden[0].size())

torch.Size([7]) torch.Size([8])
torch.Size([7, 1, 500])
torch.Size([1, 10000]) torch.Size([2, 1, 500])
torch.Size([1, 10000]) torch.Size([2, 1, 500])
torch.Size([1, 10000]) torch.Size([2, 1, 500])
torch.Size([1, 10000]) torch.Size([2, 1, 500])
torch.Size([1, 10000]) torch.Size([2, 1, 500])
torch.Size([1, 10000]) torch.Size([2, 1, 500])
torch.Size([1, 10000]) torch.Size([2, 1, 500])
torch.Size([1, 10000]) torch.Size([2, 1, 500])


In [13]:
def train(input_variable, target_variable, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion):

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    loss = 0

    target_length = target_variable.size(0)
    encoder_outputs, encoder_hidden = encoder(input_variable)

    decoder_input = Variable(torch.LongTensor([[corpus.word_to_id['<sos>']]]))
    decoder_hidden = encoder_hidden
    if use_cuda:
        decoder_input = decoder_input.cuda()

    for di in range(target_length):
        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
        loss += criterion(decoder_output, target_variable[di])
        decoder_input = target_variable[di] 

    # Backpropagation
    loss.backward()
    torch.nn.utils.clip_grad_norm(encoder.parameters(), clip)
    torch.nn.utils.clip_grad_norm(decoder.parameters(), clip)
    encoder_optimizer.step()
    decoder_optimizer.step()
    
    return loss.data[0] / target_length

In [14]:
learning_rate = 0.0001
clip = 5
decoder_learning_ratio = 5.0
encoder_optimizer = optim.Adam(encoder_test.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder_test.parameters(), lr=learning_rate * decoder_learning_ratio)

criterion = nn.CrossEntropyLoss(size_average=False)

In [15]:
loss = train(input_var, target_var, encoder_test, decoder_test, encoder_optimizer, decoder_optimizer, criterion)
loss

9.208130836486816

In [16]:
def evaluate(max_length=20):
    input_var, target_var = random.choice(corpus.data_test)

    # Set to not-training mode to disable dropout
    encoder_test.eval()
    decoder_test.eval()
    
    # Run through encoder
    encoder_outputs, encoder_hidden = encoder_test(input_var)

    # Create starting vectors for decoder
    decoder_input = Variable(torch.LongTensor([corpus.word_to_id['<sos>']]), volatile=True) # SOS
    decoder_hidden = encoder_hidden[:decoder_test.n_layers] # Use last (forward) hidden state from encoder
    
    if use_cuda:
        decoder_input = decoder_input.cuda()

    # Store output words and attention states
    decoded_words = []
    
    # Run through decoder
    for di in range(max_length):
        decoder_output, decoder_hidden = decoder_test(
            decoder_input, decoder_hidden
        )

        # Choose top word from output
        topv, topi = decoder_output.data.topk(1)
        ni = topi[0][0]
        if ni == corpus.word_to_id['<eos>']:
            decoded_words.append('<eos>')
            break
        else:
            decoded_words.append(corpus.words[ni])
            
        # Next input is chosen word
        decoder_input = Variable(torch.LongTensor([ni]))
        if use_cuda: decoder_input = decoder_input.cuda()

    # Set back to training mode
    encoder_test.train()
    decoder_test.train()
    
    print(id_to_text(pair[0], corpus.words))
    print(id_to_text(pair[1], corpus.words))
    print(' '.join(decoded_words))

In [17]:
def evaluate_one(input_variable, target_variable, encoder, decoder, criterion):
    loss = 0
    encoder.eval()
    decoder.eval()
    target_length = target_variable.size(0)
    encoder_outputs, encoder_hidden = encoder(input_variable)

    decoder_input = Variable(torch.LongTensor([[corpus.word_to_id['<sos>']]]))
    decoder_hidden = encoder_hidden
    if use_cuda:
        decoder_input = decoder_input.cuda()

    for di in range(target_length):
        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
        loss += criterion(decoder_output, target_variable[di])
        decoder_input = target_variable[di] 
        
    encoder.train()
    decoder.train()
    
    return loss.data[0] / target_length

def evaluate_full(eval_data):
    total_loss = 0.0
    for pair in eval_data:
        input_var, target_var = pair
        total_loss += evaluate_one(input_var, target_var, encoder_test, decoder_test, criterion)
    total_loss /= len(eval_data)
    return total_loss

In [18]:
# print_per_epoch = 200
# epochs = 50000
# total_loss = 0
# for i in range(1, epochs):
#     input_var, target_var = variables_from_pair(random.choice(corpus.data_train))
#     total_loss += train(input_var, target_var, encoder_test, decoder_test, encoder_optimizer, decoder_optimizer, criterion)
#     if i % print_per_epoch == 0:
#         print(i, total_loss / print_per_epoch)
#         total_loss = 0
        
#     if i % 2000 == 0:
#         print(evaluate_full(corpus.data_val))
#         print(evaluate_full(corpus.data_test))
#         evaluate()

In [19]:
def as_minutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def time_since(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (as_minutes(s), as_minutes(rs))

In [20]:
start_time = time.time()
as_minutes(time.time() - start_time)

'0m 0s'

In [None]:
print_per_epoch = 500
epochs = 50000
total_loss = 0
print_loss = 0
for i in range(len(corpus.data_train)):
    input_var, target_var = corpus.data_train[i]
    cur_loss = train(input_var, target_var, encoder_test, decoder_test, encoder_optimizer, decoder_optimizer, criterion)
    total_loss += cur_loss
    print_loss += cur_loss
    if i % print_per_epoch == 0 and i > 0:
        print(i, print_loss / print_per_epoch, as_minutes(time.time() - start_time))
        print_loss = 0
        
    if i % 2000 == 0 and i > 0:
        print(evaluate_full(corpus.data_val))
        print(evaluate_full(corpus.data_test))
        evaluate()
        print(as_minutes(time.time() - start_time))

500 6.949021146893936 0m 28s
1000 6.567773572145218 0m 56s


In [None]:
evaluate_full(corpus.data_train)