In [1]:
import os
import time
import math
import random

import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
from masked_cross_entropy import masked_cross_entropy

In [2]:
USE_CUDA = torch.cuda.is_available()
USE_CUDA

True

In [3]:
base_dir = 'data/twitter'
train_dir = os.path.join(base_dir, 'twitter_clean.txt')
vocab_dir = os.path.join(base_dir, 'twitter_vocab.txt')

In [4]:
class Corpus(object):
    """
    文本预处理，获取词汇表，并将字符串文本转换为数字序列。
    """

    def __init__(self, train_dir, vocab_dir):
        assert os.path.exists(train_dir), 'File %s does not exist.' % train_dir
        assert os.path.exists(vocab_dir), 'File %s does not exist.' % vocab_dir

        words = open(vocab_dir, encoding='utf-8').read().strip().split('\n')
        word_to_id = dict(zip(words, range(len(words))))
        
        assert word_to_id['<pad>'] == 0, "<pad> id should be 0."
        
        self.words = words
        self.word_to_id = word_to_id
        
        self.tokenize(train_dir)
        
    def tokenize(self, train_dir):
        data = open(train_dir, encoding='utf-8').read().strip().split('\n')
        questions, answers = [], []
        for line in data:
            question, answer = line.split(" ==> ")
            questions.append(self.text_to_ids(question))
            answers.append(self.text_to_ids(answer))
            
        total_num = len(questions)
        train_num = int(0.9 * total_num)
        self.x_train, self.y_train = questions[:train_num], answers[:train_num]
        self.x_test, self.y_test = questions[train_num:], answers[train_num:]
        
    def text_to_ids(self, text):
        return [self.word_to_id[x] for x in (text.split() + ['<eos>'])]
    
    def ids_to_text(self, ids):
        return [self.words[x] for x in ids]

    def __repr__(self):
        return "Train length: %d, Test length: %d, Vocabulary size: %d" % (len(self.x_train), 
                                                                           len(self.x_test), 
                                                                           len(self.words))

In [5]:
class DataSet(object):
    def __init__(self, data, labels, batch_size=64):
        num_batches = len(data) // batch_size
        data, labels = data[:(num_batches * batch_size)], labels[:(num_batches * batch_size)]
        self.data = []
        for i in range(num_batches):
            x_batch = data[(i * batch_size):((i+1)*batch_size)]
            y_batch = labels[(i * batch_size):((i+1)*batch_size)]
            x_pad, x_lengths, y_pad, y_lengths = self.pad_batch(x_batch, y_batch)
            self.data.append((x_pad, x_lengths, y_pad, y_lengths))
            
    def pad_batch(self, x_batch, y_batch):
        seq_pairs = sorted(zip(x_batch, y_batch), key=lambda p: len(p[0]), reverse=True)
        x_batch, y_batch = zip(*seq_pairs)
        
        x_lengths = list(map(len, x_batch))
        x_padded = [self.pad_seq(s, max(x_lengths)) for s in x_batch]
        
        y_lengths = list(map(len, y_batch))
        y_padded = [self.pad_seq(s, max(y_lengths)) for s in y_batch]
        
        input_var = Variable(torch.LongTensor(x_padded)).transpose(0, 1)
        target_var = Variable(torch.LongTensor(y_padded)).transpose(0, 1)
        
        if USE_CUDA:
            input_var = input_var.cuda()
            target_var = target_var.cuda()
            
        return input_var, x_lengths, target_var, y_lengths
    
    def pad_seq(self, seq, max_len):
        return seq + [0] * (max_len - len(seq))
    
    def __getitem__(self, index):
        return self.data[index]
    
    def __len__(self):
        return len(self.data)

In [6]:
class EncoderRNN(nn.Module):
    def __init__(self, hidden_size, embedding, n_layers=1, dropout=0.1):
        super(EncoderRNN, self).__init__()

        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.dropout = dropout
        
        self.embedding = embedding
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=dropout)
        
    def forward(self, input_seqs, input_lengths, hidden=None):
        embedded = self.embedding(input_seqs)
        packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, input_lengths)
        outputs, hidden = self.gru(packed, hidden)
        outputs, output_lengths = torch.nn.utils.rnn.pad_packed_sequence(outputs) # unpack (back to padded)
        return outputs, hidden

In [7]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, embedding, n_layers=1, dropout=0.1):
        super(DecoderRNN, self).__init__()

        # Keep for reference
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout = dropout

        # Define layers
        self.embedding = embedding
        self.embedding_dropout = nn.Dropout(dropout)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=dropout)
        self.out = nn.Linear(hidden_size, output_size)

    def forward(self, input_seq, last_hidden):
        # Note: we run this one step at a time
        # Get the embedding of the current input word (last output word)
        batch_size = input_seq.size(0)
        embedded = self.embedding(input_seq)
        embedded = self.embedding_dropout(embedded)
        embedded = embedded.view(1, batch_size, self.hidden_size) # S=1 x B x N

        # Get current hidden state from input word and last hidden state
        rnn_output, hidden = self.gru(embedded, last_hidden)

        rnn_output = rnn_output.squeeze(0) # S=1 x B x N -> B x N

        # Finally predict next token (Luong eq. 6, without softmax)
        output = self.out(rnn_output)

        # Return final output, hidden state, and attention weights (for visualization)
        return output, hidden

In [8]:
def train(input_batches, input_lengths, target_batches, target_lengths, encoder, decoder, encoder_optimizer, decoder_optimizer):
    
    # Zero gradients of both optimizers
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    loss = 0 # Added onto for each word

    # Run words through encoder
    encoder_outputs, encoder_hidden = encoder(input_batches, input_lengths, None)
    
    # Prepare input and output variables
    decoder_input = Variable(torch.LongTensor([corpus.word_to_id['<sos>']] * batch_size))
    decoder_hidden = encoder_hidden[:decoder.n_layers] # Use last (forward) hidden state from encoder

    max_target_length = max(target_lengths)
    all_decoder_outputs = Variable(torch.zeros(max_target_length, batch_size, decoder.output_size))

    # Move new Variables to CUDA
    if USE_CUDA:
        decoder_input = decoder_input.cuda()
        all_decoder_outputs = all_decoder_outputs.cuda()

    # Run through decoder one time step at a time
    for t in range(max_target_length):
        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)

        all_decoder_outputs[t] = decoder_output
        decoder_input = target_batches[t] # Next input is current target

    # Loss calculation and backpropagation
    loss = masked_cross_entropy(
        all_decoder_outputs.transpose(0, 1).contiguous(), # -> batch x seq
        target_batches.transpose(0, 1).contiguous(), # -> batch x seq
        target_lengths,
        USE_CUDA
    )
    loss.backward()
    
    # Clip gradient norms
    ec = torch.nn.utils.clip_grad_norm(encoder.parameters(), clip)
    dc = torch.nn.utils.clip_grad_norm(decoder.parameters(), clip)

    # Update parameters with optimizers
    encoder_optimizer.step()
    decoder_optimizer.step()
    
    return loss.data[0], ec, dc

In [9]:
def evaluate(input_seq, max_length=20):
    input_lengths = [len(input_seq)]
    input_batches = Variable(torch.LongTensor([input_seq]), volatile=True).transpose(0, 1)
    
    if USE_CUDA:
        input_batches = input_batches.cuda()
        
    # Set to not-training mode to disable dropout
    encoder.eval()
    decoder.eval()
    
    # Run through encoder
    encoder_outputs, encoder_hidden = encoder(input_batches, input_lengths, None)

    # Create starting vectors for decoder
    decoder_input = Variable(torch.LongTensor([corpus.word_to_id['<sos>']]), volatile=True) # SOS
    decoder_hidden = encoder_hidden[:decoder.n_layers] # Use last (forward) hidden state from encoder
    
    if USE_CUDA:
        decoder_input = decoder_input.cuda()

    # Store output words and attention states
    decoded_words = []
    
    # Run through decoder
    for di in range(max_length):
        decoder_output, decoder_hidden = decoder(
            decoder_input, decoder_hidden
        )

        # Choose top word from output
        topv, topi = decoder_output.data.topk(1)
        ni = topi[0][0]
        if ni == corpus.word_to_id['<eos>']:
            decoded_words.append('<eos>')
            break
        else:
            decoded_words.append(corpus.words[ni])
            
        # Next input is chosen word
        decoder_input = Variable(torch.LongTensor([ni]))
        if USE_CUDA: decoder_input = decoder_input.cuda()

    # Set back to training mode
    encoder.train()
    decoder.train()
    
    return decoded_words

In [10]:
def evaluate_randomly():
    index = random.choice(range(len(corpus.x_test)))
    input_seq, target_seq = corpus.x_test[index], corpus.y_test[index]
    output_words = evaluate(input_seq)
    print("Input:", ' '.join(corpus.ids_to_text(input_seq)))
    print("Target:", ' '.join(corpus.ids_to_text(target_seq)))
    print("Predicted:", ' '.join(output_words))

In [11]:
corpus = Corpus(train_dir, vocab_dir)
corpus

Train length: 194785, Test length: 21643, Vocabulary size: 10000

In [12]:
# Configure models
hidden_size = 500
n_layers = 2
dropout = 0.1
batch_size = 100

train_data = DataSet(corpus.x_train, corpus.y_train, batch_size)

In [13]:
# Configure training/optimization
clip = 50.0
learning_rate = 0.0001
decoder_learning_ratio = 5.0
n_epochs = 50000
epoch = 0
plot_every = 20
print_every = 500
evaluate_every = 2000

vocab_size = len(corpus.words)
embedding = nn.Embedding(vocab_size, hidden_size)

# Initialize models
encoder = EncoderRNN(hidden_size, embedding, n_layers, dropout=dropout)
decoder = DecoderRNN(hidden_size, vocab_size, embedding, n_layers, dropout=dropout)

# Initialize optimizers and criterion
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate * decoder_learning_ratio)

# Move models to GPU
if USE_CUDA:
    encoder.cuda()
    decoder.cuda()

# Keep track of time elapsed and running averages
start = time.time()
plot_losses = []
print_loss_total = 0 # Reset every print_every
plot_loss_total = 0 # Reset every plot_every

In [14]:
def as_minutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def time_since(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (as_minutes(s), as_minutes(rs))

In [15]:
# Begin!
ecs = []
dcs = []
eca = 0
dca = 0

while epoch < n_epochs:
    epoch += 1
    
    # Get training data for this cycle
    input_batches, input_lengths, target_batches, target_lengths = random.choice(train_data)

    # Run the train function
    loss, ec, dc = train(
        input_batches, input_lengths, target_batches, target_lengths,
        encoder, decoder,
        encoder_optimizer, decoder_optimizer
    )

    # Keep track of loss
    print_loss_total += loss
    plot_loss_total += loss
    eca += ec
    dca += dc
    
    # job.record(epoch, loss)

    if epoch % print_every == 0:
        print_loss_avg = print_loss_total / print_every
        print_loss_total = 0
        print_summary = '%s (%d %d%%) %.4f' % (time_since(start, epoch / n_epochs), epoch, epoch / n_epochs * 100, print_loss_avg)
        print(print_summary)
        
    if epoch % evaluate_every == 0:
        evaluate_randomly()
        evaluate_randomly()

#     if epoch % plot_every == 0:
#         plot_loss_avg = plot_loss_total / plot_every
#         plot_losses.append(plot_loss_avg)
#         plot_loss_total = 0
        
#         # TODO: Running average helper
#         ecs.append(eca / plot_every)
#         dcs.append(dca / plot_every)
#         ecs_win = 'encoder grad (%s)' % hostname
#         dcs_win = 'decoder grad (%s)' % hostname
#         vis.line(np.array(ecs), win=ecs_win, opts={'title': ecs_win})
#         vis.line(np.array(dcs), win=dcs_win, opts={'title': dcs_win})
#         eca = 0
#         dca = 0

2m 25s (- 239m 58s) (500 1%) 5.9261
4m 9s (- 204m 5s) (1000 2%) 5.3262
5m 54s (- 190m 53s) (1500 3%) 5.0771
7m 39s (- 183m 46s) (2000 4%) 4.9334
Input: watching friday night lights , <unk> hella socks lmaooo . like a house wife 💁 🏽 <eos>
Target: this show is fucking stressful <eos>
Predicted: i was just thinking about it <eos>
Input: almost a year with joe :) ) <eos>
Target: i'm so excited ! ! ! <eos>
Predicted: i was thinking about it . <eos>
9m 27s (- 179m 38s) (2500 5%) 4.8400
11m 12s (- 175m 38s) (3000 6%) 4.7588
12m 57s (- 172m 13s) (3500 7%) 4.7014
14m 43s (- 169m 14s) (4000 8%) 4.6281
Input: <unk> vodka is the worst thing ever created <eos>
Target: i fucked with it <eos>
Predicted: i know , i think it's the <unk> of the <unk> <eos>
Input: <unk> : she really meant <unk> <unk> <eos>
Target: he <unk> his first name differently <eos>
Predicted: she is a <unk> <eos>
16m 28s (- 166m 33s) (4500 9%) 4.5872
18m 13s (- 163m 59s) (5000 10%) 4.5265
19m 58s (- 161m 38s) (5500 11%) 4.5012
21m

116m 17s (- 59m 54s) (33000 66%) 3.3534
118m 2s (- 58m 8s) (33500 67%) 3.3385
119m 47s (- 56m 22s) (34000 68%) 3.3275
Input: i been doing so good ... but in and out is calling me . <eos>
Target: no . come to the gym . i'll treat u to in n out on friday <eos>
Predicted: i hope you are ok . <eos>
Input: can u not keep posting pics of or chat bc those icons are very embarrassing <unk> <eos>
Target: urs is the best <eos>
Predicted: i mean , i thought the same thing . <eos>
121m 33s (- 54m 36s) (34500 69%) 3.2980
123m 18s (- 52m 50s) (35000 70%) 3.2713
125m 3s (- 51m 4s) (35500 71%) 3.2553
126m 49s (- 49m 19s) (36000 72%) 3.2585
Input: damn man ! you really are obsessed with <unk> .. is your mom as cute as you are ? <eos>
Target: my mother hates nazi <unk> <unk> like you worse than i do , thx for asking . <eos>
Predicted: i think she is very <unk> . <eos>
Input: oh this is all you <eos>
Target: so little time ! <eos>
Predicted: i am not being <unk> . <eos>
128m 34s (- 47m 33s) (36500 73%) 3

In [45]:
evaluate_randomly()

Input: like this here . <eos>
Target: oh , but wait ! <eos>
Predicted: wait , what a <unk> of humanity . <eos>


In [46]:
torch.save(encoder.state_dict(), 'encoder.pt')
torch.save(decoder.state_dict(), 'decoder.pt')
torch.save(embedding.state_dict(), 'embedding.pt')

In [48]:
vocab_size = len(corpus.words)
embedding2 = nn.Embedding(vocab_size, hidden_size)

# Initialize models
encoder2 = EncoderRNN(hidden_size, embedding2, n_layers, dropout=dropout)
decoder2 = DecoderRNN(hidden_size, vocab_size, embedding2, n_layers, dropout=dropout)

In [65]:
embedding2.load_state_dict(torch.load('embedding.pt', map_location=lambda storage, loc: storage))
encoder2.load_state_dict(torch.load('encoder.pt', map_location=lambda storage, loc: storage))
decoder2.load_state_dict(torch.load('decoder.pt', map_location=lambda storage, loc: storage))

if USE_CUDA:
    embedding2.cuda()
    encoder2.cuda()
    decoder2.cuda()

In [66]:
def evaluate2(encoder2, decoder2, input_seq, max_length=20):
    input_lengths = [len(input_seq)]
    input_batches = Variable(torch.LongTensor([input_seq]), volatile=True).transpose(0, 1)
    
    if USE_CUDA:
        input_batches = input_batches.cuda()
        
    # Set to not-training mode to disable dropout
    encoder2.eval()
    decoder2.eval()
    
    # Run through encoder
    encoder_outputs, encoder_hidden = encoder2(input_batches, input_lengths, None)

    # Create starting vectors for decoder
    decoder_input = Variable(torch.LongTensor([corpus.word_to_id['<sos>']]), volatile=True) # SOS
    decoder_hidden = encoder_hidden[:decoder.n_layers] # Use last (forward) hidden state from encoder
    
    if USE_CUDA:
        decoder_input = decoder_input.cuda()

    # Store output words and attention states
    decoded_words = []
    
    # Run through decoder
    for di in range(max_length):
        decoder_output, decoder_hidden = decoder2(
            decoder_input, decoder_hidden
        )

        # Choose top word from output
        topv, topi = decoder_output.data.topk(1)
        ni = topi[0][0]
        if ni == corpus.word_to_id['<eos>']:
            decoded_words.append('<eos>')
            break
        else:
            decoded_words.append(corpus.words[ni])
            
        # Next input is chosen word
        decoder_input = Variable(torch.LongTensor([ni]))
        if USE_CUDA: decoder_input = decoder_input.cuda()

    # Set back to training mode
    encoder2.train()
    decoder2.train()
    
    return decoded_words

In [67]:
def evaluate_randomly2():
    index = random.choice(range(len(corpus.x_test)))
    input_seq, target_seq = corpus.x_test[index], corpus.y_test[index]
    output_words = evaluate2(encoder2, decoder2, input_seq)
    print("Input:", ' '.join(corpus.ids_to_text(input_seq)))
    print("Target:", ' '.join(corpus.ids_to_text(target_seq)))
    print("Predicted:", ' '.join(output_words))

In [73]:
evaluate_randomly2()

Input: do i have to read art of war to figure out your strategy ! ? <eos>
Target: there isn't that i have seen work . many hours of study <eos>
Predicted: it is pretty good , it's just a bunch of habit of knowledge . <eos>


In [58]:
input

Parameter containing:
-1.7006e+00  6.8077e-01 -1.2997e+00  ...   1.2223e+00 -4.0719e-01 -7.5291e-02
 5.2378e-01  1.8418e-01  1.9792e+00  ...  -2.6599e-02  6.9974e-01  5.8068e-01
-1.2809e-02  2.0259e+00 -2.4834e-01  ...   2.0912e-01 -6.8432e-01  7.9243e-02
                ...                   ⋱                   ...                
 5.4996e-01 -4.5863e-02  9.0763e-01  ...   9.0863e-01 -9.7212e-01 -6.7547e-01
 2.3864e-01 -9.5131e-01  9.1130e-01  ...   6.0205e-01  7.4683e-01  3.3002e-01
-2.3100e-01 -9.5948e-02  5.5557e-01  ...  -7.7474e-01 -1.7721e-01 -5.0797e-01
[torch.cuda.FloatTensor of size 10000x500 (GPU 0)]