In [87]:
import os
import time
import math
import random

import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
from masked_cross_entropy import masked_cross_entropy

In [13]:
USE_CUDA = torch.cuda.is_available()
USE_CUDA

False

In [14]:
base_dir = 'data/twitter'
train_dir = os.path.join(base_dir, 'twitter_clean.txt')
vocab_dir = os.path.join(base_dir, 'twitter_vocab.txt')

In [15]:
class Corpus(object):
    """
    文本预处理，获取词汇表，并将字符串文本转换为数字序列。
    """

    def __init__(self, train_dir, vocab_dir):
        assert os.path.exists(train_dir), 'File %s does not exist.' % train_dir
        assert os.path.exists(vocab_dir), 'File %s does not exist.' % vocab_dir

        words = open(vocab_dir, encoding='utf-8').read().strip().split('\n')
        word_to_id = dict(zip(words, range(len(words))))
        
        assert word_to_id['<pad>'] == 0, "<pad> id should be 0."
        
        self.words = words
        self.word_to_id = word_to_id
        
        self.tokenize(train_dir)
        
    def tokenize(self, train_dir):
        data = open(train_dir, encoding='utf-8').read().strip().split('\n')
        questions, answers = [], []
        for line in data:
            question, answer = line.split(" ==> ")
            questions.append(self.text_to_ids(question))
            answers.append(self.text_to_ids(answer))
            
        total_num = len(questions)
        train_num = int(0.9 * total_num)
        self.x_train, self.y_train = questions[:train_num], answers[:train_num]
        self.x_test, self.y_test = questions[train_num:], answers[train_num:]
        
    def text_to_ids(self, text):
        return [self.word_to_id[x] for x in (text.split() + ['<eos>'])]
    
    def ids_to_text(self, ids):
        return [self.words[x] for x in ids]

    def __repr__(self):
        return "Train length: %d, Test length: %d, Vocabulary size: %d" % (len(self.x_train), 
                                                                           len(self.x_test), 
                                                                           len(self.words))

TypeError: object of type 'Corpus' has no len()

In [16]:
corpus = Corpus(train_dir, vocab_dir)
corpus

Train length: 194785, Test length: 21643, Vocabulary size: 10000

In [17]:
class DataSet(object):
    def __init__(self, data, labels, batch_size=64):
        num_batches = len(data) // batch_size
        data, labels = data[:(num_batches * batch_size)], labels[:(num_batches * batch_size)]
        self.data = []
        for i in range(num_batches):
            x_batch = data[(i * batch_size):((i+1)*batch_size)]
            y_batch = labels[(i * batch_size):((i+1)*batch_size)]
            x_pad, x_lengths, y_pad, y_lengths = self.pad_batch(x_batch, y_batch)
            self.data.append((x_pad, x_lengths, y_pad, y_lengths))
            
    def pad_batch(self, x_batch, y_batch):
        seq_pairs = sorted(zip(x_batch, y_batch), key=lambda p: len(p[0]), reverse=True)
        x_batch, y_batch = zip(*seq_pairs)
        
        x_lengths = list(map(len, x_batch))
        x_padded = [self.pad_seq(s, max(x_lengths)) for s in x_batch]
        
        y_lengths = list(map(len, y_batch))
        y_padded = [self.pad_seq(s, max(y_lengths)) for s in y_batch]
        
        input_var = Variable(torch.LongTensor(x_padded)).transpose(0, 1)
        target_var = Variable(torch.LongTensor(y_padded)).transpose(0, 1)
        
        if USE_CUDA:
            input_var = input_var.cuda()
            target_var = target_var.cuda()
            
        return input_var, x_lengths, target_var, y_lengths
    
    def pad_seq(self, seq, max_len):
        return seq + [0] * (max_len - len(seq))
    
    def __getitem__(self, index):
        return self.data[index]
    
    def __len__(self):
        return len(self.data)

In [34]:
batch_size = 64
train_data = DataSet(corpus.x_train, corpus.y_train, batch_size)

In [35]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, embedding, n_layers=1, dropout=0.1):
        super(EncoderRNN, self).__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.dropout = dropout
        
        self.embedding = embedding
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=dropout)
        
    def forward(self, input_seqs, input_lengths, hidden=None):
        embedded = self.embedding(input_seqs)
        packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, input_lengths)
        outputs, hidden = self.gru(packed, hidden)
        outputs, output_lengths = torch.nn.utils.rnn.pad_packed_sequence(outputs) # unpack (back to padded)
        return outputs, hidden

In [36]:
x_pad, x_lengths, y_pad, y_lengths = random.choice(train_data)

In [37]:
x_pad

Variable containing:
    3    26   354  ...    294     3   102
   27   441   414  ...      3   574  2750
    3     8   337  ...     12   100     2
       ...          ⋱          ...       
 6293   295    12  ...      0     0     0
  120     6     2  ...      0     0     0
    2     2     0  ...      0     0     0
[torch.LongTensor of size 21x64]

In [38]:
embedding = nn.Embedding(10000, 128)
encoder_test = EncoderRNN(10000, 128, embedding)
encoder_test

EncoderRNN(
  (embedding): Embedding(10000, 128)
  (gru): GRU(128, 128, dropout=0.1)
)

In [39]:
encoder_outputs, encoder_hidden = encoder_test(x_pad, x_lengths, None)

In [40]:
encoder_outputs

Variable containing:
( 0 ,.,.) = 
  0.0967  0.0174 -0.1583  ...   0.1991 -0.1848 -0.3645
  0.3320 -0.1663 -0.0866  ...  -0.3093  0.3217  0.3075
 -0.3506  0.0435  0.1432  ...   0.0499 -0.3180 -0.2288
           ...             ⋱             ...          
  0.0418 -0.1770  0.2808  ...   0.3639 -0.3336  0.1129
  0.0967  0.0174 -0.1583  ...   0.1991 -0.1848 -0.3645
  0.3248  0.0684 -0.0827  ...  -0.0618 -0.2805 -0.2774

( 1 ,.,.) = 
  0.0791  0.0415  0.1415  ...  -0.3874 -0.1195 -0.5748
  0.3152  0.1634  0.3865  ...  -0.3589  0.4808 -0.4656
 -0.2568 -0.3121 -0.1265  ...   0.3454  0.2699 -0.2856
           ...             ⋱             ...          
  0.0958 -0.0335  0.0219  ...   0.4159 -0.2700 -0.3298
 -0.0577 -0.0062 -0.2672  ...  -0.1798 -0.3036 -0.4636
 -0.1150  0.3221 -0.2650  ...  -0.3449 -0.0219 -0.1261

( 2 ,.,.) = 
  0.1644  0.0734 -0.0544  ...  -0.0364 -0.2185 -0.5937
  0.3672  0.1820  0.2226  ...  -0.1956 -0.1193 -0.5428
 -0.0770  0.0233 -0.3029  ...   0.3143 -0.0126 -0.3889
   

In [41]:
encoder_hidden

Variable containing:
( 0 ,.,.) = 
 -8.6541e-03 -1.4094e-01 -2.8744e-01  ...  -3.4583e-01  3.2744e-01  1.9311e-02
 -1.0469e-01  3.7888e-01 -7.9401e-02  ...  -1.0945e-01  3.5973e-01 -2.4294e-01
 -2.2767e-01  1.0918e-01  9.6437e-02  ...  -5.2132e-01 -1.1806e-01  1.3289e-01
                 ...                   ⋱                   ...                
 -8.3465e-02 -4.5670e-02  1.3540e-01  ...  -2.8234e-01  8.8925e-02  3.5188e-02
 -3.1121e-01  1.0943e-01 -3.9635e-01  ...  -3.6520e-02  1.2125e-01 -1.9541e-01
 -2.5543e-01  2.5967e-01 -3.4784e-01  ...  -3.0540e-01  1.1317e-01 -5.7166e-02
[torch.FloatTensor of size 1x64x128]

In [42]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, embedding, n_layers=1, dropout=0.1):
        super(DecoderRNN, self).__init__()

        # Keep for reference
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout = dropout

        # Define layers
        self.embedding = embedding
        self.embedding_dropout = nn.Dropout(dropout)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=dropout)
        self.out = nn.Linear(hidden_size, output_size)

    def forward(self, input_seq, last_hidden):
        # Note: we run this one step at a time

        # Get the embedding of the current input word (last output word)
        batch_size = input_seq.size(0)
        embedded = self.embedding(input_seq)
        embedded = self.embedding_dropout(embedded)
        embedded = embedded.view(1, batch_size, self.hidden_size) # S=1 x B x N

        # Get current hidden state from input word and last hidden state
        rnn_output, hidden = self.gru(embedded, last_hidden)

        rnn_output = rnn_output.squeeze(0) # S=1 x B x N -> B x N

        # Finally predict next token (Luong eq. 6, without softmax)
        output = self.out(rnn_output)

        # Return final output, hidden state, and attention weights (for visualization)
        return output, hidden

In [43]:
decoder_test = DecoderRNN(128, 10000, embedding)
decoder_test

DecoderRNN(
  (embedding): Embedding(10000, 128)
  (embedding_dropout): Dropout(p=0.1)
  (gru): GRU(128, 128, dropout=0.1)
  (out): Linear(in_features=128, out_features=10000)
)

In [44]:
max_target_length = max(y_lengths)

In [45]:
decoder_input = Variable(torch.LongTensor([corpus.word_to_id['<sos>']] * batch_size))
decoder_hidden = encoder_hidden[:decoder_test.n_layers]

In [46]:
decoder_hidden

Variable containing:
( 0 ,.,.) = 
 -8.6541e-03 -1.4094e-01 -2.8744e-01  ...  -3.4583e-01  3.2744e-01  1.9311e-02
 -1.0469e-01  3.7888e-01 -7.9401e-02  ...  -1.0945e-01  3.5973e-01 -2.4294e-01
 -2.2767e-01  1.0918e-01  9.6437e-02  ...  -5.2132e-01 -1.1806e-01  1.3289e-01
                 ...                   ⋱                   ...                
 -8.3465e-02 -4.5670e-02  1.3540e-01  ...  -2.8234e-01  8.8925e-02  3.5188e-02
 -3.1121e-01  1.0943e-01 -3.9635e-01  ...  -3.6520e-02  1.2125e-01 -1.9541e-01
 -2.5543e-01  2.5967e-01 -3.4784e-01  ...  -3.0540e-01  1.1317e-01 -5.7166e-02
[torch.FloatTensor of size 1x64x128]

In [47]:
all_decoder_outputs = Variable(torch.zeros(max_target_length, batch_size, decoder_test.output_size))

In [48]:
if USE_CUDA:
    all_decoder_outputs = all_decoder_outputs.cuda()
    decoder_input = decoder_input.cuda()

In [49]:
# Run through decoder one time step at a time
for t in range(max_target_length):
    decoder_output, decoder_hidden = decoder_test(
        decoder_input, decoder_hidden
    )
    all_decoder_outputs[t] = decoder_output # Store this step's outputs
    decoder_input = y_pad[t] # Next input is current target

In [50]:
loss = masked_cross_entropy(
    all_decoder_outputs.transpose(0, 1).contiguous(),
    y_pad.transpose(0, 1).contiguous(),
    y_lengths, 
    USE_CUDA
)
print('loss', loss.data[0])

loss 9.22211742401123


In [77]:
def train(input_batches, input_lengths, target_batches, target_lengths, encoder, decoder, encoder_optimizer, decoder_optimizer):
    
    # Zero gradients of both optimizers
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    loss = 0 # Added onto for each word

    # Run words through encoder
    encoder_outputs, encoder_hidden = encoder(input_batches, input_lengths, None)
    
    # Prepare input and output variables
    decoder_input = Variable(torch.LongTensor([corpus.word_to_id['<sos>']] * batch_size))
    decoder_hidden = encoder_hidden[:decoder.n_layers] # Use last (forward) hidden state from encoder

    max_target_length = max(target_lengths)
    all_decoder_outputs = Variable(torch.zeros(max_target_length, batch_size, decoder.output_size))

    # Move new Variables to CUDA
    if USE_CUDA:
        decoder_input = decoder_input.cuda()
        all_decoder_outputs = all_decoder_outputs.cuda()

    # Run through decoder one time step at a time
    for t in range(max_target_length):
        decoder_output, decoder_hidden = decoder(
            decoder_input, decoder_hidden
        )

        all_decoder_outputs[t] = decoder_output
        decoder_input = target_batches[t] # Next input is current target

    # Loss calculation and backpropagation
    loss = masked_cross_entropy(
        all_decoder_outputs.transpose(0, 1).contiguous(), # -> batch x seq
        target_batches.transpose(0, 1).contiguous(), # -> batch x seq
        target_lengths,
        USE_CUDA
    )
    loss.backward()
    
    # Clip gradient norms
    ec = torch.nn.utils.clip_grad_norm(encoder.parameters(), clip)
    dc = torch.nn.utils.clip_grad_norm(decoder.parameters(), clip)

    # Update parameters with optimizers
    encoder_optimizer.step()
    decoder_optimizer.step()
    
    return loss.data[0], ec, dc

In [105]:
index = random.choice(range(len(corpus.x_test)))
print(index)
input_seq, target_seq = corpus.x_test[index], corpus.y_test[index]
print(' '.join(corpus.ids_to_text(input_seq)))
print(' '.join(corpus.ids_to_text(target_seq)))

15988
bruh this whole time i've been looking for a plug while i had a <unk> charger in my bag 😑 <eos>
but if you're looking for a real plug it my line <eos>


In [151]:
def evaluate_randomly():
    index = random.choice(range(len(corpus.x_test)))
    input_seq, target_seq = corpus.x_test[index], corpus.y_test[index]
    output_words = evaluate(input_seq)
    print("Input:", ' '.join(corpus.ids_to_text(input_seq)))
    print("Target:", ' '.join(corpus.ids_to_text(target_seq)))
    print("Predicted:", ' '.join(output_words))
    


In [154]:
evaluate_randomly()

Input: ( video ) high school <unk> from florida turns all the way up for homecoming - <eos>
Target: support and retweet <eos>
Predicted: <unk> <unk> <unk> . <eos>


In [119]:
def evaluate(input_seq, max_length=20):
    input_lengths = [len(input_seq)]
    input_batches = Variable(torch.LongTensor([input_seq]), volatile=True).transpose(0, 1)
    
    if USE_CUDA:
        input_batches = input_batches.cuda()
        
    # Set to not-training mode to disable dropout
    encoder.eval()
    decoder.eval()
    
    # Run through encoder
    encoder_outputs, encoder_hidden = encoder(input_batches, input_lengths, None)

    # Create starting vectors for decoder
    decoder_input = Variable(torch.LongTensor([corpus.word_to_id['<sos>']]), volatile=True) # SOS
    decoder_hidden = encoder_hidden[:decoder.n_layers] # Use last (forward) hidden state from encoder
    
    if USE_CUDA:
        decoder_input = decoder_input.cuda()

    # Store output words and attention states
    decoded_words = []
    
    # Run through decoder
    for di in range(max_length):
        decoder_output, decoder_hidden = decoder(
            decoder_input, decoder_hidden
        )

        # Choose top word from output
        topv, topi = decoder_output.data.topk(1)
        ni = topi[0][0]
        if ni == corpus.word_to_id['<eos>']:
            decoded_words.append('<eos>')
            break
        else:
            decoded_words.append(corpus.words[ni])
            
        # Next input is chosen word
        decoder_input = Variable(torch.LongTensor([ni]))
        if USE_CUDA: decoder_input = decoder_input.cuda()

    # Set back to training mode
    encoder.train()
    decoder.train()
    
    return decoded_words

In [155]:
# Configure models
hidden_size = 500
n_layers = 2
dropout = 0.1
batch_size = 50

train_data = DataSet(corpus.x_train, corpus.y_train, batch_size)

# Configure training/optimization
clip = 50.0
learning_rate = 0.0001
decoder_learning_ratio = 5.0
n_epochs = 50000
epoch = 0
plot_every = 20
print_every = 2
evaluate_every = 10

vocab_size = len(corpus.words)
embedding = nn.Embedding(vocab_size, hidden_size)

# Initialize models
encoder = EncoderRNN(vocab_size, hidden_size, embedding, n_layers, dropout=dropout)
decoder = DecoderRNN(hidden_size, vocab_size, embedding, n_layers, dropout=dropout)

# Initialize optimizers and criterion
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate * decoder_learning_ratio)

# Move models to GPU
if USE_CUDA:
    encoder.cuda()
    decoder.cuda()

# Keep track of time elapsed and running averages
start = time.time()
plot_losses = []
print_loss_total = 0 # Reset every print_every
plot_loss_total = 0 # Reset every plot_every

In [None]:
def as_minutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def time_since(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (as_minutes(s), as_minutes(rs))

In [None]:
# Begin!
ecs = []
dcs = []
eca = 0
dca = 0

while epoch < n_epochs:
    epoch += 1
    
    # Get training data for this cycle
    input_batches, input_lengths, target_batches, target_lengths = random.choice(train_data)

    # Run the train function
    loss, ec, dc = train(
        input_batches, input_lengths, target_batches, target_lengths,
        encoder, decoder,
        encoder_optimizer, decoder_optimizer
    )

    # Keep track of loss
    print_loss_total += loss
    plot_loss_total += loss
    eca += ec
    dca += dc
    
    # job.record(epoch, loss)

    if epoch % print_every == 0:
        print_loss_avg = print_loss_total / print_every
        print_loss_total = 0
        print_summary = '%s (%d %d%%) %.4f' % (time_since(start, epoch / n_epochs), epoch, epoch / n_epochs * 100, print_loss_avg)
        print(print_summary)
        
    if epoch % evaluate_every == 0:
        evaluate_randomly()

#     if epoch % plot_every == 0:
#         plot_loss_avg = plot_loss_total / plot_every
#         plot_losses.append(plot_loss_avg)
#         plot_loss_total = 0
        
#         # TODO: Running average helper
#         ecs.append(eca / plot_every)
#         dcs.append(dca / plot_every)
#         ecs_win = 'encoder grad (%s)' % hostname
#         dcs_win = 'decoder grad (%s)' % hostname
#         vis.line(np.array(ecs), win=ecs_win, opts={'title': ecs_win})
#         vis.line(np.array(dcs), win=dcs_win, opts={'title': dcs_win})
#         eca = 0
#         dca = 0

0m 5s (- 2428m 18s) (2 0%) 9.1916
0m 10s (- 2261m 46s) (4 0%) 9.0860
0m 17s (- 2378m 54s) (6 0%) 8.9007
0m 22s (- 2362m 18s) (8 0%) 8.4297
0m 27s (- 2312m 44s) (10 0%) 7.6483
Input: funny how he has a phd and you don't <eos>
Target: uhh first of all i put the wrong teachers name down that's my fault <eos>
Predicted: <unk> <eos>
0m 32s (- 2276m 48s) (12 0%) 6.9668
0m 37s (- 2243m 0s) (14 0%) 6.6498
