In [1]:
import os
import json
import random
import pickle
import math
from pathlib import Path
from utils import Tokenizer, Embedding
from torch.nn.utils import clip_grad_norm
from dataset import Seq2SeqDataset
from tqdm import tqdm
from tqdm import tnrange
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
import time

In [2]:
with open("../datasets/seq2seq/train.pkl", "rb") as f:
    training = pickle.load(f)
with open("../datasets/seq2seq/valid.pkl", "rb") as f:
    valid = pickle.load(f)
with open("../datasets/seq2seq/embedding.pkl", "rb") as f:
    embedding = pickle.load(f)
    
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
print(torch.__version__)

cuda
1.0.0


In [3]:
import math
import torch
import random
from torch import nn
from torch.autograd import Variable
import torch.nn.functional as F


class Encoder(nn.Module):
    def __init__(self, input_size, embedding, embedding_dim, hidden_size,
                 n_layers=1, dropout=0.5):
        super(Encoder, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        emb = nn.Embedding(input_size, embedding_dim)
        emb.weight.data.copy_(embedding.vectors)
        self.embedding = emb
        self.embedding.weight.requires_grad = False
        self.rnn = nn.GRU(300, hidden_size, n_layers, dropout = dropout ,bidirectional=True)
        self.dropout = nn.Dropout(dropout, inplace=False)

    def forward(self, src, hidden=None):
        embedded = self.dropout(self.embedding(src))
        outputs, hidden = self.rnn(embedded)
        # sum bidirectional outputs
        hidden = torch.tanh(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
#         print(hidden)
        return outputs, hidden

class Decoder(nn.Module):
    def __init__(self, output_dim, embedding, embedding_dim, enc_hidden_dim, hid_dim, n_layers, dropout):
        super().__init__()
        
        self.output_dim = output_dim
        self.hidden_size = hid_dim
        self.n_layers = n_layers
        self.emb_dim = embedding_dim
    
        emb = nn.Embedding(output_dim, embedding_dim)
        emb.weight.data.copy_(embedding.vectors)
        self.embedding = emb
        
        self.rnn = nn.GRU(embedding_dim, hid_dim, dropout=0.5)
        
        self.fc_out = nn.Linear(hid_dim, output_dim , bias=True)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden):
        
        #input = [batch size]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        #n directions in the decoder will both always be 1, therefore:
        #hidden = [n layers, batch size, hid dim]
        #context = [n layers, batch size, hid dim]
        
        input = input.unsqueeze(0)
        
        #input = [1, batch size]
        
        embedded = self.dropout(self.embedding(input))
        
        #embedded = [1, batch size, emb dim]
                
        output, hidden = self.rnn(embedded, hidden)
        
        #output = [seq len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        #seq len and n directions will always be 1 in the decoder, therefore:
        #output = [1, batch size, hid dim]
        #hidden = [n layers, batch size, hid dim]
        #cell = [n layers, batch size, hid dim]
        prediction = self.fc_out(output.squeeze(0))
        #prediction = [batch size, output dim]
        return prediction, hidden



class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
#         assert encoder.hidden_size == decoder.hidden_size, \
#             "Hidden dimensions of encoder and decoder must be equal!"
        assert encoder.n_layers == decoder.n_layers, \
            "Encoder and decoder must have equal number of layers!"
        
    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        
        #src = [src len, batch size]
        #trg = [trg len, batch size]
        #teacher_forcing_ratio is probability to use teacher forcing
        #e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time
        
        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        #tensor to store decoder outputs
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        
        #last hidden state of the encoder is used as the initial hidden state of the decoder
#         print(src)
        encoder_outputs, hidden = self.encoder(src)
        hidden = hidden.unsqueeze(0)
#         hidden = hidden.view(self.encoder.n_layers, batch_size, -1)
        #first input to the decoder is the <sos> tokens
        input = trg[0,:]
        
        for t in range(1, trg_len):
            
            #insert input token embedding, previous hidden and previous cell states
            #receive output tensor (predictions) and new hidden and cell states
            output, hidden = self.decoder(input, hidden)
            
            #place predictions in a tensor holding predictions for each token
            outputs[t] = output
            
            #decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            
            #get the highest predicted token from our predictions
            top1 = output.argmax(1) 
            
            #if teacher forcing, use actual next token as next input
            #if not, use predicted token
            input = trg[t] if teacher_force else top1
        
        return outputs
    

In [10]:
INPUT_DIM = len(embedding.vocab)
OUTPUT_DIM = len(embedding.vocab)
ENC_EMB_DIM = 300
DEC_EMB_DIM = 300
ENC_HID_DIM = 128
DEC_HID_DIM = 256
N_LAYERS = 1
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, embedding,ENC_EMB_DIM, ENC_HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, embedding, ENC_EMB_DIM,ENC_HID_DIM, DEC_HID_DIM, N_LAYERS, DEC_DROPOUT)

model = Seq2Seq(enc, dec, device).to(device)
# model.load_state_dict(torch.load('./save/seq2seq(no_attention)-model3.pt'))

In [6]:
# def init_weights(m):
#     for name, param in m.named_parameters():
#         nn.init.uniform_(param.data, -0.08, 0.08)
        
# model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(97513, 300)
    (rnn): GRU(300, 128, dropout=0.5, bidirectional=True)
    (dropout): Dropout(p=0.5)
  )
  (decoder): Decoder(
    (embedding): Embedding(97513, 300)
    (rnn): GRU(300, 256, dropout=0.5)
    (fc_out): Linear(in_features=256, out_features=97513, bias=True)
    (dropout): Dropout(p=0.5)
  )
)

In [11]:
optimizer = optim.Adam(model.parameters(), lr=0.0001)
criterion = nn.CrossEntropyLoss(ignore_index = 0)

In [12]:
from tqdm.notebook import tqdm
def train(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    print_every = 500
    print_loss = 0

    for i, batch in tqdm(enumerate(iterator)):
        src = batch['text'].t().to(device)
        trg = batch['summary'].t().to(device)
        
        optimizer.zero_grad()
        
        output = model(src, trg)
        #trg = [trg len, batch size]
        #output = [trg len, batch size, output dim]
        output_dim = output.shape[-1]
        
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        
        #trg = [(trg len - 1) * batch size]
        #output = [(trg len - 1) * batch size, output dim]
        
        loss = criterion(output, trg)
        print_loss += loss.item()
        if(i % print_every == 0 and i):
            print("avg training loss:", print_loss/print_every)
            print_loss = 0

        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

def evaluate(model, iterator, criterion):
#     model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            src = batch['text'].t().to(device)
            trg = batch['summary'].t().to(device)
            output = model(src, trg, 0) #turn off teacher forcing
            #trg = [trg len, batch size]
            #output = [trg len, batch size, output dim]
            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)
            #trg = [(trg len - 1) * batch size]
            #output = [(trg len - 1) * batch size, output dim]
            loss = criterion(output, trg)
            epoch_loss += loss.item()
    return epoch_loss / len(iterator)

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [13]:
N_EPOCHS = 10
CLIP = 30
BATCH_SIZE = 16

best_valid_loss = float('inf')
train_iterator = torch.utils.data.DataLoader(training, BATCH_SIZE, shuffle=True,collate_fn=valid.collate_fn)
valid_iterator = torch.utils.data.DataLoader(valid, BATCH_SIZE, shuffle=True,collate_fn=valid.collate_fn)

for epoch in tnrange(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), './save/seq2seq(no_attention)-model3.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

  if __name__ == '__main__':


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

avg training loss: 5.007659989356995
avg training loss: 4.97812202835083
avg training loss: 4.988719362258911
avg training loss: 4.976480197906494
avg training loss: 4.98577742099762
avg training loss: 4.997849676132202
avg training loss: 5.012466372013092
avg training loss: 4.982708010673523

Epoch: 01 | Time: 27m 27s
	Train Loss: 4.989 | Train PPL: 146.772
	 Val. Loss: 6.423 |  Val. PPL: 616.011


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

avg training loss: 4.92975253200531
avg training loss: 4.925217782497406
avg training loss: 4.94553226184845
avg training loss: 4.920001559257507
avg training loss: 4.919572986125946
avg training loss: 4.903115026473999
avg training loss: 4.9238519268035885
avg training loss: 4.934251097202301

Epoch: 02 | Time: 27m 24s
	Train Loss: 4.925 | Train PPL: 137.705
	 Val. Loss: 6.399 |  Val. PPL: 601.464


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

KeyboardInterrupt: 

In [17]:
SOS_token = 1
EOS_token = 2

def generate_pair(data):
    pairs = []
    input_length = len(data.data)
    for i in range(input_length):
        input = data.__getitem__(i)['text']
        output = data.__getitem__(i)['summary']
        id = data.__getitem__(i)['id']
        pairs.append((input, output, id))
    return pairs


class Sentence:
    def __init__(self, decoder_hidden, last_idx=SOS_token, sentence_idxes=[], sentence_scores=[]):
        if(len(sentence_idxes) != len(sentence_scores)):
            raise ValueError("length of indexes and scores should be the same")
        self.decoder_hidden = decoder_hidden
        self.last_idx = last_idx
        self.sentence_idxes =  sentence_idxes
        self.sentence_scores = sentence_scores

    def avgScore(self):
        if len(self.sentence_scores) == 0:
            raise ValueError("Calculate average score of sentence, but got no word")
        # return mean of sentence_score
        return sum(self.sentence_scores) / len(self.sentence_scores)

    def addTopk(self, topi, topv, decoder_hidden, beam_size, voc):
        topv = torch.log(topv)
        terminates, sentences = [], []
        for i in range(beam_size):
            if topi[0][i] == EOS_token:
                terminates.append(([voc[idx.item()] for idx in self.sentence_idxes] + ['<EOS>'],
                                   self.avgScore())) # tuple(word_list, score_float
                continue
            idxes = self.sentence_idxes[:] # pass by value
            scores = self.sentence_scores[:] # pass by value
            idxes.append(topi[0][i])
            scores.append(topv[0][i])
            sentences.append(Sentence(decoder_hidden, topi[0][i], idxes, scores))
        return terminates, sentences

    def toWordScore(self, voc):
        words = []
        for i in range(len(self.sentence_idxes)):
            if self.sentence_idxes[i] == EOS_token:
                words.append('<EOS>')
            else:
                words.append(voc[self.sentence_idxes[i].item()])
        if self.sentence_idxes[-1] != EOS_token:
            words.append('<EOS>')
        return (words, self.avgScore())

def beam_decode(decoder, decoder_hidden, encoder_outputs, voc, beam_size, max_length=40):
    terminal_sentences, prev_top_sentences, next_top_sentences = [], [], []
    prev_top_sentences.append(Sentence(decoder_hidden))
    for i in range(max_length):
        for sentence in prev_top_sentences:
            decoder_input = torch.LongTensor([sentence.last_idx])
            decoder_input = decoder_input.to(device)
            decoder_hidden = sentence.decoder_hidden
            decoder_output, decoder_hidden, _ = decoder(
                decoder_input, decoder_hidden, encoder_outputs
            )
            topv, topi = decoder_output.topk(beam_size)
            term, top = sentence.addTopk(topi, topv, decoder_hidden, beam_size, voc)
            terminal_sentences.extend(term)
            next_top_sentences.extend(top)

        next_top_sentences.sort(key=lambda s: s.avgScore(), reverse=True)
        prev_top_sentences = next_top_sentences[:beam_size]
        next_top_sentences = []

    terminal_sentences += [sentence.toWordScore(voc) for sentence in prev_top_sentences]
    terminal_sentences.sort(key=lambda x: x[1], reverse=True)

    n = min(len(terminal_sentences), 15)
    return terminal_sentences[:n]

def decode(decoder, decoder_hidden, encoder_outputs, voc, max_length=40):

    decoder_input = torch.LongTensor([SOS_token])
    decoder_input = decoder_input.to(device)

    decoded_words = []
    decoder_attentions = torch.zeros(max_length, max_length) #TODO: or (MAX_LEN+1, MAX_LEN+1)
#     print(decoder_hidden.size())

    for di in range(max_length):
        decoder_output, decoder_hidden = decoder(
            decoder_input, decoder_hidden)
        _, topi = decoder_output.topk(3)
        ni = topi[0][0]
        if ni == EOS_token:
            decoded_words.append('<EOS>')
            break
        else:
            decoded_words.append(voc[ni.item()])

        decoder_input = torch.LongTensor([ni])
        decoder_input = decoder_input.to(device)

    return decoded_words, decoder_attentions[:di + 1]


def evaluate(encoder, decoder, voc, sentence, beam_size = 1, max_length=40):
    indexes_batch = [sentence] #[1, seq_len]
    lengths = [len(indexes) for indexes in indexes_batch]
    input_batch = torch.LongTensor(indexes_batch).transpose(0, 1)
    input_batch = input_batch.to(device)
    batch_size = input_batch.size(1)
    encoder_outputs, encoder_hidden = encoder(input_batch, None)

    decoder_hidden = encoder_hidden.view(encoder.n_layers, batch_size, -1)
    if beam_size == 1:
        return decode(decoder, decoder_hidden, encoder_outputs, voc)
    else:
        return beam_decode(decoder, decoder_hidden, encoder_outputs, voc, beam_size)


def evaluateRandomly(encoder, decoder, tokenizer, voc, pairs, reverse, beam_size, n=10):
    for _ in range(n):
        pair = random.choice(pairs)
        print("=============================================================")
        if reverse:
            print('>', " ".join(reversed(pair[0].split())))
        else:
            print('>', tokenizer.decode(pair[0]))
            print('=', tokenizer.decode(pair[1]))
        if beam_size == 1:
            output_words, _ = evaluate(encoder, decoder, voc, pair[0], beam_size)
            output_sentence = ' '.join(output_words)
            print('<', output_sentence)
        else:
            output_words_list = evaluate(encoder, decoder, voc, pair[0], beam_size)
            for output_words, score in output_words_list:
                output_sentence = ' '.join(output_words)
                print("{:.3f} < {}".format(score, output_sentence))

In [15]:
tokenizer = Tokenizer(embedding.vocab, lower=False)
pairs = generate_pair(valid)
evaluateRandomly(enc, dec, tokenizer, embedding.vocab, pairs, False, 1, 10)

> the case relates to a visit last year from a lawyer from the international criminal court ( icc ) who was accused of passing information to mr gaddafi . <unk> mr gaddafi has also been indicted for war crimes during the 2011 uprising . <unk> both libya and the icc claim jurisdiction for that trial . <unk> when asked whether he was in good health , mr gaddafi said that he was and gave a thumbs - up sign , the bbc 's foreign editor , john simpson , reports from <unk> . <unk> mr gaddafi faces charges of complicity in exchanging information , obtaining documents that threaten national security and insulting the national flag . <unk> evidence was briefly presented at the hearing , including a pen with a camera in it and a watch , which the prosecution alleges were used in passing illicit information . <unk> representatives from local and international human rights organisations were also present at the hearing . <unk> the trial has been postponed until 19 september . <unk> in june last yea

< a man - old man has been arrested in connection with the death of a man in a . <unk> <EOS>
> the ex - england hooker , 57 , won the competition with devon in 2004 as a player and as coach in 2007 , and played in cornwall 's triumphant 1991 campaign . <unk> dawe has been cornwall head coach since 2013 , leading them to victories in the 2015 and 2016 twickenham finals . <unk> " it 's a nice local game for me , but i 'm travelling as a <unk> , " he said . <unk> " i 'm 100 % behind cornwall , there 's no emotion at all . " <unk> the game will be played at ivybridge on saturday , with cornwall favourites as they continue their pursuit of a hat - trick of county championship titles against a devon side who were only promoted to division one because of a restructuring of the competition . <unk> devon head coach dan parkes played under dawe at plymouth albion , and acknowledges his side are underdogs . <unk> " it 's going to be hard , the boys are going to be up against it , " he told bbc sp

In [20]:
model = Seq2Seq(enc, dec, device)
model.load_state_dict(torch.load('./save/seq2seq(no_attention)-model-final.pt'))

FileNotFoundError: [Errno 2] No such file or directory: './save/seq2seq(no_attention)-model-final.pt'

In [18]:
import json

def predict_out(list_dict, file_path):
    with open(file_path , 'w') as outfile:
        for entry in list_dict:
            json.dump(entry, outfile)
            outfile.write('\n')
            
def predict(encoder, decoder, voc, pairs):
    n = len(pairs)
    out = []
    show_per = 4000
    for i in tnrange(n):
        predict = {}
        pair = pairs[i]
        output_words, attention = evaluate(encoder, decoder, voc, pair[0])
        output_sentence = ' '.join(output_words)
        if(i % show_per == 0):
            print('>', tokenizer.decode(pair[0]))
            print('<', output_sentence[:-6])
#             show_attention(tokenizer.decode(pair[0]), output_sentence, attention)
        predict['id'] = pair[2]
        predict['predict'] = output_sentence[:-6]
        out.append(predict)
    return out

pairs = generate_pair(valid)
out = predict(model.encoder, model.decoder, embedding.vocab, pairs)
predict_out(out, "../data/seq2seq_output.jsonl")

  


HBox(children=(IntProgress(value=0, max=20000), HTML(value='')))

> the building was to be pulled down as part of a redevelopment of the lawn complex in lincoln . <unk> bosses at woodside wildlife park have now stepped in to save it and will move it to their site near <unk> . <unk> it will be used as an attraction housing exotic animals and coral reef aquariums . <unk> the conservatory is named after the lincolnshire botanist who travelled with captain james cook on his first voyage to the south pacific in 1768 . <unk> the grade ii listed lawn complex was sold by city of lincoln council last year to the stokes coffee company which plans to open a cafe and museum on the site . <unk> neil <unk> , director of the wildlife park , said he wanted to save the building , which housed exotic plants and koi carp and was popular with generations of families . <unk> " i , like a lot of people , spent my younger days coming here and bringing my children here , " he said . <unk> " when i heard that it was being demolished and closing down , i thought we were proba

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

