# Paraphrase Generator

## 1. ETL

In [1]:
import pickle

with open("./data/dictionary.pkl", "rb") as file:
    dictionary = pickle.load(file)

In [2]:
with open("./data/nmt_trainset.pkl", "rb") as file:
    nmt_trainset = pickle.load(file)
with open("./data/nmt_validset.pkl", "rb") as file:
    nmt_validset = pickle.load(file)

## 2. EDA

## 3. Preprocessing

## 4. Preparing the dataloader

In [3]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch

pad_idx = dictionary.word2idx['<pad>'] ##get the pad index from the vocab

def collate_batch(batch):
    sent_list, synt_lst, trg_list, adv_list = [], [], [], []
    # print(len(batch))
    # sens_, syns_, trgs_ = batch
    # for sen_, syn_, trg_ in zip(sens_, syns_, trgs_):
    for sen_, syn_, trg_, adv_ in batch:
        processed_sent = torch.tensor(sen_, dtype=torch.int64)
        sent_list.append(processed_sent)
        processed_synt = torch.tensor(syn_, dtype=torch.int64)
        synt_lst.append(processed_synt)
        processed_trg = torch.tensor(trg_, dtype=torch.int64)
        trg_list.append(processed_trg)
        adv_ = torch.tensor(adv_, dtype=torch.float32)
        adv_list.append(adv_)

    return pad_sequence(sent_list, padding_value=pad_idx, batch_first=True), pad_sequence(synt_lst, padding_value=pad_idx, batch_first=True), pad_sequence(trg_list, padding_value=pad_idx, batch_first=True), pad_sequence(adv_list, padding_value=pad_idx, batch_first=True)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
from torch.utils.data import DataLoader
import random

generator = torch.Generator().manual_seed(6969)
train_dataloader = DataLoader(nmt_trainset, batch_size=16, shuffle=True, collate_fn=collate_batch, generator=generator)
valid_dataloader = DataLoader(nmt_validset, batch_size=16, shuffle=False, collate_fn=collate_batch)

In [5]:
# for sen,syn,trg,bow in train_dataloader:
#     print(sen.shape)
#     print(syn.shape)
#     print(trg.shape)
#     print(bow.shape)
#     break

## 5. model

In [6]:
import torch
from torch import nn
import torch.nn.functional as F
from tqdm import tqdm
import random, math, time
from torch.autograd import Variable
import operator
import numpy as np

#make our work comparable if restarted the kernel
SEED = 6969
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [7]:
class PositionalEncoding(nn.Module):
    def __init__(self, dim_model, dropout = 0.1, max_len = 5000):
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        
        # Encoding - From formula
        pos_encoding = torch.zeros(max_len, dim_model)
        positions_list = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        division_term = torch.exp(torch.arange(0, dim_model, 2).float() * (-np.log(10000.0)) / dim_model) # 1000^(2i/dim_model)
        
        # PE(pos, 2i) = sin(pos/1000^(2i/dim_model))
        pos_encoding[:, 0::2] = torch.sin(positions_list * division_term)
        
        # PE(pos, 2i + 1) = cos(pos/1000^(2i/dim_model))
        pos_encoding[:, 1::2] = torch.cos(positions_list * division_term)
        
        # Saving buffer (same as parameter without gradients needed)
        pos_encoding = pos_encoding.unsqueeze(0).transpose(0, 1)
        self.register_buffer("pos_encoding",pos_encoding)
        
    def forward(self, token_embedding):
        # Residual connection + pos encoding
        token_embedding = token_embedding + self.pos_encoding[:token_embedding.size(0), :]
        return self.dropout(token_embedding)

### Transformer

In [8]:
class seq2seqTransformer(nn.Module):
    def __init__(self, input_dim, emb_dim, device, word_dropout = 0.4, dropout = 0.1):
        super(seq2seqTransformer, self).__init__()
        
        self.input_dim = input_dim 
        self.emb_dim = emb_dim 

        self.word_dropout = word_dropout
        self.dropout = dropout

        self.scale = np.sqrt(self.emb_dim)
        self.device = device
        
        # vcocabulary embedding
        self.embedding_encoder = nn.Embedding(input_dim, emb_dim)
        self.embedding_decoder = nn.Embedding(input_dim, emb_dim)
        # positional encoding
        self.positional_encoder = PositionalEncoding(emb_dim, dropout = 0.0)

        self.transformer = nn.Transformer(d_model = emb_dim, nhead = 12, dropout = dropout)
        # linear Transformation
        self.linear = nn.Linear(emb_dim, input_dim)

        self.init_weights()

    def init_weights(self):
        initrange = 0.1
        # initialize cocabulary matrix weight
        self.embedding_encoder.weight.data.uniform_(-initrange, initrange)
        self.embedding_decoder.weight.data.uniform_(-initrange, initrange)
        # initialize linear weight
        self.linear.weight.data.uniform_(-initrange, initrange)
        self.linear.bias.data.fill_(0)

    def load_embedding(self, embedding):  #synPG applied with GloVe glove.840B.300d.txt
        self.embedding_encoder.weight.data.copy_(torch.from_numpy(embedding)) 
        self.embedding_decoder.weight.data.copy_(torch.from_numpy(embedding))  

    def generate_square_mask(self, max_sent_len, max_synt_len):
        size = max_sent_len + max_synt_len + 2 #<sos> and <eos>
        mask = torch.zeros((size, size))
        mask[:max_sent_len, max_sent_len:] = float("-inf")
        mask[max_sent_len:, :max_sent_len] = float("-inf")
        return mask
        
    def forward(self, sents, synts, trg):
        #sents : batch_size, sent_len, emb_dim
        #synts : batch_size, synt_len, emb_dim
        #trgs  : batch_size, trg_len, emb_dim 
        batch_size   = sents.size(0)
        max_sent_len = sents.size(1)
        max_synt_len = synts.size(1) - 2    # count without <sos> and <eos>
        max_targ_len = trg.size(1)   - 2    # count without <sos> and <eos>

        # apply word dropout
        drop_mask = torch.bernoulli(self.word_dropout * torch.ones(max_sent_len)).bool().to(self.device)
        sents = sents.masked_fill(drop_mask, 0)

        #print(sents.shape)
        #print(self.embedding_encoder(sents).shape)

        # sentence, syntax => embedding
        sent_embeddings = self.embedding_encoder(sents).transpose(0, 1) * self.scale # sent_len, batch_size, emb_dim
        synt_embeddings = self.embedding_encoder(synts).transpose(0, 1) * self.scale # synt_len, batch_size, emb_dim
        synt_embeddings = self.positional_encoder(synt_embeddings) # synt_len, batch_size, emb_dim
        encoder_embeddings = torch.cat((sent_embeddings, synt_embeddings), dim=0) # synt_len + seq_len, batch size, emb_size
       
        # do not allow cross attetion
        src_mask = self.generate_square_mask(max_sent_len, max_synt_len).to(self.device)
        
        # target => embedding
        decoder_embeddings = self.embedding_decoder(trg[:, :-1]).transpose(0, 1) * self.scale
        decoder_embeddings = self.positional_encoder(decoder_embeddings)
        
        # sequential mask
        trg_mask = self.transformer.generate_square_subsequent_mask(max_targ_len+1).to(self.device)
        
        # forward 
        outputs = self.transformer(encoder_embeddings, decoder_embeddings, src_mask=src_mask, tgt_mask=trg_mask)
        #decoder outputs

        # apply linear layer to vocabulary size
        outputs = outputs.transpose(0, 1)
        outputs = self.linear(outputs.contiguous().view(-1, self.emb_dim)) # batch_size*trg_len, input_dim
        outputs = outputs.view(batch_size, max_targ_len + 1, self.input_dim) # batch_size, trg_len, input_dim
        return outputs
    
    def generate(self, sents, synts, max_len = 30, sample=True, temp=0.5):
        #sents  : batch_size, seq_len
        #synts  : batch_size, seq_len
        batch_size   = sents.size(0)
        max_sent_len = sents.size(1)
        max_synt_len = synts.size(1) - 2  # count without <sos> and <eos>
        max_targ_len = max_len
        
        # output index starts with <sos>
        idxs = torch.zeros((batch_size, max_targ_len+2), dtype=torch.long).to(self.device)
        idxs[:, 0] = 1
        
        # sentence, syntax => embedding
        sent_embeddings = self.embedding_encoder(sents).transpose(0, 1) * self.scale
        synt_embeddings = self.embedding_encoder(synts).transpose(0, 1) * self.scale
        synt_embeddings = self.positional_encoder(synt_embeddings)
        en_embeddings = torch.cat((sent_embeddings, synt_embeddings), dim=0)
        
        # do not allow cross attetion
        src_mask = self.generate_square_mask(max_sent_len, max_synt_len).to(self.device)
        
        # starting index => embedding
        de_embeddings = self.embedding_decoder(idxs[:, :1]).transpose(0, 1) * self.scale
        de_embeddings = self.positional_encoder(de_embeddings)
        
        # sequential mask
        trg_mask = self.transformer.generate_square_subsequent_mask(de_embeddings.size(0)).to(self.device)
        
        # encode outputs
        memory = self.transformer.encoder(en_embeddings, mask=src_mask)
        
        # auto-regressively generate output
        for i in range(1, max_targ_len+2):
            
            #if i % 5 == 0:
                #print(f'epoch : {i}')
            
            # decode
            outputs = self.transformer.decoder(de_embeddings, memory, tgt_mask=trg_mask)
            outputs = self.linear(outputs[-1].contiguous().view(-1, self.emb_dim))
            
            # get argmax index or sample index
            if not sample:
                values, idx = torch.max(outputs, 1)
            else:
                probs = F.softmax(outputs/temp, dim=1)
                idx = torch.multinomial(probs, 1).squeeze(1)
            
            # save to output index
            idxs[:, i] = idx
            
            # concatenate index to decoding
            de_embeddings = self.embedding_decoder(idxs[:, :i+1]).transpose(0, 1) * self.scale
            de_embeddings = self.positional_encoder(de_embeddings)
            
            # new sequential mask
            trg_mask = self.transformer.generate_square_subsequent_mask(de_embeddings.size(0)).to(self.device)
        
        return idxs[:, 1:]

### Glove Embedding

In [9]:
from gensim.test.utils import datapath

#you have to put this file in some python/gensim directory; just run it and it will inform where to put....
# glove_file = datapath('/root/synpg/glove.6B.300d.txt')
glove_file = './data/glove.6B.300d.txt'

In [10]:
with open("./data/dictionary.pkl", "rb") as file:
    dictionary = pickle.load(file)

vocab_dict = dictionary.word2idx

In [11]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

from utils import load_embedding

input_dim   = len(vocab_dict)
emb_dim     = 300  #fasttext
word_dropout = 0.4 #following SynPG
dropout      = 0.1

embedding = load_embedding(glove_file, dictionary)

model = seq2seqTransformer(input_dim=input_dim, emb_dim = emb_dim, device=device, word_dropout = word_dropout, dropout = dropout)
model = model.to(device)
model.load_embedding(embedding)
# model.embedding_encoder.weight.data = fast_embedding #apply fasttext instead of Glove 840b 300d.txt (5.56 GB) TT
# model.embedding_decoder.weight.data = fast_embedding

cuda:0
load 22696 of 31414 from pretrained word embeddings



## 6. Training

In [12]:
from tqdm import tqdm

def train(model, loader, optimizer, criterion, clip, loader_length):
    
    model.train()
    epoch_loss = 0
    for sents_, synts_, trgs_, adv_ in tqdm(loader):

        batch_size   = sents_.size(0)
        max_sent_len = sents_.size(1)
        max_synt_len = synts_.size(1) - 2  # count without <sos> and <eos>
        
        optimizer.zero_grad()

        # Put input into device
        sents_ = sents_.to(device)
        synts_ = synts_.to(device)
        trgs_ = trgs_.to(device)
        
        #forward 
        outputs = model(sents_, synts_, trgs_)

        # calculate loss
        targs_ = trgs_[:, 1:].contiguous().view(-1) #Without <SOS>
        outputs_ = outputs.contiguous().view(-1, outputs.size(-1))
        
        optimizer.zero_grad()

        loss = criterion(outputs_, targs_)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / loader_length

def evaluate(model, loader, criterion, loader_length):

    #turn off dropout (and batch norm if used)
    model.eval()
    epoch_loss = 0
    
    with torch.no_grad():

        for sents_, synts_, trgs_, adv_ in tqdm(loader):

            batch_size   = sents_.size(0)
            max_sent_len = sents_.size(1)
            max_synt_len = synts_.size(1) - 2  # count without <sos> and <eos>

            # Put into your device
            sents_ = sents_.to(device)
            synts_ = synts_.to(device)
            trgs_ = trgs_.to(device)
            
            #forward 
            outputs = model(sents_, synts_, trgs_)

            # calculate loss
            targs_ = trgs_[:, 1:].contiguous().view(-1) #Without <SOS>
            outputs_ = outputs.contiguous().view(-1, outputs.size(-1))
            
            loss = criterion(outputs_, targs_)
            
            epoch_loss += loss.item()
        
    return epoch_loss / loader_length

In [13]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [14]:
import torch.optim as optim

pad_idx = dictionary.word2idx['<pad>'] ##get the pad index from the vocab

lr = 10e-4 #Following SynPG
wd = 10e-5 #Following SynPG
#training hyperparameters
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
criterion = nn.CrossEntropyLoss(ignore_index = pad_idx) #combine softmax with cross entropy

In [15]:
train_loader_length = len(list(iter(train_dataloader)))
val_loader_length   = len(list(iter(valid_dataloader)))
train_loader_length, val_loader_length

(62500, 313)

In [16]:
import time
import math
from tqdm import tqdm
best_valid_loss = float('inf')
num_epochs = 5
clip       = 1

save_path = './models/nmt_paraphase_1m.pt'

train_losses = []
valid_losses = []

for epoch in range(num_epochs):

    start_time = time.time()

    # training
    train_loss = train(model, train_dataloader, optimizer, criterion, clip, train_loader_length)
    valid_loss = evaluate(model, valid_dataloader, criterion, val_loader_length)
    
    #for plotting
    train_losses.append(train_loss)
    valid_losses.append(valid_loss)
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    # save model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), save_path)
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')
    #lower perplexity is better

100%|██████████| 62500/62500 [1:26:03<00:00, 12.11it/s]
100%|██████████| 313/313 [00:07<00:00, 42.09it/s]


Epoch: 01 | Time: 86m 10s
	Train Loss: 4.166 | Train PPL:  64.453
	 Val. Loss: 4.517 |  Val. PPL:  91.572


100%|██████████| 62500/62500 [1:26:16<00:00, 12.07it/s]
100%|██████████| 313/313 [00:07<00:00, 41.92it/s]


Epoch: 02 | Time: 86m 24s
	Train Loss: 3.624 | Train PPL:  37.496
	 Val. Loss: 4.365 |  Val. PPL:  78.674


100%|██████████| 62500/62500 [1:26:13<00:00, 12.08it/s]
100%|██████████| 313/313 [00:07<00:00, 41.96it/s]


Epoch: 03 | Time: 86m 21s
	Train Loss: 3.623 | Train PPL:  37.438
	 Val. Loss: 4.506 |  Val. PPL:  90.520


100%|██████████| 62500/62500 [1:26:15<00:00, 12.08it/s]
100%|██████████| 313/313 [00:07<00:00, 41.85it/s]


Epoch: 04 | Time: 86m 22s
	Train Loss: 3.669 | Train PPL:  39.218
	 Val. Loss: 4.435 |  Val. PPL:  84.381


100%|██████████| 62500/62500 [1:26:13<00:00, 12.08it/s]
100%|██████████| 313/313 [00:07<00:00, 42.07it/s]

Epoch: 05 | Time: 86m 20s
	Train Loss: 3.696 | Train PPL:  40.284
	 Val. Loss: 4.610 |  Val. PPL: 100.501





In [17]:
from utils import synt2str, sent2str, load_embedding, reverse_bpe
    
def generate(model, loader, vocab_transform, device):
    #turn off dropout (and batch norm if used)
    model.eval()
    epoch_loss = 0
    with open("./eval/target_sents.txt", "w") as fp1, \
         open("./eval/target_synts.txt", "w") as fp2, \
         open("./eval/outputs.txt", "w") as fp3:
        with torch.no_grad():
            #for i, x in enumerate( tqdm(my_list) ):
            for sents_, synts_, trgs_, adv_ in tqdm(loader):

                sents_ = sents_.to(device)
                synts_ = synts_.to(device)
                trgs_  = trgs_.to(device)

                batch_size   = sents_.size(0)
                max_sent_len = sents_.size(1)
                max_synt_len = synts_.size(1) - 2  # count without <sos> and <eos>
                
                # generate
                idxs = model.generate(sents_, synts_, sents_.size(1), temp=0.5)
                
                # # write output
                # for sent, idx, targ, synt_ in zip(sents_, idxs.cpu().numpy(), trgs_, synts_):
                    
                #     print(targ)
                    
                #     fp1.write(targ+'\n')
                #     fp2.write(synt_+'\n')
                #     fp3.write(reverse_bpe(synt2str(idx, vocab_transform))+'\n')
                
                for sent, idx, synt in zip(sents_.cpu().numpy(), idxs.cpu().numpy(), synts_.cpu().numpy()):
                    
                    #print(sent2str(sent, vocab_transform))

                    convert_sent = reverse_bpe(sent2str(sent, vocab_transform).split()) + '\n'
                    convert_synt = synt2str(synt[1:], vocab_transform).replace("<pad>", "") + '\n' 
                    convert_idx = synt2str(idx, vocab_transform) +'\n'
                    
                    fp1.write(convert_sent)
                    fp2.write(convert_synt)
                    fp3.write(convert_idx)
                    
                    # fp1.write(sent2str(sent, vocab_transform) +'\n')
                    # fp2.write(synt2str(synt[1:], vocab_transform)+'\n')
                    # fp3.write(reverse_bpe(synt2str(idx, vocab_transform).replace("<pad>", "")) +'\n')
                    
                    # print(synt2str(synt[1:], vocab_transform)+'\n')
                    # print(sent2str(sent, vocab_transform)+'\n')
                    # print(synt2str(idx, vocab_transform)+'\n')
                    # print("--\n")

## 7. Evaluate

In [18]:
import h5py, os
print("==== loading data ====")
mrpc_set = h5py.File(os.path.join('./test_data/test_data_mrpc.h5'), 'r')
mrpc_set.keys()

==== loading data ====


<KeysViewHDF5 ['sents1', 'sents2', 'synts1', 'synts2']>

In [19]:
mrpc_set['sents1'][0].decode(), mrpc_set['synts2'][0].decode(), mrpc_set['sents2'][0].decode()

("amrozi accused his brother , whom he called `` the witness '' , of deliberately distorting his evidence .",
 "(ROOT (S (VP (VBG referring) (PP (TO to) (NP (PRP him))) (PP (IN as) (NP (NP (RB only) (`` ``) (NP (DT the) (NN witness)) ('' '')) (, ,) (SBAR (IN amrozi) (S (VP (VBN accused) (NP (NP (PRP$ his) (NN brother)) (PP (IN of) (S (ADVP (RB deliberately)) (VP (VBG distorting) (NP (PRP$ his) (NN evidence)))))))))))) (. .)))",
 "referring to him as only `` the witness '' , amrozi accused his brother of deliberately distorting his evidence .")

In [20]:
def is_paren(tok):
    return tok == ")" or tok == "("

def getleaf(tree):
    nonleaves = ''
    for w in str(tree).replace('\n', '').split():
        w = w.replace('(', '( ').replace(')', ' )')
        nonleaves += w + ' '
    
    leaves = []
    arr = nonleaves.split()
    for n, i in enumerate(arr):
        if n + 1 < len(arr):
            tok1 = arr[n]
            tok2 = arr[n + 1]
            if not is_paren(tok1) and not is_paren(tok2):
                leaves.append(arr[n])

    return leaves

def deleaf(tree):
    nonleaves = ''
    for w in str(tree).replace('\n', '').split():
        w = w.replace('(', '( ').replace(')', ' )')
        nonleaves += w + ' '

    arr = nonleaves.split()
    for n, i in enumerate(arr):
        if n + 1 < len(arr):
            tok1 = arr[n]
            tok2 = arr[n + 1]
            if not is_paren(tok1) and not is_paren(tok2):
                arr[n + 1] = ""

    nonleaves = " ".join(arr)
    return nonleaves.split()

In [21]:
#Tokenizer BPE
from subwordnmt.apply_bpe import BPE, read_vocabulary
import codecs
import numpy as np

# load bpe codes
bpe_codes = codecs.open('./data/bpe.codes', encoding='utf-8')
bpe_vocab = codecs.open('./data/vocab.txt', encoding='utf-8')
bpe_vocab = read_vocabulary(bpe_vocab, 50)
bpe = BPE(bpe_codes, '@@', bpe_vocab, None)

import pickle
with open("./data/dictionary.pkl", "rb") as file:
    dictionary = pickle.load(file)

def bpe_tokenizer(sent_, target = False):
 # bpe segment and convert to tensor
    sent_ = bpe.segment(sent_).split()
    sent_ = [dictionary.word2idx[w] if w in dictionary.word2idx else dictionary.word2idx["<unk>"] for w in sent_]
    if target:
        sent_ = [dictionary.word2idx["<sos>"]] + sent_ + [dictionary.word2idx["<eos>"]]
    return sent_

#syntax to syntatic tokenzier
from nltk import ParentedTree
def parser_tokenizer(synt_):
    synt_  = ParentedTree.fromstring(synt_)
    synt_ = deleaf(synt_)
    synt_ = [dictionary.word2idx[f"<{w}>"] for w in synt_ if f"<{w}>" in dictionary.word2idx]
    synt_ = [dictionary.word2idx["<sos>"]] + synt_ + [dictionary.word2idx["<eos>"]]
    return synt_

import pickle
with open('synt_vocab.pkl', 'rb') as f:
    synt_vocab = pickle.load(f)

def bow(synt_):
    synt_bow = np.ones(74)
    synt_ = ['<sos>'] + deleaf(synt_) + ['<eos>']
    for tag in synt_:
        if tag != '<sos>' and tag != '<eos>':
            synt_bow[synt_vocab[tag]-3] += 1
    synt_bow /= synt_bow.sum()
    return synt_bow 

In [22]:
from tqdm import tqdm

def prepare_paraphrase_dataset(sent1, synt2, sent2):
    lists_ = list()
    for sen1, syn2, sen2 in tqdm(zip(sent1,synt2,sent2)):
        sent_ = bpe_tokenizer(sen1.decode())
        syn_  = parser_tokenizer(syn2.decode())
        trg_  = bpe_tokenizer(sen2.decode(), target = True)
        bow_  = bow(syn2.decode())
        lists_.append((sent_, syn_, trg_, bow_))
    return lists_

In [23]:
mrpc_dataset = prepare_paraphrase_dataset(mrpc_set['sents1'], mrpc_set['synts2'], mrpc_set['sents2'])

1920it [00:01, 1093.73it/s]


In [24]:
from torch.utils.data import DataLoader
import random

random.seed(6969)
random.shuffle(mrpc_dataset)

test_dataloader = DataLoader(mrpc_dataset, batch_size=16, shuffle=False, collate_fn=collate_batch)

In [25]:
print(device)

from utils import load_embedding

input_dim   = len(vocab_dict)
emb_dim     = 300  #fasttext
word_dropout = 0.4 #following SynPG
dropout      = 0.1

embedding = load_embedding(glove_file, dictionary)

model = seq2seqTransformer(input_dim=input_dim, emb_dim = emb_dim, device=device, word_dropout = word_dropout, dropout = dropout)
model = model.to(device)
model.load_embedding(embedding)

save_path = './models/nmt_paraphase_1m.pt'
model.load_state_dict(torch.load(save_path))

generate(model, test_dataloader, dictionary, device)

cuda:0
load 22696 of 31414 from pretrained word embeddings



100%|██████████| 120/120 [00:25<00:00,  4.69it/s]


## 8. Metric

In [26]:
import numpy as np
from nltk.translate.bleu_score import sentence_bleu

def cal_bleu(hypothesis, reference, n):
    hypothesis = hypothesis.strip().split(' ')
    reference = reference.strip().split(' ')

    if n == 0:
        return sentence_bleu([reference], hypothesis)
    elif n == 1:
        weights = (1, 0, 0, 0)
    elif n == 2:
        weights = (0, 1, 0, 0)
    elif n == 3:
        weights = (0, 0, 1, 0)
    elif n == 4:
        weights = (0, 0, 0, 1)

    return sentence_bleu([reference], hypothesis, weights=weights)     

In [1]:
from tqdm import tqdm

with open('./eval/target_sents.txt') as fp:
    targs = fp.readlines()
with open('./eval/outputs.txt') as fp:
    preds = fp.readlines()

print(f"number of examples: {len(preds)}")

number of examples: 1920


In [34]:
targs[2], preds[2]

("`` i 'm pleased by the fact that we are bringing this to a conclusion , '' he said .\n",
 "` i 'm sure you 're it to to the wife , ' . '' .\n")

In [30]:
scores = [cal_bleu(pred, targ, 1) for pred, targ in tqdm(zip(preds, targs))]

print(f"BLEU: {np.mean(scores)*100.0}")

1920it [00:00, 7678.20it/s]

BLEU: 26.041819089925017





In [2]:
import numpy as np
from nltk.translate import meteor

def cal_meteor(hypothesis, reference):
    hypothesis = hypothesis.strip().split(' ')
    reference = reference.strip().split(' ')

    return meteor([reference], hypothesis)   

scoresm = [cal_meteor(pred, targ) for pred, targ in tqdm(zip(preds, targs))]
print(f"METHEO: {np.mean(scoresm)*100.0}")

1920it [00:02, 716.24it/s] 

METHEO: 16.89159103185199





In [5]:
from rouge import Rouge
scorer = Rouge()
scoresR = [scorer.get_scores(pred,refs= targ)[0]['rouge-1']['r'] for pred, targ in tqdm(zip(preds, targs))]
scoresP = [scorer.get_scores(pred,refs= targ)[0]['rouge-1']['p'] for pred, targ in tqdm(zip(preds, targs))]
scoresF = [scorer.get_scores(pred,refs= targ)[0]['rouge-1']['f'] for pred, targ in tqdm(zip(preds, targs))]

# # for ind,k in enumerate(scoresโร้ก):
print(f"Rouge-r: {np.mean(scoresR)*100.0}") 
print(f"Rouge-p: {np.mean(scoresP)*100.0}") 
print(f"Rouge-f: {np.mean(scoresF)*100.0}") 

1920it [00:00, 2961.20it/s]
1920it [00:00, 3114.54it/s]
1920it [00:00, 3108.73it/s]

Rouge-r: 26.50552242208582
Rouge-p: 31.260649462989644
Rouge-f: 28.381930033027004



