In [1]:
import torch
from torch import nn
import torch.nn.functional as F
from tqdm import tqdm
import random, math, time
from torch.autograd import Variable
import operator
import numpy as np
import pickle

device = torch.device('cuda:2' if torch.cuda.is_available() else 'cpu')
print(device)

#make our work comparable if restarted the kernel
SEED = 6969
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

  from .autonotebook import tqdm as notebook_tqdm


cuda:2


In [7]:
class PositionalEncoding(nn.Module):
    def __init__(self, dim_model, dropout = 0.1, max_len = 5000):
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        
        # Encoding - From formula
        pos_encoding = torch.zeros(max_len, dim_model)
        positions_list = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        division_term = torch.exp(torch.arange(0, dim_model, 2).float() * (-np.log(10000.0)) / dim_model) # 1000^(2i/dim_model)
        
        # PE(pos, 2i) = sin(pos/1000^(2i/dim_model))
        pos_encoding[:, 0::2] = torch.sin(positions_list * division_term)
        
        # PE(pos, 2i + 1) = cos(pos/1000^(2i/dim_model))
        pos_encoding[:, 1::2] = torch.cos(positions_list * division_term)
        
        # Saving buffer (same as parameter without gradients needed)
        pos_encoding = pos_encoding.unsqueeze(0).transpose(0, 1)
        self.register_buffer("pos_encoding",pos_encoding)
        
    def forward(self, token_embedding):
        # Residual connection + pos encoding
        token_embedding = token_embedding + self.pos_encoding[:token_embedding.size(0), :]
        return self.dropout(token_embedding)

In [8]:
class seq2seqTransformer(nn.Module):
    def __init__(self, input_dim, emb_dim, device, word_dropout = 0.4, dropout = 0.1):
        super(seq2seqTransformer, self).__init__()
        self.input_dim = input_dim 
        self.emb_dim = emb_dim 
        self.word_dropout = word_dropout
        self.dropout = dropout
        self.device = device
        
        self.scale = np.sqrt(self.emb_dim)
        # self.scale = torch.sqrt(torch.IntTensor([self.hid_dim])).to(device)

        # vcocabulary embedding
        self.embedding_encoder = nn.Embedding(input_dim, emb_dim)
        self.embedding_decoder = nn.Embedding(input_dim, emb_dim)
        # positional encoding
        self.pos_encoder = PositionalEncoding(emb_dim, dropout = 0.0)

        self.transformer = nn.Transformer(d_model = emb_dim, nhead = 12, dropout = dropout)
        # linear Transformation
        self.linear = nn.Linear(emb_dim, input_dim)
        self.init_weights()

    def load_embedding(self, embedding):  #synPG applied with GloVe glove.840B.300d.txt
        self.embedding_encoder.weight.data.copy_(torch.from_numpy(embedding)) 
        self.embedding_decoder.weight.data.copy_(torch.from_numpy(embedding))  

    def init_weights(self):
        initrange = 0.1
        # initialize cocabulary matrix weight
        self.embedding_encoder.weight.data.uniform_(-initrange, initrange)
        self.embedding_decoder.weight.data.uniform_(-initrange, initrange)
        # initialize linear weight
        self.linear.weight.data.uniform_(-initrange, initrange)
        self.linear.bias.data.fill_(0)

    def generate_square_mask(self, max_sent_len, max_synt_len):
        size = max_sent_len + max_synt_len + 2 #<sos> and <eos>
        mask = torch.zeros((size, size))
        mask[:max_sent_len, max_sent_len:] = float("-inf")
        mask[max_sent_len:, :max_sent_len] = float("-inf")
        return mask
        
    def forward(self, sents, synts, trg):
        #sents  : batch_size, seq_len
        #synts  : batch_size, seq_len
        #trgs   : batch_size, seq_len 
        batch_size   = sents.size(0)
        max_sent_len = sents.size(1)
        max_synt_len = synts.size(1) - 2    # count without <sos> and <eos>
        max_targ_len = trg.size(1) - 2      # count without <sos> and <eos>

        # apply word dropout
        drop_mask = torch.bernoulli(self.word_dropout * torch.ones(max_sent_len)).bool().to(self.device)
        #sents = sents.masked_fill(drop_mask, -1e10)
        sents = sents.masked_fill(drop_mask, 0)

        #print(sents.shape)
        #print(self.embedding_encoder(sents).shape)

        # sentence, syntax => embedding
        sent_embeddings = self.embedding_encoder(sents).transpose(0, 1) * self.scale
        #sent_emb = [seq_len, batch size, emb_size]
        synt_embeddings = self.embedding_encoder(synts).transpose(0, 1) * self.scale
        synt_embeddings = self.pos_encoder(synt_embeddings) 
        #synt_emb = [seq_len, batch size, emb_size]
        en_embeddings = torch.cat((sent_embeddings, synt_embeddings), dim=0)
        #en_emb = [seq_len, batch size, emb_size*2]

        # do not allow cross attetion
        src_mask = self.generate_square_mask(max_sent_len, max_synt_len).to(self.device)
        
        # target => embedding
        de_embeddings = self.embedding_decoder(trg[:, :-1]).transpose(0, 1) * self.scale
        de_embeddings = self.pos_encoder(de_embeddings)
        
        # sequential mask
        trg_mask = self.transformer.generate_square_subsequent_mask(max_targ_len+1).to(self.device)
        
        # forward
        outputs = self.transformer(en_embeddings, de_embeddings, src_mask=src_mask, tgt_mask=trg_mask)
        
        # apply linear layer to vocabulary size
        outputs = outputs.transpose(0, 1)
        outputs = self.linear(outputs.contiguous().view(-1, self.emb_dim))
        outputs = outputs.view(batch_size, max_targ_len + 1, self.input_dim)
        #output = [batch size, trg_len, vocab_size]
        return outputs
    
    def generate(self, sents, synts, max_len = 30, sample=True, temp=0.5):
        #sents  : batch_size, seq_len
        #synts  : batch_size, seq_len
        batch_size   = sents.size(0)
        max_sent_len = sents.size(1)
        max_synt_len = synts.size(1) - 2  # count without <sos> and <eos>
        max_targ_len = max_len
        
        # output index starts with <sos>
        idxs = torch.zeros((batch_size, max_targ_len+2), dtype=torch.long).to(self.device)
        idxs[:, 0] = 1
        
        # sentence, syntax => embedding
        sent_embeddings = self.embedding_encoder(sents).transpose(0, 1) * self.scale
        synt_embeddings = self.embedding_encoder(synts).transpose(0, 1) * self.scale
        synt_embeddings = self.pos_encoder(synt_embeddings)
        en_embeddings = torch.cat((sent_embeddings, synt_embeddings), dim=0)
        
        # do not allow cross attetion
        src_mask = self.generate_square_mask(max_sent_len, max_synt_len).to(self.device)
        
        # starting index => embedding
        de_embeddings = self.embedding_decoder(idxs[:, :1]).transpose(0, 1) * self.scale
        de_embeddings = self.pos_encoder(de_embeddings)
        
        # sequential mask
        trg_mask = self.transformer.generate_square_subsequent_mask(de_embeddings.size(0)).to(self.device)
        
        # encode
        memory = self.transformer.encoder(en_embeddings, mask=src_mask)
        
        # auto-regressively generate output
        for i in range(1, max_targ_len+2):
            
            #if i % 5 == 0:
                #print(f'epoch : {i}')
            
            # decode
            outputs = self.transformer.decoder(de_embeddings, memory, tgt_mask=trg_mask)
            outputs = self.linear(outputs[-1].contiguous().view(-1, self.emb_dim))
            
            # get argmax index or sample index
            if not sample:
                values, idx = torch.max(outputs, 1)
            else:
                probs = F.softmax(outputs/temp, dim=1)
                idx = torch.multinomial(probs, 1).squeeze(1)
            
            # save to output index
            idxs[:, i] = idx
            
            # concatenate index to decoding
            de_embeddings = self.embedding_decoder(idxs[:, :i+1]).transpose(0, 1) * self.scale
            de_embeddings = self.pos_encoder(de_embeddings)
            
            # new sequential mask
            trg_mask = self.transformer.generate_square_subsequent_mask(de_embeddings.size(0)).to(self.device)
        
        return idxs[:, 1:]

In [9]:
from gensim.test.utils import datapath

#you have to put this file in some python/gensim directory; just run it and it will inform where to put....
# glove_file = datapath('/root/synpg/glove.6B.300d.txt')
glove_file = './Datasets/glove.6B.300d.txt'

In [10]:
with open("./data/dictionary.pkl", "rb") as file:
    dictionary = pickle.load(file)

vocab_dict = dictionary.word2idx

## QQP

In [11]:
#device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
#device = "cpu"
print(device)

from utils import load_embedding

input_dim   = len(vocab_dict)
emb_dim     = 300  #fasttext
word_dropout = 0.4 #following SynPG
dropout      = 0.1

save_path = './models/qq_Paraphrase_1m.pt'

embedding = load_embedding(glove_file, dictionary)

model = seq2seqTransformer(input_dim=input_dim, emb_dim = emb_dim, device=device, word_dropout = word_dropout, dropout = dropout)
model = model.to(device)
model.load_embedding(embedding)
model.load_state_dict(torch.load(save_path, map_location=device))

cuda:2
load 22696 of 31414 from pretrained word embeddings



<All keys matched successfully>

In [2]:
import numpy as np
from nltk.translate.bleu_score import sentence_bleu

def cal_bleu(hypothesis, reference, n):
    hypothesis = hypothesis.strip().split(' ')
    reference = reference.strip().split(' ')

    if n == 0:
        return sentence_bleu([reference], hypothesis)
    elif n == 1:
        weights = (1, 0, 0, 0)
    elif n == 2:
        weights = (0, 1, 0, 0)
    elif n == 3:
        weights = (0, 0, 1, 0)
    elif n == 4:
        weights = (0, 0, 0, 1)

    return sentence_bleu([reference], hypothesis, weights=weights)     

In [3]:
from tqdm import tqdm

with open('./eval/target_sents.txt') as fp:
    targs_nmt = fp.readlines()
with open('./eval/outputs.txt') as fp:
    preds_nmt = fp.readlines()

with open('./eval/target_sents_qq.txt') as fp:
    targs_qq = fp.readlines()
with open('./eval/outputs_qq.txt') as fp:
    preds_qq = fp.readlines()

In [4]:
import numpy as np
from nltk.translate import meteor

def cal_meteor(hypothesis, reference):
    hypothesis = hypothesis.strip().split(' ')
    reference = reference.strip().split(' ')

    return meteor([reference], hypothesis)    

In [14]:
scores = [cal_bleu(pred, targ, 0) for pred, targ in zip(preds_qq, targs_qq)]

print(f"BLEU: {np.mean(scores)*100.0}")

BLEU: 3.5626649255114042


In [16]:
scores = [cal_meteor(pred, targ) for pred, targ in tqdm(zip(preds_qq, targs_qq))]

print(f"METEOR: {np.mean(scores)*100.0}")

300000it [04:19, 1157.58it/s]

METEOR: 29.364122519615808





In [17]:
from rouge import Rouge

rouge_scorer = Rouge()

scores = [rouge_scorer.get_scores(pred, targ, avg=True) for pred, targ in tqdm(zip(preds_qq, targs_qq))]

300000it [01:49, 2748.38it/s]


In [20]:
scores[0]['rouge-l']['r']

0.35714285714285715

In [21]:
temp_rouge_lr = temp_rouge_lp = temp_rouge_lf = list()

for result in scores:
    temp_rouge_lr.append(result['rouge-l']['r'])
    temp_rouge_lp.append(result['rouge-l']['p'])
    temp_rouge_lf.append(result['rouge-l']['f'])

print(f'Average rouge l-r {np.mean(temp_rouge_lr)}')
print(f'Average rouge l-p {np.mean(temp_rouge_lp)}')
print(f'Average rouge l-f {np.mean(temp_rouge_lf)}')

Average rouge l-r 0.37538309912074685
Average rouge l-p 0.37538309912074685
Average rouge l-f 0.37538309912074685


In [None]:
# QQP
scores = [cal_bleu(pred, pred, 0) for pred, targ in zip(preds_qq, targs_qq)]

print(f"BLEU: {np.mean(scores)*100.0}")


### NMT

In [22]:
#device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
#device = "cpu"
print(device)

from utils import load_embedding

input_dim   = len(vocab_dict)
emb_dim     = 300  #fasttext
word_dropout = 0.4 #following SynPG
dropout      = 0.1

embedding = load_embedding(glove_file, dictionary)

model = seq2seqTransformer(input_dim=input_dim, emb_dim = emb_dim, device=device, word_dropout = word_dropout, dropout = dropout)
model = model.to(device)
model.load_embedding(embedding)

save_path = './models/nmt_paraphase_1m.pt'
model.load_state_dict(torch.load(save_path))
# model.embedding_encoder.weight.data = fast_embedding #apply fasttext instead of Glove 840b 300d.txt (5.56 GB) TT
# model.embedding_decoder.weight.data = fast_embedding

#generate(model, valid_dataloader, criterion, val_loader_length, dictionary)

cuda:2
load 22696 of 31414 from pretrained word embeddings



<All keys matched successfully>

In [23]:
scores = [cal_bleu(pred, targ, 0) for pred, targ in zip(preds_nmt, targs_nmt)]

print(f"BLEU: {np.mean(scores)*100.0}")

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


BLEU: 2.1898707901576775


In [24]:
scores = [cal_meteor(pred, targ) for pred, targ in tqdm(zip(preds_nmt, targs_nmt))]

print(f"METEOR: {np.mean(scores)*100.0}")

300000it [04:21, 1148.56it/s]

METEOR: 29.40658699741652





In [25]:
from rouge import Rouge

rouge_scorer = Rouge()

scores = [rouge_scorer.get_scores(pred, targ, avg=True) for pred, targ in tqdm(zip(preds_qq, targs_qq))]

300000it [01:49, 2748.00it/s]


In [26]:
temp_rouge_lr = temp_rouge_lp = temp_rouge_lf = list()

for result in scores:
    temp_rouge_lr.append(result['rouge-l']['r'])
    temp_rouge_lp.append(result['rouge-l']['p'])
    temp_rouge_lf.append(result['rouge-l']['f'])

print(f'Average rouge l-r {np.mean(temp_rouge_lr)}')
print(f'Average rouge l-p {np.mean(temp_rouge_lp)}')
print(f'Average rouge l-f {np.mean(temp_rouge_lf)}')

Average rouge l-r 0.37538309912074685
Average rouge l-p 0.37538309912074685
Average rouge l-f 0.37538309912074685


## COPY-INPUT

In [5]:
scores = [cal_bleu(targ, targ, 0) for pred, targ in zip(preds_qq, targs_qq)]

print(f"BLEU: {np.mean(scores)*100.0}")

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


BLEU: 97.812


In [6]:
scores = [cal_meteor(targ, targ) for pred, targ in tqdm(zip(preds_qq, targs_qq))]

print(f"METEOR: {np.mean(scores)*100.0}")

300000it [00:13, 22843.64it/s]

METEOR: 99.74582727897156





In [7]:
from rouge import Rouge

rouge_scorer = Rouge()

scores = [rouge_scorer.get_scores(targ, targ, avg=True) for pred, targ in tqdm(zip(preds_qq, targs_qq))]
temp_rouge_lr = temp_rouge_lp = temp_rouge_lf = list()

for result in scores:
    temp_rouge_lr.append(result['rouge-l']['r'])
    temp_rouge_lp.append(result['rouge-l']['p'])
    temp_rouge_lf.append(result['rouge-l']['f'])

print(f'Average rouge l-r {np.mean(temp_rouge_lr)}')
print(f'Average rouge l-p {np.mean(temp_rouge_lp)}')
print(f'Average rouge l-f {np.mean(temp_rouge_lf)}')

300000it [01:46, 2806.31it/s]


Average rouge l-r 0.9999999983333314
Average rouge l-p 0.9999999983333314
Average rouge l-f 0.9999999983333314


In [8]:
scores = [cal_bleu(targ, targ, 0) for pred, targ in zip(preds_nmt, targs_nmt)]

print(f"BLEU: {np.mean(scores)*100.0}")

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


BLEU: 96.68266666666666


In [9]:
scores = [cal_meteor(targ, targ) for pred, targ in tqdm(zip(preds_nmt, targs_nmt))]

print(f"METEOR: {np.mean(scores)*100.0}")

300000it [00:12, 23263.44it/s]

METEOR: 99.74372844691173





In [10]:
from rouge import Rouge

rouge_scorer = Rouge()

scores = [rouge_scorer.get_scores(targ, targ, avg=True) for pred, targ in tqdm(zip(preds_nmt, targs_nmt))]
temp_rouge_lr = temp_rouge_lp = temp_rouge_lf = list()

for result in scores:
    temp_rouge_lr.append(result['rouge-l']['r'])
    temp_rouge_lp.append(result['rouge-l']['p'])
    temp_rouge_lf.append(result['rouge-l']['f'])

print(f'Average rouge l-r {np.mean(temp_rouge_lr)}')
print(f'Average rouge l-p {np.mean(temp_rouge_lp)}')
print(f'Average rouge l-f {np.mean(temp_rouge_lf)}')

300000it [02:02, 2444.37it/s]


Average rouge l-r 0.9999999983333314
Average rouge l-p 0.9999999983333314
Average rouge l-f 0.9999999983333314
