## 1.ETL

In [1]:
import pandas as pd
qq = pd.read_csv('./Datasets/quora_question.csv')
qq.drop(columns=['test_id','question2'], inplace=True)
qq.shape

(2345796, 1)

In [2]:
random_sample = qq.sample(n=1000,random_state=6969) #try only 1000 samples
random_sample.shape

(1000, 1)

In [3]:
random_sample.head()

Unnamed: 0,question1
614123,Why won't China let Pope Francis visit?
795359,"Is it common to say ""you are welcome"" in when ..."
2209942,"Do G+ ""plus ones"" on posts actually do anythin..."
1383030,Can llp give loan to its partners?
529755,How many medals become won in Olympics ?


In [4]:
qq1000 = random_sample['question1'].values.tolist()

In [5]:
import nltk
# benepar.download('benepar_en3')
import benepar, spacy
nlp = spacy.load('en_core_web_md')
nlp.add_pipe('benepar', config={'model': 'benepar_en3'})

def constituency_parser(text):
    doc = nlp(text)
    sent = list(doc.sents)[0]
    return  "(ROOT "+sent._.parse_string+")"

In [6]:
from tqdm import tqdm_notebook
train_data = list()
for idx in tqdm_notebook(range(len(qq1000))):
    train_data.append([qq1000[idx],constituency_parser(qq1000[idx])])

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for idx in tqdm_notebook(range(len(qq1000))):


  0%|          | 0/1000 [00:00<?, ?it/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [7]:
#Load data
train_data = pd.DataFrame(train_data)
train_data.rename(columns={0:'sentence',1:'parser'},inplace=True)
train_data.head()

Unnamed: 0,sentence,parser
0,Why won't China let Pope Francis visit?,(ROOT (SBARQ (WHADVP (WRB Why)) (SQ (MD wo) (R...
1,"Is it common to say ""you are welcome"" in when ...",(ROOT (SQ (VBZ Is) (NP (NP (PRP it))) (ADJP (J...
2,"Do G+ ""plus ones"" on posts actually do anythin...","(ROOT (SQ (VBP Do) (NP (NP (`` G+) (`` "") (CC ..."
3,Can llp give loan to its partners?,(ROOT (SQ (MD Can) (NP (NN llp)) (VP (VB give)...
4,How many medals become won in Olympics ?,(ROOT (SBARQ (WHNP (WHADJP (WRB How) (JJ many)...


In [33]:
# def target(text):
#     return  "<SOS> "+ text + " <EOS>"

train_data['target'] = train_data['sentence'] #.apply(target)

In [34]:
#Proprocessed Data
train_data.head()

Unnamed: 0,sentence,parser,target
0,Why won't China let Pope Francis visit?,(ROOT (SBARQ (WHADVP (WRB Why)) (SQ (MD wo) (R...,Why won't China let Pope Francis visit?
1,"Is it common to say ""you are welcome"" in when ...",(ROOT (SQ (VBZ Is) (NP (NP (PRP it))) (ADJP (J...,"Is it common to say ""you are welcome"" in when ..."
2,"Do G+ ""plus ones"" on posts actually do anythin...","(ROOT (SQ (VBP Do) (NP (NP (`` G+) (`` "") (CC ...","Do G+ ""plus ones"" on posts actually do anythin..."
3,Can llp give loan to its partners?,(ROOT (SQ (MD Can) (NP (NN llp)) (VP (VB give)...,Can llp give loan to its partners?
4,How many medals become won in Olympics ?,(ROOT (SBARQ (WHNP (WHADJP (WRB How) (JJ many)...,How many medals become won in Olympics ?


In [35]:
def is_paren(tok):
    return tok == ")" or tok == "("

def deleaf(tree):
    nonleaves = ''
    for w in str(tree).replace('\n', '').split():
        w = w.replace('(', '( ').replace(')', ' )')
        nonleaves += w + ' '

    arr = nonleaves.split()
    for n, i in enumerate(arr):
        if n + 1 < len(arr):
            tok1 = arr[n]
            tok2 = arr[n + 1]
            if not is_paren(tok1) and not is_paren(tok2):
                arr[n + 1] = ""

    nonleaves = " ".join(arr)
    return nonleaves.split()

from nltk import ParentedTree

def Parsertokenize(synt_):
    synt_ = ParentedTree.fromstring(synt_)
    synt_ = deleaf(synt_)
    synt_ = [f'<{w}>' for w in synt_]
    return synt_

In [49]:
from subwordnmt.apply_bpe import BPE, read_vocabulary
import codecs

# load bpe codes
bpe_codes = codecs.open('./data/bpe.codes', encoding='utf-8')
bpe_vocab = codecs.open('./data/vocab.txt', encoding='utf-8')
bpe_vocab = read_vocabulary(bpe_vocab, 50)
bpe = BPE(bpe_codes, '@@', bpe_vocab, None)

def bpetokenize(sent_, target = False):
 # bpe segment and convert to tensor
    sent_ = bpe.segment(sent_).split()
    if target:
        sent_.insert(0, "<SOS>")
        sent_.insert(-1, "<EOS>")
    return sent_

In [50]:
train_load = pd.DataFrame()
train_load['sentence'] = train_data['sentence'].apply(bpetokenize)
train_load['parser'] = train_data['parser'].apply(Parsertokenize)
train_load['target'] = train_data['target'].apply(bpetokenize, target = True)

In [51]:
train_load.head()

Unnamed: 0,sentence,parser,target
0,"[W@@, hy, won@@, 't, C@@, h@@, ina, let, P@@, ...","[<(>, <ROOT>, <(>, <SBARQ>, <(>, <WHADVP>, <(>...","[<SOS>, W@@, hy, won@@, 't, C@@, h@@, ina, let..."
1,"[I@@, s, it, common, to, say, ""@@, you, are, w...","[<(>, <ROOT>, <(>, <SQ>, <(>, <VBZ>, <)>, <(>,...","[<SOS>, I@@, s, it, common, to, say, ""@@, you,..."
2,"[D@@, o, G@@, +, ""@@, plus, on@@, es@@, "", on,...","[<(>, <ROOT>, <(>, <SQ>, <(>, <VBP>, <)>, <(>,...","[<SOS>, D@@, o, G@@, +, ""@@, plus, on@@, es@@,..."
3,"[C@@, an, ll@@, p, give, loan, to, its, partne...","[<(>, <ROOT>, <(>, <SQ>, <(>, <MD>, <)>, <(>, ...","[<SOS>, C@@, an, ll@@, p, give, loan, to, its,..."
4,"[H@@, ow, many, medals, become, won, in, O@@, ...","[<(>, <ROOT>, <(>, <SBARQ>, <(>, <WHNP>, <(>, ...","[<SOS>, H@@, ow, many, medals, become, won, in..."


In [125]:
train_list = train_load.values.tolist()
print(train_list[0][0])
print(train_list[0][1])
print(train_list[0][2])

['W@@', 'hy', 'won@@', "'t", 'C@@', 'h@@', 'ina', 'let', 'P@@', 'op@@', 'e', 'F@@', 'ran@@', 'cis', 'visit@@', '?']
['<(>', '<ROOT>', '<(>', '<SBARQ>', '<(>', '<WHADVP>', '<(>', '<WRB>', '<)>', '<)>', '<(>', '<SQ>', '<(>', '<MD>', '<)>', '<(>', '<RB>', '<)>', '<(>', '<NP>', '<(>', '<NNP>', '<)>', '<)>', '<(>', '<VP>', '<(>', '<VB>', '<)>', '<(>', '<S>', '<(>', '<NP>', '<(>', '<NNP>', '<)>', '<(>', '<NNP>', '<)>', '<)>', '<(>', '<VP>', '<(>', '<VB>', '<)>', '<)>', '<)>', '<)>', '<)>', '<(>', '<.>', '<)>', '<)>', '<)>']
['<SOS>', 'W@@', 'hy', 'won@@', "'t", 'C@@', 'h@@', 'ina', 'let', 'P@@', 'op@@', 'e', 'F@@', 'ran@@', 'cis', 'visit@@', '<EOS>', '?']


In [14]:
import pickle
with open('./data/dictionary.pkl', 'rb') as f:
    vocab_transform = pickle.load(f)
vocab_dict = vocab_transform.word2idx

In [28]:
from torchtext.data.utils import get_tokenizer

tokenizer = get_tokenizer('spacy',language='en_core_web_sm')
text_pipeline = lambda x: [vocab_dict[x_] if x_ in vocab_dict else vocab_dict["<unk>"] for x_ in x ]

In [29]:
setence_token_id = text_pipeline(train_load['sentence'].iloc[0])

In [30]:
import torch 

torch.tensor(setence_token_id, dtype=torch.int64)

tensor([21396, 16180, 14060, 16778, 14933,   775,  2824,   216, 22548,  1939,
          614, 23229,  3860, 18657, 28121,   119])

In [128]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch

pad_idx = vocab_dict['<pad>'] ##get the pad index from the vocab

def collate_batch(batch):
    sent_list, synt_lst, trg_list = [], [], []
    # print(len(batch))
    # sens_, syns_, trgs_ = batch
    # for sen_, syn_, trg_ in zip(sens_, syns_, trgs_):
    for sen_, syn_, trg_ in batch:
        processed_sent = torch.tensor(text_pipeline(sen_), dtype=torch.int64)
        sent_list.append(processed_sent)
        processed_synt = torch.tensor(text_pipeline(syn_), dtype=torch.int64)
        synt_lst.append(processed_synt)
        processed_trg = torch.tensor(text_pipeline(trg_), dtype=torch.int64)
        trg_list.append(processed_trg)

    return pad_sequence(sent_list, padding_value=pad_idx, batch_first=True), pad_sequence(synt_lst, padding_value=pad_idx, batch_first=True), pad_sequence(trg_list, padding_value=pad_idx, batch_first=True)

## 3. DataLoader

In [152]:
from torch.utils.data import DataLoader
import torch
torch.manual_seed(6969)

train = train_list[:800]
val   = train_list[800:900]
test  = train_list[900:]
train_dataloader = DataLoader(train,batch_size=16, shuffle=True,collate_fn=collate_batch)
valid_dataloader = DataLoader(val,  batch_size=16,collate_fn=collate_batch)
test_dataloader = DataLoader(test,batch_size=16,collate_fn=collate_batch)

In [133]:
for sen,syn,trg in train_dataloader:
    print(sen.shape)
    print(syn.shape)
    print(trg.shape)
    break

torch.Size([16, 49])
torch.Size([16, 192])
torch.Size([16, 51])


## 4.Model

In [135]:
import torch, torchdata, torchtext
from torch import nn
import torch.nn.functional as F
from tqdm import tqdm
import random, math, time
from torch.autograd import Variable
import operator
import numpy as np

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

#make our work comparable if restarted the kernel
SEED = 6969
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

cpu


In [136]:
class PositionalEncoding(nn.Module):
    def __init__(self, dim_model, dropout = 0.1, max_len = 5000):
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        
        # Encoding - From formula
        pos_encoding = torch.zeros(max_len, dim_model)
        positions_list = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        division_term = torch.exp(torch.arange(0, dim_model, 2).float() * (-np.log(10000.0)) / dim_model) # 1000^(2i/dim_model)
        
        # PE(pos, 2i) = sin(pos/1000^(2i/dim_model))
        pos_encoding[:, 0::2] = torch.sin(positions_list * division_term)
        
        # PE(pos, 2i + 1) = cos(pos/1000^(2i/dim_model))
        pos_encoding[:, 1::2] = torch.cos(positions_list * division_term)
        
        # Saving buffer (same as parameter without gradients needed)
        pos_encoding = pos_encoding.unsqueeze(0).transpose(0, 1)
        self.register_buffer("pos_encoding",pos_encoding)
        
    def forward(self, token_embedding):
        # Residual connection + pos encoding
        token_embedding = token_embedding + self.pos_encoding[:token_embedding.size(0), :]
        return self.dropout(token_embedding)

In [195]:
class seq2seqTransformer(nn.Module):
    def __init__(self, input_dim, emb_dim, device, word_dropout = 0.4, dropout = 0.1):
        super(seq2seqTransformer, self).__init__()
        self.input_dim = input_dim 
        self.emb_dim = emb_dim 
        self.word_dropout = word_dropout
        self.dropout = dropout
        self.device = device
        
        self.scale = np.sqrt(self.emb_dim)
        # self.scale = torch.sqrt(torch.IntTensor([self.hid_dim])).to(device)

        # vcocabulary embedding
        self.embedding_encoder = nn.Embedding(input_dim, emb_dim)
        self.embedding_decoder = nn.Embedding(input_dim, emb_dim)
        # positional encoding
        self.pos_encoder = PositionalEncoding(emb_dim, dropout = 0.0)

        self.transformer = nn.Transformer(d_model = emb_dim, nhead = 12, dropout = dropout)
        # linear Transformation
        self.linear = nn.Linear(emb_dim, input_dim)
        # self.init_weights()

    def load_embedding(self, embedding):  #synPG applied with GloVe glove.840B.300d.txt
        self.embedding_encoder.weight.data.copy_(torch.from_numpy(embedding)) 
        self.embedding_decoder.weight.data.copy_(torch.from_numpy(embedding))  

    def init_weights(self):
        initrange = 0.1
        # initialize cocabulary matrix weight
        self.embedding_encoder.weight.data.uniform_(-initrange, initrange)
        self.embedding_decoder.weight.data.uniform_(-initrange, initrange)
        # initialize linear weight
        self.linear.weight.data.uniform_(-initrange, initrange)
        self.linear.bias.data.fill_(0)

    def generate_square_mask(self, max_sent_len, max_synt_len):
        size = max_sent_len + max_synt_len + 2 #<sos> and <eos>
        mask = torch.zeros((size, size))
        mask[:max_sent_len, max_sent_len:] = float("-inf")
        mask[max_sent_len:, :max_sent_len] = float("-inf")
        return mask
        
    def forward(self, sents, synts, trg):
        #sents  : batch_size, seq_len
        #synts  : batch_size, seq_len
        #trgs   : batch_size, seq_len 
        batch_size   = sents.size(0)
        max_sent_len = sents.size(1)
        max_synt_len = synts.size(1) - 2    # count without <sos> and <eos>
        max_targ_len = trg.size(1) - 2      # count without <sos> and <eos>

        # apply word dropout
        drop_mask = torch.bernoulli(self.word_dropout * torch.ones(max_sent_len)).bool().to(self.device)
        sents = sents.masked_fill(drop_mask, -1e10)
        print(sents.shape)
        print(self.embedding_encoder(sents).shape)

        # sentence, syntax => embedding
        sent_embeddings = self.embedding_encoder(sents).transpose(0, 1) * self.scale
        #sent_emb = [seq_len, batch size, emb_size]
        synt_embeddings = self.embedding_encoder(synts).transpose(0, 1) * self.scale
        synt_embeddings = self.pos_encoder(synt_embeddings) 
        #synt_emb = [seq_len, batch size, emb_size]
        en_embeddings = torch.cat((sent_embeddings, synt_embeddings), dim=0)
        #en_emb = [seq_len, batch size, emb_size*2]

        # do not allow cross attetion
        src_mask = self.generate_square_mask(max_sent_len, max_synt_len).to(self.device)
        
        # target => embedding
        de_embeddings = self.embedding_decoder(trg[:, :-1]).transpose(0, 1) * self.scale
        de_embeddings = self.pos_encoder(de_embeddings)
        
        # sequential mask
        trg_mask = self.transformer.generate_square_subsequent_mask(max_targ_len+1).to(self.device)
        
        # forward
        outputs = self.transformer(en_embeddings, de_embeddings, src_mask=src_mask, tgt_mask=trg_mask)
        
        # apply linear layer to vocabulary size
        outputs = outputs.transpose(0, 1)
        outputs = self.linear(outputs.contiguous().view(-1, self.emb_dim))
        outputs = outputs.view(batch_size, max_targ_len + 1, self.input_dim)
        #output = [batch size, trg_len, vocab_size]
        return outputs
    
    def generate(self, sents, synts, max_len = 30, sample=True, temp=0.5):
        #sents  : batch_size, seq_len
        #synts  : batch_size, seq_len
        batch_size   = sents.size(0)
        max_sent_len = sents.size(1)
        max_synt_len = synts.size(1) - 2  # count without <sos> and <eos>
        max_targ_len = max_len
        
        # output index starts with <sos>
        idxs = torch.zeros((batch_size, max_targ_len+2), dtype=torch.long).to(self.device)
        idxs[:, 0] = 1
        
        # sentence, syntax => embedding
        sent_embeddings = self.embedding_encoder(sents).transpose(0, 1) * self.scale
        synt_embeddings = self.embedding_encoder(synts).transpose(0, 1) * self.scale
        synt_embeddings = self.pos_encoder(synt_embeddings)
        en_embeddings = torch.cat((sent_embeddings, synt_embeddings), dim=0)
        
        # do not allow cross attetion
        src_mask = self.generate_square_mask(max_sent_len, max_synt_len).to(self.device)
        
        # starting index => embedding
        de_embeddings = self.embedding_decoder(idxs[:, :1]).transpose(0, 1) * self.scale
        de_embeddings = self.pos_encoder(de_embeddings)
        
        # sequential mask
        trg_mask = self.transformer.generate_square_subsequent_mask(de_embeddings.size(0)).to(self.device)
        
        # encode
        memory = self.transformer.encoder(en_embeddings, mask=src_mask)
        
        # auto-regressively generate output
        for i in range(1, max_targ_len+2):
            if i % 5 == 0:
                print(f'epoch : {i}')
            # decode
            outputs = self.transformer.decoder(de_embeddings, memory, tgt_mask=trg_mask)
            outputs = self.linear(outputs[-1].contiguous().view(-1, self.emb_dim))
            
            # get argmax index or sample index
            if not sample:
                values, idx = torch.max(outputs, 1)
            else:
                probs = F.softmax(outputs/temp, dim=1)
                idx = torch.multinomial(probs, 1).squeeze(1)
            
            # save to output index
            idxs[:, i] = idx
            
            # concatenate index to decoding
            de_embeddings = self.embedding_decoder(idxs[:, :i+1]).transpose(0, 1) * self.scale
            de_embeddings = self.pos_encoder(de_embeddings)
            
            # new sequential mask
            trg_mask = self.transformer.generate_square_subsequent_mask(de_embeddings.size(0)).to(self.device)
        
        return idxs[:, 1:]

In [196]:
from torchtext.vocab import FastText
fast_vectors = FastText(language='simple') ##Load fasttext with language=simple
fast_embedding = fast_vectors.get_vecs_by_tokens(vocab_transform).to(device)

In [197]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

input_dim   = len(vocab_transform)
emb_dim     = 300  #fasttext
word_dropout = 0.4 #following SynPG
dropout      = 0.1

model = seq2seqTransformer(input_dim=input_dim, emb_dim = emb_dim, device=device, word_dropout = word_dropout, dropout = dropout)
model.embedding_encoder.weight.data = fast_embedding #apply fasttext instead of Glove 840b 300d.txt (5.56 GB) TT
model.embedding_decoder.weight.data = fast_embedding

cpu


In [198]:
def train(model, loader, optimizer, criterion, clip, loader_length):
    
    model.train()
    epoch_loss = 0
    for sents_, synts_, trgs_ in loader:

        batch_size   = sents_.size(0)
        max_sent_len = sents_.size(1)
        max_synt_len = synts_.size(1) - 2  # count without <sos> and <eos>
        
        optimizer.zero_grad()
        
        #forward 
        outputs = model(sents_, synts_, trgs_)

        # calculate loss
        targs_ = trgs_[:, 1:].contiguous().view(-1) #Without <SOS>
        outputs_ = outputs.contiguous().view(-1, outputs.size(-1))
        
        optimizer.zero_grad()

        loss = criterion(outputs_, targs_)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / loader_length

def evaluate(model, loader, criterion, loader_length):

    #turn off dropout (and batch norm if used)
    model.eval()
    epoch_loss = 0
    
    with torch.no_grad():

        for sents_, synts_, trgs_ in loader:

            batch_size   = sents_.size(0)
            max_sent_len = sents_.size(1)
            max_synt_len = synts_.size(1) - 2  # count without <sos> and <eos>
            
            #forward 
            outputs = model(sents_, synts_, trgs_)

            # calculate loss
            targs_ = trgs_[:, 1:].contiguous().view(-1) #Without <SOS>
            outputs_ = outputs.contiguous().view(-1, outputs.size(-1))
            
            loss = criterion(outputs_, targs_)
            
            epoch_loss += loss.item()
        
    return epoch_loss / loader_length

In [199]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [200]:
import torch.optim as optim

lr = 10e-4 #Following SynPG
wd = 10e-5 #Following SynPG
#training hyperparameters
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
criterion = nn.CrossEntropyLoss(ignore_index = pad_idx) #combine softmax with cross entropy

In [201]:
train_loader_length = len(list(iter(train_dataloader)))
val_loader_length   = len(list(iter(valid_dataloader)))
train_loader_length, val_loader_length

(50, 7)

In [202]:
import time
import math
best_valid_loss = float('inf')
num_epochs = 5
clip       = 1

save_path = f'models/{model.__class__.__name__}.pt' #Change here

train_losses = []
valid_losses = []

for epoch in range(num_epochs):

    start_time = time.time()

    # training
    train_loss = train(model, train_dataloader, optimizer, criterion, clip, train_loader_length)
    valid_loss = evaluate(model, test_dataloader, criterion, val_loader_length)
    
    #for plotting
    train_losses.append(train_loss)
    valid_losses.append(valid_loss)
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    # save model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), save_path)
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')
    
    #lower perplexity is better

torch.Size([16, 45])


RuntimeError: 'weight' must be 2-D