## 1.ETL

In [1]:
import pandas as pd
qq = pd.read_csv('./Datasets/quora_question.csv')
qq.drop(columns=['test_id','question2'], inplace=True)
qq.shape

(2345796, 1)

In [2]:
random_sample = qq.sample(n=1000,random_state=6969) #try only 1000 samples
random_sample.shape

(1000, 1)

In [3]:
random_sample.head()

Unnamed: 0,question1
614123,Why won't China let Pope Francis visit?
795359,"Is it common to say ""you are welcome"" in when ..."
2209942,"Do G+ ""plus ones"" on posts actually do anythin..."
1383030,Can llp give loan to its partners?
529755,How many medals become won in Olympics ?


In [4]:
qq1000 = random_sample['question1'].values.tolist()

In [5]:
import nltk
# benepar.download('benepar_en3')
import benepar, spacy
nlp = spacy.load('en_core_web_md')
nlp.add_pipe('benepar', config={'model': 'benepar_en3'})

def constituency_parser(text):
    doc = nlp(text)
    sent = list(doc.sents)[0]
    return  "(ROOT "+sent._.parse_string+")"

In [6]:
from tqdm import tqdm_notebook
train_data = list()
for idx in tqdm_notebook(range(len(qq1000))):
    train_data.append([qq1000[idx],constituency_parser(qq1000[idx])])

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for idx in tqdm_notebook(range(len(qq1000))):


  0%|          | 0/1000 [00:00<?, ?it/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [7]:
#Load data
train_data = pd.DataFrame(train_data)
train_data.rename(columns={0:'sentence',1:'parser'},inplace=True)
train_data.head()

Unnamed: 0,sentence,parser
0,Why won't China let Pope Francis visit?,(ROOT (SBARQ (WHADVP (WRB Why)) (SQ (MD wo) (R...
1,"Is it common to say ""you are welcome"" in when ...",(ROOT (SQ (VBZ Is) (NP (NP (PRP it))) (ADJP (J...
2,"Do G+ ""plus ones"" on posts actually do anythin...","(ROOT (SQ (VBP Do) (NP (NP (`` G+) (`` "") (CC ..."
3,Can llp give loan to its partners?,(ROOT (SQ (MD Can) (NP (NN llp)) (VP (VB give)...
4,How many medals become won in Olympics ?,(ROOT (SBARQ (WHNP (WHADJP (WRB How) (JJ many)...


In [33]:
# def target(text):
#     return  "<SOS> "+ text + " <EOS>"

train_data['target'] = train_data['sentence'] #.apply(target)

In [34]:
#Proprocessed Data
train_data.head()

Unnamed: 0,sentence,parser,target
0,Why won't China let Pope Francis visit?,(ROOT (SBARQ (WHADVP (WRB Why)) (SQ (MD wo) (R...,Why won't China let Pope Francis visit?
1,"Is it common to say ""you are welcome"" in when ...",(ROOT (SQ (VBZ Is) (NP (NP (PRP it))) (ADJP (J...,"Is it common to say ""you are welcome"" in when ..."
2,"Do G+ ""plus ones"" on posts actually do anythin...","(ROOT (SQ (VBP Do) (NP (NP (`` G+) (`` "") (CC ...","Do G+ ""plus ones"" on posts actually do anythin..."
3,Can llp give loan to its partners?,(ROOT (SQ (MD Can) (NP (NN llp)) (VP (VB give)...,Can llp give loan to its partners?
4,How many medals become won in Olympics ?,(ROOT (SBARQ (WHNP (WHADJP (WRB How) (JJ many)...,How many medals become won in Olympics ?


In [35]:
def is_paren(tok):
    return tok == ")" or tok == "("

def deleaf(tree):
    nonleaves = ''
    for w in str(tree).replace('\n', '').split():
        w = w.replace('(', '( ').replace(')', ' )')
        nonleaves += w + ' '

    arr = nonleaves.split()
    for n, i in enumerate(arr):
        if n + 1 < len(arr):
            tok1 = arr[n]
            tok2 = arr[n + 1]
            if not is_paren(tok1) and not is_paren(tok2):
                arr[n + 1] = ""

    nonleaves = " ".join(arr)
    return nonleaves.split()

from nltk import ParentedTree

def Parsertokenize(synt_):
    synt_ = ParentedTree.fromstring(synt_)
    synt_ = deleaf(synt_)
    synt_ = [f'<{w}>' for w in synt_]
    return synt_

In [49]:
from subwordnmt.apply_bpe import BPE, read_vocabulary
import codecs

# load bpe codes
bpe_codes = codecs.open('./data/bpe.codes', encoding='utf-8')
bpe_vocab = codecs.open('./data/vocab.txt', encoding='utf-8')
bpe_vocab = read_vocabulary(bpe_vocab, 50)
bpe = BPE(bpe_codes, '@@', bpe_vocab, None)

def bpetokenize(sent_, target = False):
 # bpe segment and convert to tensor
    sent_ = bpe.segment(sent_).split()
    if target:
        sent_.insert(0, "<SOS>")
        sent_.insert(-1, "<EOS>")
    return sent_

In [50]:
train_load = pd.DataFrame()
train_load['sentence'] = train_data['sentence'].apply(bpetokenize)
train_load['parser'] = train_data['parser'].apply(Parsertokenize)
train_load['target'] = train_data['target'].apply(bpetokenize, target = True)

In [51]:
train_load.head()

Unnamed: 0,sentence,parser,target
0,"[W@@, hy, won@@, 't, C@@, h@@, ina, let, P@@, ...","[<(>, <ROOT>, <(>, <SBARQ>, <(>, <WHADVP>, <(>...","[<SOS>, W@@, hy, won@@, 't, C@@, h@@, ina, let..."
1,"[I@@, s, it, common, to, say, ""@@, you, are, w...","[<(>, <ROOT>, <(>, <SQ>, <(>, <VBZ>, <)>, <(>,...","[<SOS>, I@@, s, it, common, to, say, ""@@, you,..."
2,"[D@@, o, G@@, +, ""@@, plus, on@@, es@@, "", on,...","[<(>, <ROOT>, <(>, <SQ>, <(>, <VBP>, <)>, <(>,...","[<SOS>, D@@, o, G@@, +, ""@@, plus, on@@, es@@,..."
3,"[C@@, an, ll@@, p, give, loan, to, its, partne...","[<(>, <ROOT>, <(>, <SQ>, <(>, <MD>, <)>, <(>, ...","[<SOS>, C@@, an, ll@@, p, give, loan, to, its,..."
4,"[H@@, ow, many, medals, become, won, in, O@@, ...","[<(>, <ROOT>, <(>, <SBARQ>, <(>, <WHNP>, <(>, ...","[<SOS>, H@@, ow, many, medals, become, won, in..."


In [14]:
import pickle
with open('./data/dictionary.pkl', 'rb') as f:
    vocab_transform = pickle.load(f)
vocab_dict = vocab_transform.word2idx

In [28]:
from torchtext.data.utils import get_tokenizer

tokenizer = get_tokenizer('spacy',language='en_core_web_sm')
text_pipeline = lambda x: [vocab_dict[x_] if x_ in vocab_dict else vocab_dict["<unk>"] for x_ in x ]

In [29]:
setence_token_id = text_pipeline(train_load['sentence'].iloc[0])

In [30]:
import torch 

torch.tensor(setence_token_id, dtype=torch.int64)

tensor([21396, 16180, 14060, 16778, 14933,   775,  2824,   216, 22548,  1939,
          614, 23229,  3860, 18657, 28121,   119])

In [52]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch

pad_idx = vocab_dict['<pad>'] ##get the pad index from the vocab

def collate_batch(batch):
    sent_list, synt_lst, trg_list = [], [], []
    for sen_, syn_, trg_ in batch:
        processed_sent = torch.tensor(text_pipeline(sen_), dtype=torch.int64)
        sent_list.append(processed_sent)
        processed_synt = torch.tensor(text_pipeline(syn_), dtype=torch.int64)
        synt_lst.append(processed_synt)
        processed_trg = torch.tensor(text_pipeline(trg_), dtype=torch.int64)
        trg_list.append(processed_trg)

    return pad_sequence(sent_list, padding_value=pad_idx, batch_first=True), pad_sequence(synt_lst, padding_value=pad_idx, batch_first=True), pad_sequence(trg_list, padding_value=pad_idx, batch_first=True)

## 3. DataLoader

In [53]:
from torch.utils.data import DataLoader, Dataset

class DataWrap(Dataset):

    def __init__(self, dataframe):
        self.dataframe = dataframe
    
    def __len__(self):
        return len(self.dataframe)
        
    def __getitem__(self, idx):
        return self.dataframe.iloc[idx]

In [54]:
from torch.utils.data import DataLoader
import torch
torch.manual_seed(6969)

train = DataWrap(train_load.iloc[:800])
val   = DataWrap(train_load.iloc[800:900])
test  = DataWrap(train_load.iloc[900:])
train_dataloader = DataLoader(train,batch_size=16, shuffle=True)
test_dataloader = DataLoader(val,  batch_size=16)
test_dataloader = DataLoader(test,batch_size=16)

In [78]:
# for sen,syn,trg in train_dataloader:
#     print(sen)
#     print(syn)
#     print(trg)
#     break
i,j,k =  train_load.iloc[:10]


ValueError: not enough values to unpack (expected 3, got 1)

In [77]:
for l,h2,h in [i,j,k]:
    print(l,h2,h)

ValueError: too many values to unpack (expected 3)