## 1.ETL

In [1]:
import pandas as pd
qq = pd.read_csv('./Datasets/quora_question.csv')
qq.drop(columns=['test_id','question2'], inplace=True)
qq.shape

(2345796, 1)

In [2]:
random_sample = qq.sample(n=1000,random_state=6969) #try only 1000 samples
random_sample.shape

(1000, 1)

In [3]:
random_sample.head()

Unnamed: 0,question1
614123,Why won't China let Pope Francis visit?
795359,"Is it common to say ""you are welcome"" in when ..."
2209942,"Do G+ ""plus ones"" on posts actually do anythin..."
1383030,Can llp give loan to its partners?
529755,How many medals become won in Olympics ?


In [4]:
qq1000 = random_sample['question1'].values.tolist()

In [5]:
import nltk
# benepar.download('benepar_en3')
import benepar, spacy
nlp = spacy.load('en_core_web_md')
nlp.add_pipe('benepar', config={'model': 'benepar_en3'})

def constituency_parser(text):
    doc = nlp(text)
    sent = list(doc.sents)[0]
    return  "(ROOT "+sent._.parse_string+")"

In [6]:
from tqdm import tqdm_notebook
train_data = list()
for idx in tqdm_notebook(range(len(qq1000))):
    train_data.append([qq1000[idx],constituency_parser(qq1000[idx])])

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for idx in tqdm_notebook(range(len(qq1000))):


  0%|          | 0/1000 [00:00<?, ?it/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [15]:
train_data = pd.DataFrame(train_data)
train_data.rename(columns={0:'sentence',1:'parser'},inplace=True)
train_data.head()

Unnamed: 0,sentence,parser
0,Why won't China let Pope Francis visit?,(ROOT (SBARQ (WHADVP (WRB Why)) (SQ (MD wo) (R...
1,"Is it common to say ""you are welcome"" in when ...",(ROOT (SQ (VBZ Is) (NP (NP (PRP it))) (ADJP (J...
2,"Do G+ ""plus ones"" on posts actually do anythin...","(ROOT (SQ (VBP Do) (NP (NP (`` G+) (`` "") (CC ..."
3,Can llp give loan to its partners?,(ROOT (SQ (MD Can) (NP (NN llp)) (VP (VB give)...
4,How many medals become won in Olympics ?,(ROOT (SBARQ (WHNP (WHADJP (WRB How) (JJ many)...


In [29]:
def target(text):
    return  "<SOS> "+ text + " <EOS>"

train_data['target'] = train_data['sentence'].apply(target)

In [35]:
train_data.head()

Unnamed: 0,sentence,parser,target
0,Why won't China let Pope Francis visit?,(ROOT (SBARQ (WHADVP (WRB Why)) (SQ (MD wo) (R...,<SOS> Why won't China let Pope Francis visit? ...
1,"Is it common to say ""you are welcome"" in when ...",(ROOT (SQ (VBZ Is) (NP (NP (PRP it))) (ADJP (J...,"<SOS> Is it common to say ""you are welcome"" in..."
2,"Do G+ ""plus ones"" on posts actually do anythin...","(ROOT (SQ (VBP Do) (NP (NP (`` G+) (`` "") (CC ...","<SOS> Do G+ ""plus ones"" on posts actually do a..."
3,Can llp give loan to its partners?,(ROOT (SQ (MD Can) (NP (NN llp)) (VP (VB give)...,<SOS> Can llp give loan to its partners? <EOS>
4,How many medals become won in Olympics ?,(ROOT (SBARQ (WHNP (WHADJP (WRB How) (JJ many)...,<SOS> How many medals become won in Olympics ?...


In [18]:
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer

tokenizer = get_tokenizer('spacy',language='en_core_web_sm')

def yield_tokens(data_iter):
    for text, _ in data_iter.itertuples(index=False):
        yield tokenizer(text)
    #loop through the data_iter, 
    # Mind that the data_iter in this case is pandas Dataframe
    # pass #remove this line and code here

specials = ['<unk>','<pad>','<bos>','<eos>'] #create array of special tags for the vocab
vocab_transform  = build_vocab_from_iterator(yield_tokens(train_data), specials = specials, special_first=True)

#set_default_index of the vocab to unknown tag
vocab_transform.set_default_index(vocab_transform["<unk>"]) #if you don't the id of this word, set it unk

In [24]:
assert len(vocab_transform) == 3407 #only for 1000 samples

3407

In [23]:
vocab_transform.get_itos()[:10]

['<unk>', '<pad>', '<bos>', '<eos>', '?', 'the', 'What', 'a', 'is', 'I']

In [25]:
import pickle
# with open('vocab_transform_cnn.pickle', 'wb') as f:
#     pickle.dump(vocab_transform, f, protocol=pickle.HIGHEST_PROTOCOL)
with open('vocab_transform_cnn.pickle', 'rb') as f:
    vocab_transform = pickle.load(f)

In [27]:
import torch

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

from torchtext.vocab import FastText
fast_vectors = FastText(language='simple') ##Load fasttext with language = simple
fast_embedding = fast_vectors.get_vecs_by_tokens(vocab_transform.get_itos()).to(device)

#since the fasttext  has 300 embedding
assert fast_embedding.shape == (len(vocab_transform), 300)

cpu


.vector_cache\wiki.simple.vec: 293MB [00:25, 11.4MB/s]                               
  0%|          | 0/111051 [00:00<?, ?it/s]Skipping token b'111051' with 1-dimensional vector [b'300']; likely a header
100%|██████████| 111051/111051 [00:11<00:00, 9527.14it/s] 


In [None]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

pad_idx = vocab_transform['<pad>'] ##get the pad index from the vocab

def collate_batch(batch):
    ## copy the collate_batch function from Professor's code. But it will not work right away
    #mind how the dataset that we use is structured (hint: columns)
    label_list, text_list = [], []
    for (_text, _label) in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
    #criterion expects float labels
    return torch.tensor(label_list, dtype=torch.int64), pad_sequence(text_list, padding_value=pad_idx, batch_first=True)

## 3. DataLoader

In [7]:
from torch.utils.data import DataLoader
import torch
torch.manual_seed(6969)

train = train_data[:800]
val   = train_data[800:900] 
test  = train_data[900:]
train_dataloader = DataLoader(train,batch_size=16, shuffle=True)
test_dataloader = DataLoader(val,  batch_size=16)
test_dataloader = DataLoader(test,batch_size=16)

In [8]:
for sents, syns, trgs in train_dataloader:
    print(sents)
    print(syns)
    print(trgs)
    break

('Finance: Which is be an Auroville resident for a long time?', 'How do pool leave my country?', 'What is food?', 'What does "in line boys mean of the phrase this?', 'For atheists, are morals to subjective?', 'Which programming language was used to Facebook?', 'Despite being relatively poor at programming, I want to work harder, and one day reach the International Finals of ACM ICPC. How do I go about it?', 'How can I reduce my belly fat in 2 months?', 'What does Stanley leather master mean?', 'How do I earn money in my free energy?', 'Which operating system is safer: Windows, OS X, or Linux? Why?', 'What does ";-;" exam after 10th?', 'How can I study communication?', 'How do you clean general a coffee maker?', 'Why do assassin?', 'How was your experience palace in a relationship?')
('(ROOT (NP (NP (NN Finance)) (: :) (WHNP (WDT Which)) (SQ (VBZ is) (VP (VB be) (NP (DT an) (NNP Auroville) (NN resident)) (PP (IN for) (NP (DT a) (JJ long) (NN time))))) (. ?)))', '(ROOT (SBARQ (WHADVP (WR

In [32]:
def is_paren(tok):
    return tok == ")" or tok == "("

def deleaf(tree):
    nonleaves = ''
    for w in str(tree).replace('\n', '').split():
        w = w.replace('(', '( ').replace(')', ' )')
        nonleaves += w + ' '

    arr = nonleaves.split()
    for n, i in enumerate(arr):
        if n + 1 < len(arr):
            tok1 = arr[n]
            tok2 = arr[n + 1]
            if not is_paren(tok1) and not is_paren(tok2):
                arr[n + 1] = ""

    nonleaves = " ".join(arr)
    return nonleaves.split()

from nltk import ParentedTree

def Parsetokenize(synt_):
    synt_ = ParentedTree.fromstring(synt_)
    synt_ = deleaf(synt_)
    return synt_

In [36]:
from nltk import ParentedTree
synt_ = '(ROOT (NP (NP (NN Finance)) (: :) (WHNP (WDT Which)) (SQ (VBZ is) (VP (VB be) (NP (DT an) (NNP Auroville) (NN resident)) (PP (IN for) (NP (DT a) (JJ long) (NN time))))) (. ?)))'
synt_ = ParentedTree.fromstring(synt_)
synt_ = deleaf(synt_)
synt_[:10]

['(', 'ROOT', '(', 'NP', '(', 'NP', '(', 'NN', ')', ')']