# Preparing Dataset

In [1]:
##ParaNMT
with open('./Datasets/para-nmt-5m-processed.txt', encoding="utf8") as file:
    nmt = file.readlines()

In [2]:
nmt_sentences = list()
for idx, sentence in enumerate(nmt):
    nmt_sentences.append(nmt[idx].split("\t")[0]) #non-parallel data
nmt_sentences[:5] #list

["so , unless that 's gon na be feasible , then ...",
 'of course you did .',
 "by now , singh 's probably been arrested .",
 'not our shit . i swear .',
 '“ why not ?']

In [3]:
len(nmt_sentences) #5370128 sentences

5370128

In [4]:
##Quora
import pandas as pd
qq = pd.read_csv('./Datasets/quora_question.csv')
qq.drop(columns=['test_id','question2'], inplace=True) #non-parallel data
qq.head()

Unnamed: 0,question1
0,How does the Surface Pro himself 4 compare wit...
1,Should I have a hair transplant at age 24? How...
2,What but is the best way to send money from Ch...
3,Which food not emulsifiers?
4,"How ""aberystwyth"" start reading?"


In [5]:
qq.shape #2345796 sentences

(2345796, 1)

In [6]:
qq_sentences = qq['question1'].values.tolist()
qq_sentences[:5] #list

['How does the Surface Pro himself 4 compare with iPad Pro?',
 'Should I have a hair transplant at age 24? How much would it cost?',
 'What but is the best way to send money from China to the US?',
 'Which food not emulsifiers?',
 'How "aberystwyth" start reading?']

In [7]:
import nltk
# benepar.download('benepar_en3')
import benepar, spacy
nlp = spacy.load('en_core_web_md')
nlp.add_pipe('benepar', config={'model': 'benepar_en3'})

def is_paren(tok):
    return tok == ")" or tok == "("

def getleaf(tree):
    nonleaves = ''
    for w in str(tree).replace('\n', '').split():
        w = w.replace('(', '( ').replace(')', ' )')
        nonleaves += w + ' '
    
    leaves = []
    arr = nonleaves.split()
    for n, i in enumerate(arr):
        if n + 1 < len(arr):
            tok1 = arr[n]
            tok2 = arr[n + 1]
            if not is_paren(tok1) and not is_paren(tok2):
                leaves.append(arr[n])

    return leaves

def deleaf(tree):
    nonleaves = ''
    for w in str(tree).replace('\n', '').split():
        w = w.replace('(', '( ').replace(')', ' )')
        nonleaves += w + ' '

    arr = nonleaves.split()
    for n, i in enumerate(arr):
        if n + 1 < len(arr):
            tok1 = arr[n]
            tok2 = arr[n + 1]
            if not is_paren(tok1) and not is_paren(tok2):
                arr[n + 1] = ""

    nonleaves = " ".join(arr)
    return nonleaves.split()

import pickle
with open("./data/dictionary.pkl", "rb") as file:
    dictionary = pickle.load(file)

#Setence to syntax
def constituency_parser(text):
    doc = nlp(text)
    sent = list(doc.sents)[0]
    return "(ROOT "+sent._.parse_string+")"

#syntax to syntatic tokenzier
from nltk import ParentedTree
def parser_tokenizer(synt_):
    synt_ = ParentedTree.fromstring(synt_)
    synt_ = deleaf(synt_)
    # synt_ = [f'<{w}>' for w in synt_]
    synt_ = [dictionary.word2idx[f"<{w}>"] for w in synt_ if f"<{w}>" in dictionary.word2idx]
    return synt_

In [8]:
#Tokenizer BPE
from subwordnmt.apply_bpe import BPE, read_vocabulary
import codecs

# load bpe codes
bpe_codes = codecs.open('./data/bpe.codes', encoding='utf-8')
bpe_vocab = codecs.open('./data/vocab.txt', encoding='utf-8')
bpe_vocab = read_vocabulary(bpe_vocab, 50)
bpe = BPE(bpe_codes, '@@', bpe_vocab, None)

def bpe_tokenizer(sent_, target = False):
 # bpe segment and convert to tensor
    sent_ = bpe.segment(sent_).split()
    sent_ = [dictionary.word2idx[w] if w in dictionary.word2idx else dictionary.word2idx["<unk>"] for w in sent_]
    if target:
        sent_ = [dictionary.word2idx["<sos>"]] + sent_ + [dictionary.word2idx["<eos>"]]
    return sent_

# def pos_tag(sent_):
#     return [token.ent_type_ if token.ent_type_ else "" for token in nlp(sent_)]

# def dependency_parser(sent_):
#     return [token.dep_ for token in nlp(sent_)]

## Paraphrase Generator

In [9]:
from tqdm import tqdm

def prepare_paraphrase_dataset(sentences):
    sents, synts, trgs = list(), list(), list()	
    lists_ = list()
    for idx in tqdm(range(len(sentences))):

        sent_ = bpe_tokenizer(sentences[idx])
        synt_ = parser_tokenizer(constituency_parser(sentences[idx]))
        trg_  = bpe_tokenizer(sentences[idx], target = True)

        # sents.append(sent_) #
        # synts.append(synt_)
        # trgs.append(trg_)
        
        lists_.append((sent_, synt_, trg_))
    # return {'sentences':sents, 'syntatic':synts, 'targets': trgs}
    return lists_

In [11]:
import random
random.seed(6969)
random.shuffle(nmt_sentences)

nmt_dataset = prepare_paraphrase_dataset(nmt_sentences[:100000]) #100000 sentences

In [None]:
random.shuffle(qq_sentences)

qq_dataset = prepare_paraphrase_dataset(qq_sentences[:10000])

100%|██████████| 1000/1000 [01:11<00:00, 13.98it/s]


In [None]:
import pickle

with open('./Datasets/nmt_dataset.pkl', 'wb') as f:
    pickle.dump(nmt_dataset, f, protocol=pickle.HIGHEST_PROTOCOL)
with open('./Datasets/qq_dataset.pkl', 'wb') as f:
    pickle.dump(qq_dataset, f, protocol=pickle.HIGHEST_PROTOCOL)

## Parse Generator

In [None]:
import pickle
with open("./data/dictionary.pkl", "rb") as file:
    dictionary = pickle.load(file)

def syntax_tensor(synt_):
    synt_ = ParentedTree.fromstring(synt_)
    synt_ = deleaf(synt_)
    synt_ = [dictionary.word2idx[f"<{w}>"] for w in synt_ if f"<{w}>" in dictionary.word2idx]
    synt_ = [dictionary.word2idx["<sos>"]] + synt_ + [dictionary.word2idx["<eos>"]]
    return synt_

def tag_sequence(sent_):
    sent_ = ParentedTree.fromstring(sent_)
    sent_ = getleaf(sent_)
    sent_ = [dictionary.word2idx[f"<{w}>"] for w in sent_ if f"<{w}>" in dictionary.word2idx]
    return sent_

#parse syntax and get template
from nltk import ParentedTree

def tree2tmpl(tree, level, mlevel):
    if level == mlevel:
        for idx, n in enumerate(tree):
            if isinstance(n, ParentedTree):
                tree[idx] = "(" + n.label() + ")"
    else:
        for n in tree:
            tree2tmpl(n, level + 1, mlevel)

def template(tmpl_):
    tmpl_ = ParentedTree.fromstring(tmpl_)
    tree2tmpl(tmpl_, 1, 2)
    tmpl_ = str(tmpl_).replace(")", " )").replace("(", "( ").split(" ")
    tmpl_ = [dictionary.word2idx[f"<{w}>"] for w in tmpl_ if f"<{w}>" in dictionary.word2idx]
    tmpl_ = [dictionary.word2idx["<sos>"]] + tmpl_ + [dictionary.word2idx["<eos>"]]
    return tmpl_

In [None]:
from tqdm import tqdm

def prepare_parse_dataset(sentences):
    sents, tmpls, synts = list(), list(), list()	
    lists_ = list()
    for idx in tqdm(range(len(sentences))):
        parser = constituency_parser(sentences[idx])
        sent_ = tag_sequence(parser)
        tmpl_ = template(parser)
        synt_ = syntax_tensor(parser)

        # sents.append(sent_)  #sents
        # tmpls.append(tmpl_)  #synts
        # synts.append(synt_)  #targs
        lists_.append((sent_, tmpl_, synt_))
    # return {'sentences':sents, 'templates':tmpls, 'syntatic': synts}
    return lists_

In [None]:
nmt_parse = prepare_parse_dataset(nmt_sentences[:10000]) #10000 sentences

100%|██████████| 1000/1000 [01:12<00:00, 13.80it/s]


In [None]:
qq_parse = prepare_parse_dataset(qq_sentences[:10000]) #10000 sentences

100%|██████████| 1000/1000 [01:14<00:00, 13.41it/s]


In [None]:
with open('./Datasets/nmt_parse.pkl', 'wb') as f:
    pickle.dump(nmt_parse, f, protocol=pickle.HIGHEST_PROTOCOL)
with open('./Datasets/qq_parse.pkl', 'wb') as f:
    pickle.dump(qq_parse, f, protocol=pickle.HIGHEST_PROTOCOL)