## Preparing Dataset

In [1]:
import h5py, os
print("==== loading data ====")
nmt_train = h5py.File(os.path.join('./data/train_data.h5'), 'r')
nmt_train.keys()

==== loading data ====


<KeysViewHDF5 ['sents', 'synts']>

In [2]:
nmt_train['sents'][1], nmt_train['synts'][1]

(b'you were trying to take something back .',
 b'(ROOT (S (NP (PRP you)) (VP (VBD were) (VP (VBG trying) (S (VP (TO to) (VP (VB take) (NP (NN something)) (PRT (RB back))))))) (. .)))')

In [3]:
nmt_val = h5py.File(os.path.join('./data/valid_data.h5'), 'r')
nmt_val.keys()

<KeysViewHDF5 ['sents', 'synts']>

In [4]:
len(nmt_train['sents']), len(nmt_val['sents'])

(45377426, 12800)

In [5]:
nmt_train['sents'][0],nmt_train['synts'][0]

(b"ah , yes , i 'm kang-mo ... - do n't , dad !",
 b"(ROOT (S (S (ADVP (RB ah)) (, ,) (INTJ (UH yes)) (, ,) (FW i) (VP (VBP 'm) (ADJP (JJ kang-mo)))) (: ...) (S (: -) (VP (VBP do) (RB n't) (, ,) (NP (NN dad)))) (. !)))")

In [15]:
new_dep = ['ACL', 'ACOMP', 'ADVCL', 'ADVMOD', 'AGENT', 'AMOD', 'APPOS', 'ATTR', 'AUX', 'AUXPASS', 'CASE','CC', 'CCOMP', 'COMPOUND', 'CONJ', 'CSUBJ', 'CSUBJPASS', 'DATIVE','DEP','DET', 'DOBJ', 'EXPL', 'INTJ', 'MARK', 'META','NEG', 'NOUNMOD', 'NPMOD', 'NSUBJ', 'NSUBJPASS', 'NUMMOD', 'OPRD', 'PARATAXIS', 'PCOMP', 'POBJ', 'POSS', 'PRECONJ', 'PREDET', 'PREP', 'PRT', 'PUNCT', 'QUANTMOD', 'RELCL', 'ROOT', 'XCOMP']
deprecated = ['COMPLM', 'INFMOD', 'PARTMOD', 'HMOD', 'HYPH', 'IOBJ', 'NUM', 'NUMBER', 'NMOD','NN', 'NPADVMOD', 'POSSESSIVE', 'RCMOD']

dependency_tags = new_dep + deprecated

In [16]:
import pickle

with open("./data/dictionary.pkl", "rb") as file:
    dictionary = pickle.load(file)

In [17]:
for i in range(31414, 31414+len(dependency_tags)):
    # print(i,dependency_tags[i-31414])
    dictionary.idx2word[i] = dependency_tags[i-31414].upper()
    a = dependency_tags[i-31414].upper()
    dictionary.word2idx[a] = i 

In [18]:
dictionary.word2idx['root'], dictionary.idx2word[31456], dictionary.word2idx['ROOT']

(4827, 'RELCL', 31457)

In [19]:
def is_paren(tok):
    return tok == ")" or tok == "("

def getleaf(tree):
    nonleaves = ''
    for w in str(tree).replace('\n', '').split():
        w = w.replace('(', '( ').replace(')', ' )')
        nonleaves += w + ' '
    
    leaves = []
    arr = nonleaves.split()
    for n, i in enumerate(arr):
        if n + 1 < len(arr):
            tok1 = arr[n]
            tok2 = arr[n + 1]
            if not is_paren(tok1) and not is_paren(tok2):
                leaves.append(arr[n])

    return leaves

def deleaf(tree):
    nonleaves = ''
    for w in str(tree).replace('\n', '').split():
        w = w.replace('(', '( ').replace(')', ' )')
        nonleaves += w + ' '

    arr = nonleaves.split()
    for n, i in enumerate(arr):
        if n + 1 < len(arr):
            tok1 = arr[n]
            tok2 = arr[n + 1]
            if not is_paren(tok1) and not is_paren(tok2):
                arr[n + 1] = ""

    nonleaves = " ".join(arr)
    return nonleaves.split()

In [20]:
#Tokenizer BPE
from subwordnmt.apply_bpe import BPE, read_vocabulary
import codecs
import numpy as np

# load bpe codes
bpe_codes = codecs.open('./data/bpe.codes', encoding='utf-8')
bpe_vocab = codecs.open('./data/vocab.txt', encoding='utf-8')
bpe_vocab = read_vocabulary(bpe_vocab, 50)
bpe = BPE(bpe_codes, '@@', bpe_vocab, None)

def bpe_tokenizer(sent_, target = False):
 # bpe segment and convert to tensor
    sent_ = bpe.segment(sent_).split()
    sent_ = [dictionary.word2idx[w] if w in dictionary.word2idx else dictionary.word2idx["<unk>"] for w in sent_]
    if target:
        sent_ = [dictionary.word2idx["<sos>"]] + sent_ + [dictionary.word2idx["<eos>"]]
    return sent_

#syntax to syntatic tokenzier
from nltk import ParentedTree
def parser_tokenizer(synt_):
    synt_  = ParentedTree.fromstring(synt_)
    synt_ = deleaf(synt_)
    synt_ = [dictionary.word2idx[f"<{w}>"] for w in synt_ if f"<{w}>" in dictionary.word2idx]
    synt_ = [dictionary.word2idx["<sos>"]] + synt_ + [dictionary.word2idx["<eos>"]]
    return synt_

import pickle
with open('synt_vocab.pkl', 'rb') as f:
    synt_vocab = pickle.load(f)

def bow(synt_):
    synt_bow = np.ones(74)
    synt_ = ['<sos>'] + deleaf(synt_) + ['<eos>']
    for tag in synt_:
        if tag != '<sos>' and tag != '<eos>':
            synt_bow[synt_vocab[tag]-3] += 1
    synt_bow /= synt_bow.sum()
    return synt_bow 

import spacy
nlp = spacy.load('en_core_web_md')
def dependency_parser(sent_):
    dep_ = [token.dep_.upper() for token in nlp(sent_)]
    dep_ = [dictionary.word2idx[w] for w in dep_]
    return dep_

In [21]:
# dependency_parser(nmt_train['sents'][1].decode()) 

In [22]:
from tqdm import tqdm

def prepare_paraphrase_dataset(sent1, synt2, sent2):
    lists_ = list()
    for sen1, syn2, sen2 in tqdm(zip(sent1,synt2,sent2)):
        sent_ = bpe_tokenizer(sen1.decode())
        syn_  = parser_tokenizer(syn2.decode())
        trg_  = bpe_tokenizer(sen2.decode(), target = True)
        dep_  = dependency_parser(sen2.decode()) 
        # bow_  = bow(syn2.decode())
        lists_.append((sent_, syn_, trg_, dep_))
    return lists_

In [23]:
num = 1000000
nmt_trainset = prepare_paraphrase_dataset(nmt_train['sents'][:num], nmt_train['synts'][:num], nmt_train['sents'][:num])

1000000it [1:59:49, 139.10it/s]


In [24]:
with open('./data/nmt_trainset_dep.pkl', 'wb') as f:
    pickle.dump(nmt_trainset, f, protocol=pickle.HIGHEST_PROTOCOL)

In [25]:
num = 5000
nmt_validset = prepare_paraphrase_dataset(nmt_val['sents'][:num], nmt_val['synts'][:num], nmt_val['sents'][:num])

5000it [00:36, 135.65it/s]


In [26]:
with open('./data/nmt_validset_dep.pkl', 'wb') as f:
    pickle.dump(nmt_validset, f, protocol=pickle.HIGHEST_PROTOCOL)