# Load Dataset

## NMT Dataset

In [1]:
##ParaNMT
with open('./data/para-nmt-5m-processed.txt', encoding="utf8") as file:
    nmt = file.readlines()

In [2]:
nmt_sentences = list()
for idx, sentence in enumerate(nmt):
    nmt_sentences.append(nmt[idx].split("\t")[0]) #non-parallel data
nmt_sentences[:5] #list

["so , unless that 's gon na be feasible , then ...",
 'of course you did .',
 "by now , singh 's probably been arrested .",
 'not our shit . i swear .',
 '“ why not ?']

In [3]:
len(nmt_sentences) #5370128 sentences

5370128

## Configuration

In [4]:
import nltk
# benepar.download('benepar_en3')
import benepar, spacy
nlp = spacy.load('en_core_web_md')
nlp.add_pipe('benepar', config={'model': 'benepar_en3'})

def is_paren(tok):
    return tok == ")" or tok == "("

def getleaf(tree):
    nonleaves = ''
    for w in str(tree).replace('\n', '').split():
        w = w.replace('(', '( ').replace(')', ' )')
        nonleaves += w + ' '
    
    leaves = []
    arr = nonleaves.split()
    for n, i in enumerate(arr):
        if n + 1 < len(arr):
            tok1 = arr[n]
            tok2 = arr[n + 1]
            if not is_paren(tok1) and not is_paren(tok2):
                leaves.append(arr[n])

    return leaves

def deleaf(tree):
    nonleaves = ''
    for w in str(tree).replace('\n', '').split():
        w = w.replace('(', '( ').replace(')', ' )')
        nonleaves += w + ' '

    arr = nonleaves.split()
    for n, i in enumerate(arr):
        if n + 1 < len(arr):
            tok1 = arr[n]
            tok2 = arr[n + 1]
            if not is_paren(tok1) and not is_paren(tok2):
                arr[n + 1] = ""

    nonleaves = " ".join(arr)
    return nonleaves.split()

import pickle
with open("./data/dictionary.pkl", "rb") as file:
    dictionary = pickle.load(file)

#Setence to syntax
def constituency_parser(text):
    doc = nlp(text)
    sent = list(doc.sents)[0]
    return "(ROOT "+sent._.parse_string+")"

#syntax to syntatic tokenzier
from nltk import ParentedTree
def parser_tokenizer(synt_):
    synt_ = ParentedTree.fromstring(synt_)
    synt_ = deleaf(synt_)
    # synt_ = [f'<{w}>' for w in synt_]
    synt_ = [dictionary.word2idx[f"<{w}>"] for w in synt_ if f"<{w}>" in dictionary.word2idx]
    return synt_

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
#Tokenizer BPE
from subwordnmt.apply_bpe import BPE, read_vocabulary
import codecs

# load bpe codes
bpe_codes = codecs.open('./data/bpe.codes', encoding='utf-8')
bpe_vocab = codecs.open('./data/vocab.txt', encoding='utf-8')
bpe_vocab = read_vocabulary(bpe_vocab, 50)
bpe = BPE(bpe_codes, '@@', bpe_vocab, None)

def bpe_tokenizer(sent_, target = False):
 # bpe segment and convert to tensor
    sent_ = bpe.segment(sent_).split()
    sent_ = [dictionary.word2idx[w] if w in dictionary.word2idx else dictionary.word2idx["<unk>"] for w in sent_]
    if target:
        sent_ = [dictionary.word2idx["<sos>"]] + sent_ + [dictionary.word2idx["<eos>"]]
    return sent_

# def pos_tag(sent_):
#     return [token.ent_type_ if token.ent_type_ else "" for token in nlp(sent_)]

# def dependency_parser(sent_):
#     return [token.dep_ for token in nlp(sent_)]

In [6]:
import numpy as np
import tqdm

import pickle
with open('synt_vocab.pkl', 'rb') as f:
    synt_vocab = pickle.load(f)

print(synt_vocab)

def bow(synt_):
    synt_ = ParentedTree.fromstring(synt_)
    synt_ = deleaf(synt_)
    synt_bow = np.ones(74)
    #non-inculde <s> , <pad>, </s>
    #start from '(': 3 instead
    for tag in synt_:
        if tag != '<sos>' and tag != '<eos>':
            try:
                synt_bow[synt_vocab[tag]-3] += 1
            except:
                pass
    synt_bow /= synt_bow.sum()
    return synt_bow 

{'<s>': 0, '<pad>': 1, '</s>': 2, '(': 3, 'ROOT': 4, 'S': 5, 'ADVP': 6, 'RB': 7, ')': 8, ',': 9, 'INTJ': 10, 'UH': 11, 'FW': 12, 'VP': 13, 'VBP': 14, 'ADJP': 15, 'JJ': 16, ':': 17, 'NP': 18, 'NN': 19, '.': 20, 'PRP': 21, 'VBD': 22, 'VBG': 23, 'TO': 24, 'VB': 25, 'PRT': 26, 'FRAG': 27, 'SBAR': 28, 'IN': 29, 'QP': 30, 'VBZ': 31, 'CD': 32, 'PP': 33, 'VBN': 34, 'DT': 35, 'CC': 36, 'NNS': 37, 'PRP$': 38, 'WHNP': 39, 'WP': 40, 'LS': 41, 'NNP': 42, 'SINV': 43, 'PRN': 44, '``': 45, "''": 46, 'JJR': 47, 'WDT': 48, 'POS': 49, 'MD': 50, 'SQ': 51, 'SBARQ': 52, 'WHADVP': 53, 'WRB': 54, 'RP': 55, 'EX': 56, 'JJS': 57, 'X': 58, 'LST': 59, '-LRB-': 60, '-RRB-': 61, 'RBS': 62, 'UCP': 63, 'RBR': 64, 'WHPP': 65, 'PDT': 66, 'WHADJP': 67, 'NX': 68, 'CONJP': 69, '$': 70, 'WP$': 71, '#': 72, 'SYM': 73, 'NNPS': 74, 'RRC': 75, 'NAC': 76}


# Adversary Generator

In [7]:
from tqdm import tqdm

def prepare_paraphrase_dataset(sentences):
    lists_ = list()
    for idx in tqdm(range(len(sentences))):

        sent_ = bpe_tokenizer(sentences[idx])
        synt_ = parser_tokenizer(constituency_parser(sentences[idx]))
        trg_  = bpe_tokenizer(sentences[idx], target = True)
        bow_  = bow(constituency_parser(sentences[idx]))
        lists_.append((sent_, synt_, trg_, bow_))
    return lists_

In [8]:
# sentences_1m = prepare_paraphrase_dataset(nmt_sentences[:200000])

In [9]:
from tqdm import tqdm

def prepare_paraphrase_dataset_onlybow(sentences):
    lists_ = list()
    for idx in tqdm(range(len(sentences))):

        # sent_ = bpe_tokenizer(sentences[idx])
        # synt_ = parser_tokenizer(constituency_parser(sentences[idx]))
        # trg_  = bpe_tokenizer(sentences[idx], target = True)
        bow_  = bow(constituency_parser(sentences[idx]))
        # lists_.append((sent_, synt_, trg_, bow_))
        lists_.append((bow_))
    return lists_

In [11]:
bow_900k = prepare_paraphrase_dataset_onlybow(nmt_sentences[800000:9000000])

  0%|          | 168/4570128 [00:10<86:06:24, 14.74it/s]

In [None]:
import pickle

with open('./data/bow_900k.pkl', 'wb') as f:
    pickle.dump(bow_900k, f, protocol=pickle.HIGHEST_PROTOCOL)