In [1]:
import torch
import os
from torch.utils.data import DataLoader, Dataset
from torchtext.data.functional import to_map_style_dataset

import spacy


def Multi30k(language_pair=None):
    corpus_lines_train = []

    for lan in language_pair:
        with open('text/train.{}'.format(lan), 'r') as file:
            corpus_lines_train.append(file.read().splitlines())
        # end
    # end

    corpus_train = list(zip(*corpus_lines_train))

    corpus_lines_eval = []

    for lan in language_pair:
        with open('text/val.{}'.format(lan), 'r') as file:
            corpus_lines_eval.append(file.read().splitlines())
        # end
    # end

    corpus_eval = list(zip(*corpus_lines_eval))

    return corpus_train, corpus_eval, None
# end


def load_vocab(spacy_en):
    if not os.path.exists("vocab.pt"):
        vocab_tgt = build_vocabulary(spacy_en)
        torch.save(vocab_tgt, "vocab.pt")
    else:
        vocab_tgt = torch.load("vocab.pt")
    print("Finished.\nVocabulary sizes: {}".format(len(vocab_tgt)))
    return vocab_tgt
# end

def load_spacy():

    try:
        spacy_en = spacy.load("en_core_web_sm")
    except IOError:
        os.system("python -m spacy download en_core_web_sm")
        spacy_en = spacy.load("en_core_web_sm")

    return spacy_en
# end


seq_max = 64
batch_size = 3
dim_hidden = 128
dim_feedforward = 128
n_head = 4
n_layer = 2

spacy_en = load_spacy()
vocab = load_vocab(spacy_en)

train_iter, valid_iter, _ = Multi30k(language_pair=("de", "en"))
# to test sim

train_source = to_map_style_dataset(valid_iter)

Finished.
Vocabulary sizes: 6191


In [4]:
list_en = [i[1] for i in valid_iter]

In [6]:
list_en_0 = [seq for i, seq in enumerate(list_en) if i%2==0]
list_en_1 = [seq for i, seq in enumerate(list_en) if i%2==1]

In [9]:
list_en_0 = list_en_0[:-1]

In [14]:
list_final = []
for en_0, en_1 in zip(list_en_0, list_en_1):
    tokens_en_0 = set(en_0.split(' '))
    tokens_en_1 = set(en_1.split(' '))
    sim = len(tokens_en_0 & tokens_en_1) / len(tokens_en_0 | tokens_en_1)
    list_final.append([en_0, en_1, sim])
# end

In [16]:
import json
with open('corpus.json', 'w+') as file:
    file.write(json.dumps(list_final))
# end