In [1]:
import spacy
import os
import torch

from spacy.lang.en.examples import sentences
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.functional import to_map_style_dataset

# import torchtext.datasets as datasets
# from multi30k import Multi30k
import re

def load_tokenizers():

    try:
        spacy_en = spacy.load("en_core_web_sm")
    except IOError:
        os.system("python -m spacy download en_core_web_sm")
        spacy_en = spacy.load("en_core_web_sm")

    return spacy_en
# end

def tokenize(text, tokenizer):
    # text = re.sub(r'\d+', ' ', text)
    return [tok.text.lower() for tok in tokenizer.tokenizer(text)]
# end


def yield_tokens(data_iter, tokenizer):
    for line in data_iter:
        yield tokenizer(line)
    # end
# end

def build_vocabulary(spacy_en, source):

    def tokenize_en(text):
        return tokenize(text, spacy_en)

    print("Building English Vocabulary ...")

    vocab_tgt = build_vocab_from_iterator(
        yield_tokens(source, tokenize_en),
        min_freq=20,
        specials=["<s>", "</s>", "<blank>", "<unk>"],
    )

    vocab_tgt.set_default_index(vocab_tgt["<unk>"])

    return vocab_tgt
# end


def load_vocab(spacy_en, source):
    if not os.path.exists("vocab_quora.pt"):
        vocab_tgt = build_vocabulary(spacy_en, source)
        torch.save(vocab_tgt, "vocab_quora.pt")
    else:
        vocab_tgt = torch.load("vocab_quora.pt")
    print("Finished.\nVocabulary sizes:")
    print(len(vocab_tgt))
    return vocab_tgt
# end

In [2]:
# from: /Users/jinyuj/Workspace/PythonProjects/FirstPython/src/Tests/csv2json
import json
import csv

# Function to convert a CSV to JSON
# Takes the file paths as arguments
def parse_csv_file_to_json(path_file_csv):
    # create a dictionary
    elements = []

    # Open a csv reader called DictReader
    with open(path_file_csv, encoding='utf-8') as file_csv:
    #with open(path_file_csv) as file_csv:
        reader_csv = csv.DictReader(file_csv, delimiter="\t")

        # Convert each row into a dictionary
        # and add it to data
        for dict_head_value in reader_csv:
            element = {}

            for head, value in dict_head_value.items():
                #print(value)
                if value and (value[0] in ["[", "{"]):
                    #element[head] = eval(value)
                    element[head] = value
                else:
                    element[head] = value

            elements.append(element)
        # end
    # end

    return elements
# end

In [3]:
contents_quora = parse_csv_file_to_json('quora_duplicate_questions.tsv')
set_line_quora = set()
for content_quora in contents_quora:
    set_line_quora.add(content_quora['question1'])
    set_line_quora.add(content_quora['question2'])
# end
print(len(set_line_quora))

537362


In [4]:
spacy_en = load_tokenizers()
vocab_new = load_vocab(spacy_en, set_line_quora)

Building English Vocabulary ...
Finished.
Vocabulary sizes:
12804


In [7]:
vocab_new(['-1111223'])

[3]