## TSV to HDF5 conversion

In [2]:
import pathlib as pl
from utils import csv_to_hf5,csv_line_count

folder = "D:\\Datasets\\c4_200m\\data\\tsv"

In [None]:
%%time
tsv_files = list(pl.Path(folder).glob('C4_200M.tsv-*'))
for i,f in enumerate(tsv_files):
    print("Converting ",f.name)
    n = csv_line_count(f)
    elapsed = csv_to_hf5(str(f),num_lines=n)

## Virtual Dataset
It is possible to merge all the files into a Virtual dataset, which contains 183 millions of samples.
While I have provided the code to implement that, this will not be used in the following dataset.

In [11]:
import pathlib as pl
import h5py

folder = "D:\\Datasets\\c4_200m\\data\\hdf5"
file_names_to_concatenate = list(pl.Path(folder).glob('C4_200M.h5-*'))

entry_key = 'text'  # where the data is inside of the source files.
sh = h5py.File(file_names_to_concatenate[0], 'r')[entry_key].shape  # get the first ones shape.
dt = h5py.special_dtype(vlen=str)

layout = h5py.VirtualLayout(shape=(len(file_names_to_concatenate),) + sh,
                            dtype=dt)

with h5py.File(str(pl.Path(folder)/"c4_200m.h5"), 'w', libver='latest') as f:
    for i, filename in enumerate(file_names_to_concatenate):
        vsource = h5py.VirtualSource(filename, entry_key, shape=sh)
        layout[i, :, :] = vsource

    f.create_virtual_dataset(entry_key, layout)

We can read each HDF5 files as a Numpy array, and most numpy operations such as slicing are supported.

In [3]:
import h5py

folder = "D:\\Datasets\\c4_200m\\data\\hdf5"
h5_files = list(pl.Path(folder).glob('C4_200M.h5-*'))

index = 0

with h5py.File(h5_files[0], 'r') as h5_file:
    input = h5_file['text'][0,index].decode('utf-8')
    label = h5_file['text'][1,index].decode('utf-8')

print("input:",input)
print("label:",label)

input: Bitcoin is for $7,094 this morning, which CoinDesk says.
label: Bitcoin goes for $7,094 this morning, according to CoinDesk.


## Dataset
To be able to use this dataset for a training process, I have created a custom class, based on this [notebook](https://github.com/rasbt/deeplearning-models/blob/master/pytorch_ipynb/mechanics/custom-data-loader-csv.ipynb)

In [5]:
from torch.utils.data import Dataset
class Hdf5Dataset(Dataset):
    """Custom Dataset for loading entries from HDF5 databases"""

    def __init__(self, h5_path, transform=None,num_entries = None):

        self.h5f = h5py.File(h5_path, 'r')
        if num_entries:
            self.num_entries = num_entries
        else:
            self.num_entries = self.h5f['text'].shape[1]
        self.transform = transform

    def __getitem__(self, index):
        if index > self.num_entries:
            raise StopIteration
        input = self.h5f['text'][0,index].decode('utf-8')
        label = self.h5f['text'][1,index].decode('utf-8')
        if self.transform is not None:
            features = self.transform(input)
        return input, label

    def __len__(self):
        return self.num_entries

dset = Hdf5Dataset(h5_files[0])
print("Dataset length: ", len(dset))

for f in dset[0]:
    print(f)

Dataset length:  18386522
Bitcoin is for $7,094 this morning, which CoinDesk says.
Bitcoin goes for $7,094 this morning, according to CoinDesk.


## Vocabulary
It is now possible to obtain a vocabulary fitted on our dataset by performing a count of each tokenized sentence of the dataset, and piking the most frequent ones, based on the desired vocabulary dimensionality.

In [8]:
%%time

from typing import Iterable, List
import tqdm
import torchtext as text
import torch

token_transform = text.data.utils.get_tokenizer('spacy', language='en_core_web_sm')

def yield_tokens(data_iter: Iterable, index: int) -> List[str]:
    for i in data_iter.shape[0]:
        for data_sample in tqdm.tqdm(data_iter[i,:,:]):
            if data_sample[index] and isinstance(data_sample[index], str):
                yield token_transform(data_sample[index])

def build_vocab(dataset_iterator, col=1, vocab_size=None, out_folder=None,filename='vocab.pth'):
    # Define special symbols and indices
    UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
    # Make sure the tokens are in order of their indices to properly insert them in vocab
    special_symbols = ['<UNK>', '<PAD>', '<BOS>', '<EOS>']

    # Create torchtext's Vocab object
    vocab_transform = text.vocab.build_vocab_from_iterator(yield_tokens(dset, col),
                                                           max_tokens=vocab_size,
                                                           specials=special_symbols,
                                                           special_first=True)
    print("Built vocabulary")
    vocab_transform.set_default_index(UNK_IDX)
    torch.save(vocab_transform, pl.Path(out_folder) / filename)

#Path of the vocabulary file
out_folder = 'vocab'
filename = 'tgt_vocab_20K_18M'
vocab_size=20_000
col = 1

build_vocab(dset,vocab_size=vocab_size,out_folder=out_folder,filename=filename)

AttributeError: 'Hdf5Dataset' object has no attribute 'shape'