In [None]:
from datasets import load_dataset
import h5py
import os
import tiktoken
from tqdm import tqdm


In [None]:
def dataset_loader(dataset_name, config_name, split=None):
    return load_dataset(dataset_name, config_name, split=split, cache_dir="./my_cache")


In [None]:
def process_dataset1(dataset_name, config_name, split, output_file, tokenizer_name="gpt2"):
    # Initialize tokenizer
    print(f"Loading tokenizer: {tokenizer_name}")
    enc = tiktoken.get_encoding(tokenizer_name)
    
    # Load dataset
    print(f"Loading dataset: {dataset_name}/{config_name}")
    dataset = dataset_loader(dataset_name, config_name, split)

    if split is None:
        splits = dataset.keys()
    else:
        splits = split
    print(f"Available split: {splits}")

    # Create output dir if doesnt exist
    os.makedirs(os.path.dirname(output_file), exist_ok=True)

    with h5py.File(output_file, "w") as f:
        dataset = f.create_dataset(dataset_name, (0,), maxshape=(None,), dtype='i')
        start_index = 0
        total_documents = 0

        for split_name in splits:
            print(f"Processing split: {split_name}")

            split_dataset = dataset[split_name]

            for idx, example in enumerate(tqdm(split_dataset, desc=f"Processing {split_name}")):
                # Find the key in dataset to extract text content
                key_field = None
                for key, value in example.items():
                    key_field = key
                    break

                if key_field:
                    text = example[key_field]
                else:
                    print(f"Could not find the key field in dataset")
                    continue
                
                # Append the end-of-text token and encode
                text_with_end = text + "<|endoftext|>"
                encoded = enc.encode(text_with_end, allowed_special={"<|endoftext|>"})
                encoded_len = len(encoded)

                # Calculate the end index for the new tokens
                end_index = start_index + encoded_len

                # Expand the dataset size and store the encoded tokens
                dataset.resize(dataset.shape[0] + encoded_len, axis=0)
                dataset[start_index:end_index] = encoded

                # Update the start index for the next batch of tokens
                start_index = end_index
                total_documents += 1
        
        print("Processing complete!")
        print(f"Total documents processed: {total_documents}")
        print(f"Total tokens: {start_index}")
        print(f"Output saved to: {output_file}")



In [None]:
def process_dataset(dataset, output_file, tokenizer_name="gpt2"):
    # Initialize tokenizer
    print(f"Loading tokenizer: {tokenizer_name}")
    enc = tiktoken.get_encoding(tokenizer_name)

    # Create output dir if doesnt exist
    os.makedirs(os.path.dirname(output_file), exist_ok=True)

    with h5py.File(output_file, "w") as f:
        # Create an expandable dataset named 'dataset' in the HDF5 file
        h5py_dataset = f.create_dataset("dataset", (0,), maxshape=(None,), dtype='i')
        start_index = 0
        total_documents = 0

        for example in dataset:
            # Find the key in dataset to extract text content
            key_field = None
            for key, value in example.items():
                key_field = key
                print(key_field)
                break

            if key_field:
                text = example[key_field]
            else:
                print(f"Could not find the key field in dataset")
                continue
            
            # Append the end-of-text token and encode
            text_with_end = text + "<|endoftext|>"
            encoded = enc.encode(text_with_end, allowed_special={"<|endoftext|>"})
            encoded_len = len(encoded)

            # Calculate the end index for the new tokens
            end_index = start_index + encoded_len

            # Expand the dataset size and store the encoded tokens
            h5py_dataset.resize(h5py_dataset.shape[0] + encoded_len, axis=0)
            h5py_dataset[start_index:end_index] = encoded

            # Update the start index for the next batch of tokens
            start_index = end_index
            total_documents += 1
        
        print("Processing complete!")
        print(f"Total documents processed: {total_documents}")
        print(f"Total tokens: {start_index}")
        print(f"Output saved to: {output_file}")



In [45]:
dataset_name = "EleutherAI/wikitext_document_level"
config_name = "wikitext-2-raw-v1"
split = "train"
out_file = "data/train/wikitext-2-raw-v1.h5"

dataset = dataset_loader(dataset_name, config_name, split)



In [52]:
process_dataset(dataset, out_file)


Loading tokenizer: gpt2
page
shape 0
page
shape 4506
page
shape 9032
page
shape 12871
page
shape 13674
page
shape 15295
page
shape 19082
page
shape 19096
page
shape 19110
page
shape 19150
page
shape 19169
page
shape 19182
page
shape 19210
page
shape 19784
page
shape 21660
page
shape 22816
page
shape 23566
page
shape 26960
page
shape 30229
page
shape 32814
page
shape 34340
page
shape 45752
page
shape 48719
page
shape 56237
page
shape 58918
page
shape 60731
page
shape 66181
page
shape 68935
page
shape 72201
page
shape 78654
page
shape 83041
page
shape 89556
page
shape 90600
page
shape 98666
page
shape 104955
page
shape 107729
page
shape 109094
page
shape 111565
page
shape 113006
page
shape 124344
page
shape 134518
page
shape 135762
page
shape 141159
page
shape 149521
page
shape 151264
page
shape 153683
page
shape 156397
page
shape 162701
page
shape 165055
page
shape 172912
page
shape 175394
page
shape 181297
page
shape 182483
page
shape 185168
page
shape 186688
page
shape 193067
page
sha

In [None]:
for example in dataset:
    print(example)
    break


{'page': ' = Valkyria Chronicles III = \n \n Senjō no Valkyria 3 : Unrecorded Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . Employing the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " Calamaty Raven " . \n The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II . While it retained the standard features of the series , it also underwent multiple adjustments , such as making the game more forgiving 

In [None]:
import numpy as np
import torch
data_path = out_file
context_length = 16
batch_size = 2
device = 'cpu'


In [None]:
def get_batch_iterator(data_path, batch_size, context_length):
    with h5py.File(data_path, 'r') as f:
        # Extract the dataset
        dataset = f["dataset"]

        # Get the total size of the dataset
        dataset_size = dataset.shape[0]

        # Calculate the number of examples can be made from the dataset
        n_examples = (dataset_size - 1) // context_length

        # Create an array of indices for examples and shuffle them for randomness
        example_idxs = np.arange(n_examples)
        np.random.shuffle(example_idxs)

        # Initialize epoch counter and example counter
        epochs = 0
        counter = 0
        
        while True:
            # Select a batch of random indices to generate sequences
            random_indices = example_idxs[counter: counter+batch_size] * context_length
            print(random_indices)

            # Retrieve sequences from the dataset based on the random indices
            random_samples = torch.tensor(np.array([dataset[idx:idx+context_length+1] for idx in random_indices]))
            
            # Separate the input sequences (xb) and target sequences (yb)
            xb = random_samples[:, :-1].to(device)  # Input sequence (first half of the random sample)
            yb = random_samples[:, 1:].to(device)  # Target sequence (second half of the random sample)

            # Increment the counter to move to the next batch
            counter += batch_size
            
            # Yield the input and target sequences as a tuple for the current batch
            yield xb, yb



[338752  30800]
tensor([[15579,  2921, 13181,  3241,   284,   465,  2712,   764, 22687,  1043,
         26075,  7487,   705,    82,  2712,  7306],
        [ 1816,   555, 12795,   287,   262, 12844,  9481, 21617, 13650,   764,
           679,   373,  3706,   284,   262, 17277]], dtype=torch.int32)
tensor([[ 2921, 13181,  3241,   284,   465,  2712,   764, 22687,  1043, 26075,
          7487,   705,    82,  2712,  7306,   329],
        [  555, 12795,   287,   262, 12844,  9481, 21617, 13650,   764,   679,
           373,  3706,   284,   262, 17277,  3687]], dtype=torch.int32)


In [None]:
dataset_size


2417785

In [None]:
n_examples


151111