this script processes text data into huggingface datasets and saves that to the disk.

this script fixes the issue where hf dataset would split on paragraphs instead of whole stories. this script also loads the dataset directly from huggingface

In [1]:
# print chars with ord value < 127
for i in range(128):
    print(chr(i), end=' ')

         	 
                     ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ ` a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~  

In [2]:
from tokenizers import Tokenizer

# Load your tokenizer
tokenizer = Tokenizer.from_file("./TinyStories_tokenizer_small_cleaned.json")

endoftext_token = tokenizer.encode("<|endoftext|>").ids  # This is the end of text token
print(endoftext_token)


[0]


In [3]:
from datasets import load_dataset

# load and split dataset into train and validation
dataset = load_dataset("roneneldan/TinyStories")

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
dataset.cleanup_cache_files()

{'train': 48, 'validation': 0}

In [5]:
# delete any examples with none english characters
def filter_func(example):
    # remove shortest and longest 1% of stories
    if len(example["text"]) < 418 or len(example["text"]) > 2505:
        return False
    for char in example["text"]:
        if ord(char) > 127:
            return False
    return True
dataset = dataset.filter(filter_func)

Filter: 100%|██████████| 2119719/2119719 [00:23<00:00, 89177.70 examples/s]
Filter: 100%|██████████| 21990/21990 [00:00<00:00, 87739.43 examples/s]


In [6]:
%%time

import numpy as np
# stats
print("Dataset size:", len(dataset["validation"]))
print("median length:", np.median([len(x) for x in dataset["validation"]["text"]]))
print("mean length:", np.mean([len(x) for x in dataset["validation"]["text"]]))
print("stdev length:", np.std([len(x) for x in dataset["validation"]["text"]]))
print("max length:", np.max([len(x) for x in dataset["validation"]["text"]]))
print("min length:", np.min([len(x) for x in dataset["validation"]["text"]]))

Dataset size: 20350
median length: 775.0
mean length: 863.1555282555282
stdev length: 331.29948451507494
max length: 2504
min length: 418
CPU times: user 220 ms, sys: 19.8 ms, total: 239 ms
Wall time: 238 ms


In [7]:
import random

dataset_size = len(dataset["train"])
ind = random.randint(0, dataset_size)
print(ind)
print(dataset["train"][ind]["text"])
# print([char for char in dataset["train"][ind]["text"] if ord(char) < 64])

267171
Lily and Ben are playing in the park. They like to swing, slide and run. They have a lot of fun.

But then, Lily notices something shiny on the ground. She runs to pick it up. It is a red handle. It looks like it belongs to a toy.

"Look, Ben, look!" Lily says, showing him the handle. "What is it?"

Ben comes closer and looks at the handle. He thinks hard. He remembers seeing something like it before.

"I know, I know!" Ben says. "It is a handle for a fire truck. A big, red fire truck. It makes a loud noise and sprays water."

Lily's eyes widen. She likes fire trucks. She wonders where the rest of the toy is.

"Maybe we can find it," Lily says. "Maybe someone lost it and is sad. We can help them."

Ben nods. He likes to help. He and Lily start to look around the park. They hope to find the fire truck and make someone happy.


In [8]:
def tokenize_function(examples):
    # Tokenize the batch
    encodings = tokenizer.encode_batch_fast(examples["text"])
    
    # Convert to dictionary format
    return {
        "input_ids": [encoding.ids + endoftext_token for encoding in encodings],
        # "predictions": [encoding.ids[1:] + endoftext_token + endoftext_token for encoding in encodings],
    }

# Tokenize the dataset
dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"],
    num_proc=23
)

Map (num_proc=23): 100%|██████████| 1949763/1949763 [00:32<00:00, 59708.98 examples/s]
Map (num_proc=23): 100%|██████████| 20350/20350 [00:00<00:00, 44825.55 examples/s]


In [9]:
%%time

import numpy as np
# stats
print("Dataset size:", len(dataset["validation"]))
print("mean length:", np.mean([len(x) for x in dataset["validation"]["input_ids"]]))
print("stdev length:", np.std([len(x) for x in dataset["validation"]["input_ids"]]))
print("max length:", np.max([len(x) for x in dataset["validation"]["input_ids"]]))
print("min length:", np.min([len(x) for x in dataset["validation"]["input_ids"]]))

Dataset size: 20350
mean length: 215.74437346437347
stdev length: 90.53928796330516
max length: 717
min length: 90
CPU times: user 3.2 s, sys: 40.3 ms, total: 3.24 s
Wall time: 3.24 s


In [10]:
def pack_token_lists(stories, max_length=513):
    """
    Packs token lists into batches without exceeding max_length.
    
    Args:
        token_lists: List of lists of token IDs
        max_length: Maximum allowed length for each batch (default: 513)
    
    Returns:
        Dictionary with packed inputs no longer than max_length (not padded)
    """
    # Sort token lists in descending order of length to improve packing efficiency
    stories_len_sorted = sorted(stories["input_ids"], key=len, reverse=True)
    
    inputs = []
    token_positions = []
    
    for story in stories_len_sorted:
        placed = False
        story_length = len(story)

        if story_length >= max_length:
            # truncate the token list if it exceeds max_length
            story = story[:max_length]
            inputs.append(story)

            # Add the positions of every token in the story
            token_positions.append(list(range(len(story))))

            placed = True
            continue
        
        # iterate over both inputs and token_positions
        for (input, position) in zip(inputs, token_positions):
            input_length = len(input)
            if input_length + story_length <= max_length:
                # Extend the existing input with the new story
                input.extend(story)
                
                # Update token positions
                position.extend(list(range(story_length)))
                
                placed = True
                break
                
        # If no existing batch can accommodate, create a new batch
        if not placed:
            inputs.append(story)
            token_positions.append(list(range(len(story))))
    
    return {
        "packed_inputs": inputs,
        "positions": token_positions,
    }


In [11]:
dataset = dataset.map(
  pack_token_lists,
  batched=True,
  remove_columns=["input_ids"],
  num_proc=None,
)

Map: 100%|██████████| 1949763/1949763 [02:12<00:00, 14666.30 examples/s]
Map: 100%|██████████| 20350/20350 [00:01<00:00, 15191.19 examples/s]


In [12]:
# pad inputs to max length
def pad_sequences(example, max_length=513, padding_value=endoftext_token[0]):
    """
    Pads sequences to a fixed length.
    
    Args:
        examples: Dictionary containing packed inputs
        max_length: Desired length for padding (default: 513)
    
    Returns:
        Dictionary with padded sequences
    """
    # Pad sequence to the specified max_length
    sequence = example["packed_inputs"]
    padded_input = sequence + [padding_value] * (max_length - len(sequence)) if len(sequence) < max_length else sequence[:max_length]
    positions = example["positions"]
    padded_positions = positions + [0] * (max_length - len(positions)) if len(positions) < max_length else positions[:max_length]

    return {
        "input_ids": padded_input,
        "padded_positions": padded_positions,
    }


In [13]:
dataset = dataset.map(
  pad_sequences,
  batched=False,
  remove_columns=["packed_inputs", "positions"],
  num_proc=23,
)

Map (num_proc=23): 100%|██████████| 888135/888135 [00:18<00:00, 47269.37 examples/s]
Map (num_proc=23): 100%|██████████| 9005/9005 [00:00<00:00, 30093.37 examples/s]


In [14]:
dataset.set_format("torch")

In [15]:
dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'padded_positions'],
        num_rows: 888135
    })
    validation: Dataset({
        features: ['input_ids', 'padded_positions'],
        num_rows: 9005
    })
})

In [16]:
import numpy as np

# create square attention mask for sequence packed inputs
def create_attention_mask(example, padding_value=endoftext_token[0]):
    """
    Creates an attention mask for packed inputs.
    
    Args:
        example: padded input example
    
    Returns:
        Dictionary with attention masks
    """

    # get indexes of padding tokens
    input_ids = np.array(example["input_ids"])
    padding_indexes = np.where(input_ids == endoftext_token[0])[0]

    # Create a square attention mask
    # the attention mask should be 0 if there is a padding token between i and j and 1 otherwise
    attention_mask = np.ones((len(input_ids), len(input_ids)), dtype=np.bool)
    for padding_index in padding_indexes:
        # each story delineated by a padding token
        # set attention to 0 for all tokens outside of the story
        attention_mask[:padding_index+1, padding_index+1:] = 0
        attention_mask[padding_index+1:, :padding_index+1] = 0
    
    return {
        "packed_inputs": example["input_ids"],
        "attention_mask": attention_mask,
    }

In [17]:
dataset = dataset.map(
    create_attention_mask,
    batched=False,
    remove_columns=["input_ids"],
    num_proc=23,
)

  input_ids = np.array(example["input_ids"])
  input_ids = np.array(example["input_ids"])
  input_ids = np.array(example["input_ids"])
  input_ids = np.array(example["input_ids"])
  input_ids = np.array(example["input_ids"])
  input_ids = np.array(example["input_ids"])
  input_ids = np.array(example["input_ids"])
  input_ids = np.array(example["input_ids"])
  input_ids = np.array(example["input_ids"])
  input_ids = np.array(example["input_ids"])
  input_ids = np.array(example["input_ids"])
  input_ids = np.array(example["input_ids"])
  input_ids = np.array(example["input_ids"])
  input_ids = np.array(example["input_ids"])
  input_ids = np.array(example["input_ids"])
  input_ids = np.array(example["input_ids"])
  input_ids = np.array(example["input_ids"])
  input_ids = np.array(example["input_ids"])
  input_ids = np.array(example["input_ids"])
  input_ids = np.array(example["input_ids"])
  input_ids = np.array(example["input_ids"])
  input_ids = np.array(example["input_ids"])
  input_id

In [18]:
dataset.save_to_disk("packed_dataset_with_mask_smallVocab_cleaned")

Saving the dataset (77/77 shards): 100%|██████████| 888135/888135 [01:01<00:00, 14529.52 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 9005/9005 [00:00<00:00, 20005.10 examples/s]


In [19]:
# Usage:

from datasets import load_from_disk
packed_dataset = load_from_disk("packed_dataset_with_mask_smallVocab_cleaned")
packed_dataset.set_format('torch')

from torch.utils.data import DataLoader
# Create DataLoader
dataloader_train = DataLoader(packed_dataset["train"], batch_size=1, shuffle=True)
dataloader_valid = DataLoader(packed_dataset["validation"], batch_size=1, shuffle=False)


In [20]:
# look at the first batch
for input in dataloader_train:
    first_batch = input
    break

In [21]:
first_batch

{'padded_positions': tensor([[  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
           14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,
           28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,
           42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,
           56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,
           70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,
           84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,
           98,  99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
          112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125,
          126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139,
          140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153,
          154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167,
          168, 169, 