In [1]:
import os
import pickle

from datasets import Dataset
from transformers import AutoTokenizer

In [2]:
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")

In [3]:
babylm_list_dataset = []
dir_name = "babylm_augment/"
for file_name in os.listdir(dir_name):
    if file_name.endswith(".pkl"):
        with open(dir_name + file_name, "rb") as f:
            print(f"Loading chunk {file_name}")
            data_chunk = pickle.load(f)
            babylm_list_dataset += data_chunk
print(f"Length of babylm_dataset: {len(babylm_list_dataset)}")

Loading chunk babylm_augment_17.pkl
Loading chunk babylm_augment_16.pkl
Loading chunk babylm_augment_14.pkl
Loading chunk babylm_augment_15.pkl
Loading chunk babylm_augment_11.pkl
Loading chunk babylm_augment_10.pkl
Loading chunk babylm_augment_12.pkl
Loading chunk babylm_augment_13.pkl
Loading chunk babylm_augment_0.pkl
Loading chunk babylm_augment_1.pkl
Loading chunk babylm_augment_3.pkl
Loading chunk babylm_augment_2.pkl
Loading chunk babylm_augment_6.pkl
Loading chunk babylm_augment_5.pkl
Loading chunk babylm_augment_4.pkl
Loading chunk babylm_augment_30.pkl
Loading chunk babylm_augment_18.pkl
Loading chunk babylm_augment_19.pkl
Length of babylm_dataset: 15678


In [4]:
babylm_dataset = Dataset.from_list(babylm_list_dataset)
babylm_dataset = babylm_dataset.train_test_split(test_size=0.2)
babylm_dataset

DatasetDict({
    train: Dataset({
        features: ['source', 'text', 'emotion', 'augmented_text'],
        num_rows: 12542
    })
    test: Dataset({
        features: ['source', 'text', 'emotion', 'augmented_text'],
        num_rows: 3136
    })
})

In [5]:
def preprocess_function(examples):
    return tokenizer([" ".join(x) for x in examples["augmented_text"]])

block_size = 128

def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of block_size.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [6]:
tokenized_babylm = babylm_dataset.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    remove_columns=babylm_dataset["train"].column_names,
)

Map (num_proc=4):   0%|          | 0/12542 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1241 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1485 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1433 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1815 > 1024). Running this sequence through the model will result in indexing errors


Map (num_proc=4):   0%|          | 0/3136 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1631 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2340 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1361 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1574 > 1024). Running this sequence through the model will result in indexing errors


In [7]:
babylm_dataset = tokenized_babylm.map(group_texts, batched=True, num_proc=4)

Map (num_proc=4):   0%|          | 0/12542 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/3136 [00:00<?, ? examples/s]

In [8]:
babylm_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 104428
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 25900
    })
})