In [24]:
from transformers import DataCollatorForLanguageModeling
from transformers import AutoTokenizer
import os
import numpy as np
import torch
from datasets import load_dataset


In [28]:
dataset = "wikipedia_bpe"
data_dir = os.path.join('../data', dataset)
device = 'cpu'
device_type = 'cuda' if 'cuda' in device else 'cpu' # for later use in torch.autocast

batch_size = 6
block_size = 15

train_data = np.memmap(os.path.join(data_dir, 'train.bin'), dtype=np.uint16, mode='r')
val_data = np.memmap(os.path.join(data_dir, 'val.bin'), dtype=np.uint16, mode='r')


tokenizer = AutoTokenizer.from_pretrained(data_dir) #Load tokenizer
# Load tokenized data data =

def get_batch(split, batchloader='random', batch_idx = 0):

    data = train_data if split == 'train' else val_data
    if batchloader == 'random':
        ix = torch.randint(len(data) - block_size, (batch_size,))
        x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in ix])
        y = torch.stack([torch.from_numpy((data[i+1:i+1+block_size]).astype(np.int64)) for i in ix])
        if device_type == 'cuda':
            # pin arrays x,y, which allows us to move them to GPU asynchronously (non_blocking=True)
            x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)
        else:
            x, y = x.to(device), y.to(device)
        return x, y

    elif batchloader == 'full':
        #ix = torch.randint(len(data) - block_size, (batch_size,))
        #Basically get list of indices for the batch where batch_idx is the index of the batch and then its every ith element for batch size number of elements
        #Batch size is number of batches and block size is the number of tokens in each batch
        ix = torch.arange(batch_idx*batch_size*block_size, (batch_idx+1)*batch_size*block_size, block_size)
        print(ix)
        x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in ix])
        y = torch.stack([torch.from_numpy((data[i+1:i+1+block_size]).astype(np.int64)) for i in ix])
        if device_type == 'cuda':
            # pin arrays x,y, which allows us to move them to GPU asynchronously (non_blocking=True)
            x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)
        else:
            x, y = x.to(device), y.to(device)
        return x, y




In [26]:



x,y = get_batch('train')

print(x.shape, y.shape)

print(x, y)

print(train_data[0:10])

torch.Size([6, 15]) torch.Size([6, 15])
tensor([[ 1546,    25, 12087,   761,   207, 13379,   203,  1578,   257,    14,
            57,    14,   960,   207,   173],
        [12780,  3904,    48,  8566,  3422,    12,   207,   173,  9029,    13,
            40, 11155,  2857,  2739,    57],
        [ 9025,   548,   202,   173,  5682,    56, 15326,   229,   173,   751,
          1891,    13,    19,  8449,  1544],
        [  620,  6192,    14,  3101,   196,   196,    50,  3891,   249,    12,
           312,  5876,  5744,  6680,   211],
        [ 1316,    14,    29,   531,   531,   892,  4911,   173,   966,   531,
           531,   531,   132,   411,  4911],
        [  132,   251,  2042,  3890,  2436,  2793,    41,  1759,   569,   173,
          6679,  1184,  2347,    26,   132]]) tensor([[   25, 12087,   761,   207, 13379,   203,  1578,   257,    14,    57,
            14,   960,   207,   173,  5195],
        [ 3904,    48,  8566,  3422,    12,   207,   173,  9029,    13,    40,
         111

In [39]:

batch_size = 5
block_size = 5
batch_idx_max = train_data.shape[0] // (batch_size * block_size)
for i in range(batch_idx_max):
    x,y = get_batch('train', batchloader='full', batch_idx=i)
    if i == 0:
        print(x.shape, y.shape)
        print(x, y)
        print(train_data[0:10])
print("Batch_idx_max", batch_idx_max)
print(batch_idx_max*batch_size*block_size)

print(train_data.shape)
print(x.shape, y.shape)
print(x, y)

#print(train_data[0:10])

tensor([ 0,  5, 10, 15, 20])
torch.Size([5, 5]) torch.Size([5, 5])
tensor([[   29,   531,   531,   808,  9783],
        [ 5710, 12316,   531,   531,   531],
        [  132,  1372,  9783,  5710, 12316],
        [  391,  1753,   271,   808,  9783],
        [ 8421,   169,    14,  5710, 12316]]) tensor([[  531,   531,   808,  9783,  5710],
        [12316,   531,   531,   531,   132],
        [ 1372,  9783,  5710, 12316,   391],
        [ 1753,   271,   808,  9783,  8421],
        [  169,    14,  5710, 12316,  3641]])
[   29   531   531   808  9783  5710 12316   531   531   531]
tensor([25, 30, 35, 40, 45])
tensor([50, 55, 60, 65, 70])
tensor([75, 80, 85, 90, 95])
tensor([100, 105, 110, 115, 120])
tensor([125, 130, 135, 140, 145])
tensor([150, 155, 160, 165, 170])
tensor([175, 180, 185, 190, 195])
tensor([200, 205, 210, 215, 220])
tensor([225, 230, 235, 240, 245])
tensor([250, 255, 260, 265, 270])
tensor([275, 280, 285, 290, 295])
tensor([300, 305, 310, 315, 320])
tensor([325, 330, 335, 340

In [18]:
collate_fn = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
data = train_data[:1000].tolist()

texts = [
  "The quick brown fox jumps over the lazy dog. He loves to play with balls. But he is not very friendly with other animals. Add",
  "I am learning about NLP and AI today",
    "bert is a transformer model",
    "43 is a prime number",
    "python is a programming language",
    "I am learning about transformers",

]

tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokens = [tokenizer(t) for t in texts]

# print(tokens[0])
# print("\n")

dataloader = torch.utils.data.DataLoader(tokens, collate_fn=collate_fn, batch_size=5)

for batch in dataloader:


    print(batch["input_ids"].shape)

    break






torch.Size([5, 28])


In [40]:
train_data_path = "/home/abishekthamma/PycharmProjects/masters_thesis/ss-llm/babylm_data/babylm_10M"
dev_data_path = "/home/abishekthamma/PycharmProjects/masters_thesis/ss-llm/babylm_data/babylm_dev"

training_files = [f"{train_data_path}/simple_wikipedia.train",
                 f"{train_data_path}/gutenberg.train",
                 
                 f"{train_data_path}/aochildes.train",
                 f"{train_data_path}/bnc_spoken.train",
                 f"{train_data_path}/cbt.train",
                 f"{train_data_path}/children_stories.train",
                 f"{train_data_path}/open_subtitles.train",
                 f"{train_data_path}/qed.train",
                 
                 f"{train_data_path}/switchboard.train",
                 f"{train_data_path}/wikipedia.train"]

eval_files = [f"{dev_data_path}/aochildes.dev",
             f"{dev_data_path}/bnc_spoken.dev",
             f"{dev_data_path}/cbt.dev",
             f"{dev_data_path}/children_stories.dev",
             f"{dev_data_path}/gutenberg.dev",
             f"{dev_data_path}/open_subtitles.dev",
             f"{dev_data_path}/qed.dev",
             f"{dev_data_path}/simple_wikipedia.dev",
             f"{dev_data_path}/switchboard.dev",
             f"{dev_data_path}/wikipedia.dev"]

raw_datasets = load_dataset('text', data_files={'train': training_files, 
                                           'validation': eval_files})


print(type(raw_datasets), "Type of raw_datasets")
print(type(raw_datasets["train"]), "Type of raw_datasets[train]")

print("printing examples from raw_datasets[train]")
print(*raw_datasets["train"]["text"][:13], sep="<EOL> \n")

print("Printing raw_datasets", raw_datasets )

<class 'datasets.dataset_dict.DatasetDict'> Type of raw_datasets
<class 'datasets.arrow_dataset.Dataset'> Type of raw_datasets[train]
printing examples from raw_datasets[train]
Kate Mara (born February 27, 1983) is an American actress. She played Alma Del Mar Jr. in the movie "Brokeback Mountain". She also played Jessica Chandler for "Random Hearts" in 1999. She has also performed on stage. In 2017, she starred as Mary Jo Kopechne in "Chappaquiddick". She played Zoe Barnes on "House of Cards".<EOL> 
Mara was born in Bedford, New York.<EOL> 
<EOL> 
Hop<EOL> 
Hop or hops can mean several things:<EOL> 
<EOL> 
Battle of Wilson's Creek<EOL> 
The Battle of Wilson's Creek, also called the "Bull Run of the West", was a battle of the American Civil War. It was the first battle that took place west of the Mississippi River. Nathaniel Lyon, the Union general, was the first general killed in the Civil war.<EOL> 
Prelude to the battle.<EOL> 
During the summer of 1861, the Union and Confederate Armi

In [55]:
custom_tokenizer = AutoTokenizer.from_pretrained(data_dir)

context_length = 10

def tokenize(element):
    outputs = custom_tokenizer(
        element["text"],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True,
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        #if length == context_length:
        input_batch.append(input_ids)
    return {"input_ids": input_batch}


tokenized_datasets = raw_datasets.map(
    tokenize, batched=True, remove_columns=raw_datasets["train"].column_names
)
tokenized_datasets

Map:   0%|          | 0/1058740 [00:00<?, ? examples/s]

Map:   0%|          | 0/1026747 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 1868240
    })
    validation: Dataset({
        features: ['input_ids'],
        num_rows: 1798331
    })
})

In [68]:
def clean_text(example):
    #remove the character G from the text

    return example.replace("Ġ", "")


print(*[( clean_text(custom_tokenizer.decode(tokenized_datasets["train"][x]["input_ids"])) 
        , tokenized_datasets["train"][x]["input_ids"]) for x in range(200, 220)], sep="\n")

('fly ", " run like a thief " and "', [2371, 944, 316, 4066, 381, 171, 7956, 2, 207, 316])
('tri angle ".', [7197, 5669, 846])
('glass er was born in chicago, illinois. he', [8161, 181, 242, 1420, 203, 4375, 12, 2798, 14, 213])
('was married to joan glass er until his death.', [242, 1844, 196, 12250, 2771, 181, 1116, 309, 1316, 14])
('they had four children. glass er died on january', [312, 366, 952, 1263, 14, 2771, 181, 1261, 232, 1796])
('2, 2014 from natural causes at his home in', [353, 12, 3079, 345, 2916, 5259, 290, 309, 812, 203])
('los angeles. he was aged 89.', [2671, 4872, 14, 213, 242, 4537, 13973, 14])
('', [])
('st cross college, oxford', [249, 1979, 2159, 12, 6298])
('st. cross college is one of the colleges of', [249, 14, 1979, 2159, 229, 352, 202, 173, 10568, 202])
('the university of oxford. it is a graduate college', [173, 1407, 202, 6298, 14, 222, 229, 171, 8138, 2159])
('only. it does not have any under grad uate', [597, 14, 222, 682, 289, 292, 450, 666, 8604, 8259]

In [21]:
from torchtext.datasets import PennTreebank
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import dataset
from torch import nn, Tensor
import torch
from typing import Tuple

train_iter = PennTreebank(split='train')
tokenizer = get_tokenizer('basic_english')
vocab = build_vocab_from_iterator(map(tokenizer, train_iter), specials=['<unk>'])
vocab.set_default_index(vocab['<unk>'])

def data_process(raw_text_iter: dataset.IterableDataset) -> Tensor:
    """Converts raw text into a flat Tensor."""
    data = [torch.tensor(vocab(tokenizer(item)), dtype=torch.long) for item in raw_text_iter]

    pv = data[:5]
    print(*[(k, len(k)) for k in pv], sep="\n")

    return torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))

# ``train_iter`` was "consumed" by the process of building the vocab,
# so we have to create it again
train_iter, val_iter, test_iter = PennTreebank()
#print([item for item in train_iter])
train_data = data_process(train_iter)
#val_data = data_process(val_iter)
#test_data = data_process(test_iter)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def batchify(data: Tensor, bsz: int) -> Tensor:
    """Divides the data into ``bsz`` separate sequences, removing extra elements
    that wouldn't cleanly fit.

    Arguments:
        data: Tensor, shape ``[N]``
        bsz: int, batch size

    Returns:
        Tensor of shape ``[N // bsz, bsz]``
    """
    seq_len = data.size(0) // bsz
    print("seq_len, bsz, data.size(0)", seq_len, bsz, data.size(0))
    data = data[:seq_len * bsz]
    data = data.view(bsz, seq_len).t().contiguous()
    return data.to(device)

batch_size = 20
eval_batch_size = 10
train_data = batchify(train_data, batch_size)  # shape ``[seq_len, batch_size]``
#val_data = batchify(val_data, eval_batch_size)
#test_data = batchify(test_data, eval_batch_size)

print(train_data.shape)
bptt = 35
def get_batch(source: Tensor, i: int) -> Tuple[Tensor, Tensor]:
    """
    Args:
        source: Tensor, shape ``[full_seq_len, batch_size]``
        i: int

    Returns:
        tuple (data, target), where data has shape ``[seq_len, batch_size]`` and
        target has shape ``[seq_len * batch_size]``
    """
    seq_len = min(bptt, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].reshape(-1)
    return data, target

for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)):
    data, targets = get_batch(train_data, i)
    print("\n","\n")
    print(i, "\n")
    print(data.shape, targets.shape, "\n")

    print(data, targets)
    if batch > 0:
        break

(tensor([9892, 9893, 9894, 9896, 9897, 9898, 9902, 9903, 9904, 9905, 9906, 9908,
        9909, 9910, 9911, 9913, 9914, 9915, 9916, 9917, 9918, 9919, 9920, 9921]), 24)
(tensor([9187,    0,    2,   74,  395,   34, 2126,    1,  146,   20,    5, 9139,
         275,  410,    8,    2]), 16)
(tensor([  24,    8,    0,   14,  141,    3,    0,    2,    8, 2506,    8,    1,
        3070, 1595,   97]), 15)
(tensor([7627,    0,    2,   74,  395,    7,  339,  141,    3, 2466,  659, 2162,
         956,   25,  524,    5, 9139,  275,    3,   40,  303,  441, 3667]), 23)
(tensor([   5,  943,    3, 3137,  499,  263,    4,  138, 6053, 4223, 5997,   32,
         988,    5,  241,  762,    3, 1016, 2778,  211,    5,   97,    3,  434,
        4099,    4,   15,   46,   56,    2,   74,  195, 1244,  220]), 34)
seq_len, bsz, data.size(0) 46220 20 924412
torch.Size([46220, 20])

 

0 

torch.Size([35, 20]) torch.Size([700]) 

tensor([[9892,    4,   31,   16, 1925,  178,    3, 2295,    5,   42, 3412, 1611,
        

In [None]:
class DataBatchingForLanguageModeling:
    def __init__(self, batch_size, block_size, data=None, batching_type='random', tokenizer=None):
        """
        Initializes a DataBatcher object.

        Args:
            batch_size (int): The number of sequences in each batch.
            block_size (int): The length of each sequence in the batch.
            data (list, optional): The input data to be batched. Defaults to None.
            batching_type (str, optional): The type of batching to be performed. Can be 'random' or 'full'. Defaults to 'random'.
            tokenizer (object, optional): The tokenizer object to be used for tokenization. Defaults to None.
        """
        self.curriculum = None  # None or define a curriculum later, basically selecting ordering of data before batching
        self.batch_size = batch_size
        self.block_size = block_size
        self.max_batch_idx = len(data) // (batch_size * block_size)
        self.batch_idx = 0
        self.batching_type = batching_type  # Random or full

        #If batching type is full it can be implemented in a few different ways
        #1. Simplest way, chunk data into block size chunks and each chunk becomes a training example
        #2. Iterate over the data in a sequential manner, using a sliding window (Maybe using a jump size if needed? Hyperparameter?)
        #3. Use the example from GPTWee where they take every new line as a training example. Again, not sure how effective this is. (Also confirm the approach from implementation code)
        

        self.tokenizer = tokenizer
        self.data = data


    def get_batch(self):
        
        if self.batching_type == 'random':
            ix = self.get_random_batch_indices()

        elif self.batching_type == 'full':
            if self.batch_idx == self.max_batch_idx:
                self.batch_idx = 0
            ix = self.get_full_batch_indices()
            self.batch_idx += 1

        x,y = self.get_batch_from_indices(ix)
        return x, y


    def get_random_batch_indices(self):
        return torch.randint(len(self.data) - self.block_size, (self.batch_size,))
    
    def get_full_batch_indices(self):
        return torch.arange(self.batch_idx*self.batch_size*self.block_size, (self.batch_idx+1)*self.batch_size*self.block_size, self.block_size)

    def get_batch_from_indices(self, indices):
        x = torch.stack([torch.from_numpy((self.data[i:i+self.block_size]).astype(np.int64)) for i in indices])
        y = torch.stack([torch.from_numpy((self.data[i+1:i+1+self.block_size]).astype(np.int64)) for i in indices])
        if device_type == 'cuda':
            # pin arrays x,y, which allows us to move them to GPU asynchronously (non_blocking=True)
            x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)
        else:
            x, y = x.to(device), y.to(device)
        return x, y


    def __len__(self):
        return len(self.data)

