# preprocessing glue for fine-tuning

In [107]:
import os
import pickle
import tiktoken
import itertools

import numpy as np
from datasets import load_dataset
from torch.utils.data import Dataset, DataLoader

In [27]:
def get_dataset(task):
    
    dataset = load_dataset("nyu-mll/glue", task)
    num_labels = len(dataset['train'].features['label'].names)
    
    return dataset, num_labels

In [28]:
dataset, num_labels = get_dataset('cola')

In [148]:
def least_power_of_two(n):
    return 1 << (n-1).bit_length()

In [162]:
tokenizer = tiktoken.get_encoding('gpt2')
pad_token = tokenizer.encode('<|endoftext|>', allowed_special="all")[0]

def tokenize_batch(sents):
    tokens = tokenizer.encode_batch(sents, allowed_special = 'all')
    padded = list(zip(*itertools.zip_longest(*tokens, fillvalue=pad_token)))
    padded = np.array(padded)
    return np.column_stack([padded, np.full(padded.shape[0], tokenizer.eot_token)])

In [163]:
tokenize_batch(['the dog', 'went', 'to school', 'today in the store'])

array([[ 1169,  3290, 50256, 50256, 50256],
       [19963, 50256, 50256, 50256, 50256],
       [ 1462,  1524, 50256, 50256, 50256],
       [40838,   287,   262,  3650, 50256]])

In [135]:
batch_size = 4
n_batches  = 6
dataloader = DataLoader(dataset['train'], batch_size = batch_size, shuffle = True)

In [171]:
for i, batch in enumerate(dataloader):
    print(batch)
    break

for i, batch in enumerate(dataloader):
    print(batch)
    break

{'sentence': ['When it rains harder, how much faster a flow that appears in the river?', 'John seems to be easy to fool Ben.', 'Emma gave bad advice to Harriet.', 'John drinks coffee at 11, and Mary, tea at 10:30.'], 'label': tensor([0, 0, 1, 0]), 'idx': tensor([ 127, 4280, 6696, 7093])}
{'sentence': ['That a review came out yesterday of this article is catastrophic.', 'the book of poems with a red cover by Robert Burns from Blackwell takes a very long time to read.', 'To train his horse would be desirable.', 'we need to provide two trees and.'], 'label': tensor([1, 1, 1, 1]), 'idx': tensor([1490, 5788, 4070, 7476])}


In [43]:
tokenized = dataset.map(
    process,
    remove_columns=['sentence'],
    desc="tokenizing the splits"
)

tokenizing the splits:   0%|          | 0/8551 [00:00<?, ? examples/s]

tokenizing the splits:   0%|          | 0/1043 [00:00<?, ? examples/s]

tokenizing the splits:   0%|          | 0/1063 [00:00<?, ? examples/s]

In [44]:
for split, dset in tokenized.items():
    arr_len = np.sum(dset['len'], dtype=np.uint64)
    filename = os.path.join(os.path.dirname(__file__), f'{split}.bin')
    dtype = np.uint16 # (can do since enc.max_token_value == 50256 is < 2**16)
    arr = np.memmap(filename, dtype=dtype, mode='w+', shape=(arr_len,))
    total_batches = 1024

    idx = 0
    for batch_idx in tqdm(range(total_batches), desc=f'writing {filename}'):
        # Batch together samples for faster write
        batch = dset.shard(num_shards=total_batches, index=batch_idx, contiguous=True).with_format('numpy')
        arr_batch = np.concatenate(batch['ids'])
        # Write into mmap
        arr[idx : idx + len(arr_batch)] = arr_batch
        idx += len(arr_batch)
    arr.flush()

{'label': 1,
 'idx': 0,
 'ids': [5122,
  2460,
  1839,
  470,
  2822,
  428,
  3781,
  11,
  1309,
  3436,
  262,
  1306,
  530,
  356,
  18077,
  13,
  50256],
 'len': 17}