<a href="https://colab.research.google.com/github/eduseiti/ia368v_dd_class_05/blob/main/LM_training_dataset_preparation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## mc4 pt dataset samples preparation

This notebook prepares the mc4 pt dataset samples for the CLM training, passing the facebook/OPT-125m tokenizer, and spliting the token sequences in 512-byte chunks.

The final dataset is also split in 10k-sample data blocks, in order to be low-RAM friendly.

This notebook is supposed to run on colab, but due to some unknown reason it created very large files, taking very long time to finish, when compared to its execution on a desktop with 32GB RAM.

In [2]:
!pip install transformers -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m44.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m83.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
import os

from google.colab import drive

from transformers import AutoTokenizer

from multiprocessing import Pool

import pickle

import gc

import glob

import torch

import time

import numpy as np

In [4]:
WORKING_FOLDER="drive/MyDrive/unicamp/ia368v_dd/aula_05"
SAMPLES_FILENAME="sample-1gb.txt"

In [5]:
MODEL_NAME="facebook/opt-125m"

In [6]:
TEXT_CHUNK_SIZE=512

In [7]:
drive.mount('/content/drive', force_remount=True)

os.chdir(WORKING_FOLDER)

Mounted at /content/drive


### Download the mc4 pt samples

In [8]:
if not os.path.exists(SAMPLES_FILENAME):
    !gsutil cp gs://unicamp-dl/ia025a_2022s1/aula9/sample-1gb.txt .
else:
    print("Samples file already downloaded...")

Samples file already downloaded...


### Split the dataset in blocks to be able to handle it in limited RAM

In [9]:
SAMPLES_BLOCK_SIZE=10000

In [10]:
with open(SAMPLES_FILENAME) as inputFile:
    lines = inputFile.readlines()

In [11]:
blocked_samples = []

for i in range(int(len(lines) // SAMPLES_BLOCK_SIZE)):
    blocked_samples.append(lines[(i * SAMPLES_BLOCK_SIZE):(i * SAMPLES_BLOCK_SIZE + SAMPLES_BLOCK_SIZE)])

In [12]:
len(blocked_samples)

25

### Tokenize the samples in blocks to avoid memory exhaustion

The tokenized blocks are saved in disk.

In [13]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

Downloading (…)okenizer_config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/651 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/441 [00:00<?, ?B/s]

In [14]:
def tokenize_block(samples_block, block_index):

    print("Processing samples block {}...".format(block_index))

    tokenized_block = tokenizer(samples_block, padding='max_length', max_length=TEXT_CHUNK_SIZE)

    with open("tokenized_samples_block_{:02d}.pkl".format(block_index), "wb") as outputFile:
        pickle.dump(tokenized_block, outputFile, pickle.HIGHEST_PROTOCOL)


In [None]:
start_time = time.time()

with open(SAMPLES_FILENAME) as inputFile:
    with Pool(processes=8) as pool:
        tokenized_samples = pool.starmap(tokenize_block, zip(blocked_samples, range(len(blocked_samples))))
        
        
print("Time to tokenize all the samples blocks: {} s...".format(time.time() - start_time))

In [None]:
del lines
del blocked_samples

In [None]:
gc.collect()

### Now process the sample blocks to split samples longer than the model input size

In [None]:
tokenized_block_files = glob.glob("tokenized_samples_block*")

In [None]:
tokenized_block_files = sorted(tokenized_block_files)

In [None]:
for k, block_filename in enumerate(tokenized_block_files):

    all_tokenized_samples = []
    
    print("Reading file {}...".format(block_filename))
    
    with open(block_filename, 'rb') as inputFile:
        block_data = pickle.load(inputFile)
        
    for i in range(len(block_data['input_ids'])):
        
#         print("{} - len(block_data['input_ids'][{}]={}".format(i, i, len(block_data['input_ids'][i])))
        
        for j in range(int(len(block_data['input_ids'][i]) // TEXT_CHUNK_SIZE)):
            
#             print("-- chunk from {} until {}".format(j * TEXT_CHUNK_SIZE, j * TEXT_CHUNK_SIZE + TEXT_CHUNK_SIZE))
              
            all_tokenized_samples.append({'input_ids': block_data['input_ids'][i][(j * TEXT_CHUNK_SIZE):(j * TEXT_CHUNK_SIZE + TEXT_CHUNK_SIZE)],
                                          'attention_mask': block_data['attention_mask'][i][(j * TEXT_CHUNK_SIZE):(j * TEXT_CHUNK_SIZE + TEXT_CHUNK_SIZE)]})
            
        remaining_tokens =  len(block_data['input_ids'][i]) % TEXT_CHUNK_SIZE
        
        #
        # Check if the remaining tokens are worth creating a new smaller sample...
        #
        
        if remaining_tokens > 100:
            
#             print("-- last chunk from {} until {}".format(len(block_data['input_ids'][i]) - remaining_tokens, len(block_data['input_ids'][i])))
            
            input_ids = np.ones(TEXT_CHUNK_SIZE, dtype=int)
            input_ids[:remaining_tokens] = block_data['input_ids'][i][-remaining_tokens:]
            
            all_tokenized_samples.append({'input_ids': list(input_ids),
                                          'attention_mask': list(np.concatenate([np.ones(remaining_tokens, dtype=int), np.zeros(TEXT_CHUNK_SIZE - remaining_tokens, dtype=int)]))})
            
             
                
    with open("normalized_samples_block_{:02d}.pkl".format(k), "wb") as outputFile:
        pickle.dump(all_tokenized_samples, outputFile, pickle.HIGHEST_PROTOCOL)
        
#         if i == 10:
#             break
    
#     break

Reading file tokenized_samples_block_00.pkl...
Reading file tokenized_samples_block_01.pkl...
Reading file tokenized_samples_block_02.pkl...
Reading file tokenized_samples_block_03.pkl...
Reading file tokenized_samples_block_04.pkl...
Reading file tokenized_samples_block_05.pkl...
Reading file tokenized_samples_block_06.pkl...
Reading file tokenized_samples_block_07.pkl...
Reading file tokenized_samples_block_08.pkl...
Reading file tokenized_samples_block_09.pkl...
Reading file tokenized_samples_block_10.pkl...
Reading file tokenized_samples_block_11.pkl...
Reading file tokenized_samples_block_12.pkl...
Reading file tokenized_samples_block_13.pkl...
Reading file tokenized_samples_block_14.pkl...
Reading file tokenized_samples_block_15.pkl...
Reading file tokenized_samples_block_16.pkl...
Reading file tokenized_samples_block_17.pkl...
Reading file tokenized_samples_block_18.pkl...
Reading file tokenized_samples_block_19.pkl...
Reading file tokenized_samples_block_20.pkl...
Reading file 