# Setup

In [None]:
# Import importlib to reload modules and sys and os to add the path for other imports
import importlib
import os
import sys

# Append the parent directory to the path to import the necessary modules
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

# Import the utilities and the dataloader
from utils import saffuutil

# Now reload the modules to ensure they are up-to-date
importlib.reload(saffuutil)

# Import the model and funcs required from utils
from utils.saffuutil import get_saffutok_data

# SAFFU Tokenizer

## Pre-Prof Adapted

This chunk signals the code in this section is custom and before trying to adapt/emulate professor code in saffu folder pulled from professor's repo

In [None]:
# Set the directory for the tokenizer data and call func
tok_dir = "../data/500_train/"
tok_train_data = get_saffutok_data(dir_path=tok_dir, threads=8)

Tokenizing files: 100%|██████████| 400/400 [00:12<00:00, 32.81file/s]


Files/Tokens: 400/1276661


## Post-Adaptation

### Imports

In [None]:
# region = "Abandoned custom import, workout with prof"
# # Imports that are diff since we import only the func needed
# from saffu import utilities_saffu
# importlib.reload(utilities_saffu)
# # For tokenizer setup - [5] ipynb
# from saffu.utilities_saffu import get_config
# endregion

# Execute the necessary scripts to set up the environment
exec(open("../src/configuration_saffu.py").read())
exec(open("../src/tokenization_saffu.py").read())
exec(open("../src/utilities_saffu.py").read())
exec(open("../src/data_saffu.py").read())
exec(open("../src/modeling_saffu.py").read())
exec(open("../src/training_saffu.py").read())
exec(open("../src/inference_saffu.py").read())
exec(open("../src/initialization_saffu.py").read())
exec(open("../src/load_data.py").read())

### Setup Vars

In [None]:
# Define list of model sizes applicable for get_config
model_sizes = ["micro", "tiny", "small", "medium", "big"]

# Custom name the DS
data_set = "500_train"

# Define the dict to store tokenizer cache
tokenizer_directory = "./cache/"

# Pick a model name from the list
model_size = model_sizes[0]

# Get the config for the current model size
config = get_config(model_size=model_size)

# Set the tokenizer name and create the tokenizer
tokenizer_name = f"{data_set}-{model_size}"
tokenizer = SAFFUTokenizer(config)

# Set the vocab file path/name
vocab_file = os.path.join(
    tokenizer_directory,
    tokenizer._model_path,
    (tokenizer_name + "-" if tokenizer_name else "") + "vocab.json",
)

# Flag to reload the tokenizer or not
reload = False

### Actual Tokenizer Training

In [None]:
# If reload = False and vocab file exists, load the tokenizer
if not reload and os.path.exists(vocab_file):

    ## Progress message
    print(f"Loading tokenizer: {tokenizer_name}")

    # Load the tokenizer
    result = tokenizer.load(tokenizer_name, load_directory=tokenizer_directory)

# Else if either reloading or vocab file does not exist, train the tokenizer
else:

    ## Progress message
    print(f"Training tokenizer: {tokenizer_name}")

    # Do the pretokenization
    pretokenized_data = tokenizer.pretokenize_documents(tok_train_data)

    # Train the tokenizer with the pretokenized data
    tokenizer.train(pretokenized_data)

    # Save the tokenizer vocabulary in the specified directory
    tokenizer.save_vocabulary(tokenizer_name, save_directory=tokenizer_directory)

# Set the vocabulary for the tokenizer finally
tokenizer.set_vocabulary()

# Print the vocabulary size for the experiment
print("Vocabulary size for experiment: ", len(tokenizer._vocabulary))

Training tokenizer: 500_train-micro


Pre-tokenizing 1276661 documents: 100%|██████████| 1276661/1276661 [00:15<00:00, 81312.28it/s]
Counting token frequencies: 100%|██████████| 1276661/1276661 [00:00<00:00, 5625587.63it/s]


Training bpe tokenizer

numbers of samples, pre-tokens, and target bpe pieces for covering of pre-tokens:  1276661 3053249 2048


Initializing: 100%|██████████| 1048576/1048576 [00:22<00:00, 47047.15it/s]
Fitting:  91%|█████████ | 1868/2048 [00:43<00:04, 43.10it/s] 


Built a vocabulary of 2048 types


Building sub-token reference dictionary: 100%|██████████| 1048576/1048576 [01:36<00:00, 10892.94it/s]


Portion of model's 1048576 reference tokens covered: 0.9997978210449219
Portion of model's 1048576 reference tokens covered: 0.9997978210449219
Vocabulary size for experiment:  4035


In [None]:
print("Vocabulary size for experiment: ", len(tokenizer._vocabulary))

data_file = os.path.join(
    tokenizer_directory,
    tokenizer._model_path,
    (tokenizer_name + "-" if tokenizer_name else "")
    + f"data-space_{tokenizer.config._space}-r_{tokenizer.config._r}-b_{tokenizer.config._b}-heads_{tokenizer.config._heads}-N_{tokenizer.config._N}.json",
)

# Tokenize a sentence
print(tokenizer._tokenize("These casseroles disgust Kayla."))

# Check if tokenized stuff is in the vocabulary
print(
    [
        x in tokenizer._vocabulary
        for x in tokenizer._tokenize("These casseroles disgust Kayla.")
    ]
)

Vocabulary size for experiment:  4035
['Th', 'es', 'e', ' c', 'as', 's', 'er', 'ol', 'es', ' d', 'is', 'gu', 'st', ' K', 'ay', 'la', '.']
[True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True]


In [20]:
sentence = "These casseroles paagal hai kya disgust Kayla."
checl = tokenizer.encode(sentence)
print(checl)
print(tokenizer.decode(checl))

[872, 180, 94, 300, 263, 90, 88, 207, 180, 1167, 733, 124, 1146, 171, 1390, 1787, 335, 287, 1438, 312, 426, 930, 206, 0]
Th es e  c as s er ol es  pa ag al  ha i  k ya  d is gu st  K ay la .


In [None]:
yes = tokenizer.preprocess("These paagal hai kya disgust Kayla.")
print(yes)

[872, 180, 94, 1167, 733, 124, 1146, 171, 1390, 1787, 335, 287, 1438, 312, 426, 930, 206, 0]
