# Setup

In [None]:
# Import importlib to reload modules and sys and os to add the path for other imports
import importlib
import os
import sys

# Append the parent directory to the path to import the necessary modules
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

# Import the utilities and the dataloader
from utils import saffuutil

# Now reload the modules to ensure they are up-to-date
importlib.reload(saffuutil)

# Import the model and funcs required from utils
from utils.saffuutil import get_saffutok_data

# SAFFU Tokenizer

## Pre-Prof Adapted

This chunk signals the code in this section is custom and before trying to adapt/emulate professor code in saffu folder pulled from professor's repo

In [None]:
# Set the directory for the tokenizer data and call func
tok_dir = "../data/all_train/"
tok_train_data = get_saffutok_data(dir_path=tok_dir, threads=8)

Tokenizing files:  60%|██████    | 1177/1960 [00:40<00:15, 49.19it/s]

Error processing file ../data/all_train/AeMBR_LCI_Cost_9-9-15.xls: cannot convert float infinity to integer


Tokenizing files:  93%|█████████▎| 1823/1960 [00:59<00:02, 47.14it/s]



Tokenizing files: 100%|██████████| 1960/1960 [01:12<00:00, 26.98it/s]


Files/Tokens: 1960/5479125


## Post-Adaptation

### Imports

In [None]:
# region = "Abandoned custom import, workout with prof"
# # Imports that are diff since we import only the func needed
# from saffu import utilities_saffu
# importlib.reload(utilities_saffu)
# # For tokenizer setup - [5] ipynb
# from saffu.utilities_saffu import get_config
# endregion

# Execute the necessary scripts to set up the environment
exec(open("../src/configuration_saffu.py").read())
exec(open("../src/tokenization_saffu.py").read())
exec(open("../src/utilities_saffu.py").read())
exec(open("../src/data_saffu.py").read())
exec(open("../src/modeling_saffu.py").read())
exec(open("../src/training_saffu.py").read())
exec(open("../src/inference_saffu.py").read())
exec(open("../src/initialization_saffu.py").read())
exec(open("../src/load_data.py").read())

### Setup Vars

In [None]:
# Define list of model sizes applicable for get_config
model_sizes = ["micro", "tiny", "small", "medium", "big"]

# Custom name the DS
data_set = "500_train"

# Define the dict to store tokenizer cache
tokenizer_directory = "./cache/"

# Pick a model name from the list
model_size = model_sizes[0]

# Get the config for the current model size
config = get_config(model_size=model_size)

# Set the tokenizer name and create the tokenizer
tokenizer_name = f"{data_set}-{model_size}"
tokenizer = SAFFUTokenizer(config)

# Set the vocab file path/name
vocab_file = os.path.join(
    tokenizer_directory,
    tokenizer._model_path,
    (tokenizer_name + "-" if tokenizer_name else "") + "vocab.json",
)

# Flag to reload the tokenizer or not
reload = False

In [None]:
# If reload = False and vocab file exists, load the tokenizer
if not reload and os.path.exists(vocab_file):

    ## Progress message
    print(f"Loading tokenizer: {tokenizer_name}")

    # Load the tokenizer
    result = tokenizer.load(tokenizer_name, load_directory=tokenizer_directory)

# Else if either reloading or vocab file does not exist, train the tokenizer
else:

    ## Progress message
    print(f"Training tokenizer: {tokenizer_name}")

    # Do the pretokenization
    pretokenized_data = tokenizer.pretokenize_documents(tok_train_data)

    # Train the tokenizer with the pretokenized data
    tokenizer.train(pretokenized_data)

    # Save the tokenizer vocabulary in the specified directory
    tokenizer.save_vocabulary(tokenizer_name, save_directory=tokenizer_directory)

# Print the vocabulary size for the experiment
print("Vocabulary size for experiment: ", len(tokenizer._vocabulary))

Loading tokenizer: 500_train-micro
Vocabulary size for experiment:  4034


In [None]:
print("Vocabulary size for experiment: ", len(tokenizer._vocabulary))

data_file = os.path.join(
    tokenizer_directory,
    tokenizer._model_path,
    (tokenizer_name + "-" if tokenizer_name else "")
    + f"data-space_{tokenizer.config._space}-r_{tokenizer.config._r}-b_{tokenizer.config._b}-heads_{tokenizer.config._heads}-N_{tokenizer.config._N}.json",
)

# Tokenize a sentence
print(tokenizer._tokenize("These casseroles disgust Kayla."))

# Check if tokenized stuff is in the vocabulary
print(
    [
        x in tokenizer._vocabulary
        for x in tokenizer._tokenize("These casseroles disgust Kayla.")
    ]
)

Vocabulary size for experiment:  4034
['Th', 'es', 'e', ' c', 'as', 's', 'er', 'ol', 'es', ' d', 'is', 'gu', 'st', ' K', 'ay', 'la', '.']
[True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True]


In [None]:
from utils.saffuutil import get_saffu_tensors
from utils.selfutil import get_fileList

# Files
train_files, _ = get_fileList("../data/500_train/")
train_files

['../data/500_train/retention-in-kindergarten.xlsx',
 '../data/500_train/meta%20deta%20set.xlsx',
 '../data/500_train/michelle_lokay_000_1_2_1.pst.263.xls',
 '../data/500_train/wais2012co2.xls',
 '../data/500_train/Table%20OTU%20within%20treatment%20SIMPER.xlsx',
 '../data/500_train/NPL%20Contaminants%2008%2025%202017.xlsx',
 '../data/500_train/SwintekEtAlSupplementaryData.xlsx',
 '../data/500_train/Vendor%20Study%20Metadata%20for%20Rapid.xlsx',
 '../data/500_train/appendix1.xls',
 '../data/500_train/darron_c_giron_000_1_1_1.pst.324.xls',
 '../data/500_train/darron_c_giron_002_1_1_1.pst.145.xls',
 '../data/500_train/Test%204.xlsx',
 '../data/500_train/Regeneration%20study%20GG2-90.xlsx',
 '../data/500_train/Figure2b_Right_Daytime%20East%20Dominated%20Wind%20Net%20NO2%20over%205%20ppb.xlsx',
 '../data/500_train/darrell_schoolcraft_000_1_1_1.pst.469.xls',
 '../data/500_train/Ohio%20County%20WV%20oil%20and%20gas%20mobile%20study%20dataset%20and%20data%20dictionary.xlsx',
 '../data/500_tra

In [None]:
import importlib
import os
from typing import List

import torch
from tqdm import tqdm

from utils import parseutil, saffuutil

importlib.reload(saffuutil)
importlib.reload(parseutil)

from utils.saffuutil import get_saffu_tensors


class LoaderSAFFU(torch.utils.data.Dataset):
    def __init__(
        self,
        file_paths: List[str],
        tokenizer,
        max_rows: int = 100,
        max_cols: int = 100,
        pad_length: int = 32,
    ):
        self.max_rows = max_rows
        self.max_cols = max_cols
        self.pad_length = pad_length
        self.tokenizer = tokenizer

        self.x_tok = []
        self.x_masks = []
        self.y_tok = []
        self.file_paths = []
        self.failed_files = []

        for file_path in tqdm(file_paths, desc="Processing files"):
            x_tok, x_masks, y_tok = self.featurize(file_path)
            if x_tok is not None:
                self.x_tok.append(x_tok)
                self.x_masks.append(x_masks)
                self.y_tok.append(y_tok)
                self.file_paths.append(file_path)
            else:
                self.failed_files.append(file_path)

        print(
            f"\n{len(self.file_paths) + len(self.failed_files)}(P) = {len(self.file_paths)}(G) + {len(self.failed_files)}(E)"
        )

    def featurize(self, file_path):
        """Featurize a single spreadsheet file into tensors. Return None on failure."""
        try:
            return get_saffu_tensors(
                file_path,
                max_rows=self.max_rows,
                max_cols=self.max_cols,
                pad_length=self.pad_length,
                tokenizer=self.tokenizer,
            )
        except Exception as e:
            print(f"Featurization failed for {file_path}: {e}")
            return None, None, None

    def __len__(self):
        return len(self.x_tok)

    def __getitem__(self, index):
        return {
            "x_tok": self.x_tok[index],
            "x_masks": self.x_masks[index],
            "y_tok": self.y_tok[index],
            "file_paths": self.file_paths[index],
        }

    def get_imbalance(self) -> float:
        # Get bold cell count
        bold_count = sum((tensor[:, :, 6] == 1).sum().item() for tensor in self.y_tok)

        # Raise error if no bold cells
        if not bold_count:
            raise ValueError(
                "No bold cells found in the dataset. Cannot calculate imbalance ratio."
            )

        # Return ratio of non-bold to bold
        return (
            sum((tensor[:, :, 6] == 0).sum().item() for tensor in self.y_tok)
            / bold_count
        )

In [None]:
train_loader = LoaderSAFFU(
    file_paths=train_files,
    tokenizer=tokenizer,
    max_rows=100,  # Optional, defaults to 100
    max_cols=100,  # Optional, defaults to 100
    pad_length=32,  # Optional, defaults to 32
)

Processing files:  94%|█████████▍| 377/400 [04:04<00:22,  1.02it/s]



Processing files: 100%|██████████| 400/400 [04:27<00:00,  1.50it/s]


400(P) = 400(G) + 0(E)





In [None]:
positions = [
    (0, 0),  # A1: header with slashes (Val/1)
    (1, 2),  # C2: "test" (plain text)
    (6, 2),  # C7: scientific notation
    (5, 0),  # A6: purple fill (styled number)
    (7, 3),  # D8: numeric + formatting
    (2, 4),  # E3: float (36.285)
    (18, 4),
    (17, 4),
    (3, 5),  # F4: float with decimals (43.519)
    (4, 6),  # G5: "Test2" (text)
    (6, 6),  # G7: "Test3" (string next to number)
    (13, 7),  # H14: orange cell with "Hey"
    (14, 7),  # H15: red-bordered time cell
    (21, 0),  # A22: DateTime (italic)
    (24, 1),  # B25: DateTime with minutes
    (27, 6),  # G28: "al" (short text)
    (28, 7),  # H29: "Click Here" (string hyperlink)
    (27, 7),  # H28: number as text?
    (20, 6),  # G21: "#NAME?" (Excel error)
    (21, 8),  # I22: float (numeric cell)
    (14, 8),  # I15: styled float with currency
    (25, 8),  # I26: styled number (footer-aligned)
]


def inspect_cell(x_tok, x_masks, y_tok, row, col, tokenizer):
    token_ids = x_tok[row, col].tolist()
    decoded = tokenizer.decode(token_ids)
    mask = x_masks[row, col].tolist()
    meta = y_tok[row, col].tolist()

    print(f"\n🔹 Cell ({row}, {col})")
    print(f"→ Decoded Text     : {decoded}")
    print(f"→ Token IDs        : {token_ids}")
    print(f"→ Attention Mask   : {mask}")
    print(f"→ Metadata Vector  : {meta}")


# Loop through chosen positions
for row, col in positions:
    inspect_cell(x_tok, x_masks, y_tok, row, col, tokenizer)

In [None]:
import torch

device = torch.device("mps:0")
model = SAFFUDecoder(SAFFUEncoder(tokenizer)).to(device)
stage = "init"
reload = False
if reload or (
    not os.path.exists(f"./models_to_test/{data_set}-{model_size}-{stage}.state")
):
    save_model(model, data_set, model_size, stage)

In [None]:
# determine the data set(s)
data_set = "babylm_10M"  # babylm_10M+babylm_100M+BWB
model_size = "small"  # "medium" # "tiny" # "micro" # "big" #
training_sizes = {
    "babylm_10M": 10,
    "babylm_100M": 100,
    "BWB": 1000,
}  # in millions of word-tokens
devsample = 10  # dev data is 1/10 of total available
dataset_size = sum([training_sizes[data_subset] for data_subset in data_set.split("+")])
downsample = int(dataset_size / 5)  # roughly 5 million word-tokens per split
eta = 2**-3
texts, dtexts, ttexts = [], [], []
docs, ddocs, tdocs = [], [], []

In [None]:
model.encoder._V.weight.requires_grad = False

seed = 691
ignore_space = False
ignore_case = False
initialize = True
verbose = True
co_vecs = 1 * (2**0 + 0.99999)
identity_ratio = 2 ** (-1)
icf = True
log_label = False
nlabels = 1 * (2**0)
centroids = False
label_iterations = 1 * (2**7)  # None

max_epochs = 2**-1
patience = 0 * (2**0)
reload = False

# Custom
devstr = "mps"
docs = tok_train_data
downsample = 10
ddocs = []

initial_file = "".join(
    [
        data_file[:-5] + "-",
        f"b_{tokenizer.config._bits}-hb_{tokenizer.config._hidden}-",
        f"we_{int(tokenizer.config._wave_encode)}-oa_{tokenizer.config._o_agg}-ra_{tokenizer.config._r_agg}-ba_{tokenizer.config._b_agg}-",
        f"mr_{int(tokenizer.config._mask_r)}-mb_{int(tokenizer.config._mask_b)}-md_{tokenizer.config._model_documents}-",
        f"is_{int(ignore_space)}-ic_{int(ignore_case)}-ws_{int(initialize)}-wv_{int(co_vecs)}-ds_{downsample}-seed_{seed}",
    ]
)

initializer = SAFFUInitializer(
    ignore_case,
    ignore_space,
    devstr,
    co_vecs,
    identity_ratio=identity_ratio,
    label_iterations=label_iterations,
    log_label=log_label,
    nlabels=nlabels,
    centroids=centroids,
    icf=icf,
)
eval_docs = initializer.initialize(
    model,
    docs,
    ddocs,
    downsample,
    seed,
    max_epochs,
    patience,
    devsample=devsample,
    model_file=initial_file,
    reload=reload,
    verbose=verbose,
)

Initializing embedding matrix V:   0%|          | 0/10 [00:00<?, ?it/s]

TypeError: 'int' object is not iterable