In [1]:
import os
import pickle
from transformers import AutoTokenizer
import pandas as pd
import torch
from dotenv import load_dotenv

load_dotenv()

True

In [6]:
##Set Tokenizer Root as the folder that contains different tokenizer folders  (Same instructions as before)
# Options - (relevant ones) - 
#   babylm_full_bpe_8k - Tokenizer for 10M models, vocab size 8k
#   babylm_full_bpe_100M_8k - Tokenizer for 100M models, vocab size 8k 
#Model's Relevant details can be found in the Model Table in the database (in rundata.xlsx)

TOKENIZER_ROOT = r"data" 

def load_tokenizer(data_dir):
    """
    Load tokenizer for natural stories evaluation.

    Args:
        data_dir (str): The directory path where the tokenizer data is stored.

    Returns:
        tokenizer (Tokenizer): The loaded tokenizer object.

    Raises:
        NotImplementedError: If stoi/itos is not supported or found.

    """
    data_dir = os.path.join(TOKENIZER_ROOT, data_dir)
    meta_path = os.path.join(data_dir, "meta.pkl")
    load_meta = os.path.exists(meta_path)

    if load_meta:
        with open(meta_path, 'rb') as f:
            meta = pickle.load(f)
        if meta.get("custom_tokenizer", False):
            print(f"Loading custom tokenizer from {data_dir}")
            tokenizer = AutoTokenizer.from_pretrained(data_dir, use_fast=False)
        else:
            if meta.get("stoi", False):
                raise NotImplementedError("stoi/itos not supported yet")
            else:
                raise NotImplementedError("No stoi/itos found")
    else:
        print("No meta.pkl found")
        raise NotImplementedError("No meta.pkl found")

    if not tokenizer.eos_token:
        tokenizer.add_special_tokens({"eos_token": "</s>"})
    if not tokenizer.pad_token:
        tokenizer.pad_token = tokenizer.eos_token

    tokenizer.padding_side = "left"  # Add if needed?
    return tokenizer

    


In [None]:
#SET HF Access Key to be able to access the models since the repository is private (Will share separately)
from huggingface_hub import login as hf_login

hf_login(token=os.environ.get("HF_ACCESS_TOKEN"))

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/abishekthamma/.cache/huggingface/token
Login successful


In [8]:
#To find model name - Use rundata.xlsx file and check the model name unde "output_folder_name" column in the sheet Run Details. 
model_details_df = pd.read_excel("results/rundata.xlsx", sheet_name="Run Details")

model_details_df.head()

Unnamed: 0,run_id,output_folder_name,n_layer,n_head,block_size,n_embd,batch_size,learning_rate,seed,masking,...,blimp_exists,wandb_exists,reading_time_exists,model_surprisal_data_exists,runtime,rough_sbu_estimate,epochs,wandb_runid,num_iterations,gradient_accumulation_steps
0,5444724,out-babylm_full_bpe-4x4-nomask-5444724,4,4,128,256,32.0,0.001,1337,False,...,True,True,True,True,0:15:38,155,100.824615,5o1o8tm8,44000.0,8.0
1,5445338,out-babylm_wocdes_full_bpe-4x4-nomask-5445338,4,4,128,256,32.0,0.0005,1337,False,...,True,True,True,True,0:15:32,155,100.824615,hffttavi,44000.0,8.0
2,5492054,out-babylm_full_bpe-8x8-nomask-5492054,8,8,512,512,32.0,0.0005,1337,False,...,True,True,True,True,1:03:36,559,403.298462,ddnlzu6p,44000.0,8.0
3,5492134,out-babylm_full_bpe-6x6-nomask-5492134,6,6,256,384,32.0,0.0005,1337,False,...,True,True,True,True,0:23:14,227,201.649231,ub67nse1,44000.0,8.0
4,5496426,out-babylm_full_bpe_8k-8x8-nomask-5496426,8,8,512,512,32.0,0.0005,1337,False,...,True,True,True,True,0:56:01,524,403.298462,7paxvcyz,44000.0,8.0


In [9]:
#Huggingdface list repository names -
from huggingface_hub import list_models

hf_models_list = [x.id for x in list_models(author="fmtmodels")]
hf_models_list[:10]  

['fmtmodels/out-babylm_full_bpe_8k-6x6-mask_ee2000_em01-6849723',
 'fmtmodels/out-babylm_full_bpe_8k-6x6-mask_ee004_em10-6683311',
 'fmtmodels/out-babylm_full_bpe_100M_8k-6x6-mask_ee002_em10-8465077',
 'fmtmodels/out-babylm_full_bpe_100M_8k-6x6-mask_ee002_em10-8465082',
 'fmtmodels/out-babylm_full_bpe_100M_8k-6x6-mask_ee002_em10-8465084',
 'fmtmodels/out-babylm_full_bpe_100M_8k-6x6-mask_ee002_em10-8465085',
 'fmtmodels/out-babylm_full_bpe_100M_8k-6x6-mask_ee002_em10-8465086',
 'fmtmodels/out-babylm_full_bpe_100M_8k-6x6-mask_ee002_em10-8465087',
 'fmtmodels/out-babylm_full_bpe_100M_8k-6x6-mask_ee002_em10-8465089',
 'fmtmodels/out-babylm_full_bpe_100M_8k-6x6-mask_ee002_em10-8465090']

#### NOTE
- Utilize hf_models_list and model_details_df to find the right model name relevant
- Not all models are pushed to the hub yet 

In [10]:
HF_REPO_ROOT = "fmtmodels"
model_name = "out-babylm_full_bpe_100M_8k-6x6-mask_ee002_em10-8465084" 


#Currently AutoModel doesn't work, so we need to import the model from model_HF and use its from_pretrained method
from model_HF import GPT

model = GPT.from_pretrained(f"{HF_REPO_ROOT}/{model_name}")

Setting flash to False because wm_mask is enabled
Setting flash to False because wm_mask is enabled
Setting flash to False because wm_mask is enabled
Setting flash to False because wm_mask is enabled
Setting flash to False because wm_mask is enabled
Setting flash to False because wm_mask is enabled
number of parameters: 13.69M


In [11]:
#Sample usage - (Only small difference to previous script)
from rich import print
tokenizer = load_tokenizer("babylm_full_bpe_100M_8k") #Since this model required the 100M tokenizer
model.eval()
model.to("cuda")

sample_sentence = "The quick brown fox jumps over the lazy dog"
input_ids = tokenizer.encode(sample_sentence, return_tensors="pt").to("cuda")
print("Input IDs Shape: ", input_ids.shape)

with torch.no_grad():
    outputs_m1 = model(input_ids, hidden_states=True) #If you want Loss, pass expected tokens with target = tokens_to_predict


print("Logits: ", outputs_m1["logits"].shape)
print("Loss", outputs_m1["loss"])
print("Hidden States: ", [{f"Layer {i}": outputs_m1["hidden_states"][i].shape} for i in range(len(outputs_m1["hidden_states"]))])