In [8]:
import os
import sys
import torch
import pickle
from transformers import GPT2Tokenizer, AutoTokenizer
import numpy as np

In [9]:
model_name_list = ['out-babylm_full_bpe_8k-6x6-mask_log001-6617787',
 'out-babylm_full_bpe_8k-6x6-nomask-curr_log-7047459_s42',
 'out-babylm_full_bpe_8k-6x6-nomask-curr_log-7047460_s2347',
 'out-babylm_full_bpe_8k-6x6-nomask-curr_log-7047461_s9',
 'out-babylm_full_bpe_8k-6x6-nomask-curr_log-7047462_s616',
 'out-babylm_full_bpe_8k-6x6-nomask-curr_log-7047464_s46674',
 'out-babylm_full_bpe_8k-6x6-nomask-curr_log-7047466_s6747',
 'out-babylm_full_bpe_8k-6x6-nomask-curr_log-7047467_s869',
 'out-babylm_full_bpe_8k-6x6-nomask-curr_log-7047468_s466',
 'out-babylm_full_bpe_8k-6x6-nomask-curr_log-7047469_s11111']

tokenizers_root = r"/home/abishekthamma/PycharmProjects/masters_thesis/ss-llm/nanoGPT/data"
out_root = r'/home/abishekthamma/PycharmProjects/masters_thesis/ss-llm/nanoGPT/output_dump'

data_folder = r'babylm_full_bpe_8k'
model_name = 'out-babylm_full_bpe_8k-6x6-mask_lin-5734459_s1337'

#model_name = model_list[0]

out_dir = os.path.join(out_root, model_name)
data_dir = os.path.join(tokenizers_root, data_folder)

device = "cuda"

In [15]:
def load_model(out_dir, device):
    """
    Loads a pre-trained GPT model from a checkpoint file.

    Args:
        out_dir (str): The directory where the checkpoint file is located.
        device (torch.device): The device to load the model onto.

    Returns:
        GPT: The loaded GPT model.

    Raises:
        FileNotFoundError: If the checkpoint file is not found.
    """
    ckpt_path = os.path.join(out_dir, 'ckpt.pt')
    print(f"Loading model from {ckpt_path}")
    #NANOGPT_ROOT = str(Path(__file__).parents[4])
    NANOGPT_ROOT = r'/home/abishekthamma/PycharmProjects/masters_thesis/ss-llm/nanoGPT' #Edit later to be dynamic
    sys.path.append(NANOGPT_ROOT)
    from model import GPT, GPTConfig

    checkpoint = torch.load(ckpt_path, map_location=device)

    # Backward compatibility for new model args for QKV and FFW Adjustments
    if checkpoint["model_args"].get("wm_decay_length", None) is None:
        #wm_decay_length = block_size
        checkpoint["model_args"]["wm_decay_length"] = checkpoint["model_args"]["block_size"]
    # Setting head size as 3 times n_embd if not set already
    if checkpoint['model_args'].get('head_size_qkv', None) is None:
        checkpoint['model_args']['head_size_qkv'] = checkpoint['model_args']['n_embd']

    if checkpoint["model_args"].get("ffw_dim", None) is None:
        checkpoint["model_args"]["ffw_dim"] = 4 * checkpoint["model_args"]["n_embd"]

    #print(checkpoint['model_args'])
    gptconf = GPTConfig(**checkpoint['model_args'])

    load_model = GPT(gptconf)

    state_dict = checkpoint['model']
    unwanted_prefix = '_orig_mod.'
    for k, v in list(state_dict.items()):
        if k.startswith(unwanted_prefix):
            state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)

    load_model.load_state_dict(state_dict)
    load_model.eval()

    load_model = load_model.to(device)

    return load_model

def load_tokenizer(data_dir):
    """
    Load tokenizer for natural stories evaluation.

    Args:
        data_dir (str): The directory path where the tokenizer data is stored.

    Returns:
        tokenizer (Tokenizer): The loaded tokenizer object.

    Raises:
        NotImplementedError: If stoi/itos is not supported or found.

    """
    meta_path = os.path.join(data_dir, 'meta.pkl')
    load_meta = os.path.exists(meta_path)
    if load_meta:
        with open(meta_path, 'rb') as f:
            meta = pickle.load(f)
        if meta.get("custom_tokenizer", False):
            print(f"Loading custom tokenizer from {data_dir}")
            tokenizer = AutoTokenizer.from_pretrained(data_dir, use_fast=False)
        else:
            if meta.get("stoi", False):
                raise NotImplementedError("stoi/itos not supported yet")
            else:
                raise NotImplementedError("No stoi/itos found")
    else:
        print("No meta.pkl found, using default GPT-2 tokenizer")
        tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")

    if not tokenizer.eos_token:
        tokenizer.add_special_tokens({"eos_token": "</s>"})
    if not tokenizer.pad_token:
        tokenizer.pad_token = tokenizer.eos_token

    tokenizer.padding_side = "left" #Add if needed?
    return tokenizer

def load_model_tokenizer(out_dir, data_dir, device):
    model = load_model(out_dir, device)
    tokenizer = load_tokenizer(data_dir)
    return model, tokenizer

def load_dataset(data_dir, val_only=True):
    
    val_data = np.memmap(os.path.join(data_dir, 'val.bin'), dtype=np.uint16, mode='r')
    if val_only:
        return None, val_data
    else:
        train_data = np.memmap(os.path.join(data_dir, 'train.bin'), dtype=np.uint16, mode='r')
        return train_data, val_data

def get_perplexity(model, data, device="cuda", stride=1):

    """
    Calculate the perplexity of a given dataset.

    Args:
        model (GPT): The GPT model to evaluate.
        data (np.memmap): The dataset to evaluate.
        device (str): The device to run the model on.
        stride (int): The stride to use when evaluating the model.

    Returns:
        float: The perplexity of the dataset.

    """
    model.eval()
    model = model.to(device)
    nlls = [] #Negative log likelihoods
    with torch.no_grad():
        for i in range(0, data.shape[0] - 1, stride):
            x = torch.tensor(data[i:i + stride]).to(device)
            y = torch.tensor(data[i + 1:i + stride + 1]).to(device)
            logits, _ = model(x)
            nlls.append(torch.nn.functional.cross_entropy(logits.view(-1, logits.size(-1)), y.view(-1), reduction='none').cpu().numpy())
    nlls = np.concatenate(nlls)
    return np.exp(np.mean(nlls))
    

    
model, tokenizer = load_model_tokenizer(out_dir, data_dir, device)

    


Loading model from /home/abishekthamma/PycharmProjects/masters_thesis/ss-llm/nanoGPT/output_dump/out-babylm_full_bpe_8k-6x6-mask_lin-5734459_s1337/ckpt.pt
Setting flash to False because wm_mask is enabled
Setting flash to False because wm_mask is enabled
Setting flash to False because wm_mask is enabled
Setting flash to False because wm_mask is enabled
Setting flash to False because wm_mask is enabled
Setting flash to False because wm_mask is enabled
number of parameters: 13.69M
Loading custom tokenizer from /home/abishekthamma/PycharmProjects/masters_thesis/ss-llm/nanoGPT/data/babylm_full_bpe_8k


In [26]:
model, tokenizer = load_model_tokenizer(out_dir, data_dir, device)
model
model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
type(model)


Loading model from /home/abishekthamma/PycharmProjects/masters_thesis/ss-llm/nanoGPT/output_dump/out-babylm_full_bpe_8k-6x6-mask_lin-5734459_s1337/ckpt.pt
Setting flash to False because wm_mask is enabled
Setting flash to False because wm_mask is enabled
Setting flash to False because wm_mask is enabled
Setting flash to False because wm_mask is enabled
Setting flash to False because wm_mask is enabled
Setting flash to False because wm_mask is enabled
number of parameters: 13.69M
Loading custom tokenizer from /home/abishekthamma/PycharmProjects/masters_thesis/ss-llm/nanoGPT/data/babylm_full_bpe_8k


transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel

In [18]:
from transformers import GPT2LMHeadModel, GPT2TokenizerFast

device = "cuda"
model_id = "openai-community/gpt2"
model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
tokenizer = GPT2TokenizerFast.from_pretrained(model_id)

from datasets import load_dataset

test = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
encodings = tokenizer("\n\n".join(test["text"]), return_tensors="pt")#

import torch
from tqdm import tqdm

max_length = model.config.n_positions
stride = 64
seq_len = encodings.input_ids.size(1)

nlls = []
prev_end_loc = 0
for begin_loc in tqdm(range(0, seq_len, stride)):
    end_loc = min(begin_loc + max_length, seq_len)
    trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
    input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
    target_ids = input_ids.clone()
    target_ids[:, :-trg_len] = -100

    with torch.no_grad():
        outputs = model(input_ids, labels=target_ids)

        # loss is calculated using CrossEntropyLoss which averages over valid labels
        # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
        # to the left by 1.
        neg_log_likelihood = outputs.loss

    nlls.append(neg_log_likelihood)

    prev_end_loc = end_loc
    if end_loc == seq_len:
        break

ppl = torch.exp(torch.stack(nlls).mean())
print(ppl)


Token indices sequence length is longer than the specified maximum sequence length for this model (287644 > 1024). Running this sequence through the model will result in indexing errors
100%|█████████▉| 4479/4495 [08:33<00:01,  8.72it/s]


tensor(24.3362, device='cuda:0')
