In [1]:
from transformers import GPT2Tokenizer, GPT2Model, GPT2LMHeadModel
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.padding_side = "left" 
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.generation_config.pad_token_id = tokenizer.eos_token_id


In [2]:
input_text = "The Sun was bright that day,"
encoded_input = tokenizer(input_text, return_tensors='pt', padding='max_length', max_length=42)
encoded_input

{'input_ids': tensor([[50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256,   464,  3825,   373,  6016,   326,
          1110,    11]]), 'attention_mask': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]])}

In [3]:
#output = model(**encoded_input)
output = model.generate(**encoded_input, do_sample=False, max_length=75)
output

tensor([[50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256,   464,  3825,   373,  6016,   326,
          1110,    11,   290,   262,  4252,   373, 22751,    13,   198,   198,
             1,    40,  1101,  7926,    11,   475,   314,  1101,   407,  1016,
           284,   307,  1498,   284,   766,   345,   757,   526,   198,   198,
             1,    40,  1101,  7926,    11]])

In [4]:
tokenizer.decode(output[0])

'<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>The Sun was bright that day, and the sun was shining.\n\n"I\'m sorry, but I\'m not going to be able to see you again."\n\n"I\'m sorry,'

In [5]:
# Manually load some logging conf
from hydra import initialize, initialize_config_module, initialize_config_dir, compose
from omegaconf import DictConfig, OmegaConf, open_dict
import qtransform
import os
import logging
import yaml

args = [
        "run=train", 
        "model=gpt_2_h2l2e256b64_GeBN",
        "dataset=huggingface", 
        "debug=True",
        "dataset.name=openwebtext",
        "+export=True",
        "run.epochs=100",
        "run.max_iters=300",
        "dataset/tokenizer=tiktoken",
        "dataset.tokenizer.encoding=gpt2"
    ]
@qtransform.with_config(args, logging.DEBUG)
def get_dataloader(cfg):
    log = logging.getLogger("__name__")
    
    from qtransform.dataset import get_data, get_loader, DatasetWrapper
    data_wrapper: DatasetWrapper = get_data(cfg.dataset)
    data_wrapper.load_dataset()
    
    dataset_train = data_wrapper.dataset_info.train
    dataset_eval = data_wrapper.dataset_info.eval
    if cfg.dataset.sizes.train >= 1.0:
        log.warning(f'Training on the entirety of the dataset without leaving some data for testing.')
    #check if batch_size batches are going to be performed
    from torch.utils.data import Dataset
    def check_dataset_size(name: str, dataset: Dataset):
        batch_size = cfg.dataset.dataloader.batch_size
        #model which is not an llm is loaded
        if cfg.dataset.args.get('block_size') is None:
            log.info(f'Model for dataset {name} presumably is not an LLM as the block size has not been specified')
            return
        block_size = cfg.dataset.args.block_size
        if batch_size * block_size > len(dataset):
            log.warning(f'The product of batch_size {batch_size} and block_size {block_size} is larger than the dataset {name}, causing the dataloader to skip batches. Maybe check the split size?')
    check_dataset_size("train", dataset_train)
    train_dataloader = get_loader(dataloader_cfg = cfg.dataset.dataloader, data = dataset_train)
    if dataset_eval is not None:
        check_dataset_size("eval", dataset_eval)
        eval_dataloader = get_loader(dataloader_cfg = cfg.dataset.dataloader, data = dataset_eval)
    else:
        eval_dataloader = None

    #update tokenizer config with metadata to save it in model checkpoints
    data_wrapper.tokenizer.load_metadata(filepath=os.path.join(data_wrapper.tokenized_dir, cfg.dataset.tokenizer.meta_file))
    with open_dict(cfg.dataset.tokenizer):
        cfg.dataset.tokenizer["meta"] = data_wrapper.tokenizer.meta
        
    return train_dataloader, eval_dataloader

train_d, eval_d = get_dataloader()


[ [36m2024-02-23 10:29:34,581 [0m][[2;37mhydra.core.utils[0m][[34mDEBUG[0m] - [34mSetting JobRuntime:name=UNKNOWN_NAME[0m
[ [36m2024-02-23 10:29:34,583 [0m][[2;37mhydra.core.utils[0m][[34mDEBUG[0m] - [34mSetting JobRuntime:name=app[0m
[ [36m2024-02-23 10:29:34,783 [0m][[2;37mqtransform[0m][[32mINFO[0m] - [32mHydra compose config is: {'data': {'dtype': 'float32'}, 'device': 'cuda', 'debug': True, 'dataset': {'wrapper': 'HuggingfaceDatasetWrapper', 'module': 'huggingface', 'name': 'openwebtext', 'root_path': '~/.qtransform/datasets', 'dataset_dir': ['${dataset.root_path}', '${dataset.module}', '${dataset.name}'], 'sizes': {'train': 0.3, 'eval': 0.05, 'bench': 0.3}, 'tokenizer': {'dtype': '${data.dtype}', 'meta_file': 'meta.pkl', 'wrapper': 'TikTokenizer', 'encoding': 'gpt2', 'module': 'tiktoken'}, 'dataloader': {'shuffle': True, 'num_workers': 2, 'batch_size': 12}, 'type': 'huggingface', 'args': {'block_size': '${model.args.block_size}', 'cache_dir': None, 'data_co

In [6]:
from brevitas.graph import gptq, quantize
import torch
from tqdm import tqdm

#_ = quantize.preprocess_for_quantize(model, trace_model=True)
#model = quantize.quantize(model)
model = quantize.layerwise_quantize(model)

# dont update weights during farward pass, only use gptq_mode.update()
with torch.no_grad():
    with gptq.gptq_mode(model) as gptq_mode:
        gptq_model = gptq_mode.model
        print(gptq_mode.num_layers)
        for i in tqdm(range(gptq_mode.num_layers)):
            for j, batch in enumerate(train_d):
                for seq in batch:
                    #t = t.cuda()
                    gptq_model(seq)
                gptq_mode.update()
                # five batches for calibration?
                if j > 5:
                    break

1


  return super().rename(names)
100%|██████████| 1/1 [02:43<00:00, 163.53s/it]


In [7]:
output = model.generate(**encoded_input, do_sample=False, max_length=75)
output

tensor([[50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256,   464,  3825,   373,  6016,   326,
          1110,    11,   290,   262,  4252,   373, 22751,  6016,   326,  1110,
            13,   198,   198,     1,    40,   373,   287,   262,  3504,   286,
           262,  1755,    11,   290,   314,  2497,   257,   582,   351,   257,
          2485,    13,   679,   373,  5762]])

In [8]:
tokenizer.decode(output[0])

'<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>The Sun was bright that day, and the sun was shining bright that day.\n\n"I was in the middle of the night, and I saw a man with a gun. He was wearing'