In [7]:
import logging
log = logging.getLogger(__name__)

In [1]:
from transformers import GPT2Tokenizer, GPT2Model, GPT2LMHeadModel
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.padding_side = "left" 
tokenizer.pad_token = tokenizer.eos_token

In [2]:
# Manually load some logging conf
from hydra import initialize, initialize_config_module, initialize_config_dir, compose
from omegaconf import DictConfig, OmegaConf, open_dict
import qtransform
import os
import logging
import yaml

args = [
        "run=train", 
        "model=gpt_2_h2l2e256b64_GeBN",
        "dataset=huggingface", 
        "debug=True",
        "dataset.name=openwebtext",
        "+export=True",
        "run.epochs=1",
        "run.max_iters=300",
        "dataset/tokenizer=tiktoken",
        "dataset.tokenizer.encoding=gpt2"
    ]
@qtransform.with_config(args, logging.DEBUG)
def get_dataloader(cfg):
    log = logging.getLogger("__name__")
    
    from qtransform.dataset import get_data, get_loader, DatasetWrapper
    data_wrapper: DatasetWrapper = get_data(cfg.dataset)
    data_wrapper.load_dataset()
    
    dataset_train = data_wrapper.dataset_info.train
    dataset_eval = data_wrapper.dataset_info.eval
    if cfg.dataset.sizes.train >= 1.0:
        log.warning(f'Training on the entirety of the dataset without leaving some data for testing.')
    #check if batch_size batches are going to be performed
    from torch.utils.data import Dataset
    def check_dataset_size(name: str, dataset: Dataset):
        batch_size = cfg.dataset.dataloader.batch_size
        #model which is not an llm is loaded
        if cfg.dataset.args.get('block_size') is None:
            log.info(f'Model for dataset {name} presumably is not an LLM as the block size has not been specified')
            return
        block_size = cfg.dataset.args.block_size
        if batch_size * block_size > len(dataset):
            log.warning(f'The product of batch_size {batch_size} and block_size {block_size} is larger than the dataset {name}, causing the dataloader to skip batches. Maybe check the split size?')
    check_dataset_size("train", dataset_train)
    train_dataloader = get_loader(dataloader_cfg = cfg.dataset.dataloader, data = dataset_train)
    if dataset_eval is not None:
        check_dataset_size("eval", dataset_eval)
        eval_dataloader = get_loader(dataloader_cfg = cfg.dataset.dataloader, data = dataset_eval)
    else:
        eval_dataloader = None

    #update tokenizer config with metadata to save it in model checkpoints
    data_wrapper.tokenizer.load_metadata(filepath=os.path.join(data_wrapper.tokenized_dir, cfg.dataset.tokenizer.meta_file))
    with open_dict(cfg.dataset.tokenizer):
        cfg.dataset.tokenizer["meta"] = data_wrapper.tokenizer.meta
        
    return train_dataloader, eval_dataloader

train_d, eval_d = get_dataloader()

[ [36m2024-02-23 12:08:59,837 [0m][[2;37mhydra.core.utils[0m][[34mDEBUG[0m] - [34mSetting JobRuntime:name=UNKNOWN_NAME[0m
[ [36m2024-02-23 12:08:59,839 [0m][[2;37mhydra.core.utils[0m][[34mDEBUG[0m] - [34mSetting JobRuntime:name=app[0m
[ [36m2024-02-23 12:09:00,032 [0m][[2;37mqtransform[0m][[32mINFO[0m] - [32mHydra compose config is: {'data': {'dtype': 'float32'}, 'device': 'cuda', 'debug': True, 'dataset': {'wrapper': 'HuggingfaceDatasetWrapper', 'module': 'huggingface', 'name': 'openwebtext', 'root_path': '~/.qtransform/datasets', 'dataset_dir': ['${dataset.root_path}', '${dataset.module}', '${dataset.name}'], 'sizes': {'train': 0.3, 'eval': 0.05, 'bench': 0.3}, 'tokenizer': {'dtype': '${data.dtype}', 'meta_file': 'meta.pkl', 'wrapper': 'TikTokenizer', 'encoding': 'gpt2', 'module': 'tiktoken'}, 'dataloader': {'shuffle': True, 'num_workers': 2, 'batch_size': 12}, 'type': 'huggingface', 'args': {'block_size': '${model.args.block_size}', 'cache_dir': None, 'data_co

In [3]:

@qtransform.with_config(args, logging.DEBUG)
def get_optim(cfg):
    from qtransform.optim import get_optim, get_scheduler
    log.debug(f"optim config: {cfg.optim}")
    #optimizer = optim.Adadelta(model.parameters(), lr=cfg.optim.learning_rate)
    optimizer = get_optim(model=model, optim_cfg=cfg.optim)
    log.debug(f'Configured optimizer ({type(optimizer)}): {optimizer}')
    scheduler = get_scheduler(optimizer=optimizer, scheduler_cfg = cfg.optim.scheduler)
    log.debug(f'Scheduler: {scheduler}')
    return optimizer, scheduler

In [5]:
from qtransform.run.train import train
args = [
        "run=train", 
        "model=gpt_2_h2l2e256b64_GeBN",
        "dataset=huggingface", 
        "debug=True",
        "dataset.name=openwebtext",
        "+export=True",
        "run.epochs=1",
        "run.max_iters=300",
        "dataset/tokenizer=tiktoken",
        "dataset.tokenizer.encoding=gpt2"
    ]
@qtransform.with_config(args, logging.DEBUG)
def _train(cfg, model, device, train_dataloader, eval_dataloader, optimizer,scheduler, timestamp):
    last_checkpoint = None
    # lets go
    quant_cfg = cfg.get('quantization')
    if quant_cfg and quant_cfg.quantize:    
        log.debug(f'Running quantized model')
        from qtransform.quantization import get_quantizer
        quantizer, model_quant_cfg = get_quantizer(quant_cfg, model=model)
        model, replace_layers_later = quantizer.get_quantized_model(model_quant_cfg, inplace=True)
        # TODO make this a decorator so it can return stuff
        last_checkpoint = quantizer.train_qat(model, train, [cfg, device, train_dataloader, eval_dataloader, optimizer,scheduler, timestamp])
        #quantize last layers (batchnorm). parmams last saved checkpoint do not entirely reflect current model anymore 
        if replace_layers_later is not None:
            model = quantizer.get_quantized_model(replace_layers_later)
    else:
        #if hasattr(log,"trace"): log.trace(model)
        last_checkpoint = train(cfg=cfg, device=device, model=model, train_data_loader=train_dataloader, eval_data_loader=eval_dataloader, optimizer=optimizer, scheduler=scheduler, timestamp=timestamp)
    # maybe subsequent jobs can be managed by hydra in the future?
    # when this paradigm comes up more frequently we have to make this a thing ....
    log.debug("Finished training model")

In [10]:
from qtransform import device_singleton
device = device_singleton.device
@qtransform.with_config(args, logging.DEBUG)
def get_model(cfg):
    from qtransform.model import get_model
    model = get_model(cfg.model)
    model.train()
    #only parameters (type torch.nn.parameter.Parameter) are moved to the device, not non-named Tensors
    #this is a problem if a layer uses a non-named Tensor during the forward pass
    model.to(device=device)
    return model

model = get_model()
optimizer, scheduler = get_optim()
_train(model)

[ [36m2024-02-23 12:24:06,170 [0m][[2;37mhydra.core.utils[0m][[34mDEBUG[0m] - [34mSetting JobRuntime:name=app[0m
[ [36m2024-02-23 12:24:06,358 [0m][[2;37mqtransform[0m][[32mINFO[0m] - [32mHydra compose config is: {'data': {'dtype': 'float32'}, 'device': 'cuda', 'debug': True, 'dataset': {'wrapper': 'HuggingfaceDatasetWrapper', 'module': 'huggingface', 'name': 'openwebtext', 'root_path': '~/.qtransform/datasets', 'dataset_dir': ['${dataset.root_path}', '${dataset.module}', '${dataset.name}'], 'sizes': {'train': 0.3, 'eval': 0.05, 'bench': 0.3}, 'tokenizer': {'dtype': '${data.dtype}', 'meta_file': 'meta.pkl', 'wrapper': 'TikTokenizer', 'encoding': 'gpt2', 'module': 'tiktoken'}, 'dataloader': {'shuffle': True, 'num_workers': 2, 'batch_size': 12}, 'type': 'huggingface', 'args': {'block_size': '${model.args.block_size}', 'cache_dir': None, 'data_column_name': 'text', 'batches': 1000, 'chunking': False, 'chunk_size': 100}}, 'seed': 1234567890, 'model': {'calc_loss_in_model': T

UnboundLocalError: local variable 'model' referenced before assignment