In [1]:
from typing import Optional
import torch

class LayerWiseDummyOptimizer(torch.optim.Optimizer):
    def __init__(self, optimizer_dict=None, *args, **kwargs):
        dummy_tensor = torch.randn(1, 1)
        self.optimizer_dict = optimizer_dict
        super().__init__([dummy_tensor], {"lr": 1e-03})
    def zero_grad(self, set_to_none: bool = True) -> None: pass
    def step(self, closure=None) -> Optional[float]: pass

class LayerWiseDummyScheduler(torch.optim.lr_scheduler.LRScheduler):
    def __init__(self, *args, **kwargs):
        optimizer = LayerWiseDummyOptimizer()
        last_epoch = -1
        verbose = False
        super().__init__(optimizer, last_epoch, verbose)
    def get_lr(self): return [group["lr"] for group in self.optimizer.param_groups]
    def _get_closed_form_lr(self): return self.base_lrs

In [2]:
import torch.nn
import bitsandbytes as bnb
from transformers import get_constant_schedule
from functools import partial

from galore_torch import GaLoreAdamW8bit
        
def load_galore_optimizer(model, lr, galore_config):
    galore_params = [
        module.weight for module_name, module in model.named_modules() 
        if isinstance(module, nn.Linear) and any(target_key in module_name for target_key in galore_config["target_modules_list"])
    ] 
    id_galore_params = {id(p) for p in galore_params}
    
    # hook Galore to parameters in target_modules_list
    def optimizer_hook(p, optimizer, scheduler):
        if p.grad is not None: 
            optimizer.step()
            optimizer.zero_grad()
            scheduler.step()

    for p in model.parameters():
        if p.requires_grad:
            # Assign the Galore optimizer and scheduler to parameters in target_modules_list, bnb.Adam8bit to all others
            if id(p) in id_galore_params:
                optimizer = GaLoreAdamW8bit([dict(params=[p], **galore_config)], lr=lr)
            else:
                optimizer = bnb.optim.Adam8bit([p], lr = lr)
            scheduler = get_constant_schedule(optimizer)
            
            p.register_post_accumulate_grad_hook(partial(optimizer_hook, optimizer=optimizer, scheduler=scheduler))
            
    # return dummies, stepping is done with hooks 
    return LayerWiseDummyOptimizer(), LayerWiseDummyScheduler()

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, set_seed, get_constant_schedule
from trl import SFTTrainer, setup_chat_format, DataCollatorForCompletionOnlyLM
from datasets import load_dataset
import torch, torch.nn as nn, uuid, wandb

use_galore = True
use_galore_layerwise = True
lr = 1e-5
rank = 1024
modelpath = "../models/llama2-7b"
set_seed(42)
run_id = f"use_galore-1-{use_galore}-layerwise-{use_galore_layerwise}-rank{rank}-{str(uuid.uuid4())}"

model = AutoModelForCausalLM.from_pretrained(
    modelpath,    
    torch_dtype=torch.bfloat16,
    attn_implementation = "flash_attention_2",  
    device_map = "auto",
    use_cache = False,
)
tokenizer = AutoTokenizer.from_pretrained(modelpath, use_fast = False)

model, tokenizer = setup_chat_format(model, tokenizer)
if tokenizer.pad_token in [None, tokenizer.eos_token]: 
    tokenizer.pad_token = tokenizer.unk_token

dataset = load_dataset("g-ronimo/oasst2_top4k_en")

training_arguments = TrainingArguments(
    output_dir = f"out_{run_id}",
    evaluation_strategy = "steps",
    label_names = ["labels"],
    per_device_train_batch_size = 16,
    gradient_accumulation_steps = 1,
    save_steps = 250,
    eval_steps = 250,
    logging_steps = 1, 
    learning_rate = lr,
    num_train_epochs = 3,
    lr_scheduler_type = "constant",
    gradient_checkpointing = True,
    group_by_length = False,
)

if use_galore:
    galore_config = dict(
        target_modules_list=["attn", "mlp"], 
        rank=rank, 
        update_proj_gap=200, 
        scale=2, 
        proj_type="std"
    )
    optimizers = load_galore_optimizer(model, lr, galore_config)
else:
    optimizers = (None, None)

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset["train"],
    eval_dataset = dataset['test'],
    data_collator = DataCollatorForCompletionOnlyLM(
        instruction_template = "<|im_start|>user", 
        response_template = "<|im_start|>assistant", 
        tokenizer = tokenizer, 
        mlm = False),
    max_seq_length = 256,
    dataset_kwargs = dict(add_special_tokens = False),
    optimizers =  optimizers,
    args = training_arguments,
)

wandb.init(
    project = "galore-7B", 
    name = run_id,
).log_code(include_fn=lambda path: path.endswith(".py") or path.endswith(".ipynb"))

trainer.train()

trainer.save_model(f"out_{run_id}/model_trained")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mg-ronimo[0m. Use [1m`wandb login --relogin`[0m to force relogin


Answer as an expert in medical malpractice law in the UK, presenting your arguments to another lawyer. Identify if this constitutes a breach of duty and a settlement value broken down by how the different factors affect the amount. Justify it step by step.
Our client’s mother approached our specialist erb’s palsy solicitors after realising that her son, Peter, may be eligible to claim compensation for the injuries he sustained during birth.
During her pregnancy with Peter, his mother had attended all of the scans and they showed a baby of normal size. However, during her labour, it became apparent that Peter was quite a big baby and, due to this, they took his mother to theatre to monitor her progress.
Peter’s head was delivered first, but due to his size, his shoulders became stuck (shoulder dystocia). Multiple attempts were made to deliver his body, including the use of a ventouse (vacuum cup) and forceps. The McRoberts position (a manoeuvre often used in this situation) was not adop

Step,Training Loss,Validation Loss
250,1.1007,1.057425
500,0.9637,1.06259


Material 1:
Caucasian man, age 36, has intermittent joint pain in both knees over a period of several years. Recently (months) also joint pain in right elbow. 

No edema, swelling or joint effusion. 
The joint is intermittent.
The pain in the knees is sharp when active and aching when passive. 
The elbow joint pain is passive, aching.
The pain may worsen in severity and occurrence when eating peanuts. 

Comorbidities (in no especial order):
1. 6-8 previous episodes of acute pancreatitis. (Onset age 33)
2. Intermittent pain in kidney area since before the onset of pancreatitis (Undiagnosed, onset age 15)
3. Sleep apnea.
4. Undiagnosed pain upper jaw molar. 

End of material 1.

Question 1: Deduce five diagnoses in order of likelihood, in the following format
Diagnosis: 
 This instance will be ignored in loss calculation. Note, if this happens often, consider increasing the `max_seq_length`.
Material 1:
Caucasian man, age 36, has intermittent joint pain in both knees over a period of sev

KeyboardInterrupt: 