# Qlora for Llama3. runs with "qlora3" kernel
- Source: https://medium.com/@avishekpaul31/fine-tuning-llama-3-8b-instruct-qlora-using-low-cost-resources-89075e0dfa04
- Source: https://github.com/AvisP/LM_Finetune/blob/main/llama-3-finetune-qlora.ipynb

## Setup for kernel (qlora3)

In [None]:
# workaround for packaging fehler
pip install setuptools==69.5.1

In [None]:
!pip install torch==2.0.1
!pip install bitsandbytes
!pip install -U transformers[torch] datasets
!pip install -q bitsandbytes trl peft accelerate
!pip install flash-attn --no-build-isolation
!pip install transformers==4.40.2 # Es geht nur genau diese Version!!
!pip install trl
!pip install autoawq
!pip install huggingface_hub

In [None]:
!pip install sentencepiece
!pip install mistral_inference


In [None]:
!conda list | grep trl

## Qlora Training Procedure

### Set GPU and import Libaries

In [None]:
import os

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:256"
os.environ["CUDA_VISIBLE_DEVICES"] = "3"
!echo $CUDA_VISIBLE_DEVICES

In [None]:
from huggingface_hub import notebook_login
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

#notebook_login()


### Set Model

#### LLama-3-8B-Instruct

In [None]:

model_id = "/home/thsch026/masterarbeit/models/generated/prune/shortened-llm/Meta-LLama-3-8B-instruct"

# Storage Location
trained_model_id = "Meta-Llama-3-8B-Instruct_prune_qlora"
output_dir = '/home/thsch026/masterarbeit/models/generated/qlora/' + trained_model_id

#### LLama 3 8 B Instruct pruned

In [None]:
model_id = "meta-llama/Llama-2-7b-chat-hf"

# Storage Location
trained_model_id = "Llama-2-7b-chat-hf_qlora"
output_dir = '/home/thsch026/masterarbeit/models/generated/qlora/' + trained_model_id

#### Mistral 7B

In [None]:
model_id = "mistralai/Mistral-7B-Instruct-v0.2"

# Storage Location
trained_model_id = "Mistral-7B-Instruct-v0.2_qlora"
output_dir = '/home/thsch026/masterarbeit/models/generated/qlora/' + trained_model_id

#### Llama-2-7b-chat-hf

In [None]:
model_id = "meta-llama/Llama-2-7b-chat-hf"

# Storage Location
trained_model_id = "Llama-2-7b-chat-hf_qlora"
output_dir = '/home/thsch026/masterarbeit/models/generated/qlora/' + trained_model_id

### Load Model and choose GPU

In [None]:
model = AutoModelForCausalLM.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_id)

### Load Dataset and trim for training

In [None]:
from datasets import load_dataset

# based on config
raw_datasets = load_dataset("HuggingFaceH4/ultrachat_200k")

In [None]:
from datasets import DatasetDict

# remove this when done debugging to include whole dataset
indices = range(0,10000)

dataset_dict = {"train": raw_datasets["train_sft"].select(indices),
                "test": raw_datasets["test_sft"].select(indices)}

#dataset_dict = {"train": raw_datasets["train_sft"],
  #              "test": raw_datasets["test_sft"]}

raw_datasets = DatasetDict(dataset_dict)
raw_datasets

In [None]:
example = raw_datasets["train"][0]
messages = example["messages"]
for message in messages:
  role = message["role"]
  content = message["content"]
  print('{0:20}:  {1}'.format(role, content))

In [None]:
tokenizer.eos_token_id

In [None]:
terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

In [None]:
import re
import random
from multiprocessing import cpu_count

def apply_chat_template(example, tokenizer):
    messages = example["messages"]
    # We add an empty system message if there is none
    #if messages[0]["role"] != "system":
     #   messages.insert(0, {"role": "system", "content": ""})
    example["text"] = tokenizer.apply_chat_template(messages, tokenize=False)

    return example

In [None]:
column_names = list(raw_datasets["train"].features)
raw_datasets = raw_datasets.map(apply_chat_template,
                                num_proc=cpu_count(),
                                fn_kwargs={"tokenizer": tokenizer},
                                remove_columns=column_names,
                                desc="Applying chat template",)

In [None]:
# set pad_token_id equal to the eos_token_id if not set
if tokenizer.pad_token_id is None:
  tokenizer.pad_token_id = tokenizer.eos_token_id

# Set reasonable default for models without max length
if tokenizer.model_max_length > 100_000:
  tokenizer.model_max_length = 2048

In [None]:
# create the splits
train_dataset = raw_datasets["train"]
eval_dataset = raw_datasets["test"]

for index in random.sample(range(len(raw_datasets["train"])), 3):
  print(f"Sample {index} of the processed training set:\n\n{raw_datasets['train'][index]['text']}")
  print("#####################################")

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    device_map="auto",
)

messages = [
    {"role": "system", "content": "You are a polite chatbot who responds clear and gentle."},
    {"role": "user", "content": "Who are you?"},
]

input_ids = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    return_tensors="pt"
).to(model.device)

terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_flash_sdp(False)
# https://github.com/Lightning-AI/litgpt/issues/327

outputs = model.generate(
    input_ids,
    max_new_tokens=128,
    eos_token_id=terminators,
    do_sample=True,
    temperature=0.6,
    top_p=0.9,
)

response = outputs[0][input_ids.shape[-1]:]

print(tokenizer.decode(response, skip_special_tokens=True))

## Prepare Training

In [None]:
from trl import SFTTrainer
from peft import LoraConfig
from transformers import TrainingArguments
from transformers import BitsAndBytesConfig

In [None]:

# For 8 bit quantization
quantization_config = BitsAndBytesConfig(load_in_8bit=True,
                                        llm_int8_threshold=200.0)

## For 4 bit quantization
#quantization_config = BitsAndBytesConfig(
 #           load_in_4bit=True,
  #          bnb_4bit_use_double_quant=True,
   #         bnb_4bit_quant_type="nf4",
    #        bnb_4bit_compute_dtype=torch.bfloat16,)

#model = AutoModelForCausalLM.from_pretrained(model_id, 
 #                                            quantization_config=quantization_config,
  #                                      device_map="auto")

In [None]:
model_kwargs = dict(
    attn_implementation="flash_attention_2",#"flash_attention_2", # set this to True if your GPU supports it (Flash Attention drastically speeds up model computations)
    torch_dtype="auto",
    use_cache=False, # set to False as we're going to use gradient checkpointing
    device_map="auto",
    quantization_config=quantization_config,
)

In [None]:

# based on config
training_args = TrainingArguments(
    fp16=False, # specify bf16=True instead when training on GPUs that support bf16 else fp16
    bf16=False,
    do_eval=True,
    evaluation_strategy="epoch",
    gradient_accumulation_steps=1,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
    learning_rate=2.0e-05,
    log_level="info",
    logging_steps=5,
    logging_strategy="steps",
    lr_scheduler_type="cosine",
    max_steps=-1,
    num_train_epochs=1,
    output_dir=output_dir,
    overwrite_output_dir=True,
    per_device_eval_batch_size=1, # originally set to 8
    per_device_train_batch_size=1, # originally set to 8
    push_to_hub=True,
    hub_model_id=trained_model_id,
    # hub_strategy="every_save",
    # report_to="tensorboard",
    report_to="none",  # for skipping wandb logging
    save_strategy="epoch",
    save_total_limit=None,
    seed=42,
)

# based on config
peft_config = LoraConfig(
        r=64,
        lora_alpha=16,
        lora_dropout=0.1,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
)

trainer = SFTTrainer(
        model=model_id,
        model_init_kwargs=model_kwargs,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        dataset_text_field="text",
        tokenizer=tokenizer,
        packing=True,
        peft_config=peft_config,
        max_seq_length=tokenizer.model_max_length,
    )

# To clear out cache for unsuccessful run
torch.cuda.empty_cache()

## Start Training

In [None]:
train_result = trainer.train()

## Save results

In [None]:
tokenizer.save_pretrained(output_dir)
model.save_pretrained(output_dir)