In [None]:
%%capture
!pip install unsloth "xformers==0.0.28.post2"
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install -Uq datasets bitsandbytes accelerate einops transformers peft trl sentencepiece comet-ml>=3.43.2

In [None]:
COMET_API_KEY = "cN46R8yZC3C1aWC7tTon4TKM4"
HF_TOKEN = "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"

In [None]:
import comet_ml
# comet_ml.login(COMET_API_KEY)
exp = comet_ml.start(project_name="llm-engineers-handbook", api_key=COMET_API_KEY)

In [None]:
import os
import torch
from trl import SFTTrainer, SFTConfig
from transformers import TextStreamer
from unsloth import FastLanguageModel, is_bfloat16_supported
from datasets import load_dataset, concatenate_datasets

In [None]:
from huggingface_hub import login
login(HF_TOKEN)

In [None]:
max_seq_length = 2048

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="meta-llama/Meta-Llama-3.1-8b",
    max_seq_length=max_seq_length,
    load_in_4bit=True,
    device_map="sequential",
    trust_remote_code=True,
    use_gradient_checkpointing="unsloth",
)

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r=32,
    lora_alpha=32,
    lora_dropout=0,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    use_gradient_checkpointing=True,
    use_rslora=False,
)


In [None]:
dataset1 = load_dataset("mlabonne/llmtwin")
dataset2 = load_dataset("mlabonne/FineTome-Alpaca-100k", split="train[:10000]")
dataset = concatenate_datasets([dataset1["train"], dataset2]).remove_columns(["source", "score"])

In [None]:
dataset.to_pandas().sample(10)

In [None]:
dataset1["train"].to_pandas().sample(10)

In [None]:
from unsloth.chat_templates import get_chat_template

def apply_basic_chat_template(example):
    system_prompt = "You are a helpful assistant. Below is an instruction that describes a task. Write a response that appropriately completes the request."
    return {
        "messages": [
            {
                "role": "system",
                "content": system_prompt,
            },
            {
                "role": "user",
                "content": example["instruction"],
            },
            {
                "role": "assistant",
                "content": example["output"]
            }
        ]
    }

dataset = dataset.map(apply_basic_chat_template, remove_columns=["instruction", "output"])

tokenizer = get_chat_template(
    tokenizer,
    chat_template="llama-3",
    map_eos_token=True
)

def formatting_prompts_func(examples):
    msgs = examples["messages"]
    texts = [tokenizer.apply_chat_template(msg, tokenize=False, add_generation_prompt=False) for msg in msgs]
    return {"text": texts}

dataset = dataset.map(formatting_prompts_func, batched=True).remove_columns(["messages"])

dataset.to_pandas().sample(10)

In [None]:
dataset = dataset.train_test_split(test_size=0.05)
dataset

In [None]:
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=True,
    args=SFTConfig(
        learning_rate=3e-4,
        lr_scheduler_type="linear",
        per_device_train_batch_size=2,
        gradient_accumulation_steps=8,
        num_train_epochs=3,
        fp16= not is_bfloat16_supported(),
        bf16= is_bfloat16_supported(),
        logging_steps=2,
        optim="adamw_torch_fused",
        weight_decay=0.01,
        warmup_ratio=0.1,
        eval_strategy="epoch",
        output_dir="model_output",
        seed=432,
        # report_to="comet_ml"
    )
)

In [None]:
trainer.train()
print("\n------------------Training Complete--------------------------\n")
exp.end()

In [None]:
model.push_to_hub_merged("srivatsaHFhub/llama3.1_fineTomeAlpaca_modified", tokenizer, save_method="merged_16bit", token=HF_TOKEN)

In [None]:
def gpu_status():
    gpu_stats = torch.cuda.get_device_properties(0)
    start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
    max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
    print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
    print(f"{start_gpu_memory} GB of memory reserved.")

gpu_status()

In [None]:
FastLanguageModel.for_inference(model)

def generate(prompt, max_new_tokens=512, temp=1):
    messages = [
        {"role": "system", "content":"You are a helpful assistant. Below is an instruction that describes a task. Write a response that appropriately completes the request."},
        {"role": "user", "content": prompt},
    ]
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize = True,
        add_generation_prompt = True, # Must add for generation
        return_tensors = "pt",
        torch_dtype = torch.bfloat16
    ).to("cuda")

    text_streamer = TextStreamer(tokenizer, skip_prompt = True)
    model.eval()
    res = model.generate(
        input_ids = inputs,
        streamer = text_streamer,
        max_new_tokens = max_new_tokens,
        use_cache = True,
        temperature = temp,
        min_p = 0.1
    )

In [None]:
generate("Can you tell me about supervised fine tuning ?",  max_new_tokens=1024)

In [None]:
generate("How does direct preference optimization work ?", max_new_tokens=2048)

In [None]:
import gc

# del model
gc.collect()
torch.cuda.empty_cache()

gpu_status()

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="srivatsaHFhub/llama3.1_fineTomeAlpaca_modified",
    max_seq_length=max_seq_length,
    load_in_4bit=False,
    device_map="sequential",
    trust_remote_code=True,
    use_gradient_checkpointing="unsloth",
)

In [None]:
FastLanguageModel.for_inference(model)
generate("How does temperature affect LLM output ?")

**These generations are not very precise and to the point, and have repeted information.**