In [None]:
!pip install -q -U bitsandbytes
!git clone https://github.com/huggingface/transformers.git
%cd transformers
!pip install .
%cd ../
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q datasets
!pip install --upgrade datasets

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

model_id = "meta-llama/Llama-3.2-3B-Instruct"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id, token="**")
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0}, token="**")

In [None]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [None]:
def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [None]:
for name, module in model.named_modules():
    print(name)

In [None]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

In [None]:
from datasets import load_dataset
data = load_dataset("wikitext", "wikitext-2-raw-v1")
data = data.map(lambda samples: tokenizer(samples["text"]), batched=True)
print(len(data['train']))
print(len(data['test']))

In [None]:
!wandb login **

In [None]:
import torch
from tqdm import tqdm
import torch.nn as nn

def evaluate_ppl(model, tokenizer, device="cuda:0"):
    test_dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")

    test_enc = tokenizer("\n\n".join(test_dataset["text"]), return_tensors="pt")
    model.seqlen = 2048
    test_enc = test_enc.input_ids.to(device)

    nsamples = test_enc.numel() // model.seqlen
    nlls = []
    for i in tqdm(range(nsamples), desc="Evaluating..."):
        batch = test_enc[:, (i * model.seqlen) : ((i + 1) * model.seqlen)]

        with torch.no_grad():
            lm_logits = model(batch).logits

        shift_logits = lm_logits[:, :-1, :].contiguous().float()
        shift_labels = test_enc[:, (i * model.seqlen) : ((i + 1) * model.seqlen)][:, 1:]

        loss_fct = nn.CrossEntropyLoss()
        loss = loss_fct(
            shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)
        )
        neg_log_likelihood = loss.float() * model.seqlen
        nlls.append(neg_log_likelihood)

    ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * model.seqlen))

    return ppl.item()

In [None]:
perplexity_score = evaluate_ppl(model, tokenizer)
print(f"Perplexity on test set before fine-tuning: {perplexity_score:.4f}")

In [None]:
import transformers

torch.cuda.empty_cache()
tokenizer.pad_token = tokenizer.eos_token
model.train()
trainer = transformers.Trainer(
    model=model,
    train_dataset=data["train"],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=6,
        warmup_steps=100,
        num_train_epochs=8,
        learning_rate=1e-4,
        # lr_scheduler_type="cosine",
        fp16=True,
        logging_steps=500,
        output_dir="outputs",
        optim="paged_adamw_8bit",
        max_grad_norm=0.8,
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()
model.config.use_cache = True

In [None]:
model.save_pretrained("./finetuned_model")

In [None]:
perplexity_score = evaluate_ppl(model, tokenizer)
print(f"Perplexity on test set after fine-tuning: {perplexity_score:.4f}")

In [None]:
merged_model = model.merge_and_unload()
merged_model.save_pretrained("./finetuned_final_model")
tokenizer.save_pretrained("./finetuned_final_model")

In [None]:
from huggingface_hub import HfApi
import os
HF_TOKEN = "**" # Replace with your Hugging Face token
repo_id = "zbyzby/Llama3.2-3B-Instruct-QLoRA-finetuned"
folder_path = "./finetuned_final_model"
api = HfApi(token=HF_TOKEN)
for root, dirs, files in os.walk(folder_path):
    for file in files:
        file_path = os.path.join(root, file)
        path_in_repo = os.path.relpath(file_path, folder_path)
        print(f"Uploading: {file_path} -> {path_in_repo}")
        api.upload_file(
            path_or_fileobj=file_path,
            path_in_repo=path_in_repo,
            repo_id=repo_id,
            token=HF_TOKEN,
        )
print("All files uploaded.")