In [None]:
!pip install -r requirements.txt

In [None]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

In [None]:
import os
import numpy as np

import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM, 
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    # pipeline, 
    # logging
)
from peft import LoraConfig

from trl import SFTTrainer

import wandb

import gc

import evaluate

import nltk
from nltk.tokenize import sent_tokenize
nltk.download("punkt")

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"
os.environ["USE_FLASH_ATTENTION"] = "1"

print(f'Is CUDA available on torch? {torch.cuda.is_available()}.')

In [None]:
def display_cuda_memory():
    print("\n--------------------------------------------------\n")
    print("torch.cuda.memory_allocated: %fGB"%(torch.cuda.memory_allocated(0)/1024/1024/1024))
    print("torch.cuda.memory_reserved: %fGB"%(torch.cuda.memory_reserved(0)/1024/1024/1024))
    print("torch.cuda.max_memory_reserved: %fGB"%(torch.cuda.max_memory_reserved(0)/1024/1024/1024))
    print("\n--------------------------------------------------\n")

In [None]:
# Force garbage collection
gc.collect()

In [None]:
data_files = {'train': 'train_data.json',
              'validation': 'val_data.json',
              'test': 'test_data.json'}
data = load_dataset("json", data_files=data_files)
data

In [None]:
device_type = "cuda:0"
base_model = "lmsys/vicuna-7b-v1.5"

In [None]:
# compute_dtype = getattr(torch, "float16")
# quant_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=compute_dtype,
#     bnb_4bit_use_double_quant=False
# )

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    # quantization_config=quant_config,
    device_map=device_type
)
model.config.use_cache = False
model.config.pretraining_tp = 1

In [None]:
display_cuda_memory()

In [None]:
# import tempfile
# from transformers.modeling_utils import load_sharded_checkpoint

# with tempfile.TemporaryDirectory() as tmp_dir:
#     model.save_pretrained(tmp_dir, max_shard_size="200MB")
#     load_sharded_checkpoint(model, tmp_dir)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [None]:
peft_params = LoraConfig(
    lora_alpha=32,
    lora_dropout=0.1,
    r=32,
    bias="none",
    task_type="CAUSAL_LM"
)

In [None]:
# set evaluation metric
metric = evaluate.load("rouge")

# helper function to postprocess text
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(sent_tokenize(label)) for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {k: round(v * 100, 4) for k, v in result.items()}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    return result

In [None]:
wandb.login(key="<Wandb Token>") # log into wandb

In [None]:
%env WANDB_PROJECT=vicuna-7b-v1.5-full-fine-tuning-window

In [None]:
# define function to format data to prompt instruction format
def prompt_instruction_format(sample):
    return [f"""### Instruction:
    You are a friendly and patient professional who cares about dogs. Use the given Input below to write the Response.
    If you have not seen a similar input to Input, politely respond that it is not within your knowledge as a Response.

    ### Input:
    {sample['question']}

    ### Response:
    {sample['answer']}
    """]

In [None]:
training_params = TrainingArguments(
    output_dir="lora_fine_tune",
    num_train_epochs=3,
    per_device_train_batch_size=1,
    # per_device_eval_batch_size=2,
    gradient_accumulation_steps=1,
    # optim="paged_adamw_32bit",
    optim="adamw_torch",
    save_steps=25,
    logging_steps=3,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="wandb",
    disable_tqdm=False,
    seed=55,
)

In [None]:
display_cuda_memory()

In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=data['train'],
    # eval_dataset=data['validation'],
    # peft_config=peft_params,
    max_seq_length=None,
    tokenizer=tokenizer,
    args=training_params,
    packing=False,
    formatting_func=prompt_instruction_format,
    compute_metrics=compute_metrics,
)

In [None]:
display_cuda_memory()

In [None]:
#Force clean the pytorch cache
gc.collect()

torch.cuda.empty_cache()

# Train the model
trainer.train()

In [None]:
display_cuda_memory()

In [None]:
wandb.finish()