llama 2 qlora training for campus Q&A

In [None]:
import torch
print(f"gpu: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'none'}")
print(f"mem: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f}gb" if torch.cuda.is_available() else "no gpu")

In [None]:
%pip install -q transformers datasets accelerate peft bitsandbytes trl sentencepiece

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
import json

# load howard campus Q&A data  
with open('campus_qa.jsonl', 'r') as f:
    data = [json.loads(line) for line in f]

def format_instruction(sample):
    return f"""### Human: {sample['instruction']}

### Assistant: {sample['output']}"""

formatted_data = [{"text": format_instruction(s)} for s in data]
print(f"{len(formatted_data)} examples loaded")

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

model_name = "meta-llama/Llama-2-7b-chat-hf"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

print(f"loading {model_name}...")
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

print("loaded")

In [None]:
model = prepare_model_for_kbit_training(model)

# tried r=8 first, seemed too small
# tried r=32, slower and didnt help much
# r=16 seems like sweet spot

peft_config = LoraConfig(
    r=16,  
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"]
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

In [None]:
from transformers import TrainingArguments
from trl import SFTTrainer
from datasets import Dataset

dataset = Dataset.from_list(formatted_data)

# batch_size=2 was oom, switched to 1 with grad_accum
training_args = TrainingArguments(
    output_dir="./llama2-campus",
    num_train_epochs=3,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    optim="paged_adamw_32bit",
    logging_steps=5,
    save_strategy="epoch",
    learning_rate=2e-4,
    warmup_steps=10,
    fp16=True,
    push_to_hub=False,
)

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    tokenizer=tokenizer,
    dataset_text_field="text",
    max_seq_length=512,
)

print("training...")

In [None]:
trainer.train()

trainer.save_model("llama2-campus-final")
tokenizer.save_pretrained("llama2-campus-final")

# took about 25min on T4
print("done. saved to llama2-campus-final")

In [None]:
from transformers import pipeline

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_length=200)

test_q = "what are the library hours?"
prompt = f"### Human: {test_q}\n\n### Assistant:"
result = pipe(prompt)[0]['generated_text']
print(result.split('### Assistant:')[1].strip())

In [None]:
!zip -r llama2-campus-model.zip llama2-campus-final/
from google.colab import files
files.download('llama2-campus-model.zip')

In [None]:
# can load locally with transformers.pipeline and use same prompt format