In [None]:
# access your google drive.
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
os.chdir('/content/drive/MyDrive/CSCI 544 Project/Train SFT ') # change to current directory
print("Current directory changed to:", os.getcwd())

Current directory changed to: /content/drive/.shortcut-targets-by-id/1HRF3kCndHCL6FW9EbuYfx-vCSYuQ1aVA/CSCI 544 Project/Train SFT 


In [None]:
# !pip install datasets
# !pip install trl
# !pip install peft

# Environments

 - HF_TOKEN = "your hugging face API key", set at notebook secret.
 - Download packages:
  - trl
  - datasets
  - peft
 - Access to your google drive.

# Datasets

In [None]:
import json
import random
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from datasets import load_dataset

In [None]:
# DO NOT RUN. example open source datasets, used to rule out issues within trl.
#dataset = load_dataset("stanfordnlp/imdb", split="train")

In [None]:
# OUR DATA.
# Step 1: Load the JSON file into the Colab notebook
data_path = "/content/drive/MyDrive/CSCI 544 Project/Train SFT /datasets/data.json"  # Update this path if necessary

with open(data_path, "r") as file:
    data = json.load(file)

# Convert the JSON data into a list of dictionaries
formatted_data = [
    {"prompt": entry["prompt"], "completion": ", ".join(entry["ground_truth"])}
    for entry in data.values()
]

# Step 2: Perform train-validation-test split, 70-15-15
train_data, temp_data = train_test_split(formatted_data, test_size=0.3, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

# Convert the splits into Hugging Face datasets
train_dataset = Dataset.from_list(train_data)
val_dataset = Dataset.from_list(val_data)
test_dataset = Dataset.from_list(test_data)

# Combine the datasets into a DatasetDict for better organization
dataset = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset
})

# Step 3: Prepare data for SFTTrainer
# SFTTrainer expects the dataset to have 'prompt' and 'completion' keys
print("3 sample data from training set:")
for i in range(3):
  print(dataset["train"][i])

print("Trian data size:", dataset["train"].num_rows, "row.")
print("Validation data size:", dataset["validation"].num_rows, "row.")
print("Test data size:", dataset["test"].num_rows, "row.")

3 sample data from training set:
{'prompt': "I am a male, aged 19, from Charlottesville, VA, working as student.I have previously watched and liked the movies: ['Truth About Cats & Dogs, The (1996) (comedy, romance)', 'Birdcage, The (1996) (comedy)', 'Jerry Maguire (1996) (drama, romance)', 'Twelve Monkeys (1995) (drama, sci_fi)'].\nI have watched and HATED the movies: ['Multiplicity (1996) (comedy)', 'Time to Kill, A (1996) (drama)', 'Down Periscope (1996) (comedy)', 'Twister (1996) (action, adventure, thriller)', 'Ransom (1996) (drama, thriller)'].Please provide recommendations for movies released before April 22nd, 1998, based on my history.\nBased on my profile, recommend the top 5 movies I am most likely to watch next.\nPlease provide the output in a list of strings format, containing only the movie titles.\nMake sure to strictly adhere to the output format given below. Strictly do not generate any additional information other than the movie names.\nFormat:  ['movie_name', 'movie_

# Training
- SFTConfig
  - TrainingArguments
- SFTTrainer
- LoraConfig
  - Qlora

- Reference:
  - [tutorial_1](https://github.com/philschmid/deep-learning-pytorch-huggingface/blob/main/training/fine-tune-llms-in-2024-with-trl.ipynb)
  -



#### TODO
- No access to llama-3.2

## Shared setting

In [None]:
from trl import SFTConfig, SFTTrainer
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from peft import LoraConfig

In [None]:
# model
model_id = "meta-llama/Llama-3.2-1B-Instruct" # meta-llama/Llama-3.2-11B-Vision-Instruct

In [None]:
# Training arguments, see reference.
training_args = SFTConfig(                            # inherits TrainingArguments class.
    max_seq_length=512,                               # max sequence length of the input data.
    output_dir="./models/yfy",        # directory to save and repository id
    num_train_epochs=3,                     # number of training epochs
    learning_rate=5e-5,                     # learning rate
    per_device_train_batch_size=1,          # batch size per device during training
    save_strategy="epoch",                  # save checkpoint every epoch
    logging_dir="./logs",                   # Directory for logging, enable prettier log print
    logging_steps=20,                         # Log after every step
    report_to=[],                            # Turn off wandb
    fp16=True,                                #

    # bf16=True,                              # use bfloat16 precision
    # lr_scheduler_type="constant",           # use constant learning rate scheduler
    # warmup_ratio=0.03,                      # warmup ratio based on QLoRA paper

    # tf32=True,                              # use tf32 precision
    # push_to_hub=True,                       # push model to hub
    # report_to="tensorboard",                # report metrics to tensorboard
    gradient_accumulation_steps=8,          # number of steps before performing a backward/update pass
    gradient_checkpointing=True            # use gradient checkpointing to save memory
    # optim="adamw_torch_fused",              # use fused adamw optimizer
    # logging_steps=10,                       # log every 10 steps
    # max_grad_norm=0.3,                      # max gradient norm based on QLoRA paper
)

In [None]:
def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['completion'])):
        text = f"### Prompt: {example['prompt'][i]}\n ### Completion: {example['completion'][i]}"
        output_texts.append(text)
    return output_texts

In [None]:
# adding
def role_formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['completion'])):
        text = f"### role: user, content: {example['prompt'][i]}\n ### role: assistant, content: [{example['completion'][i]}]"
        output_texts.append(text)
    return output_texts

In [None]:
# CHANGE TO YOUR OWN PATH
name = input("Enter your name: ")
path = f"/content/drive/MyDrive/CSCI 544 Project/Train SFT /models/{name}/"

Enter your name: yfy


## Standard SFT

In [None]:
# load pretrained model
model = AutoModelForCausalLM.from_pretrained(model_id)          # model for sft

KeyboardInterrupt: 

In [None]:
# Define trainer.
trainer = SFTTrainer(
    model=model,                    # seperate model from
    train_dataset=dataset['train'], # .select(range(20)
    args=training_args,
    formatting_func=formatting_prompts_func
)

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

Map:   0%|          | 0/611 [00:00<?, ? examples/s]

In [None]:
trainer.train()

OutOfMemoryError: CUDA out of memory. Tried to allocate 1.17 GiB. GPU 0 has a total capacity of 14.75 GiB of which 523.06 MiB is free. Process 2699 has 14.23 GiB memory in use. Of the allocated memory 14.06 GiB is allocated by PyTorch, and 57.56 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
trainer.save_model(path + "stf-std-gpt2-full-train-data")

In [None]:
# test for 20 train samples.
trainer.train()

Step,Training Loss
1,2.0536
2,1.9506
3,1.8094
4,1.6528
5,1.5504
6,1.4232
7,1.3402
8,1.4153
9,1.3937


TrainOutput(global_step=9, training_loss=1.6210273106892903, metrics={'train_runtime': 144.7646, 'train_samples_per_second': 0.414, 'train_steps_per_second': 0.062, 'total_flos': 9802533888000.0, 'train_loss': 1.6210273106892903, 'epoch': 3.0})

## SFT with QLora

In [None]:
# load pretrained model
model_lora = AutoModelForCausalLM.from_pretrained(model_id)    # model for lora

In [None]:
# save_directory = "/content/drive/MyDrive/CSCI 544 Project/Train SFT /models/meta-llama/Llama-3.2-1B-Instruct"

# # Save the model
# model_lora.save_pretrained(save_directory)

In [None]:
# QLoRA config
peft_config = LoraConfig(
        lora_alpha=128,
        lora_dropout=0.05,
        r=256,
        bias="none",
        target_modules="all-linear",
        task_type="CAUSAL_LM",
)

In [None]:
# Qlora_trainer
lora_trainer = SFTTrainer(
    model=model_lora,
    train_dataset=dataset['train'], # .select(range(20)
    peft_config=peft_config,
    args=training_args,
    formatting_func=role_formatting_prompts_func
)

Map:   0%|          | 0/611 [00:00<?, ? examples/s]

In [None]:
# full train data, 611 row.
lora_trainer.train()

Step,Training Loss
20,1.0674
40,0.498
60,0.4314
80,0.4135
100,0.3637
120,0.3562
140,0.3473
160,0.3498
180,0.3219
200,0.3196


TrainOutput(global_step=228, training_loss=0.4309795540675782, metrics={'train_runtime': 670.0828, 'train_samples_per_second': 2.735, 'train_steps_per_second': 0.34, 'total_flos': 3755123903410176.0, 'train_loss': 0.4309795540675782, 'epoch': 2.985270049099836})

In [None]:
# save model
lora_trainer.save_model(path + "112024-sft-lora-role-Llama-3.2-1B-Instruct-full-train-data")

# Evaluation



In [None]:
import torch
from transformers import AutoTokenizer, pipeline, AutoModelForCausalLM
from datasets import load_dataset
from random import randint
from tqdm import tqdm

In [None]:
# change model name (after merged) to which ever you wanna try
test_model_id = "./models/yfy/112024-sft-lora-Llama-3.2-1B-Instruct-full-train-data"

In [None]:
# Load Model with PEFT adapter
test_model = AutoModelForCausalLM.from_pretrained(
    test_model_id,
    torch_dtype=torch.float16,
    device_map=None,  # Avoid auto device mapping
).to("cuda")

test_tokenizer = AutoTokenizer.from_pretrained(test_model_id)

# load into pipeline
test_pipe = pipeline("text-generation", model=test_model, tokenizer=test_tokenizer)

NameError: name 'test_model_id' is not defined

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
# pipe = pipeline("text-generation", model=model_lora, tokenizer=tokenizer)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [None]:
pipe = pipeline(
    "text-generation",
    model=model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

In [None]:
# Load our test dataset
#eval_dataset = load_dataset("json", data_files="test_dataset.json", split="train")

# Test on sample
idx = 30
prompt = dataset['validation'][idx]['prompt']
message = [{"role": "user", "content": prompt}]
outputs = pipe(message, max_new_tokens=256)

print(f"Query:\n{dataset['validation'][idx]['prompt']}")
print(f"Original Answer:\n{dataset['validation'][idx]['completion']}")
#print(f"Generated Answer:\n{outputs}")
print(f"Generated Answer:\n{outputs[0]['generated_text'][-1]['content']}")

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Query:
I am a female, aged 35, from Berkeley, CA, working as programmer.I have previously watched and liked the movies: ["Singin' in the Rain (1952) (musical, romance)", 'Lawrence of Arabia (1962) (adventure, war)', 'Shawshank Redemption, The (1994) (drama)', 'Pulp Fiction (1994) (crime, drama)', 'Paradise Lost: The Child Murders at Robin Hood Hills (1996) (documentary)'].Please provide recommendations for movies released before April 22nd, 1998, based on my history.
Based on my profile, recommend the top 5 movies I am most likely to watch next.
Please provide the output in a list of strings format, containing only the movie titles.
Make sure to strictly adhere to the output format given below. Strictly do not generate any additional information other than the movie names.
Format:  ['movie_name', 'movie_name', ... 'movie_name']
Make sure to limit the recommendations to movies available in the MovieLens dataset.
Original Answer:
Bridge on the River Kwai, The (1957), American in Paris, A

In [None]:
# Load our test dataset
#eval_dataset = load_dataset("json", data_files="test_dataset.json", split="train")

# Test on sample
idx = 8
prompt = dataset['validation'][idx]['prompt']
outputs = test_pipe(prompt, max_new_tokens=100, do_sample=False)

print(f"Query:\n{dataset['validation'][idx]['prompt']}")
print(f"Original Answer:\n{dataset['validation'][idx]['completion']}")
#print(f"Generated Answer:\n{outputs}")
print(f"Generated Answer:\n{outputs[0]['generated_text'][len(prompt):].strip()}")

NameError: name 'test_pipe' is not defined

In [None]:
os.environ["HUGGINGFACE_API_KEY"] = "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"
from packages.evaluation import read_embeddings_from_csv, evaluate_recommendations_for_all_users, distill_top_k_movies
from packages.models import map_movies_to_dataset

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# embedding
embedding_path = "/content/drive/MyDrive/CSCI 544 Project/Train SFT /datasets/movie_embeddings.csv"

In [None]:
# How to evaluate???
def evaluate(sample, pipline, top_k, embedding_path, max_new_tokens=256):
    """
    evaluate recommendation result of the llm.

    return matrics
    """
    # load movie embeddings

    movie_id_list, movie_titles_list, movie_embeddings_2d_list = read_embeddings_from_csv(embedding_path)
    movie_embeddings_2d_list = torch.FloatTensor(movie_embeddings_2d_list)#.to(device)

    #get inference output from llm, embed output and map to recommendation space.
    recommended_movies = dict()
    for i in tqdm(range(len(sample))):
        #print("Call model api...")
        message = [{'role': 'user', 'content': sample[i]['prompt']}]
        llm_outputs = pipline(message, max_new_tokens=max_new_tokens)
        #print("Embedding model output...")
        movies_titles = distill_top_k_movies(llm_outputs[0]['generated_text'][-1]['content'], top_k) """change this to meet the new format. It's inside evaluation.py"""
        recommended_movies[i] = map_movies_to_dataset(movies_titles, movie_id_list, movie_embeddings_2d_list, pooling='mean')

    # get ground truth list
    groud_truth_movies = dict()
    for i in range(len(sample[:]['completion'])):
        movies = [item.strip() for item in sample[i]['completion'].split(",")]

        # check if groud truth have enough movies
        if len(movies) < top_k:
            movies = movies + [''] * (top_k - len(movies))
        groud_truth_movies[i] = movies


    # evaluate
    metrics = evaluate_recommendations_for_all_users(groud_truth_movies, recommended_movies, top_k)

    return metrics

In [None]:
# work
number_of_eval_samples = 10
embedding_path = "/content/drive/MyDrive/CSCI 544 Project/Train SFT /datasets/movie_embeddings.csv"
top_k = 5

# iterate over eval dataset and predict
metrics = evaluate(dataset['validation'].select(range(number_of_eval_samples)), pipe, top_k, embedding_path, max_new_tokens=256)

print(f"Average Evaluation Metrics:")
for metric, value in metrics.items():
    print(f"{metric}: {value:.4f}")

100%|██████████| 10/10 [38:28<00:00, 230.89s/it]

Average Evaluation Metrics:
MAP: 0.0000
NDCG@k: 0.0000
Precision@k: 0.0000
Recall@k: 0.0000





# Issues

- Processed Dataset, desired format? Shall we include "[ ]"?
  - Does every user have five movies to watch?

- How to evaluate?
