In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!git clone https://github.com/fernandososter/RAG.git

Cloning into 'RAG'...
remote: Enumerating objects: 20537, done.[K
remote: Counting objects: 100% (63/63), done.[K
remote: Compressing objects: 100% (51/51), done.[K


In [None]:
!pip install -r /content/RAG/QLoRA-Youtube/requirements.txt


In [None]:
import os
import pandas as pd
import scipy as sp
import numpy as np
import torch
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, set_seed
from peft import PeftModel
#from item import Item
import pickle
from peft import LoraConfig
import wandb
from huggingface_hub import login
from trl import SFTTrainer, SFTConfig, DataCollatorForCompletionOnlyLM
from google.colab import userdata


%matplotlib inline

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)
  if gpu_info.find('Tesla T4') >= 0:
    print("Success - Connected to a T4")
  else:
    print("NOT CONNECTED TO A T4")

In [None]:
hf_token = userdata.get('HF_TOKEN')
wandb_api_key = userdata.get('WANDB_API_KEY')

login(hf_token, add_to_git_credential=True)

os.environ["WANDB_API_KEY"] = wandb_api_key
wandb.login()

In [None]:
PROJECT_PATH = '/content/RAG/QLoRA-Youtube/'
MODEL_CACHE_PATH = '/content/drive/MyDrive/modelos/'

BASE_MODEL = "meta-llama/Meta-Llama-3.1-8B"
PROJECT_NAME = "QLoRA-Youtube"
HF_USER = "fsoster"

QUANT_4_BIT = True

GREEN = "\033[92m"
YELLOW = "\033[93m"
RED = "\033[91m"
RESET = "\033[0m"
COLOR_MAP = {"red":RED, "orange": YELLOW, "green": GREEN}
RUN_NAME = "2025-11-11_13.04.39"
PROJECT_RUN_NAME = f"{PROJECT_NAME}-{RUN_NAME}"

FINETUNED_MODEL = f"{HF_USER}/{PROJECT_RUN_NAME}"


LOG_TO_WANDB = True

os.environ["WANDB_PROJECT"] = PROJECT_NAME
os.environ["WANDB_LOG_MODEL"] = "checkpoint" if LOG_TO_WANDB else "end"
os.environ["WANDB_WATCH"] = "gradients"

if LOG_TO_WANDB:
  wandb.init(project=PROJECT_NAME, name=RUN_NAME)



In [None]:
from typing import Optional
class Item:
    prompt: Optional[str] = None
    PREFIX = "Views are "
    QUESTION = "How many views for this video?"
    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)

    title: str
    view_count: float

    def __init__(self, title, view_count):
        self.title = title
        self.view_count = view_count
        self.makePrompt(self.title)


    def makePrompt(self, text):
        self.prompt = f"{self.QUESTION}\n\n{text}\n\n"
        self.prompt += f"{self.PREFIX}{str(round(self.view_count))}"
        self.token_count = len(self.tokenizer.encode(self.prompt, add_special_tokens=False))



    def test_prompt(self):
        return self.prompt.split(self.PREFIX)[0] + self.PREFIX

In [None]:
df = pd.read_csv(f"{PROJECT_PATH}/dataset/youtube_video.csv")
df.head()

In [None]:
columns = ["title", "view_count"]
items = [Item(**item) for item in df[columns].to_dict(orient="records")]
test = items[:round(len(items)*0.3)]
train = items[round(len(items)*0.3):]

In [None]:
'''
pkl_train_file = 'dataset/train.pkl'
pkl_test_file = 'dataset/test.pkl'

if os.path.exists(pkl_train_file):
  with open(pkl_train_file, "rb") as f:
        train = pickle.load(f)

  with open(pkl_test_file, "rb") as f:
        test = pickle.load(f)

else:
    columns = ["title", "view_count"]
    items = [Item(**item) for item in df[columns].to_dict(orient="records")]
    test = items[:round(len(items)*0.3)]
    train = items[round(len(items)*0.3):]
    with open(pkl_train_file, 'wb') as file:
        pickle.dump(train, file)

    with open(pkl_test_file, 'wb') as file:
        pickle.dump(test, file)
'''

In [None]:
len(train), len(test)

In [None]:

if QUANT_4_BIT:
  quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4"
  )
else:
  quant_config = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_8bit_compute_dtype=torch.bfloat16
  )

In [None]:
set_seed(42)
prompt = train[0].prompt
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"


base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config=quant_config,
    device_map="auto",
    cache_dir=MODEL_CACHE_PATH
)
base_model.generation_config.pad_token_id = tokenizer.pad_token_id

print(f"Memory footprint: {base_model.get_memory_footprint() / 1e9:.1f} GB")

In [None]:
LORA_R = 32
LORA_ALPHA = 64
TARGET_MODULES = ["q_proj", "v_proj", "k_proj", "o_proj"]
LORA_DROPOUT = 0.1

lora_parameters = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=TARGET_MODULES,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM", # Specifies we're doing causal language modeling
)

In [None]:
# üì¶ Training Setup:
EPOCHS = 1
BATCH_SIZE = 16                     # A100 GPU can go up to 16
GRADIENT_ACCUMULATION_STEPS = 2
MAX_SEQUENCE_LENGTH = 182          # Max token length per input

# ‚öôÔ∏è Optimization:
LEARNING_RATE = 1e-4
LR_SCHEDULER_TYPE = 'cosine'
WARMUP_RATIO = 0.03
OPTIMIZER = "paged_adamw_32bit"

# üíæ Checkpointing & Logging:
SAVE_STEPS = 200        # Checkpoint
STEPS = 20              # Log every 20 steps
save_total_limit = 10   # Keep latest 10 only


LOG_TO_WANDB = True

HUB_MODEL_NAME = f"{HF_USER}/{PROJECT_RUN_NAME}"

train_parameters = SFTConfig(
    # Output & Run
    output_dir=PROJECT_RUN_NAME,
    run_name=RUN_NAME,
    dataset_text_field="text",
    max_seq_length=MAX_SEQUENCE_LENGTH,

    # Training
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    max_steps=-1,
    group_by_length=True,

    # Evaluation
    eval_strategy="steps",
    eval_steps=STEPS,
    per_device_eval_batch_size=1,

    # Optimization
    learning_rate=LEARNING_RATE,
    lr_scheduler_type=LR_SCHEDULER_TYPE,
    warmup_ratio=WARMUP_RATIO,
    optim=OPTIMIZER,
    weight_decay=0.001,
    max_grad_norm=0.3,

    # Precision
    fp16=False,
    bf16=True,

    # Logging & Saving
    logging_steps=STEPS,            # See loss after each {STEP} batches
    save_strategy="steps",
    save_steps=SAVE_STEPS,          # Model Checkpointed locally
    save_total_limit=save_total_limit,
    report_to="wandb" if LOG_TO_WANDB else None,

    # Hub
    push_to_hub=True,
    hub_strategy="end",  # Only push once, at the end
    load_best_model_at_end=True, # Loads the best eval_loss checkpoint
    metric_for_best_model="eval_loss", # Monitors eval_loss
    greater_is_better=False, # Lower eval_loss = better model
)


In [None]:
# The latest version of trl is showing a warning about labels - please ignore this warning
fine_tuning = SFTTrainer(
    model=base_model,
    train_dataset=train_data,
    eval_dataset=val_data,
    peft_config=lora_parameters,    # QLoRA config
    args=train_parameters,          # SFTConfig
    data_collator=collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)] # Early stop if no val improvement for 5 steps
)

In [None]:
fine_tuning.train()
print(f"‚úÖ Best model pushed to HF Hub: {HUB_MODEL_NAME}")

In [None]:
for item in train:
    inputs = tokenizer.encode(item.prompt, return_tensors="pt").to("cuda")
    attention_mask = torch.ones(inputs.shape, device="cuda")
    outputs = base_model.generate(inputs, max_new_tokens=4, attention_mask=attention_mask, num_return_sequences=1)
    response = tokenizer.decode(outputs[0])

In [None]:
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config=quant_config,
    device_map="cpu",
)
base_model.generation_config.pad_token_id = tokenizer.pad_token_id


fine_tuned_model = PeftModel.from_pretrained(base_model, FINETUNED_MODEL)

print(f"Memory footprint: {fine_tuned_model.get_memory_footprint() / 1e6:.1f} MB")

In [None]:
fine_tuned_model