In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
dl_project_path ='MyDrive/Semester 3/DL/Code'

env_path = f'/content/drive/{dl_project_path}'

import sys
# Add the handout folder to python paths
if env_path not in sys.path:
    sys.path.append(env_path)

In [None]:
# Installation of HuggingFace datasets
!pip install datasets
!pip install transformers
!pip install bitsandbytes
!pip install --upgrade peft
!pip install safetensors
!pip install evaluate
!pip install unsloth

# Model, tokenizer and device

In [None]:
import os
import random
import numpy as np

import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from torch.optim.lr_scheduler import CyclicLR

from unsloth import FastLanguageModel
from unsloth import is_bfloat16_supported

from transformers import LlamaForCausalLM, AutoTokenizer, get_scheduler, BitsAndBytesConfig
from datasets import load_dataset, Dataset
from bitsandbytes.optim import AdamW8bit, PagedAdamW8bit
from peft import PeftModel, LoraConfig, get_peft_model, prepare_model_for_kbit_training

from utils import sample_selector, tokenization, dataset_concatenator, AttributeCollate, AlignmentTrainer, save_checkpoint, load_checkpoint, set_datasets

# Device
device = 'cpu'
if torch.cuda.is_available():
    print('GPU available')
    device = 'cuda'
print(f'Device: {device}')

# Seeds
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed(42)


# TODO
training_path = "custom_model_samestart_adaptive_v3/"
resume = False

save_path = os.path.join(env_path, training_path)
load_path = save_path

model_name = "unsloth/Llama-3.2-3B-Instruct-bnb-4bit"
dtype = torch.bfloat16 if is_bfloat16_supported() else torch.float16

if resume:
  # get last epoch number from the dir
  if os.path.exists(load_path):
    start_epoch = len(os.listdir(load_path))
    epoch_folder = os.path.join(load_path, f"epoch_{start_epoch-1}")
    model_name = f"{epoch_folder}/lora_model"
    print(f"Resuming from epoch {start_epoch}")
  else:
    print("No checkpoints found")
    start_epoch = 0
else:
  start_epoch = 0
  print("Starting from scratch")

# Load the tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=2048,
    dtype=dtype,
    load_in_4bit=True
)

if resume:
  print(model.print_trainable_parameters())

tokenizer.pad_token = tokenizer.eos_token
bos_token = tokenizer.bos_token
eos_token = tokenizer.eos_token

# Dataset

## Instruction dictionary

In [None]:
# Instruction dictionary
system_instruction = bos_token + 'System Prompt: Answer the following user instruction based on the provided alignment attributes. Alignment attributes: '
user_instruction = 'User instruction: '
response_instruction = 'Response: '

instruct_dictionary = {'system': system_instruction, 'user': user_instruction, 'response': response_instruction}

# Tokenize instruction dictionary
instruct_dictionary_tokenized = {}
instructions = [instruction for instruction in instruct_dictionary.keys()]

for instruction in instructions:

    tokens = tokenizer(instruct_dictionary[instruction], padding = False, add_special_tokens=False)
    instruct_dictionary_tokenized[f"{instruction}_labels"] = torch.Tensor([-100] * len(tokens["input_ids"])).long() # Labels
    for key, value in tokens.items():
        instruct_dictionary_tokenized[f"{instruction}_{key}"] = torch.Tensor(value).long() # IDs and attention mask

## HelpSteer

In [None]:
dir = os.path.join(env_path, 'dataset')
max_length = 1500
attributes = ['helpfulness', 'coherence', 'verbosity', 'correctness', 'complexity']
train_dataset, val_dataset, test_dataset = set_datasets(dir, tokenizer, instruct_dictionary_tokenized, max_length, attributes)

print('Training dataset length: ', len(train_dataset))
print('Length of val dataset:', len(val_dataset))
print('Length of test dataset:', len(test_dataset))

print(train_dataset)

In [None]:
  # train_dataset = train_dataset.select(range(100))
# val_dataset = val_dataset.select(range(100))
# test_dataset = test_dataset.select(range(100))

In [None]:
# Define training loop
num_epochs = 10
batch_size = 32
bos_token_id = tokenizer.bos_token_id
eos_token_id = tokenizer.eos_token_id

if start_epoch == 0:  # Start fresh
    # Lora config
    model = FastLanguageModel.get_peft_model(
        model,
        r = 16,
        target_modules = ['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj'],   # recommended to use all modules
        lora_alpha = 16,
        lora_dropout = 0,
        bias = 'none',
        use_gradient_checkpointing = "unsloth",
        random_state = 3407,
        use_rslora = False,
        loftq_config = None,
    )

    print(model.print_trainable_parameters())

    ## Init DataLoaders
    # TODO: set number of attributes and initial probabilities
    initial_attributes = ['helpfulness', 'coherence', 'verbosity']
    initial_probs = [0.333, 0.333, 0.333]
    num_attributes_per_batch = 1

    AttrAligner = AlignmentTrainer(alpha=0.5, beta=0.3, T=0.03, attributes=initial_attributes, attribute_probs=initial_probs, num_attributes_per_batch=num_attributes_per_batch)

    training_dict = {"epoch": start_epoch,
                     "attributes": initial_attributes,
                     "probs": initial_probs,
                     "num_attributes_per_batch": num_attributes_per_batch} # Dictionary for checkpoints

    collate_fn = AttributeCollate(attributes = initial_attributes,
                                  attribute_probs = initial_probs,
                                  num_attributes_per_batch = num_attributes_per_batch,
                                  dict_instruct = instruct_dictionary_tokenized,
                                  bos_id = bos_token_id,
                                  eos_id = eos_token_id)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=False)

    # BATCH TEST
    batch = next(iter(train_loader))

    print(type(batch))
    example = batch['input_ids'][1]
    print(tokenizer.decode(example))
    print(batch['input_ids'][1].shape)
    print(batch['attention_mask'][1].shape)
    print(batch['labels'][1].shape)

    # Init optimizer and scheduler
    if num_attributes_per_batch == len(initial_attributes): # Use all attributes during training, unique optimizer and learning rate

      # Instantiate optimizer for a new training session
      optimizers = {"all": AdamW(model.parameters(), lr=5e-6, weight_decay=1e-2)} # placeholder for the number of optimizers

    else:
      # learning_rates = {initial_attributes[i]: 1e-5 for i in range(len(initial_attributes))}
      optimizers = {initial_attributes[i]: AdamW(model.parameters(), lr=5e-6, weight_decay=1e-2) for i in range(len(initial_attributes))}

    # Schedulers
    # lr_schedulers = {key: CyclicLR(optimizers[key], base_lr=1e-6, max_lr=5e-6, step_size_up=2000) for key in optimizers}


else:

    # Load from checkpoint
    previous_epoch = start_epoch - 1  # Assuming checkpoint directory structure
    training_dict, optimizers = load_checkpoint(load_path, previous_epoch, model)

    # Instantiate Dataloader
    # Correct this, save train_dataset, val_dataset, instruct_dictionary_tokenized in some path
    initial_attributes = training_dict['attributes']
    probs = training_dict['probs']
    num_attributes_per_batch = training_dict['num_attributes_per_batch']
    bos_token_id = tokenizer.bos_token_id
    eos_token_id = tokenizer.eos_token_id

    AttrAligner = AlignmentTrainer(alpha=0.5, beta=0.9, T=1, attributes=initial_attributes, attribute_probs=probs, num_attributes_per_batch=num_attributes_per_batch)

    collate_fn = AttributeCollate(attributes = initial_attributes,
                                  attribute_probs = probs,
                                  num_attributes_per_batch = num_attributes_per_batch,
                                  dict_instruct = instruct_dictionary_tokenized,
                                  bos_id = bos_token_id,
                                  eos_id = eos_token_id)


    train_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=False)

    # BATCH TEST
    batch = next(iter(train_loader))

    print(type(batch))
    example = batch['input_ids'][1]
    print(tokenizer.decode(example))
    print(batch['input_ids'][1].shape)
    print(batch['attention_mask'][1].shape)
    print(batch['labels'][1].shape)

print(f"Training will resume from epoch {start_epoch}.")

In [None]:
# ------TRAIN LOOP------
from tqdm.auto import tqdm
import evaluate
from torch.amp import autocast, GradScaler

left_training_steps = (num_epochs - start_epoch) * len(train_loader)
progress_bar = tqdm(range(left_training_steps))
scaler = GradScaler()

for epoch in range(start_epoch, num_epochs):

    tot_train_loss = 0
    tot_val_loss = 0

    # specific train loss and courter per attribute
    train_losses = {key: 0 for key in optimizers}
    train_counters = {key: 0 for key in optimizers}

    # specific val loss and courter per attribute
    val_losses = {key: 0 for key in optimizers}
    val_counters = {key: 0 for key in optimizers}

    # ------ EPOCH LOOP------
    probs_str = [np.round(p, 3) for p in collate_fn.attribute_probs]
    print(f"epoch: {epoch}, attributes probabilities: {probs_str}")
    model.train()
    for batch in train_loader:
        batch_attribute = batch.pop('attributes')
        # batch to device
        batch = {k: v.to(device) for k, v in batch.items()}

        # Forward, loss, backprop, optimize
        with autocast(device_type='cuda', dtype=dtype):
          outputs = model(**batch)
          loss = outputs.loss

        tot_train_loss += loss.item()

        # Optimizer, scheduler, loss
        if len(batch_attribute) == len(initial_attributes): # All attributes
          optimizer = optimizers["all"]
          # lr_scheduler = lr_schedulers["all"]

        else: # Attribute - dependent
          optimizer = optimizers[batch_attribute[0]]
          # lr_scheduler = lr_schedulers[batch_attribute[0]]
          train_losses[batch_attribute[0]] += loss.item()
          train_counters[batch_attribute[0]] += 1

        # Step
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()
        # lr_scheduler.step()


        progress_bar.update(1)

    # ------ VALIDATION LOOP------
    model.eval()
    for batch in val_loader:
      batch_attribute = batch.pop('attributes')
      batch = {k: v.to(device) for k, v in batch.items()}
      with torch.no_grad(), autocast(device_type='cuda', dtype=dtype):
        outputs = model(**batch)
        loss = outputs.loss

      tot_val_loss += loss.item()

      # Loss
      if len(batch_attribute) != len(initial_attributes): # Independent attributes
        val_losses[batch_attribute[0]] += loss.item()
        val_counters[batch_attribute[0]] += 1


    if len(batch_attribute) != len(initial_attributes):
      # Average loss
      for key in train_losses:
        if train_counters[key] != 0:
          train_losses[key] /= train_counters[key]

      for key in val_losses:
        if val_counters[key] != 0:
          val_losses[key] /= val_counters[key]

      # set new probabilities
      new_probs = AttrAligner.compute_new_probs(train_losses, val_losses)
      collate_fn.set_attribute_probs(new_probs)

      training_dict["probs"] = new_probs # Update probabilities

    # Add all loss
    train_losses["all"] = tot_train_loss/len(train_loader)
    val_losses["all"] = tot_val_loss/len(val_loader)

    # Epoch end
    print(f"Epoch {epoch}/{num_epochs - 1} - " + ", ".join([f"{attr}: Train Loss: {train_losses[attr]:.4f}, Val Loss: {val_losses[attr]:.4f}" for attr in train_losses]))
    save_checkpoint(save_path, epoch, model, training_dict, optimizers, train_losses, val_losses)



# Model Inference

In [None]:
# Normal tokenization and loading

# Load the tokenizer
tokenizer2 = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")
tokenizer2.pad_token = tokenizer2.eos_token

def tokenizer2(data):
  return tokenizer(data['prompt'], max_length = 1000, padding = 'max_length', truncation=True)

small_set = train_dataset
train_set2 = small_set.map(tokenizer2)
# select only columns: prompt_input_ids, prompt_attention_mask, response_input_ids, response_attention_mask
train_set2 = train_set2.select_columns(['input_ids', 'attention_mask'])
train_set2.set_format(type="torch", columns=['input_ids', 'attention_mask'])

train_loader2 = torch.utils.data.DataLoader(train_set2, batch_size=1, shuffle=True)

In [None]:
# BATCH TEST
test_loader = DataLoader(test_dataset, batch_size=2, collate_fn=collate_fn, shuffle=True)

batch = next(iter(train_loader))

print(type(batch))
example = batch['input_ids'][1]
print(tokenizer.decode(example))
print(batch['input_ids'][1].shape)
print(batch['attention_mask'][1].shape)
print(batch['labels'][1].shape)



In [None]:
model.eval()
with torch.no_grad():
    for batch in test_loader:
      batch = {k: v.to(device) for k, v in batch.items()}
      outputs = model.generate(**batch, max_new_tokens=1000)

#input = tokenizer.decode(batch['input_ids'][0], skip_special_tokens=True)
out = tokenizer.decode(outputs[0], skip_special_tokens=True)
out