In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
dl_project_path ='MyDrive/Semester 3/DL/Code'

env_path = f'/content/drive/{dl_project_path}'

import sys
# Add the handout folder to python paths
if env_path not in sys.path:
    sys.path.append(env_path)

In [None]:
# Installation of HuggingFace datasets
!pip install datasets
!pip install transformers
!pip install bitsandbytes
!pip install --upgrade peft
!pip install safetensors
!pip install evaluate
!pip install unsloth

# Model, tokenizer and device

In [None]:
import os
import random
import numpy as np

import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from torch.optim.lr_scheduler import CyclicLR

from unsloth import FastLanguageModel
from unsloth import is_bfloat16_supported

from transformers import LlamaForCausalLM, AutoTokenizer, get_scheduler, BitsAndBytesConfig
from datasets import load_dataset, Dataset
from bitsandbytes.optim import AdamW8bit, PagedAdamW8bit
from peft import PeftModel, LoraConfig, get_peft_model, prepare_model_for_kbit_training

from utils import sample_selector, tokenization, dataset_concatenator, AttributeCollate, AlignmentTrainer, save_checkpoint, load_checkpoint, set_datasets

# Device
device = 'cpu'
if torch.cuda.is_available():
    print('GPU available')
    device = 'cuda'
print(f'Device: {device}')

# Seeds
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed(42)


# TODO
training_path = "custom_model_samestart_adaptive_v3/"
resume = True

save_path = os.path.join(env_path, training_path)
load_path = save_path

model_name = "unsloth/Llama-3.2-3B-Instruct-bnb-4bit"
dtype = torch.bfloat16 if is_bfloat16_supported() else torch.float16

if resume:
  # get last epoch number from the dir
  if os.path.exists(load_path):
    start_epoch = len(os.listdir(load_path)) - 3
    epoch_folder = os.path.join(load_path, f"epoch_{start_epoch-1}")
    model_name = f"{epoch_folder}/lora_model"
    print(f"Resuming from epoch {start_epoch}")
  else:
    print("No checkpoints found")
    start_epoch = 0
else:
  start_epoch = 0
  print("Starting from scratch")

# Load the tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=2048,
    dtype=dtype,
    load_in_4bit=True
)

if resume:
  print(model.print_trainable_parameters())

tokenizer.pad_token = tokenizer.eos_token
bos_token = tokenizer.bos_token
eos_token = tokenizer.eos_token

# Dataset

## Instruction dictionary

In [None]:
# Instruction dictionary
system_instruction = bos_token + 'System Prompt: Answer the following user instruction based on the provided alignment attributes. Alignment attributes: '
user_instruction = 'User instruction: '
response_instruction = 'Response: '

instruct_dictionary = {'system': system_instruction, 'user': user_instruction, 'response': response_instruction}

# Tokenize instruction dictionary
instruct_dictionary_tokenized = {}
instructions = [instruction for instruction in instruct_dictionary.keys()]

for instruction in instructions:

    tokens = tokenizer(instruct_dictionary[instruction], padding = False, add_special_tokens=False)
    instruct_dictionary_tokenized[f"{instruction}_labels"] = torch.Tensor([-100] * len(tokens["input_ids"])).long() # Labels
    for key, value in tokens.items():
        instruct_dictionary_tokenized[f"{instruction}_{key}"] = torch.Tensor(value).long() # IDs and attention mask

## HelpSteer

In [None]:
dir = os.path.join(env_path, 'dataset')
max_length = 1500
attributes = ['helpfulness', 'coherence', 'verbosity', 'correctness', 'complexity']
train_dataset, val_dataset, test_dataset = set_datasets(dir, tokenizer, instruct_dictionary_tokenized, max_length, attributes)

print('Training dataset length: ', len(train_dataset))
print('Length of val dataset:', len(val_dataset))
print('Length of test dataset:', len(test_dataset))

print(train_dataset)

# Test Loss

In [None]:
from tqdm.auto import tqdm
import evaluate
from torch.amp import autocast, GradScaler

In [None]:
test_loss = {}

## All attributes

In [None]:
# All losses
batch_size = 32
bos_token_id = tokenizer.bos_token_id
eos_token_id = tokenizer.eos_token_id
initial_attributes = ['helpfulness', 'coherence', 'verbosity']
initial_probs = [0.33, 0.33, 0.33]
num_attributes_per_batch = 3

collate_fn = AttributeCollate(attributes = initial_attributes,
                                  attribute_probs = initial_probs,
                                  num_attributes_per_batch = num_attributes_per_batch,
                                  dict_instruct = instruct_dictionary_tokenized,
                                  bos_id = bos_token_id,
                                  eos_id = eos_token_id)

test_loader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=False)

In [None]:
# ------ TEST LOOP------
running_loss = 0
model.eval()
for batch in test_loader:
  batch_attribute = batch.pop('attributes')
  batch = {k: v.to(device) for k, v in batch.items()}
  with torch.no_grad(), autocast(device_type='cuda', dtype=dtype):
    outputs = model(**batch)
    loss = outputs.loss

  running_loss += loss.item()

test_loss['all'] = running_loss/len(test_loader)

## Helpfulness

In [None]:
# Helpfulness
batch_size = 32
bos_token_id = tokenizer.bos_token_id
eos_token_id = tokenizer.eos_token_id
initial_attributes = ['helpfulness', 'coherence', 'verbosity']
initial_probs = [1, 0, 0]
num_attributes_per_batch = 1

collate_fn = AttributeCollate(attributes = initial_attributes,
                                  attribute_probs = initial_probs,
                                  num_attributes_per_batch = num_attributes_per_batch,
                                  dict_instruct = instruct_dictionary_tokenized,
                                  bos_id = bos_token_id,
                                  eos_id = eos_token_id)

test_loader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=False)

In [None]:
# ------ TEST LOOP------
running_loss = 0
model.eval()
for batch in test_loader:
  batch_attribute = batch.pop('attributes')
  batch = {k: v.to(device) for k, v in batch.items()}
  with torch.no_grad(), autocast(device_type='cuda', dtype=dtype):
    outputs = model(**batch)
    loss = outputs.loss

  running_loss += loss.item()

test_loss['helpfulness'] = running_loss/len(test_loader)

## Coherence

In [None]:
# Coherence
batch_size = 32
bos_token_id = tokenizer.bos_token_id
eos_token_id = tokenizer.eos_token_id
initial_attributes = ['helpfulness', 'coherence', 'verbosity']
initial_probs = [0, 1, 0]
num_attributes_per_batch = 1

collate_fn = AttributeCollate(attributes = initial_attributes,
                                  attribute_probs = initial_probs,
                                  num_attributes_per_batch = num_attributes_per_batch,
                                  dict_instruct = instruct_dictionary_tokenized,
                                  bos_id = bos_token_id,
                                  eos_id = eos_token_id)

test_loader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=False)

In [None]:
# ------ TEST LOOP------
running_loss = 0
model.eval()
for batch in test_loader:
  batch_attribute = batch.pop('attributes')
  batch = {k: v.to(device) for k, v in batch.items()}
  with torch.no_grad(), autocast(device_type='cuda', dtype=dtype):
    outputs = model(**batch)
    loss = outputs.loss

  running_loss += loss.item()

test_loss['coherence'] = running_loss/len(test_loader)

## Verbosity

In [None]:
# Verbosity
batch_size = 32
bos_token_id = tokenizer.bos_token_id
eos_token_id = tokenizer.eos_token_id
initial_attributes = ['helpfulness', 'coherence', 'verbosity']
initial_probs = [0, 0, 1]
num_attributes_per_batch = 1

collate_fn = AttributeCollate(attributes = initial_attributes,
                                  attribute_probs = initial_probs,
                                  num_attributes_per_batch = num_attributes_per_batch,
                                  dict_instruct = instruct_dictionary_tokenized,
                                  bos_id = bos_token_id,
                                  eos_id = eos_token_id)

test_loader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=False)

In [None]:
# ------ TEST LOOP------
running_loss = 0
model.eval()
for batch in test_loader:
  batch_attribute = batch.pop('attributes')
  batch = {k: v.to(device) for k, v in batch.items()}
  with torch.no_grad(), autocast(device_type='cuda', dtype=dtype):
    outputs = model(**batch)
    loss = outputs.loss

  running_loss += loss.item()

test_loss['verbosity'] = running_loss/len(test_loader)

## Save losses

In [None]:
print(f"General loss: {test_loss['all']}\n"
      f"Helpfulness loss: {test_loss['helpfulness']}\n"
      f"Coherence loss: {test_loss['coherence']}\n"
      f"Verbosity loss: {test_loss['verbosity']}")

In [None]:
torch.save(test_loss, os.path.join(epoch_folder, 'test_loss.pth'))