In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m23.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [2]:
import wandb
wandb.login(key="2bcea92e3d3f472ffce763d1296a2ff409c4662a")
from tqdm import tqdm


[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [3]:
import argparse
import random
import numpy as np
from functools import partial
from torch.cuda.amp import autocast, GradScaler


import torch
torch.cuda.empty_cache()
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import AdamW

from torch.utils.data import DataLoader
from datasets import load_dataset
!export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False

In [4]:
def seed_everything(seed=2003):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True

def calculate_DPO_loss(model_prefered_logprob, model_disprefered_logprob,
                       ref_prefered_logprob, ref_disprefered_logprob,
                       beta=0.5):

    prefered_relative_logprob = model_prefered_logprob - ref_prefered_logprob
    disprefered_relative_logprob = model_disprefered_logprob - ref_disprefered_logprob

    reward_accuracies = (prefered_relative_logprob > disprefered_relative_logprob).float().mean(dim=-1)
    reward_margins = (prefered_relative_logprob - disprefered_relative_logprob).mean(dim=-1)

    loss = -F.logsigmoid(beta * (prefered_relative_logprob - disprefered_relative_logprob)).mean(dim=-1)

    return loss, prefered_relative_logprob.mean(dim=-1), disprefered_relative_logprob.mean(dim=-1), reward_accuracies, reward_margins

def get_log_prob(logits, labels):
    log_probs = F.log_softmax(logits, dim=-1)
    return torch.gather(log_probs, -1, labels.unsqueeze(-1)).squeeze(-1).mean(-1)

def collate_fn(batch, tokenizer, max_length, device):
    prompts = ['Instruct: ' + item['prompt'] + '\n' for item in batch]
    chosen_responses = ['Output: ' + item['chosen'] for item in batch]
    rejected_responses = ['Output: ' + item['rejected'] for item in batch]

    prompt_ids = tokenizer.batch_encode_plus(prompts, padding=True, return_tensors="pt", max_length=max_length, truncation=True)['input_ids'].to(device)
    prefered_ids = tokenizer.batch_encode_plus(chosen_responses, padding=True, return_tensors="pt", max_length=max_length, truncation=True)['input_ids'].to(device)
    disprefered_ids = tokenizer.batch_encode_plus(rejected_responses, padding=True, return_tensors="pt", max_length=max_length, truncation=True)['input_ids'].to(device)

    prompt_prefered_ids = torch.cat([prompt_ids, prefered_ids], dim=-1)
    prompt_disprefered_ids = torch.cat([prompt_ids, disprefered_ids], dim=-1)

    prompt_prefered_mask = torch.cat([torch.ones_like(prompt_ids), torch.zeros_like(prefered_ids)], dim=-1)
    prompt_disprefered_mask = torch.cat([torch.ones_like(prompt_ids), torch.zeros_like(disprefered_ids)], dim=-1)

    return {'prompt_prefered_ids': prompt_prefered_ids,
            'prompt_disprefered_ids': prompt_disprefered_ids,
            'prompt_prefered_mask': prompt_prefered_mask,
            'prompt_disprefered_mask': prompt_disprefered_mask}

def train(model, ref_model, tokenizer, optimizer, train_dataloader, epochs=1, beta=0.1):
    model.train()
    ref_model.eval()
    model, train_dataloader = accelerator.prepare(model, train_dataloader)
    batch_count = 0
    for batch in train_dataloader:
       # Check and print tensor devices for batch tensors
      batch_count+=1
      print(f"Batch {batch_count} of {len(train_dataloader)}")

      # Move model tensors to the GPU (handled by Accelerator)
      prompt_prefered_ids = batch['prompt_prefered_ids']
      prompt_disprefered_ids = batch['prompt_disprefered_ids']
      prompt_prefered_mask = batch['prompt_prefered_mask']
      prompt_disprefered_mask = batch['prompt_disprefered_mask']



      # Forward pass with main model (on GPU)
      model_prefered_log_prob = get_log_prob(model(prompt_prefered_ids.clone(), attention_mask=prompt_prefered_mask.clone()).logits.to(device), prompt_prefered_ids.clone())
      model_disprefered_log_prob = get_log_prob(model(prompt_disprefered_ids.clone(), attention_mask=prompt_disprefered_mask.clone()).logits.to(device), prompt_disprefered_ids.clone())
      print("Before loss calculation:")

      # Move tensors to CPU for ref_model forward pass if required
      prompt_prefered_ids_cpu = prompt_prefered_ids.to("cpu")
      prompt_disprefered_ids_cpu = prompt_disprefered_ids.to("cpu")
      prompt_prefered_mask_cpu = prompt_prefered_mask.to("cpu")
      prompt_disprefered_mask_cpu = prompt_disprefered_mask.to("cpu")

      # Forward pass with ref_model (on CPU)
      with torch.no_grad():
          ref_prefered_log_prob = get_log_prob(
              ref_model(prompt_prefered_ids_cpu.clone(), attention_mask=prompt_prefered_mask_cpu.clone()).logits,
              prompt_prefered_ids_cpu.clone()
          )
          ref_disprefered_log_prob = get_log_prob(
              ref_model(prompt_disprefered_ids_cpu.clone(), attention_mask=prompt_disprefered_mask_cpu.clone()).logits,
              prompt_disprefered_ids_cpu.clone()
          )

      # Move ref_model outputs to GPU for loss calculation
      ref_prefered_log_prob = ref_prefered_log_prob.to(device)
      ref_disprefered_log_prob = ref_disprefered_log_prob.to(device)



      # Calculate DPO loss on GPU
      loss, prefered_relative_logprob, disprefered_relative_logprob, reward_accuracies, reward_margins = calculate_DPO_loss(
          model_prefered_log_prob, model_disprefered_log_prob,
          ref_prefered_log_prob, ref_disprefered_log_prob,
          beta=beta
      )

    # Backpropagation and optimization
    torch.autograd.set_detect_anomaly(True)
    torch.autograd.set_detect_anomaly(True)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    del loss, prefered_relative_logprob, disprefered_relative_logprob, reward_accuracies, reward_margins
    del model_prefered_log_prob, model_disprefered_log_prob, ref_prefered_log_prob, ref_disprefered_log_prob
    del prompt_prefered_ids, prompt_disprefered_ids, prompt_prefered_mask, prompt_disprefered_mask

    torch.cuda.empty_cache()

In [5]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps xformers "trl<0.9.0" peft accelerate bitsandbytes
!pip install --upgrade transformers accelerate bitsandbytes

In [6]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/mistral-7b-v0.3-bnb-4bit",      # New Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/llama-3-8b-bnb-4bit",           # Llama-3 15 trillion tokens model 2x faster!
    "unsloth/llama-3-8b-Instruct-bnb-4bit",
    "unsloth/llama-3-70b-bnb-4bit",
    "unsloth/Phi-3-mini-4k-instruct",        # Phi-3 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/gemma-7b-bnb-4bit",             # Gemma 2.2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Phi-3-mini-4k-instruct",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth 2024.11.5: Fast Mistral patching. Transformers = 4.46.2.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.0+cu121. CUDA = 8.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.26G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/194 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/458 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

In [7]:
from accelerate import Accelerator
accelerator = Accelerator()
device = accelerator.device
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer.pad_token = tokenizer.eos_token
model = model.to(device)
ref_model = model.to(device)
optimizer = AdamW(model.parameters(), lr=1e-6)

dataset = load_dataset("jondurbin/truthy-dpo-v0.1", split="train[:50]")
train_dataloader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=True, collate_fn=partial(collate_fn, tokenizer=tokenizer, max_length=512, device=device))

train(model, ref_model, tokenizer, optimizer, train_dataloader, epochs=10, beta=0.1)



README.md:   0%|          | 0.00/904 [00:00<?, ?B/s]

truthy-dpo.parquet:   0%|          | 0.00/653k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1016 [00:00<?, ? examples/s]

Batch 1 of 50
Before loss calculation:
Batch 2 of 50
Before loss calculation:
Batch 3 of 50
Before loss calculation:
Batch 4 of 50
Before loss calculation:
Batch 5 of 50
Before loss calculation:
Batch 6 of 50
Before loss calculation:
Batch 7 of 50
Before loss calculation:
Batch 8 of 50
Before loss calculation:
Batch 9 of 50
Before loss calculation:
Batch 10 of 50
Before loss calculation:
Batch 11 of 50
Before loss calculation:
Batch 12 of 50
Before loss calculation:
Batch 13 of 50
Before loss calculation:
Batch 14 of 50
Before loss calculation:
Batch 15 of 50
Before loss calculation:
Batch 16 of 50
Before loss calculation:
Batch 17 of 50
Before loss calculation:
Batch 18 of 50
Before loss calculation:
Batch 19 of 50
Before loss calculation:
Batch 20 of 50
Before loss calculation:
Batch 21 of 50
Before loss calculation:
Batch 22 of 50
Before loss calculation:
Batch 23 of 50
Before loss calculation:
Batch 24 of 50
Before loss calculation:
Batch 25 of 50
Before loss calculation:
Batch 26 

RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [1, 236, 3072]], which is output 0 of MulBackward0, is at version 1; expected version 0 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!