In [14]:
import os
os.environ["HF_Home"] = "cache_dir"
os.environ["CUDA_VISIBLE_DEVICES"] = "4"

In [2]:
import torch
cache_dir = "../cache"
from transformers import AutoTokenizer, AutoModelForCausalLM
with open("../.env") as f:
    token = f.read().strip()
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B", token=token, use_fast=True, cache_dir=cache_dir)
model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B", token=token, cache_dir=cache_dir, torch_dtype=torch.bfloat16, device_map="cuda").eval()

  from .autonotebook import tqdm as notebook_tqdm
2025-01-30 12:46:33.711523: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Loading checkpoint shards: 100%|██████████| 4/4 [00:10<00:00,  2.52s/it]


In [3]:
# Generate a prompt
prompt = "Write me some code"
inputs = tokenizer(prompt, return_tensors="pt")
# Generate text
outputs = model.generate(input_ids=inputs["input_ids"].cuda(), attention_mask=inputs["attention_mask"].cuda(), max_length=100, do_sample=True, temperature=0.7)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Write me some code
If you're a programmer, you probably find yourself writing code all the time. It's what you do. You might not realize it, but writing code is also a great way to help other programmers. If you have a solution to a common problem, it can be useful to share it with other programmers. That's where the Write me some code series comes in. In this series, we'll be writing code to solve common problems that other programmers might face. Whether


In [4]:
from datasets import load_dataset

# Load all helpfulness/harmless subsets (share the same schema)
dataset = load_dataset("Anthropic/hh-rlhf", cache_dir=cache_dir)
# # For both train and test, convert the dataset from chosen/rejected to text/binary label
# for split in dataset.keys():
#     dataset[split] = dataset[split].map(lambda x: {"text": x["chosen"], "label": 1}, remove_columns=["chosen"])
#     dataset[split] = dataset[split].map(lambda x: {"text": x["rejected"], "label": 0}, remove_columns=["rejected"])

In [5]:
def tokenize_batch(batch):
    input_ids = []
    attention_masks = []
    loss_masks = []
    for text in batch:
        cur = text.strip().split("\n\n")
        cur_tokenized_inputs_ids = []
        cur_tokenized_attention_mask = []
        loss_mask = []
        for i, item in enumerate(cur):
            # Tokenize the text
            inputs = tokenizer(item, max_length=512, truncation=True)
            # Add the tokenized text to the list with the separator token
            cur_tokenized_inputs_ids.append(inputs["input_ids"] + [tokenizer.eos_token_id])
            cur_tokenized_attention_mask.append(inputs["attention_mask"] + [1])
            # If the item is the last one, loss mask will be ones while the rest will be zeros
            if i == len(cur) - 1:
                loss_mask.append([1]*len(inputs["input_ids"]))
            else:
                loss_mask.append([0]*len(inputs["input_ids"]))
        
        # Flatten the lists
        cur_tokenized_inputs_ids = [item for sublist in cur_tokenized_inputs_ids for item in sublist]
        cur_tokenized_attention_mask = [item for sublist in cur_tokenized_attention_mask for item in sublist]
        loss_mask = [item for sublist in loss_mask for item in sublist]

        input_ids.append(cur_tokenized_inputs_ids)
        attention_masks.append(cur_tokenized_attention_mask)
        loss_masks.append(loss_mask)
    # Cut sequences to max length
    input_ids = [item[:512] for item in input_ids]
    attention_masks = [item[:512] for item in attention_masks]
    loss_masks = [item[:512] for item in loss_masks]

    # Pad the tokenized text on the right
    max_len = max([len(item) for item in input_ids])
    print(max_len)
    input_ids = [item + [tokenizer.eos_token_id]*(max_len - len(item)) for item in input_ids]
    attention_masks = [item + [0]*(max_len - len(item)) for item in attention_masks]
    loss_masks = [item + [0]*(max_len - len(item)) for item in loss_masks]
    # Convert the lists to tensors
    input_ids = torch.tensor(input_ids).cuda()
    attention_masks = torch.tensor(attention_masks).cuda()
    loss_masks = torch.tensor(loss_masks).cuda()
    return {"input_ids": input_ids, "attention_mask": attention_masks, "loss_mask": loss_masks}

In [6]:
# Iterate over all text in the dataset
batch_size = 16
for batch in dataset["train"].batch(batch_size):
    with torch.no_grad():
        # Combine positive and negative examples
        combined = []
        for item in batch["chosen"]:
            combined.append(item)
        for item in batch["rejected"]:
            combined.append(item)

        # Tokenize the batch
        tokenized = tokenize_batch(combined)
        # Split the batch into positive and negative
        tokenized_positive = {"input_ids": tokenized["input_ids"][:batch_size], "attention_mask": tokenized["attention_mask"][:batch_size], "loss_mask": tokenized["loss_mask"][:batch_size]}
        tokenized_negative = {"input_ids": tokenized["input_ids"][batch_size:], "attention_mask": tokenized["attention_mask"][batch_size:], "loss_mask": tokenized["loss_mask"][batch_size:]}
        # # Pad the batch on the left
        # max_len = max([len(item["input_ids"]) for item in tokenized])
        # for item in tokenized:
        #     item["input_ids"] += [tokenizer.pad_token_id]*(max_len - len(item["input_ids"]))
        #     item["attention_mask"] += [0]*(max_len - len(item["attention_mask"]))
        #     item["loss_mask"] += [0]*(max_len - len(item["loss_mask"]))
        # # Convert the batch to tensors
        # input_ids = torch.tensor([item["input_ids"] for item in tokenized]).cuda()
        # attention_mask = torch.tensor([item["attention_mask"] for item in tokenized]).cuda()
        # loss_mask = torch.tensor([item["loss_mask"] for item in tokenized]).cuda()
        # Generate the output for both positive and negative examples
        outputs_positive = model(**tokenized_positive)
        outputs_negative = model(**tokenized_negative)
        break

404


In [7]:
outputs_positive

CausalLMOutputWithPast(loss=None, logits=tensor([[[ 6.8750,  8.7500, 12.8750,  ..., -4.4688, -4.4688, -4.4688],
         [ 3.8594,  1.8203,  3.1406,  ..., -7.3750, -7.3750, -7.3750],
         [-2.7188, -1.8828, -1.7734,  ..., -8.8750, -8.8750, -8.8750],
         ...,
         [ 5.9375,  4.0312,  3.1250,  ..., -3.5312, -3.5312, -3.5312],
         [ 5.8125,  3.8906,  3.0156,  ..., -3.4844, -3.4844, -3.4844],
         [ 5.7188,  3.8125,  2.9531,  ..., -3.4531, -3.4531, -3.4531]],

        [[ 6.8750,  8.7500, 12.8750,  ..., -4.4688, -4.4688, -4.4688],
         [ 3.8594,  1.8203,  3.1406,  ..., -7.3750, -7.3750, -7.3750],
         [-2.7188, -1.8828, -1.7734,  ..., -8.8750, -8.8750, -8.8750],
         ...,
         [ 4.1875,  1.9062,  0.6641,  ..., -2.2188, -2.2188, -2.2188],
         [ 4.4375,  2.0781,  0.9297,  ..., -2.5156, -2.5156, -2.5156],
         [ 4.4688,  2.0312,  0.9453,  ..., -2.4688, -2.4688, -2.4688]],

        [[ 6.8750,  8.7500, 12.8750,  ..., -4.4688, -4.4688, -4.4688],
    

In [8]:
import torch.nn.functional as F
def dpo_loss(pi_logps, ref_logps, yw_idxs, yl_idxs, beta):
    """
    pi_logps: policy logprobs, shape (B,)
    ref_logps: reference model logprobs, shape (B,)
    yw_idxs: preferred completion indices in [0, B-1], shape (T,)
    yl_idxs: dispreferred completion indices in [0, B-1], shape (T,)
    beta: temperature controlling strength of KL penalty
    Each pair of (yw_idxs[i], yl_idxs[i]) represents the
    indices of a single preference pair.
    """
    pi_yw_logps, pi_yl_logps = pi_logps[yw_idxs], pi_logps[yl_idxs]
    ref_yw_logps, ref_yl_logps = ref_logps[yw_idxs], ref_logps[yl_idxs]
    pi_logratios = pi_yw_logps - pi_yl_logps
    ref_logratios = ref_yw_logps - ref_yl_logps
    losses = -F.logsigmoid(beta * (pi_logratios - ref_logratios))
    rewards = beta * (pi_logps - ref_logps).detach()
    return losses, rewards

In [9]:
outputs_positive.logits.shape

torch.Size([16, 404, 128256])

In [10]:
tokenized_positive["input_ids"].shape

torch.Size([16, 404])

In [11]:
outputs_positive.logits.flatten(0, 1).shape

torch.Size([6464, 128256])

In [12]:
tokenized_positive["input_ids"].flatten(0, 1).shape

torch.Size([6464])

In [13]:
outputs_positive.logits.flatten(0, 1)[:1][:, tokenized_positive["input_ids"].flatten(0, 1)[:1]]

tensor([[-4.0312]], device='cuda:0', dtype=torch.bfloat16)

In [16]:
outputs_positive.logits.flatten(0, 1)[:1].mT.index_select(1, tokenized_positive["input_ids"].flatten(0, 1)[:1])

../aten/src/ATen/native/cuda/Indexing.cu:1255: indexSelectSmallIndex: block: [118,0,0], thread: [32,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1255: indexSelectSmallIndex: block: [118,0,0], thread: [33,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1255: indexSelectSmallIndex: block: [118,0,0], thread: [34,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1255: indexSelectSmallIndex: block: [118,0,0], thread: [35,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1255: indexSelectSmallIndex: block: [118,0,0], thread: [36,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1255: indexSelectSmallIndex: block: [118,0,0], thread: [37,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1255: indexSelectSmallIndex: block: [118,

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [13]:
dpo_loss(outputs_positive.logits[0, :-1], outputs_negative.logits[0, :-1], tokenized_positive["input_ids"][0, 1:, None], tokenized_negative["input_ids"][0, 1:, None], 0.1)

../aten/src/ATen/native/cuda/IndexKernel.cu:93: operator(): block: [66,0,0], thread: [32,0,0] Assertion `-sizes[i] <= index && index < sizes[i] && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/IndexKernel.cu:93: operator(): block: [66,0,0], thread: [33,0,0] Assertion `-sizes[i] <= index && index < sizes[i] && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/IndexKernel.cu:93: operator(): block: [66,0,0], thread: [34,0,0] Assertion `-sizes[i] <= index && index < sizes[i] && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/IndexKernel.cu:93: operator(): block: [66,0,0], thread: [35,0,0] Assertion `-sizes[i] <= index && index < sizes[i] && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/IndexKernel.cu:93: operator(): block: [66,0,0], thread: [36,0,0] Assertion `-sizes[i] <= index && index < sizes[i] && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/IndexKernel.cu:93: operator(): block: [66,0,0], thread: [37,0,0] Assertion `-size

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
