In [1]:
import sys
import os
import jaxtyping
from pathlib import Path

import os
import sys
import time
from dataclasses import dataclass
from functools import partial
from pathlib import Path
from typing import Callable

import einops
import numpy as np
import torch as t
import torch.nn as nn
import wandb
import tqdm
import tabulate
from eindex import eindex
from jaxtyping import Float, Int
from rich import print as rprint
from rich.table import Table
from tabulate import tabulate
from torch import Tensor
from transformer_lens import HookedTransformer, utils
from transformer_lens.hook_points import HookPoint
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from tqdm import tqdm

device = t.device("mps" if t.backends.mps.is_available() else "cuda" if t.cuda.is_available() else "cpu")

In [2]:
@dataclass
class RLHFArgs:
    # random
    seed: int = 1

    # logging
    wandb_project_name: str = "rlhf_transformers"
    wandb_entity: str | None = None

    # macro-training 
    total_phases: int = 200
    batch_size: int = 32 #enforce batch_size % num_minibatches == 0
    num_minibatches: int = 4
    batches_per_learning_phase: int = 2

    # optimization hyperparameters
    base_lr: float = 2e-5
    head_lr: float = 5e-4
    max_grad_norm: float = 1.0
    warmup_steps: int = 20 #enforce warmup_steps < total_phases
    final_scale: float = 0.1

    # PPO objective function coefficients
    clip_coef: float = 0.2
    vf_coef: float = 0.15
    ent_coef: float = 0.001

    # model and sampling with prefix
    base_model: str = "gpt2-small"
    gen_len: int = 50
    temperature: float = 1.0
    top_k: int = 10
    prefix: str = "This movie was really"
    prepend_bos: bool = True

    # RLHF-specific arguments
    kl_coef: float = 2.5
    reward_fn: Callable = lambda x: 0.0
    normalize_reward: bool = True

    def __post_init__(self):
        self.minibatch_size = self.batch_size // self.num_minibatches

# Setup: working with the transformer

Right after the last layernorm before we unembed our tokens, we add a hook function (our value head) which computes a **value estimate** for the generated sequence. The hook function is a simple 2-layer neural network which computs the value estimate during the forward pass and stores it externally.

Why do we choose this location? After the layernorm essentially normalizes the reward, and before the unembedding because we take in the enumerated tokens as input. It is also towards the end because (supposedly) it contains the most information after accumulating through the residual stream.

In [3]:
class TransformerWithValueHead(nn.Module):
    def __init__(self, base_model):
        super().__init__()
        self.base_model = HookedTransformer.from_pretrained(base_model)
        
        d_model = self.base_model.cfg.d_model
        self.value_head = nn.Sequential(
            nn.Linear(d_model, 4 * d_model),
            nn.ReLU(),
            nn.Linear(4 * d_model, 1))

    def forward(self, input_ids):
        value_head_output = None

        # resid_post: [batch seq d_model] so
        # value_head_ouput: [batch seq]
        def calc_and_store_value_head_output(resid_post, hook):
            # nonlocal: for variables inside nested functions
            nonlocal value_head_output
            value_head_output = self.value_head(resid_post).squeeze(-1)

        # run_with_hooks injects parameters
        logits = self.base_model.run_with_hooks(
            input_ids,
            return_type = "logits",
            # "normalized" to represent being after the LayerNorm
            fwd_hooks = [(utils.get_act_name("normalized"), calc_and_store_value_head_output)])
        
        return logits, value_head_output
    
model = TransformerWithValueHead("gpt2-small").to(device)

Loaded pretrained model gpt2-small into HookedTransformer


Defaulting `stop_at_eos = False` is interesting. From an interpretability perspective, `stop_at_eos = False`  helps with seeing hallucations. From a training perspective, it helps measure how well the model learned to stop and enables models to learn from full length text, not truncated text.

In [4]:
# prepend_bos: appending a BOS token at the start of a sequence, which marks the start
def get_samples(base_model, prompt, batch_size, gen_len, temperature, top_k, prepend_bos):
    # returns one tokenized prompt, squeeze to extract pure tokens
    input_ids = base_model.to_tokens(prompt, prepend_bos = prepend_bos).squeeze(0)

    output_ids = base_model.generate(
        # [tokens] becomes [batch_size tokens]
        # repeats input_ids once batch_size times
        input_ids.repeat(batch_size, 1), 
        max_new_tokens = gen_len, 
        stop_at_eos = False,
        temperature = temperature,
        top_k = top_k, 
        verbose = False
    )

    # samples: [batch_size sequence]
    samples = base_model.to_string(output_ids)

    # .clone() to prevent modification to internal output_ids
    return output_ids.clone(), samples

In [5]:
sample_ids, samples = get_samples(
    model.base_model,
    prompt = "This movie was really",
    batch_size = 5,
    gen_len = 15,
    temperature = 0.8,
    top_k = 15,
    prepend_bos = False
)

table = Table("Token IDs", "Samples", show_lines = True)
for ids, sample in zip(sample_ids, samples):
    # ids.tolist(): convert Tensor into Python list
    # repr(sample): printable representation (adds single quotes)
    table.add_row(str(ids.tolist()), repr(sample))

rprint(table)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [6]:
# .half(): uses float16 precision for faster inference on GPUs
cls_model = AutoModelForSequenceClassification.from_pretrained("lvwerra/distilbert-imdb").half().to(device)
cls_tokenizer = AutoTokenizer.from_pretrained("lvwerra/distilbert-imdb")

def reward_fn_sentiment_imdb(gen_sample, direction: str = "pos"):
    # "pt" for pytorch tensors, padding + truncation to ensure same length generation
    tokens = cls_tokenizer(gen_sample, return_tensors = "pt", padding = True, truncation = True)["input_ids"].to(device)
    # logits: [batch_size, 2] for pos/neg classification
    logits = cls_model(tokens).logits
    # positive_cls: [batch_size] contains relevant class after softmaxing to get probabilities
    positive_cls = logits.softmax(-1)[:, 1 if (direction == "pos") else 0]
    return positive_cls.to(device)

In [7]:
def normalize_reward(reward, eps = 1e-5):
    return (reward - reward.mean()) / (reward.std() + eps)

Using the simple $A(s_t, a_t) = Q(s_t, a_t) - V(s_t)$ formula where $Q(s_t, a_t)$ is based off of the one-step Q estimates. if $t<T$, then our Q estimate is $V(s_{t+1})$, but if $t=T$, then we can use the known reward $r_t$ for the entire sequence.

GAE is an alternative but wouldn't bring a significant improvement since GAE is most helpful in reducing variance in advantage estimation, and our situation is low variance (each step adds a single token to our sequence).

In [8]:
def compute_advantages(values, rewards, prefix_len):
    one_step_est = t.cat([values[:, prefix_len:-1], rewards[:, None]], dim = -1)
    zero_step_est = values[:,  prefix_len-1:-1]
    return one_step_est - zero_step_est

# Memory

Compared to the PPO implementation, there are a few differences in `ReplayMemory`. 
- Don't need an `add` function because we add it all at once instead of one-by-one.
- Don't need multiple environments

And for `ReplayMinibatch`
- Don't need `actions` anymore since there isn't a sense of an "agent" since actions (tokens generated) are contained within the sequences
- Don't need `dones` since we set the sequence to be `gen_len` long
- Sotre `ref_logits` as a part of the KL penalty w.r.t the reference model

In [9]:
@dataclass
class ReplayMinibatch:
    sample_ids: Float[Tensor, "minibatch_size seq_len"]
    logprobs: Float[Tensor, "minibatch_size gen_len"]
    advantages: Float[Tensor, "minibatch_size gen_len"]
    returns: Float[Tensor, "minibatch_size gen_len"]
    ref_logits: Float[Tensor, "minibatch_size seq_len d_vocab"]

class ReplayMemory:
    def __init__(self, args, sample_ids, logprobs, advantages, values, ref_logits):
        self.args = args
        self.sample_ids = sample_ids
        self.logprobs = logprobs
        self.advantages = advantages
        self.values = values
        self.ref_logits = ref_logits

    def get_minibatches(self):
        minibatches = []

        # Detach tensors to avoid retaining computation graph and causing double-backward errors
        sample_ids = self.sample_ids.detach() if hasattr(self.sample_ids, "detach") else self.sample_ids
        logprobs = self.logprobs.detach() if hasattr(self.logprobs, "detach") else self.logprobs
        advantages = self.advantages.detach() if hasattr(self.advantages, "detach") else self.advantages
        values = self.values.detach() if hasattr(self.values, "detach") else self.values
        ref_logits = self.ref_logits.detach() if hasattr(self.ref_logits, "detach") else self.ref_logits

        # since we use 1-step advantage estimation
        # returns = next-step estimate of value function
        returns = advantages + values[:, -self.args.gen_len - 1: -1]

        for _ in range(self.args.batches_per_learning_phase):
            for indices in t.randperm(self.args.batch_size).reshape(self.args.num_minibatches, -1):
                minibatches.append(ReplayMinibatch(
                    sample_ids = sample_ids[indices],
                    logprobs=logprobs[indices],
                    advantages=advantages[indices],
                    returns=returns[indices],
                    ref_logits=ref_logits[indices]
                ))

        return minibatches

In addition to the 3 components of the total PPO objective, we'll add on the KL penalty as a part of the RLHF framework.
- The KL prediction shift penalty is $-\lambda_{KL} D_{KL}(\pi_{PPO}\phantom{.}|| \phantom{.}\pi_{base})$ (and not the other way) because the penalization should be for results that are likely under $\pi_{PPO}$ and unlikely under $\pi_{base}$. Expanding the KL penalty yields: $$\lambda_{KL} \cdot \sum_i \pi_{PPO_i}\log\left(\frac{\pi_{PPO_i}}{\pi_{base_i}}\right)$$
- The `entropy`, `value_fn`, and `clipped_sur_obj` functions are essentially the same from PPO 

In [10]:
# .mean() to aggregate over the batch + stabilize training
def calc_kl_penalty(logits, ref_logits, kl_coef):
    log_probs = logits.log_softmax(-1)
    ref_log_probs = ref_logits.log_softmax(-1)
    probs = log_probs.exp()

    kl_div = (probs * (log_probs - ref_log_probs)).sum(-1)

    return kl_coef * kl_div.mean()

def calc_entropy_bonus(logits, ent_coef):
    log_probs = logits.log_softmax(-1)
    probs = log_probs.exp()

    entropy = -(log_probs * probs).sum(-1)

    return ent_coef * entropy.mean()

# supervised regression loss for the value function
def calc_value_fn_loss(values, mb_returns, vf_coef):
    return 1/2 * vf_coef * (values - mb_returns).pow(2).mean()

def calc_clipped_sur_obj(logprobs, mb_logprobs, mb_advantages, clip_coef, eps = 1e-8):
    logits_diff = logprobs - mb_logprobs
    # ratio of the policies
    ratio = t.exp(logits_diff)

    # normalizing the advantages
    mb_advantages = (mb_advantages - mb_advantages.mean()) / (mb_advantages.std() + eps)

    # standard clip application
    non_clipped = ratio * mb_advantages
    clipped = t.clip(ratio, 1 - clip_coef, 1 + clip_coef) * mb_advantages

    return t.minimum(non_clipped, clipped).mean()

`get_log_probs` ensures that the output is always of size `(minibatch_size, gen_len)`. We only care about the log probs of the tokens generated, not in the prefix.

In [11]:
def get_log_probs(logits, tokens, prefix_len):
    if prefix_len is not None:
        logits = logits[:, prefix_len-1:]
        tokens = tokens[:, prefix_len-1:]
    
    log_probs = logits.log_softmax(-1)
    shaped_log_probs = eindex(log_probs, tokens, "b s [b s+1]")

    return shaped_log_probs

For both the base model and the value head, we define seperate learning rates, which makes sense since the value head is randomly initalized whereas the base model is already built out.

For the scheduler, we use a lienar warmup up to `1.0` then linear decay down to `args.final_scale`.

In [12]:
def get_optimizer(model, base_lr, head_lr):
    return t.optim.AdamW(
        [
           {"params": model.base_model.parameters(), "lr": base_lr},
           {"params": model.value_head.parameters(), "lr": head_lr} 
        ], maximize = True)

def get_optimizer_and_scheduler(args, model):
    def lr_lambda(step):
        if step < args.warmup_steps:
            return step / args.warmup_steps
        else:
            return 1 - (1 - args.final_scale) * (step - args.warmup_steps) / (args.total_phases - args.warmup_steps)
        
    optimizer = get_optimizer(model, args.base_lr, args.head_lr)
    scheduler = t.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda = lr_lambda)

    return optimizer, scheduler

# Training

In [13]:
class RLHFTrainer:
    def __init__(self, args):
        self.args = args
        self.run_name = f"{args.wandb_project_name}_{args.base_model}_{time.strftime('%Y%m%d-%H%M%S')}"

        self.model = TransformerWithValueHead(args.base_model).to(device).train()
        self.ref_model = HookedTransformer.from_pretrained(args.base_model).to(device).eval()
        self.optimizer, self.scheduler = get_optimizer_and_scheduler(self.args, self.model)
        self.prefix_len = len(self.model.base_model.to_str_tokens(self.args.prefix, prepend_bos = self.args.prepend_bos))

    def compute_rlhf_objective(self, minibatch):
        logits, values = self.model(minibatch.sample_ids)
        log_probs = get_log_probs(logits, minibatch.sample_ids, self.prefix_len)

        gen_len_slice = slice(-self.args.gen_len - 1, -1)

        # ??? understand the slicing and why it differs for minibatch and log_probs
        kl_penalty = calc_kl_penalty(logits[:, gen_len_slice], minibatch.ref_logits[:, gen_len_slice], self.args.kl_coef)
        entropy = calc_entropy_bonus(logits[:, gen_len_slice], self.args.ent_coef)
        value_fn_loss = calc_value_fn_loss(values[:, gen_len_slice], minibatch.returns, self.args.vf_coef)
        clipped_sur_obj = calc_clipped_sur_obj(log_probs, minibatch.logprobs, minibatch.advantages, self.args.clip_coef)

        ppo_obj_fn = clipped_sur_obj - value_fn_loss + entropy
        total_obj_fn = ppo_obj_fn - kl_penalty

        # RL-specific logging
        with t.inference_mode():
            logratio = log_probs - minibatch.logprobs
            ratio = logratio.exp()
            clipfracs = [((ratio - 1.0).abs() > self.args.clip_coef).float().mean().item()]
        wandb.log(
            dict(
                total_steps=self.step,
                lr=self.scheduler.get_last_lr()[0],
                clipped_surrogate_objective=clipped_sur_obj.item(),
                clipfrac=np.mean(clipfracs),
                value_loss=value_fn_loss.item(),
                values=values.mean().item(),
                entropy_bonus=entropy.item(),
                kl_penalty=kl_penalty.item(),
            ),
            step=self.step,
        )

        return total_obj_fn
    
    def rollout_phase(self):
        sample_ids, samples = get_samples(
            base_model = self.model.base_model,
            prompt = self.args.prefix,
            batch_size = self.args.batch_size,
            gen_len = self.args.gen_len,
            temperature = self.args.temperature,
            top_k = self.args.top_k,
            prepend_bos = self.args.prepend_bos)
        
        with t.inference_mode():
            logits, values = self.model(sample_ids)
            ref_logits = self.ref_model(sample_ids)

        log_probs = get_log_probs(logits, sample_ids, self.prefix_len)

        rewards = self.args.reward_fn(samples)
        rewards_mean = rewards.mean().item()
        rewards_normed = normalize_reward(rewards) if self.args.normalize_reward else rewards
        advantages = compute_advantages(values, rewards_normed, self.prefix_len)

        wandb.log({"Mean Reward": rewards_mean}, step = self.step)

        # visualization
        n_log_samples = min(3, self.args.batch_size)
        ref_logprobs = get_log_probs(ref_logits[:n_log_samples], sample_ids[:n_log_samples], self.prefix_len).sum(-1)
        headers = ["Reward", "Ref logprobs", "Sample"]
        table_data = [[str(int(r)), f"{lp:.2f}", repr(s)] for r, lp, s in zip(rewards.tolist(), ref_logprobs, samples)]
        table = tabulate(table_data, headers, tablefmt="simple_grid", maxcolwidths=[None, None, 90])
        print(f"Phase {self.phase+1:03}/{self.args.total_phases:03}, Mean reward: {rewards_mean:.4f}\n{table}\n")

        return ReplayMemory(
            args = self.args,
            sample_ids = sample_ids,
            logprobs = log_probs,
            advantages = advantages,
            values = values,
            ref_logits = ref_logits)
        
    def learning_phase(self, memory):
        for minibatch in tqdm(memory.get_minibatches(), desc = f"Learning phase {self.phase+1}"):
            self.optimizer.zero_grad()
            total_obj_fn = self.compute_rlhf_objective(minibatch)
            total_obj_fn.backward()
            # clip according to max_norm
            nn.utils.clip_grad_norm_(self.model.parameters(), max_norm = self.args.max_grad_norm)
            self.optimizer.step()
            self.step += 1

        self.scheduler.step()

    def train(self):
        # ??? why define these here instead of global
        self.step = 0
        self.samples = []

        wandb.init(
            project = self.args.wandb_project_name,
            entity = self.args.wandb_entity,
            name = self.run_name,
            config = self.args,
        )

        for self.phase in tqdm(range(self.args.total_phases), desc = "Training phases"):
            memory = self.rollout_phase()
            self.learning_phase(memory)

        wandb.finish()

In [14]:
# testing with kl_coef = 0.0, it has no incentives to match the ref distribution, only maximize reward
args = RLHFArgs(kl_coef=0.0, total_phases=5, warmup_steps=0, reward_fn=reward_fn_sentiment_imdb)
trainer = RLHFTrainer(args)
trainer.train()

Loaded pretrained model gpt2-small into HookedTransformer
Loaded pretrained model gpt2-small into HookedTransformer
Moving model to device:  mps


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[34m[1mwandb[0m: Currently logged in as: [33mdjdumpling[0m ([33mdjdumpling-yale[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_

Training phases:   0%|          | 0/5 [00:00<?, ?it/s]We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Phase 001/005, Mean reward: 0.8828
┌──────────┬────────────────┬────────────────────────────────────────────────────────────────────────────────────────────┐
│   Reward │   Ref logprobs │ Sample                                                                                     │
├──────────┼────────────────┼────────────────────────────────────────────────────────────────────────────────────────────┤
│        0 │        -121.07 │ '<|endoftext|>This movie was really cool. I was looking through the DVD and I was like,    │
│          │                │ "What the? You guys did it? I\'m just looking for something to make you laugh, I\'m just a │
│          │                │ kid. I just wanna be like, I can do this,'                                                 │
├──────────┼────────────────┼────────────────────────────────────────────────────────────────────────────────────────────┤
│        0 │         -95.07 │ '<|endoftext|>This movie was really good! It was funny and it was a great 

wandb-core(97658) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
wandb-core(97675) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
wandb-core(97678) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
wandb-core(97704) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
wandb-core(97710) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
wandb-core(97726) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
wandb-core(97730) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
wandb-core(97739) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
wandb-core(97751) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
wandb-core(97774) MallocStackLogging: can't turn off malloc stack logging because 

Phase 002/005, Mean reward: 0.5840
┌──────────┬────────────────┬────────────────────────────────────────────────────────────────────────────────────────────┐
│   Reward │   Ref logprobs │ Sample                                                                                     │
├──────────┼────────────────┼────────────────────────────────────────────────────────────────────────────────────────────┤
│        0 │        -123.44 │ '<|endoftext|>This movie was really just my own experience and it was a huge relief to see │
│          │                │ it. The music is amazing but not by all the artists. I have heard this movie a thousand    │
│          │                │ times but never once have I heard it. It is a very dark and gritty movie that'             │
├──────────┼────────────────┼────────────────────────────────────────────────────────────────────────────────────────────┤
│        0 │        -101.12 │ "<|endoftext|>This movie was really made for me. I don't think it's the sa

wandb-core(8854) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
wandb-core(8869) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
wandb-core(8896) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
wandb-core(8917) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
wandb-core(8961) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
wandb-core(8991) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
wandb-core(9015) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
wandb-core(9039) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
wandb-core(9077) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
wandb-core(9096) MallocStackLogging: can't turn off malloc stack logging because it was not

Phase 003/005, Mean reward: 0.6479
┌──────────┬────────────────┬────────────────────────────────────────────────────────────────────────────────────────────┐
│   Reward │   Ref logprobs │ Sample                                                                                     │
├──────────┼────────────────┼────────────────────────────────────────────────────────────────────────────────────────────┤
│        0 │        -124.75 │ '<|endoftext|>This movie was really just about the best, and the best, I could have ever   │
│          │                │ imagined. It was all very well and dandy, but it was not for everybody. There was a bit of │
│          │                │ a dark humor, but it never really got into the character. I'                               │
├──────────┼────────────────┼────────────────────────────────────────────────────────────────────────────────────────────┤
│        0 │        -130.39 │ "<|endoftext|>This movie was really a work in progress with no actual foot

wandb-core(20838) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
wandb-core(20861) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
wandb-core(20869) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
wandb-core(20905) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
wandb-core(20918) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
wandb-core(20926) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
wandb-core(20943) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
wandb-core(20964) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
wandb-core(20969) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
wandb-core(21026) MallocStackLogging: can't turn off malloc stack logging because 

Phase 004/005, Mean reward: 0.6699
┌──────────┬────────────────┬────────────────────────────────────────────────────────────────────────────────────────────┐
│   Reward │   Ref logprobs │ Sample                                                                                     │
├──────────┼────────────────┼────────────────────────────────────────────────────────────────────────────────────────────┤
│        0 │        -127.61 │ '<|endoftext|>This movie was really not for the money. I did this because I wanted to be   │
│          │                │ entertained and have people think I was crazy and not just be an idiot. I also felt I      │
│          │                │ needed to make it as a show and this was the place for that. I also wanted'                │
├──────────┼────────────────┼────────────────────────────────────────────────────────────────────────────────────────────┤
│        0 │        -126.45 │ "<|endoftext|>This movie was really taken by me. I don't think anyone coul

wandb-core(32315) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
wandb-core(32340) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
wandb-core(32343) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
wandb-core(32366) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
wandb-core(32389) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
wandb-core(32422) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
wandb-core(32441) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
wandb-core(32481) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
wandb-core(32521) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
wandb-core(32537) MallocStackLogging: can't turn off malloc stack logging because 

KeyboardInterrupt: 

Error in callback <bound method _WandbInit._post_run_cell_hook of <wandb.sdk.wandb_init._WandbInit object at 0x1771f99d0>> (for post_run_cell), with arguments args (<ExecutionResult object at 3123f1d30, execution_count=14 error_before_exec=None error_in_exec= info=<ExecutionInfo object at 3123f3770, raw_cell="# testing with kl_coef = 0.0, it has no incentives.." transformed_cell="# testing with kl_coef = 0.0, it has no incentives.." store_history=True silent=False shell_futures=True cell_id=vscode-notebook-cell:/Users/alexwa/Documents/GitHub/rl/rlhf_transformer/rlhf.ipynb#X33sZmlsZQ%3D%3D> result=None>,),kwargs {}:


BrokenPipeError: [Errno 32] Broken pipe