In [1]:
!pip install transformers[torch] torch trl pandas numpy peft google-colab datasets bitsandbytes wandb



In [2]:
from dataclasses import dataclass, field
from typing import Optional

import torch
from accelerate import Accelerator
from datasets import load_dataset
from peft import LoraConfig
from tqdm import tqdm
from transformers import Adafactor, AutoTokenizer, HfArgumentParser, pipeline

from trl import AutoModelForCausalLMWithValueHead, PPOConfig, PPOTrainer, set_seed
from trl.core import LengthSampler



In [3]:
# Define default values for the script
model_name = "danyoung/finance-qa"  # Example model name
tokenizer_name = "danyoung/finance-qa"
reward_model_name =  "grantjw/reward_model_fiqa"
learning_rate = 1.41e-5
output_max_length = 128
mini_batch_size = 1
batch_size = 16
ppo_epochs = 4
gradient_accumulation_steps = 4
adafactor = False
early_stopping = False
target_kl = 0.1
reward_baseline = 0.0
batched_gen = False
save_freq = None
output_dir = "runs/"
seed = 0
steps = 20000
init_kl_coef = 0.2
adap_kl_ctrl = True

In [4]:
config = PPOConfig(
    steps=steps,
    model_name=model_name,
    learning_rate=learning_rate,
    log_with="wandb",  # As per your command
    batch_size=batch_size,
    mini_batch_size=mini_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optimize_cuda_cache=True,  # Keeping the original default
    early_stopping=early_stopping,
    target_kl=target_kl,
    ppo_epochs=ppo_epochs,
    seed=seed,
    init_kl_coef=init_kl_coef,
    adap_kl_ctrl=adap_kl_ctrl,
)



In [5]:
# Load and preprocess dataset
train_dataset = load_dataset("gbharti/wealth-alpaca_lora", split='train[30:40%]')
train_dataset = train_dataset.select(range(1000))
original_columns = train_dataset.column_names
train_dataset

Dataset({
    features: ['instruction', 'output', 'input'],
    num_rows: 1000
})

In [6]:
original_columns = train_dataset.column_names
original_columns

['instruction', 'output', 'input']

In [7]:
# We then define the arguments to pass to the sentiment analysis pipeline.
# We set `return_all_scores` to True to get the sentiment score for each toke
sent_kwargs = {
    "return_all_scores": True,
    "function_to_apply": "none",
    "batch_size": 16,
    "truncation": True,
}

In [8]:
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# tokenizer has a pad token, but it is not eos_token by default. We need to set it to eos_token.
# only for this model.

1

In [9]:
if getattr(tokenizer, "pad_token", None) is None:
    tokenizer.pad_token = tokenizer.eos_token
   # model.config.pad_token_id = tokenizer.eos_token_id


In [10]:
tokenizer.eos_token_id

2

In [11]:
tokenizer.eos_token

'</s>'

In [12]:
# Below is an example function to build the dataset. In our case, we use the IMDB dataset
# from the `datasets` library. One should customize this function to train the model on
# its own dataset.

def build_dataset(
    tokenizer,
    dataset_name="lvwerra/stack-exchange-paired",
):
    """
    Build dataset for training. This builds the dataset from `load_dataset`, one should
    customize this function to train the model on its own dataset.

    Args:
        dataset_name (`str`):
            The name of the dataset to be loaded.

    Returns:
        dataloader (`torch.utils.data.DataLoader`):
            The dataloader for the dataset.
    """

    num_proc = 24

    def preprocess_function(examples):
        new_examples = {
            "query": [],
            "input_ids": [],
        }
        for question in examples["instruction"]:
            query = "instruction: " + question + "\n\nAnswer: "
            tokenized_question = tokenizer(query, truncation=True)
            new_examples["query"].append(query)
            new_examples["input_ids"].append(tokenized_question["input_ids"])

        return new_examples

    ds = train_dataset.map(
        preprocess_function,
        batched=True,
        num_proc=num_proc,
        remove_columns=original_columns,
    )
    ds = ds.filter(lambda x: len(x["input_ids"]) < 512, batched=False)

    ds.set_format(type="torch")
    return ds

In [13]:
# We retrieve the dataloader by calling the `build_dataset` function.
dataset = build_dataset(tokenizer)

In [14]:
def collator(data):
    return dict((key, [d[key] for d in data]) for key in data[0])


In [15]:
# set seed before initializing value head for deterministic eval
set_seed(config.seed)

In [16]:
current_device = Accelerator().local_process_index


In [17]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

In [18]:
model = AutoModelForCausalLMWithValueHead.from_pretrained(
    config.model_name,
    load_in_4bit=True,
    device_map={"": current_device},
    peft_config=lora_config,
    pad_token_id=tokenizer.eos_token_id,
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]



In [19]:
if getattr(tokenizer, "pad_token", None) is None:
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = tokenizer.eos_token_id


In [20]:
optimizer = None
if adafactor:
    optimizer = Adafactor(
        filter(lambda p: p.requires_grad, model.parameters()),
        scale_parameter=False,
        relative_step=False,
        warmup_init=False,
        lr=learning_rate,  # using learning_rate from your provided arguments
    )

In [21]:
# We then build the PPOTrainer, passing the model, the reference model, the tokenizer
ppo_trainer = PPOTrainer(
    config,
    model,
    ref_model=None,
    tokenizer=tokenizer,
    dataset=dataset,
    data_collator=collator,
    optimizer=optimizer,
)

[34m[1mwandb[0m: Currently logged in as: [33mgrantjkw[0m ([33mgrant-capstone[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [22]:
# We then build the sentiment analysis pipeline using our reward model, passing the
# model name and the sentiment analysis pipeline arguments. Let's also make sure to
# set the device to the same device as the PPOTrainer.
device = ppo_trainer.accelerator.device
if ppo_trainer.accelerator.num_processes == 1:
    device = 0 if torch.cuda.is_available() else "cpu"  # to avoid a ` pipeline` bug
sentiment_pipe = pipeline(
    "sentiment-analysis",
    model=reward_model_name,
    device_map={"": current_device},
    model_kwargs={"load_in_4bit": True},
    tokenizer=tokenizer,
    return_token_type_ids=False,
)

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of MistralForSequenceClassification were not initialized from the model checkpoint at danyoung/finance-qa and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
# We then define the arguments to pass to the `generate` function. These arguments
# are passed to the `generate` function of the PPOTrainer, which is a wrapper around
# the `generate` function of the trained model.

generation_kwargs = {
    # "min_length": -1,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True,
    "pad_token_id": tokenizer.eos_token_id,
    "eos_token_id": 10000,
}
output_min_length = 32
output_max_length = 128
reward_baseline = 0.0
output_dir = "runs/"
save_freq = None
output_length_sampler = LengthSampler(output_min_length, output_max_length)

for epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)):
    if epoch >= config.total_ppo_epochs:
        break

    question_tensors = batch["input_ids"]

    response_tensors = ppo_trainer.generate(
        question_tensors,
        return_prompt=False,
        length_sampler=output_length_sampler,
        **generation_kwargs,
    )
    batch["response"] = tokenizer.batch_decode(response_tensors, skip_special_tokens=True)

    # Compute reward score (using the sentiment analysis pipeline)
    texts = [q + r for q, r in zip(batch["query"], batch["response"])]
    pipe_outputs = sentiment_pipe(texts, **sent_kwargs)
    rewards = [torch.tensor(output[0]["score"] - reward_baseline) for output in pipe_outputs]

    # Run PPO step
    stats = ppo_trainer.step(question_tensors, response_tensors, rewards)
    ppo_trainer.log_stats(stats, batch, rewards)

    if save_freq and epoch and epoch % save_freq == 0:
        ppo_trainer.save_pretrained(output_dir + f"step_{epoch}")

0it [00:00, ?it/s]You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
0it [00:00, ?it/s]


RuntimeError: ignored