In [1]:
!pip install transformers accelerate bitsandbytes peft trl datasets tqdm wandb sentencepiece tokenizers evaluate nltk rouge_score

[0m

In [2]:
from accelerate import Accelerator

import torch
from datasets import load_dataset
from tqdm import tqdm
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline,
)

from huggingface_hub import login

from peft import LoraConfig
from trl import (
    AutoModelForCausalLMWithValueHead,
    PPOConfig,
    PPOTrainer,
)
from trl.core import LengthSampler
import torch.optim as optim
import torch.optim.lr_scheduler as lr_scheduler

import evaluate
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize
nltk.download("punkt")

import wandb

import gc

import os
os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true'

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
print(f'Is CUDA available on torch? {torch.cuda.is_available()}')

Is CUDA available on torch? True


In [4]:
login(token='<Hugging Face Token>')

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [5]:
wandb.login(key='<Wandb Token>') # log into wandb

[34m[1mwandb[0m: Currently logged in as: [33mhonlam-chung[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [6]:
current_device = Accelerator().local_process_index

In [7]:
torch.cuda.empty_cache()
gc.collect()

40

In [8]:
lora_alpha =  512
lora_dropout = 0.1
lora_rank = 256

peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_rank,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
        "query_key_value",
        "dense",
        "dense_h_to_4h",
        "dense_4h_to_h",
    ]
)

In [9]:
policy_id = 'deepaknh/falcon7B_FineTuning_Experiment2_QLORA_7perParam'

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16,
)

policy_model = AutoModelForCausalLMWithValueHead.from_pretrained(
    policy_id,
    device_map={"": current_device},
    quantization_config=bnb_config,
    peft_config=peft_config,
)



Loading checkpoint shards:   0%|          | 0/15 [00:00<?, ?it/s]



In [10]:
model_id = "vilsonrodrigues/falcon-7b-sharded"

rl_tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True, model_max_length=512)

if getattr(rl_tokenizer, "pad_token", None) is None:
    rl_tokenizer.pad_token = rl_tokenizer.eos_token

In [11]:
reward_model_id = 'reward_model_falcon_test_1'

reward_model = reward_model = AutoModelForSequenceClassification.from_pretrained(
    reward_model_id,
    device_map={"" : current_device},
)

In [12]:
reward_model_tokenizer = AutoTokenizer.from_pretrained(reward_model_id)

if reward_model_tokenizer.pad_token is None:
    reward_model_tokenizer.pad_token = reward_model_tokenizer.eos_token
    reward_model_id.config.pad_token_id = reward_model_id.config.eos_token_id

In [13]:
data_files = {
    'train': 'train_augment_data.json',
}

In [14]:
aug_dataset = load_dataset('json', data_files=data_files)

In [15]:
aug_dataset

DatasetDict({
    train: Dataset({
        features: ['answer', 'question'],
        num_rows: 147276
    })
})

In [16]:
def build_dataset(tokenizer, dataset):

    num_proc = 24

    def preprocess_function(examples):
        new_examples = {
            "query": [],
            "input_ids": [],
            "truth": [],
        }

        for question, answer in zip(examples['question'], examples['answer']):
            query = f"Question: {question} \n\nAnswer: "
            tokenized_question = tokenizer(query, truncation=True)
            new_examples["query"].append(query)
            new_examples["truth"].append(answer)
            new_examples["input_ids"].append(tokenized_question["input_ids"])

        return new_examples

    ds = dataset.map(
        preprocess_function,
        batched=True,
        num_proc=num_proc,
        remove_columns=['question', 'answer']
    )

    ds.set_format(type="torch")
    return ds

In [17]:
ppo_dataset = build_dataset(rl_tokenizer, aug_dataset)

In [18]:
ppo_dataset

DatasetDict({
    train: Dataset({
        features: ['query', 'input_ids', 'truth'],
        num_rows: 147276
    })
})

In [19]:
def collator(data):
    return dict((key, [d[key] for d in data]) for key in data[0])

In [20]:
lr = 5e-7

config = PPOConfig(
    model_name=policy_id,
    steps=1000,
    learning_rate=lr,
    batch_size=64,
    mini_batch_size=1,
    gradient_accumulation_steps=4,
    optimize_cuda_cache=True,
    ppo_epochs=5,
    target_kl=0.1,
    init_kl_coef=0.2,
    adap_kl_ctrl=True,
    remove_unused_columns=False,
    log_with='wandb',
    seed=42,
)

In [21]:
adam = optim.Adam(policy_model.parameters(), lr=lr)
scheduler = lr_scheduler.ExponentialLR(adam, gamma=0.1)

In [22]:
ppo_trainer = PPOTrainer(
    config,
    model=policy_model, # e.g. flan T5 fine-tuned
    ref_model=None, # e.g., flan T5 base line / without fine-tuning
    tokenizer=rl_tokenizer,
    dataset=ppo_dataset['train'],
    data_collator=collator,
    optimizer=adam,
    lr_scheduler=scheduler,
)

In [23]:
device = ppo_trainer.accelerator.device
if ppo_trainer.accelerator.num_processes == 1:
    device = 0

In [24]:
sent_kwargs = {
    'top_k': None,
    "function_to_apply": "none",
    "batch_size": 32,
    "truncation": True,
}

In [25]:
sentiment_pipe = pipeline(
    "sentiment-analysis",
    reward_model,
    device_map={"" : current_device},
    tokenizer=reward_model_tokenizer,
    return_token_type_ids=False,
)

In [26]:
generation_kwargs = {
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True,
    "pad_token_id": reward_model_tokenizer.pad_token_id,
    "eos_token_id": 100_000,
    "no_repeat_ngram_size": 2,
}

In [27]:
output_min_length = 32
output_max_length = 128
output_length_sampler = LengthSampler(output_min_length, output_max_length)

In [28]:
# set evaluation metric
rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")

# helper function to postprocess text
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(sent_tokenize(label)) for label in labels]

    return preds, labels

def compute_metrics(preds, labels, tokenizer):
    if isinstance(preds, tuple):
        preds = preds[0]

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(preds, labels)

    rouge_result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    bleu_result = bleu.compute(predictions=decoded_preds, references=decoded_labels)
    result = {k: round(v * 100, 4) for k, v in rouge_result.items()}
    for k, v in bleu_result.items():
        if isinstance(v, list):
            result[k] = list(map(lambda x: round(x * 100, 4), v))
        else:
            result[k] = round(v * 100, 4)

    return result

In [29]:
store_metrics = []

In [30]:
for epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)):
    if epoch >= config.total_ppo_epochs:
        break

    # leverage pre-tokenized dataset
    question_tensors = batch["input_ids"]

    # compute response tensors from our ppo_trainer
    # exclude the prompt from the output
    # ensure it's the correct length
    response_tensors = ppo_trainer.generate(
        question_tensors,
        return_prompt=False,
        length_sampler=output_length_sampler,
        **generation_kwargs,
    )

    # batch decode our responses
    batch["response"] = rl_tokenizer.batch_decode(response_tensors, skip_special_tokens=True)

    # Compute reward score (using the sentiment analysis pipeline)
    texts = [q + r for q, r in zip(batch["query"], batch["response"])]
    pipe_outputs = sentiment_pipe(texts, **sent_kwargs)
    rewards = [torch.tensor(output[0]["score"]) for output in pipe_outputs]
    rewards_dict = {
        'avg_reward': np.mean(rewards),
        'max_reward': np.max(rewards),
        'min_reward': np.min(rewards),
    }

    # Run PPO step
    stats = ppo_trainer.step(question_tensors, response_tensors, rewards)
    ppo_trainer.log_stats(stats, batch, rewards)
    
    metrics = compute_metrics(batch['response'], batch['truth'], rl_tokenizer)
    metrics.update(rewards_dict)
    store_metrics.append(metrics)

10it [26:50, 161.58s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
16it [43:14, 162.18s/it]


In [31]:
torch.cuda.empty_cache()
gc.collect()

3744

In [32]:
df = pd.DataFrame(store_metrics)

In [33]:
df

Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum,bleu,precisions,brevity_penalty,length_ratio,translation_length,reference_length,avg_reward,max_reward,min_reward
0,31.6466,8.2671,20.5318,25.9335,5.1771,"[26.3479, 6.2949, 2.8204, 1.5357]",100.0,140.1847,409900,292400,-1.265847,5.606047,-5.991078
1,34.0014,11.4225,23.5553,29.287,7.8351,"[28.0655, 8.4677, 4.7867, 3.3129]",100.0,134.1368,378400,282100,0.150411,5.932647,-6.064582
2,33.6652,11.0547,22.6692,28.6415,8.7742,"[28.4762, 9.7114, 5.5853, 3.8372]",100.0,138.4747,404900,292400,-0.161244,6.06037,-6.098593
3,33.4687,10.6026,22.8337,27.8283,7.3155,"[28.2496, 8.5571, 4.3921, 2.6975]",100.0,136.9989,386200,281900,-0.752609,6.027473,-6.210948
4,30.2941,7.9926,20.6846,25.8159,5.8142,"[26.6066, 6.9333, 3.3525, 1.8479]",100.0,115.8302,343900,296900,0.684927,5.968105,-6.128095
5,32.2643,8.9259,21.5265,27.9167,5.9317,"[26.3181, 7.0263, 3.3349, 2.0075]",100.0,150.151,447600,298100,-0.951631,5.99031,-6.277581
6,31.4443,8.6545,20.6687,26.8658,6.087,"[27.3947, 7.0434, 3.4454, 2.0651]",100.0,129.7702,372700,287200,0.675229,5.836922,-5.534246
7,31.4665,10.27,21.7482,26.681,6.8906,"[25.0061, 7.5397, 4.2076, 2.8418]",100.0,151.4983,409500,270300,-0.227136,6.075751,-6.217311
8,32.803,9.2336,22.2786,27.4597,5.7337,"[27.1472, 6.9441, 3.1205, 1.8373]",100.0,128.7696,362100,281200,0.030481,3.617817,-6.11173
9,33.8082,10.4472,22.7851,29.1162,7.365,"[27.7123, 8.8602, 4.4504, 2.6927]",100.0,145.8548,424000,290700,-0.518518,5.822958,-6.250123


In [34]:
for epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)):

    # leverage pre-tokenized dataset
    question_tensors = batch["input_ids"]

    # compute response tensors from our ppo_trainer
    # exclude the prompt from the output
    # ensure it's the correct length
    response_tensors = ppo_trainer.generate(
        question_tensors,
        return_prompt=False,
        length_sampler=output_length_sampler,
        **generation_kwargs,
    )

    batch["response"] = rl_tokenizer.batch_decode(response_tensors, skip_special_tokens=True)
    print(batch['query'][0])
    print(batch['response'][0])
    print(batch['truth'][0])
    break

0it [01:24, ?it/s]

Question: My beloved dog has been showing signs of redness and discharge in their eyes lately. I'm worried it might be bacterial conjunctivitis. Can you tell me about the symptoms and treatments for this condition in dogs? 

Answer: 
 Bacterial conjuntivial infection, also called ocular bacteri conjundiva, is a bacterial infection that affects the eye. Symptoms include a pink or red color to the eyes, with the possibility of discharge or swelling. Treatment often involves antibiotic eye drops or ointments. It's important to seek veterinary attention promptly to ensure proper diagnosis and treatment to prevent any further complications. We understand how concerning it can be to see your
Symptoms of bacterial conjunctivitis in dogs include redness, discharge, and swelling. Treatments often include antibiotic eye drops or ointments. It's important to consult with a veterinarian for proper diagnosis and treatment to ensure your furry friend's eyes are healthy and comfortable.





In [37]:
df.to_csv(f'rlhf-falcon_test.csv')

In [38]:
ppo_trainer.save_pretrained("rlhf-falcon_test")