In [None]:
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.5.0
!pip install -q sentencepiece

In [None]:
import random

import torch
from datasets import Dataset, load_dataset
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
)
from trl import RewardTrainer

In [None]:
import pandas as pd
from datasets import Dataset

def format(example):

    # Format instruction
    prompt = example['prompt']

    # Format chosen answer
    chosen = example['answer2']

    # Format rejected answer
    rejected = example['answer1']

    return {
        "instruction": prompt,
        "chosen_response": chosen,
        "rejected_response": rejected,
    }

# Load dataset

generated_examples = pd.read_csv('/content/new_df')
column_to_drop = 'Unnamed: 0'
generated_examples.drop(column_to_drop, axis=1, inplace=True)
generated_examples.dropna(axis=0, how='any', inplace=True)



dataset  = Dataset.from_pandas(generated_examples)

# Save columns
original_columns = dataset.column_names

# Format dataset
dataset = dataset.map(format,
    remove_columns=original_columns
)

# Print sample
dataset[1]

In [None]:
dataset

In [None]:
from transformers import (
    AutoModelForSequenceClassification,
    BitsAndBytesConfig,
    AutoTokenizer,
)

# Load tokenizer for the "tiiuae/falcon-7b-instruct" model
tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-7b-instruct")

# Prepare quantization parameters
quantization_config = BitsAndBytesConfig(load_in_8bit=False, load_in_4bit=True)

# Initialize the sequence classification model
model = AutoModelForSequenceClassification.from_pretrained(
    "tiiuae/falcon-7b-instruct",
    quantization_config=quantization_config,  # Apply the quantization configuration
    device_map={"": 0},  # Assign the model to device 0
    trust_remote_code=True,  # Trust remote code
    num_labels=1,  # Set the number of labels for classification (in this case, 1)
)

# Disable cache in model configuration
model.config.use_cache = False

In [None]:
# If the tokenizer's pad_token is not set, use eos_token as pad_token and update model's pad_token_id
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = model.config.eos_token_id


# Define a formatting function for processing examples
def formatting_func(examples):
    kwargs = {
        "padding": "max_length",
        "truncation": True,
        "max_length": 512,
        "return_tensors": "pt",
    }

    # Prepend the instruction and a line break to the chosen_response and rejected_response fields.
    prompt_plus_chosen_response = (
        examples["instruction"] + "\n" + examples["chosen_response"]
    )
    prompt_plus_rejected_response = (
        examples["instruction"] + "\n" + examples["rejected_response"]
    )

    # Tokenize the modified fields.
    tokens_chosen = tokenizer.encode_plus(prompt_plus_chosen_response, **kwargs)
    tokens_rejected = tokenizer.encode_plus(prompt_plus_rejected_response, **kwargs)

    return {
        "input_ids_chosen": tokens_chosen["input_ids"][0],
        "attention_mask_chosen": tokens_chosen["attention_mask"][0],
        "input_ids_rejected": tokens_rejected["input_ids"][0],
        "attention_mask_rejected": tokens_rejected["attention_mask"][0],
    }


# Apply the formatting function to the prepared dataset
formatted_dataset = dataset.map(formatting_func)

# Split the formatted dataset into training and testing sets
formatted_dataset = formatted_dataset.train_test_split()

In [None]:
from transformers import TrainingArguments
from peft import LoraConfig
from trl import RewardTrainer

# Prepare training parameters
training_args = TrainingArguments(
    output_dir="./train_logs",  # Output folder
    max_steps=100,  # Maximum number of training steps
    per_device_train_batch_size=4,  # Batch size per GPU for training
    gradient_accumulation_steps=1,  # Number of steps to accumulate gradients
    learning_rate=1.0e-4,  # Learning rate
    optim="adamw_torch",  # Optimizer
    save_steps=50,  # How often to save checkpoints
    logging_steps=10,  # How often to log training information
    report_to="tensorboard",  # Reporting method (in this case, TensorBoard)
    remove_unused_columns=False,  # Whether to remove unused columns
    evaluation_strategy="steps",  # Evaluation strategy
    num_train_epochs=5,  # Number of training epochs
)

# Prepare PEFT parameters
peft_config = LoraConfig(
    r=16,  # Value of r
    lora_alpha=16,# Value of lora_alpha
    target_modules=[
"query_key_value",
"dense",
"dense_h_to_4h",
"dense_4h_to_h",
],
    bias="none",  # Bias setting
    task_type="SEQ_CLS",  # Task type (Sequence Classification)
    modules_to_save=["scores"],  # Modules to save
)

# Prepare RewardTrainer
trainer = RewardTrainer(
    model=model,  # The model for reinforcement learning
    tokenizer=tokenizer,  # The tokenizer for processing input data
    args=training_args,  # Training arguments
    train_dataset=formatted_dataset["train"],  # Training dataset
    eval_dataset=formatted_dataset["test"],  # Evaluation dataset
    peft_config=peft_config,  # PEFT configuration
    max_length=512,  # Maximum length of input
)

# Execute training
trainer.train()

# Save the pretrained reward model
trainer.model.save_pretrained("./reward_model")

In [None]:
import torch


def get_score(model, tokenizer, prompt, response):
    """
    Computes a score for a given prompt and response using a provided model and tokenizer.

    Args:
        model (nn.Module): The model for scoring.
        tokenizer: The tokenizer for processing input data.
        prompt (str): The prompt text.
        response (str): The response text.

    Returns:
        float: The computed score.
    """
    print(prompt, response)
    # Tokenize the input sequences
    inputs = tokenizer.encode_plus(
        prompt,
        response,
        truncation=True,
        padding="max_length",
        max_length=512,
        return_tensors="pt",
    ).to("cuda:0")

    # Perform forward pass
    with torch.no_grad():
        outputs = model(**inputs,return_dict=True)

    # Extract the logits
    logits = outputs.logits

    return logits.item()

In [None]:
x = 40
dataset[x]

In [None]:
# Get the prompt and responses for the example
prompt = dataset[x]["instruction"]
rejected_response = dataset[x]["rejected_response"]
chosen_response = dataset[x]["chosen_response"]

# Get the score for the example with the less preferred response
score_less_pref = get_score(model, tokenizer, prompt, rejected_response)
print(f"Score for less preferred response: {score_less_pref}")

# Get the score for the example with the preferred response
score_pref = get_score(model, tokenizer, prompt, chosen_response)
print(f"Score for preferred response: {score_pref}")

In [None]:
from datasets import Dataset
from sklearn.model_selection import train_test_split

import pandas as pd
generated_examples = pd.read_csv('/content/new_df')
column_to_drop = 'Unnamed: 0'
generated_examples.drop(column_to_drop, axis=1, inplace=True)

train_df, val_df = train_test_split(generated_examples, test_size=0.4, random_state=42)

from datasets import Dataset
train_dataset = Dataset.from_pandas(train_df)
train_dataset=train_dataset.remove_columns('__index_level_0__')


In [None]:
import torch
from tqdm import tqdm
import pandas as pd

tqdm.pandas()

from transformers import pipeline, AutoTokenizer
from datasets import load_dataset

from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead
from trl.core import LengthSampler

In [None]:
config = PPOConfig(
    model_name="tiiuae/falcon-7b-instruct",
    learning_rate=1.41e-5,
)

sent_kwargs = {"return_all_scores": True, "function_to_apply": "none", "batch_size": 16}

In [None]:
import pandas as pd
from datasets import Dataset

def build_dataset(config, df, input_min_text_length=2, input_max_text_length=200):
    """
    Build dataset for training. This builds the dataset from `load_dataset`, one should
    customize this function to train the model on its own dataset.

    Args:
        dataset_name (`str`):
            The name of the dataset to be loaded.

    Returns:
        dataloader (`torch.utils.data.DataLoader`):
            The dataloader for the dataset.
    """
    tokenizer = AutoTokenizer.from_pretrained(config.model_name)
    tokenizer.pad_token = tokenizer.eos_token


    generated_examples = df
    column_to_drop = 'Unnamed: 0'
    generated_examples.drop(column_to_drop, axis=1, inplace=True)

    train_df, val_df = train_test_split(generated_examples, test_size=0.4, random_state=42)

    ds = Dataset.from_pandas(train_df)

    input_size = LengthSampler(input_min_text_length, input_max_text_length)

    def tokenize(sample):
        sample["input_ids"] = tokenizer.encode(sample["prompt"])[: input_size()]
        sample["query"] = tokenizer.decode(sample["input_ids"])
        return sample

    ds = ds.map(tokenize, batched=False)
    ds.set_format(type="torch")
    return ds

In [None]:
generated_examples = pd.read_csv('/content/new_df')
dataset = build_dataset(config,df=generated_examples)

In [None]:
dataset.remove_columns('__index_level_0__')

In [None]:
model = AutoModelForCausalLMWithValueHead.from_pretrained(config.model_name)
ref_model = AutoModelForCausalLMWithValueHead.from_pretrained(config.model_name)
tokenizer = AutoTokenizer.from_pretrained(config.model_name)

tokenizer.pad_token = tokenizer.eos_token

In [None]:
def collator(data):
    return dict((key, [d[key] for d in data]) for key in data[0])

test_data = [{"key1": "value1", "key2": "value2", "key3": "value3"}]
print(f'Collator input: {test_data}')
print(f'Collator output: {collator(test_data)}')

In [None]:
ppo_trainer = PPOTrainer(config, model, ref_model, tokenizer, dataset=dataset, data_collator=collator)

In [None]:
device = ppo_trainer.accelerator.device
if ppo_trainer.accelerator.num_processes == 1:
    device = 0 if torch.cuda.is_available() else "cpu"  # to avoid a `pipeline` bug

In [None]:
gen_kwargs = {"min_length": -1, "top_k": 0.0, "top_p": 1.0, "do_sample": True, "pad_token_id": tokenizer.eos_token_id}

In [None]:
from datasets import Dataset
from sklearn.model_selection import train_test_split

import pandas as pd
generated_examples = pd.read_csv('/content/new_df')
column_to_drop = 'Unnamed: 0'
generated_examples.drop(column_to_drop, axis=1, inplace=True)

train_df, val_df = train_test_split(generated_examples, test_size=0.4, random_state=42)

from datasets import Dataset
train_dataset = Dataset.from_pandas(train_df)
dataset=train_dataset.remove_columns('__index_level_0__')

In [None]:
dataset = dataset.rename_columns({"prompt": "review"})
dataset = dataset.map(lambda x: {"review": x["review"][:1000]}, batched=False)

In [None]:
txt_in_len = 5
txt_out_len = 32
seed = 1

dataset = dataset.map(
    lambda x: {"input_ids": tokenizer.encode(" " + x["answer2"], return_tensors="pt", truncation=True, padding="max_length", max_length=32)[0]},
    batched=False,
)
dataset = dataset.map(lambda x: {"query": tokenizer.decode(x["input_ids"])}, batched=False)
dataset = dataset[:20480]
from datasets import Dataset

dataset = Dataset.from_dict(dataset)
dataset.set_format("pytorch")

In [None]:
dataset

In [None]:
dataset = dataset.rename_column('answer2', 'chosen')
dataset = dataset.rename_column('answer1', 'rejected')

In [None]:
dataset

In [None]:
model = AutoModelForCausalLMWithValueHead.from_pretrained(config.model_name)
ref_model = AutoModelForCausalLMWithValueHead.from_pretrained("/content/reward_model")
tokenizer = AutoTokenizer.from_pretrained(config.model_name)

tokenizer.pad_token = tokenizer.eos_token

In [None]:
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, AutoModelForSeq2SeqLM, GenerationConfig
from datasets import load_dataset
from peft import PeftModel, PeftConfig, LoraConfig, TaskType

# trl: Transformer Reinforcement Learning library
from trl import PPOTrainer, PPOConfig, AutoModelForSeq2SeqLMWithValueHead
from trl import create_reference_model
from trl.core import LengthSampler

import torch


import numpy as np
import pandas as pd

# tqdm library makes the loops show a smart progress meter.
from tqdm import tqdm
tqdm.pandas()

In [None]:
learning_rate=1.41e-5
max_ppo_epochs=5
mini_batch_size=4
batch_size=1

config = PPOConfig(
    model_name="tiiuae/falcon-7b-instruct",
    learning_rate=learning_rate,
    ppo_epochs=max_ppo_epochs,
    mini_batch_size=mini_batch_size,
    batch_size=batch_size
)

ppo_trainer = PPOTrainer(config=config,
                         model=model,
                         ref_model=ref_model,
                         tokenizer=tokenizer,
                         dataset=dataset,
                         data_collator=collator)

In [None]:
def get_score(model, tokenizer, responses):
    positive_logist = []
    for i in responses:
        instructions = tokenizer.encode_plus(
                                           i,
                                           truncation=True,
                                          padding="max_length",
                                          max_length=512,
                                          return_tensors="pt",
                                      ).to("cuda:0")

        with torch.no_grad():
            outputs = model(**instructions)

        logits = outputs[0].mean()
        positive_logist.append(logits)

    return positive_logist

In [None]:
output_min_length = 10
output_max_length = 50
output_length_sampler = LengthSampler(output_min_length, output_max_length)

generation_kwargs = {
    "min_length": 5,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True
}

reward_kwargs = {
    "top_k": None,
    "function_to_apply": "none", # You want the raw logits without softmax.
    "batch_size": 5
}

max_ppo_steps = 10

for step, batch in tqdm(enumerate(ppo_trainer.dataloader)):
    # Break when you reach max_steps.
    if step >= max_ppo_steps:
        break

    prompt_tensors = batch["input_ids"]

    summary_tensors = []

    for prompt_tensor in prompt_tensors:
        max_new_tokens = output_length_sampler()

        generation_kwargs["max_new_tokens"] = max_new_tokens
        summary = ppo_trainer.generate(prompt_tensor, **generation_kwargs)

        summary_tensors.append(summary.squeeze()[-max_new_tokens:])

    # This needs to be called "response".
    batch["response"] = [tokenizer.decode(r.squeeze()) for r in summary_tensors]

    # Compute reward outputs.
    texts = [q + r for q, r in zip(batch["query"], batch["response"])]
    logits = get_score(model, tokenizer,texts)

    reward_tensors = [torch.tensor(logits)]

    # Run PPO step.
    stats = ppo_trainer.step(prompt_tensors, summary_tensors, reward_tensors)
    ppo_trainer.log_stats(stats, batch, reward_tensors)

    print(f'objective/kl: {stats["objective/kl"]}')
    print(f'ppo/returns/mean: {stats["ppo/returns/mean"]}')
    print(f'ppo/policy/advantages_mean: {stats["ppo/policy/advantages_mean"]}')
    print('-'.join('' for x in range(100)))

In [None]:
model.save_pretrained("rhlfmodel/")
tokenizer.save_pretrained("rhlfmodel/")

In [None]:
from transformers import pipeline, set_seed
model_path = "rhlfmodel/"
set_seed(42)
pipe = pipeline("text-generation",model=model_path, tokenizer=model_path, max_length=30, num_return_sequences=1)

In [None]:
text = dataset[11]
text

In [None]:
pipe(text['review'])