In [1]:
import torch
from tqdm import tqdm
import pandas as pd
import wandb
import numpy as np
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence
from sklearn.decomposition import PCA
# import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
# tqdm.pandas()

from transformers import pipeline, AutoTokenizer,GPT2Tokenizer, GPT2Model, AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset, DatasetDict
from sklearn.model_selection import train_test_split

from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead
from trl.core import LengthSampler
import os
from torch.optim.lr_scheduler import StepLR
import time


2024-03-14 04:07:28.604236: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.cuda.empty_cache()  # Releases all unused cached memory from PyTorch
# print(torch.cuda.memory_summary())

device

device(type='cuda')

In [3]:
config = PPOConfig(
#     model_name='google/gemma-2b-it',
    model_name='gpt2',
#     model_name="pgfeldman/Yelp_American",
    learning_rate=1.41e-5,
#     mini_batch_size=1,
#     gradient_accumulation_steps=1,
#     init_kl_coef=0.2,
#     cliprange_value=0.2,
#     vf_coef=0.1,
#     batch_size=32,
    batch_size=128,
#     mini_batch_size=4,
    optimize_device_cache=True,
#     early_stopping=True,
#     target_kl=1,
    ratio_threshold=20.0,
#     log_with="wandb",
)

In [4]:
scs_model = SentenceTransformer("sentence-t5-base")

In [6]:
os.environ["HF_TOKEN"] = "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"


tokenizer = AutoTokenizer.from_pretrained(config.model_name, padding_side='left')
model = AutoModelForCausalLMWithValueHead.from_pretrained(
    config.model_name,
    torch_dtype=torch.bfloat16,
).to(device)


Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [7]:
generation_kwargs = {
    "min_length": -1,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True,
    "pad_token_id": tokenizer.eos_token_id,
    "max_new_tokens": 50
}
tokenizer.pad_token = tokenizer.eos_token

In [8]:
rewrite_data_df = pd.read_csv('data/8000_data.csv')
rewrite_data_df = rewrite_data_df.dropna()

In [9]:
class CustomDataset(Dataset):
    def __init__(self, X_df, y_df, tokenizer):
        self.labels = y_df

        # Format the input texts by accessing the DataFrame's columns
        input_texts = [
            f"Given the input:\n {row['original_text']}\n and the resulting output:\n {row['rewritten_text']}.\n Determine the prompt used to generate the output: "
            for _, row in X_df.iterrows()
        ]
        
        tokenized_inputs = tokenizer(input_texts, padding='max_length', max_length=150, truncation=True, return_tensors="pt")

        self.input_ids = tokenized_inputs['input_ids']
        self.attention_masks = tokenized_inputs['attention_mask']
    
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_masks[idx],
            'labels': self.labels.iloc[idx]  # Assuming y_df is a Series or DataFrame column
        }


In [10]:
X = rewrite_data_df.drop('prompt', axis=1)
y = rewrite_data_df['prompt']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

ds = CustomDataset(X_train, y_train, tokenizer)

In [11]:
def collator(data):
    tes = dict((key, [d[key] for d in data]) for key in data[0])
    return tes


In [12]:
def batch_cosine_similarity(x1, x2):

    # Normalize the embeddings to unit vectors
    x1_norm = torch.nn.functional.normalize(x1, p=2, dim=-1)
    x2_norm = torch.nn.functional.normalize(x2, p=2, dim=-1)
    
    # Compute the cosine similarity
    cos_sim = torch.mm(x1_norm, x2_norm.transpose(0, 1))
    
    return cos_sim

def sharpened_cosine_similarity_batch(scs_model, output_texts, target_texts, sharpen_factor=3):
    # Assuming scs_model.encode() returns PyTorch tensors
    target_embeddings = scs_model.encode(target_texts, convert_to_tensor=True)
    output_embeddings = scs_model.encode(output_texts, convert_to_tensor=True)
    
    # Calculate batch cosine similarities using the previously defined batch_cosine_similarity function
    cos_sims = batch_cosine_similarity(target_embeddings, output_embeddings)
    
    # Apply the sharpening factor to each similarity score and create a list of tensors
    sharpened_scores = [cos_sims[i][i].unsqueeze(0) ** sharpen_factor for i in range(cos_sims.size(0))]
    
    return sharpened_scores



In [13]:
ppo_trainer = PPOTrainer(
    model=model,
    config=config,
    dataset=ds,
    tokenizer=tokenizer,
    data_collator=collator
)


Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [None]:
for epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)):
    query_tensors = batch["input_ids"]
    gen_start = time.time()
    responses = ppo_trainer.generate(query_tensors, **generation_kwargs,
                                    return_prompt=False)
    gen_time = time.time() - gen_start
    print(f"Generation time: {gen_time:.4f} seconds")

    decode_start = time.time()
    output_text = tokenizer.batch_decode(responses, skip_special_tokens=True)
    decode_time = time.time() - decode_start
    print(f"Decoding time: {decode_time:.4f} seconds")

    # Compute rewards in batch
    sentiment_start = time.time()
    rewards = sharpened_cosine_similarity_batch(scs_model, output_text, batch['labels'])
    sentiment_time = time.time() - sentiment_start
    print(f"Sentiment analysis time: {sentiment_time:.4f} seconds")

    # Run PPO sep with batch rewards
    ppo_start = time.time()
    stats = ppo_trainer.step(query_tensors, responses, rewards)
    ppo_trainer.log_stats(stats, batch, rewards)
    ppo_time = time.time() - ppo_start
    print(f"PPO step time: {ppo_time:.4f} seconds")
 
    # ppo_trainer.save_pretrained("model/gemma_final")

0it [00:00, ?it/s]You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Generation time: 21.1789 seconds
Decoding time: 0.0124 seconds
Sentiment analysis time: 0.4181 seconds


1it [07:06, 426.28s/it]

PPO step time: 404.6541 seconds
Generation time: 23.3518 seconds
Decoding time: 0.0118 seconds
Sentiment analysis time: 0.2872 seconds


2it [14:11, 425.83s/it]

PPO step time: 401.8596 seconds
Generation time: 21.4881 seconds
Decoding time: 0.0142 seconds
Sentiment analysis time: 0.2892 seconds


3it [21:19, 426.51s/it]

PPO step time: 405.5195 seconds
Generation time: 21.5807 seconds
Decoding time: 0.0134 seconds
Sentiment analysis time: 0.3177 seconds


4it [28:14, 422.05s/it]

PPO step time: 393.2847 seconds
Generation time: 21.6947 seconds
Decoding time: 0.0111 seconds
Sentiment analysis time: 0.3357 seconds


5it [35:23, 424.76s/it]

PPO step time: 407.5142 seconds
Generation time: 21.5503 seconds
Decoding time: 0.0134 seconds
Sentiment analysis time: 0.3273 seconds


6it [42:29, 424.96s/it]

PPO step time: 403.4681 seconds
Generation time: 21.4484 seconds
Decoding time: 0.0128 seconds
Sentiment analysis time: 0.3279 seconds


# Evaluation

In [None]:
custom_dataset = CustomDataset(X_test, y_test, tokenizer)
data_loader = DataLoader(custom_dataset, batch_size=5, shuffle=False)

for batch in data_loader:
    input_ids = list(torch.unbind(batch['input_ids'], dim=0))
    labels = batch['labels']
    responses = ppo_trainer.generate(input_ids, **generation_kwargs, return_prompt=False)
    
    decoded_prompts = tokenizer.batch_decode(responses, skip_special_tokens=True)
    target_prompts = labels
    
    print(decoded_prompts)
    print(target_prompts)

    similarity_scores = sharpened_cosine_similarity_batch(scs_model, decoded_prompts, target_prompts)


    print(similarity_scores)

    break


In [None]:
output_text = "Convert this into a sea shanty."
target_text = "Rewrite this with shanty style."
sharpened_cosine_similarity(scs_model, output_text, target_text)

In [None]:
token_vecs.shape, output_vecs.shape

In [None]:
a = np.array([['<p>This like betting your life savings on a game of black jack, win once, and then walk. Brilliant!</p >',
        'Convert this into a sea shanty.',
        "**Sure, here's the shanty:**\n\n(Verse 1)\nGather 'round me, me hearties, and listen to a tale\nOf a game of black jack, where fate's a wail\nWith a full hand's worth of savings, you're ready to play\nSo stake your dough upon the table, and dance to the bay.\n\n(Chorus)\nThis like betting on black jack, a game of high stakes\nWin once and then walk"]],
      dtype=object)

In [None]:
a[0][2]

In [None]:
input_text = f"Given the input:\n {a[0][0]}\n and the resulting output:\n {a[0][2]}.\n\n Determine the  prompt: "
input_text

In [None]:
inputs = tokenizer(input_text, return_tensors="pt")
inputs = {key: value.to(device) for key, value in inputs.items()}  # Move inputs to GPU
output = model.generate(**inputs, max_length=200)
print(tokenizer.decode(output[0]))
# responses = ppo_trainer.generate(query_tensors, **generation_kwargs,
#                                      length_sampler=output_length_sampler,
#                                     return_prompt=False)

In [None]:
output_text = "."
target_text = "Convert this into a sea shanty."
sharpened_cosine_similarity(scs_model, output_text, target_text)

In [None]:
inputs

In [None]:
%pip install torch tqdm pandas wandb numpy matplotlib scikit-learn transformers datasets trl

In [None]:
%pip install transformers -U