In [None]:
# LLM SFT using TRL - Transformer Reinforcement Learning

In [None]:
# 0. imports
import torch
from transformers import GPT2Tokenizer

from trl import AutoModelForCausalLMWithValueHead, PPOConfig, PPOTrainer


# 1. load a pretrained model
model = AutoModelForCausalLMWithValueHead.from_pretrained("gpt2")
model_ref = AutoModelForCausalLMWithValueHead.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

# 2. initialize trainer
ppo_config = {"mini_batch_size": 1, "batch_size": 1}
config = PPOConfig(**ppo_config)
ppo_trainer = PPOTrainer(config, model, model_ref, tokenizer)

# 3. encode a query
query_txt = "This morning I went to the "
query_tensor = tokenizer.encode(query_txt, return_tensors="pt").to(model.pretrained_model.device)

# 4. generate model response
generation_kwargs = {
    "min_length": -1,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True,
    "pad_token_id": tokenizer.eos_token_id,
    "max_new_tokens": 20,
}
response_tensor = ppo_trainer.generate([item for item in query_tensor], return_prompt=False, **generation_kwargs)
response_txt = tokenizer.decode(response_tensor[0])

# 5. define a reward for response
# (this could be any reward such as human feedback or output from another model)
reward = [torch.tensor(1.0, device=model.pretrained_model.device)]

# 6. train model with ppo
train_stats = ppo_trainer.step([query_tensor[0]], [response_tensor[0]], reward)

In [None]:

# .. Let's assume we have a trained model using `PPOTrainer` and `AutoModelForCausalLMWithValueHead`

# push the model on the Hub
model.push_to_hub("my-fine-tuned-model-ppo")

# or save it locally
model.save_pretrained("my-fine-tuned-model-ppo")

# load the model from the Hub
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("my-fine-tuned-model-ppo")

In [None]:
# LLM RLHF

In [None]:
# Data Annotation

In [None]:
import pandas as pd

# Sample text data
data = {
    'text': [
        "This movie is great!",
        "I didn't like the ending.",
        "The acting was fantastic.",
        "The plot was confusing.",
        "It's a masterpiece."
    ]
}

# Create a DataFrame from the text data
df = pd.DataFrame(data)
print(df)

# Function to annotate sentiment polarity
def annotate_sentiment(text):
    # Perform sentiment analysis or manual annotation
    # Here, we simply assign 'positive' or 'negative' based on keywords
    if 'great' in text or 'fantastic' in text or 'masterpiece' in text:
        return 'positive'
    elif 'didn\'t like' in text or 'confusing' in text:
        return 'negative'
    else:
        return 'neutral'

# Annotate sentiment for each text in the DataFrame
df['sentiment'] = df['text'].apply(annotate_sentiment)

# Print the annotated DataFrame
print(df)


In [None]:
#sample python code for synthetic data generation using  Reinforcement Learning Augmented Intelligent Fabrication


In [None]:
import numpy as np

class Environment:
    def __init__(self):
        self.data_distribution = [0.8, 0.2]  # Class distribution for the synthetic data
        self.observation_space = 2  # Dimensionality of the observations
        self.action_space = 1  # Number of possible actions (binary classification)

    def reset(self):
        return np.random.randn(self.observation_space)  # Reset the environment to a new observation

    def step(self, action):
        # Generate a synthetic data point based on the action
        if action == 0:  # Action 0 corresponds to class 0
            observation = np.random.multivariate_normal([-1, -1], [[1, 0], [0, 1]])
            reward = 1 if np.random.rand() < self.data_distribution[0] else 0
        else:  # Action 1 corresponds to class 1
            observation = np.random.multivariate_normal([1, 1], [[1, 0], [0, 1]])
            reward = 1 if np.random.rand() < self.data_distribution[1] else 0
        return observation, reward

class Agent:
    def __init__(self, learning_rate=0.1, discount_factor=0.95, exploration_rate=0.1):
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.exploration_rate = exploration_rate
        self.q_table = np.zeros((2, 2))  # Q-table: state-action values

    def choose_action(self, observation):
        if np.random.rand() < self.exploration_rate:
            return np.random.choice([0, 1])  # Random action (exploration)
        else:
            return np.argmax(self.q_table[np.argmax(observation)])  # Greedy action selection

    def update_q_table(self, observation, action, reward, next_observation):
        best_next_action = np.argmax(self.q_table[np.argmax(next_observation)])
        td_target = reward + self.discount_factor * self.q_table[np.argmax(next_observation)][best_next_action]
        td_error = td_target - self.q_table[np.argmax(observation)][action]
        self.q_table[np.argmax(observation)][action] += self.learning_rate * td_error

# Initialize the environment and agent
env = Environment()
agent = Agent()

# Training loop
num_episodes = 1000
for episode in range(num_episodes):
    observation = env.reset()
    for _ in range(100):  # Limiting the number of steps per episode
        action = agent.choose_action(observation)
        next_observation, reward = env.step(action)
        agent.update_q_table(observation, action, reward, next_observation)
        observation = next_observation

# Generate synthetic data
synthetic_data = []
for _ in range(1000):
    observation = env.reset()
    action = agent.choose_action(observation)
    synthetic_data.append((observation, action))

# Example usage of synthetic data
for data_point in synthetic_data[:10]:
    print("Observation:", data_point[0], "Action:", data_point[1])
