In [None]:
pip install trl accelerate

Collecting trl
  Downloading trl-0.12.0-py3-none-any.whl.metadata (10 kB)
Collecting datasets>=2.21.0 (from trl)
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting transformers>=4.46.0 (from trl)
  Downloading transformers-4.46.1-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets>=2.21.0->trl)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets>=2.21.0->trl)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets>=2.21.0->trl)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets>=2.21.0->trl)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Collecting tokenizers<0.2

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from trl import PPOTrainer, PPOConfig
from trl.models import AutoModelForCausalLMWithValueHead
from trl.core import set_seed
import os

# Set a random seed for reproducibility
set_seed(42)

# Set device
device = torch.device("cuda")

# Specify the model checkpoint and create output directory
checkpoint = "HuggingFaceTB/SmolLM2-135M"
output_dir = "./ppo_logs"
os.makedirs(output_dir, exist_ok=True)

# Load the tokenizer and model, then wrap it with a value head
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
# Set pad token to eos token if not set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

base_model = AutoModelForCausalLM.from_pretrained(
    checkpoint,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    pad_token_id=tokenizer.pad_token_id
).to(device)

# Wrap the model with a value head for PPO
model = AutoModelForCausalLMWithValueHead.from_pretrained(base_model).to(device)

# Define batch sizes
BATCH_SIZE = 2  # Smaller batch size for easier handling

# Define PPO configuration
ppo_config = PPOConfig(
    learning_rate=1e-5,
    mini_batch_size=1,
    batch_size=BATCH_SIZE
)

# Initialize PPO trainer
ppo_trainer = PPOTrainer(
    config=ppo_config,
    model=model,
    tokenizer=tokenizer,
    dataset=None
)

# Reward function (example) - Customize this based on your criteria
def reward_function(prompt, response):
    # Dummy example: reward longer responses, customize as needed
    reward = len(response) / 100
    return reward

# Training loop
prompts = ["Explain gravity", "How does photosynthesis work?", "What is quantum computing?"]
for epoch in range(10):  # Run for multiple epochs to fine-tune
    # Process prompts in batches
    for i in range(0, len(prompts), BATCH_SIZE):
        batch_prompts = prompts[i:i + BATCH_SIZE]

        # If the last batch is incomplete, pad it with the first prompt
        while len(batch_prompts) < BATCH_SIZE:
            batch_prompts.append(prompts[0])

        query_tensors = []
        response_tensors = []
        reward_tensors = []

        # Process each prompt in the batch
        for prompt in batch_prompts:
            # Encode the prompt with attention mask
            query_tensor = tokenizer(
                prompt,
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=512,
                return_attention_mask=True
            ).to(device)

            # Generate response with the model
            response_ids = model.generate(
                query_tensor.input_ids,
                attention_mask=query_tensor.attention_mask,
                max_length=100,
                num_return_sequences=1,
                pad_token_id=tokenizer.pad_token_id,
            )

            # Decode response
            response = tokenizer.decode(response_ids[0], skip_special_tokens=True)

            # Encode response
            response_tensor = tokenizer(
                response,
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=512,
                return_attention_mask=True
            ).to(device)

            # Calculate reward
            reward_value = reward_function(prompt, response)
            reward_tensor = torch.tensor([reward_value], device=device)

            # Append to batch lists
            query_tensors.append(query_tensor.input_ids[0])
            response_tensors.append(response_tensor.input_ids[0])
            reward_tensors.append(reward_tensor)

        # Run PPO step to update the model with the accumulated batch
        ppo_trainer.step(
            query_tensors,
            response_tensors,
            reward_tensors
        )

        # Save the model periodically
        if epoch % 5 == 0:
            model.save_pretrained(os.path.join(output_dir, f"checkpoint-{epoch}"))

    print(f"Epoch {epoch+1} completed")

# Test the fine-tuned model
test_prompt = "Gravity is"
test_inputs = tokenizer(
    test_prompt,
    return_tensors="pt",
    padding=True,
    truncation=True,
    max_length=512,
    return_attention_mask=True
).to(device)

output_ids = model.generate(
    test_inputs.input_ids,
    attention_mask=test_inputs.attention_mask,
    max_length=100,
    pad_token_id=tokenizer.pad_token_id
)
output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print(output)

# Save the final model
model.save_pretrained(os.path.join(output_dir, "final_model"))

We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


Epoch 1 completed




Epoch 2 completed




Epoch 3 completed




Epoch 4 completed




Epoch 5 completed




Epoch 6 completed




Epoch 7 completed




Epoch 8 completed
Epoch 9 completed
Epoch 10 completed
Gravity is a force that pulls objects towards each other. It is the force that keeps us on the ground.

The Earth is a sphere, so the force of gravity is the same on all objects.

The Earth is a sphere, so the force of gravity is the same on all objects.

The Earth is a sphere, so the force of gravity is the same on all objects.

The Earth is a sphere, so the force of gravity is the same on


In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque, namedtuple
import random
from ale_py import ALEInterface, roms
import cv2
import matplotlib.pyplot as plt
from tqdm import tqdm
import copy

# Define Gaussian Probability Layer (GPL) for probabilistic "twistronics" effect
class GaussianProbabilityLayer(nn.Module):
    def __init__(self, std_dev=0.1):
        super(GaussianProbabilityLayer, self).__init__()
        self.std_dev = std_dev

    def forward(self, x):
        if self.training:
            noise = torch.randn_like(x) * self.std_dev
            return x + noise
        return x

# Define NoisyLinear for exploration-exploitation tradeoff
class NoisyLinear(nn.Module):
    def __init__(self, in_features, out_features, std_init=0.5):
        super(NoisyLinear, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.weight_mu = nn.Parameter(torch.FloatTensor(out_features, in_features))
        self.weight_sigma = nn.Parameter(torch.FloatTensor(out_features, in_features))
        self.register_buffer('weight_epsilon', torch.FloatTensor(out_features, in_features))
        self.bias_mu = nn.Parameter(torch.FloatTensor(out_features))
        self.bias_sigma = nn.Parameter(torch.FloatTensor(out_features))
        self.register_buffer('bias_epsilon', torch.FloatTensor(out_features))
        self.std_init = std_init
        self.reset_parameters()
        self.reset_noise()

    def reset_parameters(self):
        mu_range = 1 / np.sqrt(self.in_features)
        self.weight_mu.data.uniform_(-mu_range, mu_range)
        self.weight_sigma.data.fill_(self.std_init / np.sqrt(self.in_features))
        self.bias_mu.data.uniform_(-mu_range, mu_range)
        self.bias_sigma.data.fill_(self.std_init / np.sqrt(self.out_features))

    def reset_noise(self):
        epsilon_in = self._scale_noise(self.in_features)
        epsilon_out = self._scale_noise(self.out_features)
        self.weight_epsilon.copy_(epsilon_out.ger(epsilon_in))
        self.bias_epsilon.copy_(self._scale_noise(self.out_features))

    def forward(self, input):
        if self.training:
            weight = self.weight_mu + self.weight_sigma * self.weight_epsilon
            bias = self.bias_mu + self.bias_sigma * self.bias_epsilon
        else:
            weight = self.weight_mu
            bias = self.bias_mu
        return nn.functional.linear(input, weight, bias)

    @staticmethod
    def _scale_noise(size):
        x = torch.randn(size)
        return x.sign().mul_(x.abs().sqrt_())

class DuelingDQN(nn.Module):
    def __init__(self, input_shape, n_actions):
        super(DuelingDQN, self).__init__()

        self.conv = nn.Sequential(
            nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU()
        )

        # Dynamically determine the output size of the convolutional layers
        conv_out_size = self._get_conv_out(input_shape)

        # Apply GaussianProbabilityLayer between convolutional and fully connected layers
        self.gpl1 = GaussianProbabilityLayer(std_dev=0.1)

        # Value stream
        self.fc_value = nn.Sequential(
            NoisyLinear(conv_out_size, 512),
            nn.ReLU(),
            self.gpl1,
            NoisyLinear(512, 1)
        )

        # Advantage stream
        self.fc_advantage = nn.Sequential(
            NoisyLinear(conv_out_size, 512),
            nn.ReLU(),
            GaussianProbabilityLayer(std_dev=0.05),
            NoisyLinear(512, n_actions)
        )

    def _get_conv_out(self, shape):
        # Generate a dummy input to pass through conv layers and calculate output size
        dummy_input = torch.zeros(1, *shape)
        o = self.conv(dummy_input)
        return int(np.prod(o.size()))

    def forward(self, x):
        # Check the input shape
        print(f"Input shape to DuelingDQN forward: {x.shape}")

        conv_out = self.conv(x).view(x.size()[0], -1)

        # Print the shape of conv_out for debugging
        print(f"Shape after conv layers (conv_out): {conv_out.shape}")

        conv_out = self.gpl1(conv_out)
        value = self.fc_value(conv_out)
        advantage = self.fc_advantage(conv_out)
        return value + (advantage - advantage.mean(dim=1, keepdim=True))

# Define the Experience tuple for replay buffer
Experience = namedtuple('Experience', ['state', 'action', 'reward', 'next_state', 'done'])

# Define the Prioritized Replay Buffer
class PrioritizedReplayBuffer:
    def __init__(self, capacity, alpha=0.6):
        self.alpha = alpha
        self.capacity = capacity
        self.buffer = deque(maxlen=capacity)
        self.priorities = deque(maxlen=capacity)

    def push(self, state, action, reward, next_state, done):
        max_priority = max(self.priorities, default=1.0)
        self.buffer.append(Experience(state, action, reward, next_state, done))
        self.priorities.append(float(max_priority))

    def sample(self, batch_size, beta=0.4):
        if len(self.buffer) < batch_size:
            return None

        priorities = np.array(list(self.priorities), dtype=np.float32)
        probabilities = priorities ** self.alpha
        probabilities /= probabilities.sum()

        indices = random.choices(range(len(self.buffer)), k=batch_size, weights=probabilities)
        experiences = [self.buffer[idx] for idx in indices]

        weights = (len(self.buffer) * probabilities[indices]) ** (-beta)
        weights /= weights.max()

        states = torch.stack([exp.state for exp in experiences])
        actions = torch.tensor([exp.action for exp in experiences], dtype=torch.long)
        rewards = torch.tensor([exp.reward for exp in experiences], dtype=torch.float)
        next_states = torch.stack([exp.next_state for exp in experiences])
        dones = torch.tensor([exp.done for exp in experiences], dtype=torch.float)
        weights = torch.tensor(weights, dtype=torch.float)

        return states, actions, rewards, next_states, dones, indices, weights

    def update_priorities(self, indices, priorities):
        for idx, priority in zip(indices, priorities):
            self.priorities[idx] = float(priority.item())

    def __len__(self):
        return len(self.buffer)

# Preprocess the state from the environment
def preprocess_state(state):
    gray = cv2.cvtColor(state, cv2.COLOR_RGB2GRAY)
    resized = cv2.resize(gray, (84, 84), interpolation=cv2.INTER_AREA)
    processed = torch.tensor(resized, dtype=torch.float32).unsqueeze(0) / 255.0
    return processed

# Define the Swarm Member for individual DQN agents
class SwarmMember:
    def __init__(self, state_shape, n_actions, device, id):
        self.id = id
        self.device = device
        self.state_shape = state_shape
        self.n_actions = n_actions

        self.policy_net = DuelingDQN(state_shape, n_actions).to(device)
        self.target_net = DuelingDQN(state_shape, n_actions).to(device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()

        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=0.0001)
        self.epsilon = 1.0
        self.epsilon_min = 0.1
        self.epsilon_decay = 0.9995

        self.personal_best_reward = float('-inf')
        self.personal_best_weights = copy.deepcopy(self.policy_net.state_dict())

    def update_personal_best(self, episode_reward):
        if episode_reward > self.personal_best_reward:
            self.personal_best_reward = episode_reward
            self.personal_best_weights = copy.deepcopy(self.policy_net.state_dict())
            return True
        return False

    def get_suggested_action(self, state):
        # Epsilon-greedy policy for exploration
        if random.random() < self.epsilon:
            return random.randint(0, self.n_actions - 1)
        else:
            with torch.no_grad():
                return self.policy_net(state).argmax(dim=1).item()

# Define the Swarm DQN with Qbert environment
class SwarmDQN:
    def __init__(self, state_shape, n_actions, swarm_size=5, device="cuda" if torch.cuda.is_available() else "cpu"):
        self.device = device
        self.swarm_size = swarm_size
        self.state_shape = state_shape
        self.n_actions = n_actions

        self.swarm = [SwarmMember(state_shape, n_actions, device, i) for i in range(swarm_size)]

        self.memory = PrioritizedReplayBuffer(100000)
        self.batch_size = 32
        self.gamma = 0.99

        self.global_best_reward = float('-inf')
        self.global_best_weights = copy.deepcopy(self.swarm[0].policy_net.state_dict())

    def select_action(self, state, member_idx):
        member = self.swarm[member_idx]

        suggested_action = member.get_suggested_action(state.squeeze().cpu())
        if random.random() < member.epsilon:
            if suggested_action is not None and random.random() < 0.7:
                return suggested_action
            return random.randrange(self.n_actions)
        else:
            with torch.no_grad():
                q_values = member.policy_net(state)
                if suggested_action is not None:
                    q_values[0][suggested_action] += 0.1
                return torch.argmax(q_values).item()

    def calculate_reward(self, raw_reward, suggested_action, taken_action):
        reward = raw_reward
        if suggested_action is not None and suggested_action == taken_action:
            reward += 0.1
        return reward

    def update_member(self, member_idx, batch):
        member = self.swarm[member_idx]
        states, actions, rewards, next_states, dones, indices, weights = batch

        states = states.view(self.batch_size, -1, 84, 84).to(self.device)
        next_states = next_states.view(self.batch_size, -1, 84, 84).to(self.device)

        actions = actions.to(self.device)
        rewards = rewards.to(self.device)
        dones = dones.to(self.device)
        weights = weights.to(self.device)

        # Forward pass only
        with torch.no_grad():
            current_q_values = member.policy_net(states).gather(1, actions.unsqueeze(1))
            next_actions = member.policy_net(next_states).max(1)[1]
            next_q_values = member.target_net(next_states).gather(1, next_actions.unsqueeze(1)).squeeze(1)
            target_q_values = rewards + (self.gamma * next_q_values * (1.0 - dones))

        td_errors = torch.abs(current_q_values - target_q_values.unsqueeze(1)).detach().cpu().numpy()
        self.memory.update_priorities(indices, td_errors.squeeze())

    def update_global_best(self, member_idx, episode_reward):
        if episode_reward > self.global_best_reward:
            self.global_best_reward = episode_reward
            self.global_best_weights = copy.deepcopy(self.swarm[member_idx].policy_net.state_dict())
            return True
        return False

# Train the Swarm Agent for Qbert
def train_swarm_agent(episodes=1000, render_frequency=100, score_threshold=50, hit_threshold=20):
    ale = ALEInterface()
    ale.setInt('random_seed', 123)
    ale.setBool('sound', False)
    ale.setBool('display_screen', True)
    ale.setFloat('repeat_action_probability', 0.0)
    ale.loadROM(roms.get_rom_path("qbert"))

    actions = ale.getMinimalActionSet()
    state_shape = (4, 84, 84)
    swarm = SwarmDQN(state_shape, len(actions))

    episode_rewards = []
    swarm_rewards = [[] for _ in range(swarm.swarm_size)]

    for episode in tqdm(range(1, episodes + 1)):
        member_idx = random.randrange(swarm.swarm_size)
        member = swarm.swarm[member_idx]

        ale.reset_game()
        total_reward = 0
        done = False

        state_stack = deque([preprocess_state(ale.getScreenRGB()) for _ in range(4)], maxlen=4)

        while not done:
            stacked_state = torch.cat(list(state_stack), dim=0).unsqueeze(0).to(swarm.device)

            action_idx = swarm.select_action(stacked_state, member_idx)
            reward = 0

            for _ in range(4):
                reward += ale.act(actions[action_idx])
                if ale.game_over():
                    done = True
                    break

            next_state = preprocess_state(ale.getScreenRGB()) if not done else state_stack[-1].clone()
            state_stack.append(next_state)

            stacked_next_state = torch.cat(list(state_stack), dim=0).unsqueeze(0).to(swarm.device)

            swarm.memory.push(stacked_state, action_idx, float(reward), stacked_next_state, float(done))

            if len(swarm.memory) >= swarm.batch_size:
                batch = swarm.memory.sample(swarm.batch_size)
                if batch is not None:
                    swarm.update_member(member_idx, batch)

            total_reward += reward

            if episode % render_frequency == 0:
                screen = ale.getScreenRGB()
                cv2.imshow('Qbert', cv2.cvtColor(screen, cv2.COLOR_RGB2BGR))
                if cv2.waitKey(1) & 0xFF == ord('q'):
                    break

        episode_rewards.append(total_reward)
        swarm_rewards[member_idx].append(total_reward)

        personal_best_updated = member.update_personal_best(total_reward)
        if personal_best_updated:
            swarm.update_global_best(member_idx, total_reward)

        member.epsilon = max(member.epsilon_min, member.epsilon * member.epsilon_decay)

        if episode % 10 == 0:
            avg_reward = np.mean(episode_rewards[-10:])
            print(f"Episode {episode}, Average Reward: {avg_reward:.2f}, Member {member_idx} Epsilon: {member.epsilon:.2f}")

    cv2.destroyAllWindows()
    return swarm, episode_rewards, swarm_rewards

if __name__ == "__main__":
    try:
        swarm, episode_rewards, swarm_rewards = train_swarm_agent()
        plt.figure(figsize=(15, 5))

        plt.subplot(1, 2, 1)
        plt.plot(episode_rewards)
        plt.title("Overall Training Progress")
        plt.xlabel("Episode")
        plt.ylabel("Reward")

        plt.subplot(1, 2, 2)
        for i, rewards in enumerate(swarm_rewards):
            plt.plot(rewards, label=f"Member {i}")
        plt.title("Individual Swarm Member Performance")
        plt.xlabel("Episode")
        plt.ylabel("Reward")
        plt.legend()
        plt.show()

    except Exception as e:
        print("An error occurred:", e)

AttributeError: 'NoneType' object has no attribute 'modules'