<a href="https://colab.research.google.com/github/dkleitsas/ppo_cartpole/blob/main/PPO_CartPole.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Hyperparameters
learning_rate = 0.0003
gamma = 0.99
lambda_gae = 0.97
eps_clip = 0.2
training_iterations = 30


class Actor(nn.Module):
    def __init__(self):
        super(Actor, self).__init__()
        self.fc1 = nn.Linear(4, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, 2)

        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.relu(self.fc3(x))
        action_probs = self.softmax(x)
        return action_probs


class Critic(nn.Module):
    def __init__(self):
        super(Critic, self).__init__()
        self.fc1 = nn.Linear(4, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, 1)

        self.relu = nn.ReLU()


    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.relu(self.fc3(x))
        return x


class PPO:
    def __init__(self):
        self.memory = Memory()
        self.actor = Actor().to(device)
        self.critic = Critic().to(device)
        self.actor_opt = optim.Adam(self.actor.parameters(), lr=learning_rate)
        self.critic_opt = optim.Adam(self.critic.parameters(), lr=learning_rate)
        self.MseLoss = nn.MSELoss()

    def select_action(self, state):
        state = torch.FloatTensor(state).to(device)

        with torch.no_grad():
            action_probs = self.actor(state)

        dist = Categorical(action_probs)
        action = dist.sample()
        return action.item(), dist.log_prob(action).item()

    def update(self):

        states = torch.stack(self.memory.states, dim=0).to(device)
        actions = torch.LongTensor(self.memory.actions).to(device)
        rewards = torch.FloatTensor(self.memory.rewards).to(device)
        old_logprobs = torch.FloatTensor(self.memory.logprobs).to(device)
        dones = torch.FloatTensor(self.memory.dones).to(device)


        state_values = self.critic(states)
        state_values = torch.cat((state_values.squeeze(), torch.tensor([0.0], device=device)))
        state_values = state_values.squeeze().detach()
        gaes = torch.zeros_like(rewards).to(device)
        returns = torch.zeros_like(rewards).to(device)
        advantage = 0


        for t in reversed(range(len(rewards))):
            delta = rewards[t] + gamma * (1 - dones[t]) * state_values[t + 1] - state_values[t]
            advantage = delta + gamma * lambda_gae * (1 - dones[t]) * advantage
            gaes[t] = advantage
            returns[t] = advantage + state_values[t]

        for _ in range(training_iterations):
            action_probs = self.actor(states)
            state_values = self.critic(states)
            dist = Categorical(action_probs)
            new_logprobs = dist.log_prob(actions)
            state_values = state_values.squeeze()

            ratios = torch.exp(new_logprobs - old_logprobs.detach())

            advantages = gaes
            surr1 = ratios * advantages
            surr2 = torch.clamp(ratios, 1 - eps_clip, 1 + eps_clip) * advantages
            loss_actor = -torch.min(surr1, surr2)
            loss_critic = self.MseLoss(state_values, returns)

            self.actor_opt.zero_grad()
            self.critic_opt.zero_grad()
            loss_actor.mean().backward()
            loss_critic.mean().backward()
            self.actor_opt.step()
            self.critic_opt.step()


class Memory:
    def __init__(self):
        self.states = []
        self.actions = []
        self.logprobs = []
        self.rewards = []
        self.dones = []

    def clear_memory(self):
        self.states = []
        self.actions = []
        self.logprobs = []
        self.rewards = []
        self.dones = []

    def push(self, state, action, logprob, reward, done):
        self.states.append(state)
        self.actions.append(action)
        self.logprobs.append(logprob)
        self.rewards.append(reward)
        self.dones.append(done)


In [None]:
import gym

agent = PPO()

env = gym.make("CartPole-v1")

rewards_past_x = []
consecutive_solves = 0

for episode in range(1000):  # Train for 1000 episodes
    state = env.reset()
    done = False
    total_reward = 0

    while not done:
        # Get the action from your PPO model
        action, log_prob = agent.select_action(state)  # Replace with your PPO's method

        # Step in the environment
        next_state, reward, done, info = env.step(action)
        agent.memory.push(torch.from_numpy(state), action, log_prob, reward, done)
        # Optionally: Render the environment (for visualization)

        state = next_state

        total_reward += reward

    rewards_past_x.append(total_reward)
    if (episode + 1) % 5 == 0:
        agent.update()
        agent.memory.clear_memory()
    if (episode + 1) % 10 == 0:
        average_reward = sum(rewards_past_x) / len(rewards_past_x)
        print(f"Episode {episode + 1}: Average Last 10 Reward = {average_reward}")
        if average_reward == 500.0:
            consecutive_solves += 1
            if consecutive_solves == 2:
                print("Cart Pole Solved\n Terminating...")
                break
        else:
          consecutive_solves = 0
        rewards_past_x = []


env.close()

Episode 10: Average Last 10 Reward = 16.3
Episode 20: Average Last 10 Reward = 18.7
Episode 30: Average Last 10 Reward = 35.2
Episode 40: Average Last 10 Reward = 32.7
Episode 50: Average Last 10 Reward = 31.1
Episode 60: Average Last 10 Reward = 34.2
Episode 70: Average Last 10 Reward = 76.5
Episode 80: Average Last 10 Reward = 110.9
Episode 90: Average Last 10 Reward = 155.7
Episode 100: Average Last 10 Reward = 176.0
Episode 110: Average Last 10 Reward = 194.4
Episode 120: Average Last 10 Reward = 139.1
Episode 130: Average Last 10 Reward = 116.9
Episode 140: Average Last 10 Reward = 149.5
Episode 150: Average Last 10 Reward = 191.3
Episode 160: Average Last 10 Reward = 286.1
Episode 170: Average Last 10 Reward = 258.0
Episode 180: Average Last 10 Reward = 283.7
Episode 190: Average Last 10 Reward = 342.6
Episode 200: Average Last 10 Reward = 349.2
Episode 210: Average Last 10 Reward = 448.1
Episode 220: Average Last 10 Reward = 462.9
Episode 230: Average Last 10 Reward = 451.2
Epis