In [1]:
import gym
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torch.distributions import Categorical
import os

In [2]:
# Hyperparameters
learning_rate = 0.0003
gamma = 0.99
lmbda = 0.95
eps_clip = 0.2
K_epochs = 4
T_horizon = 2000

In [3]:
# Policy Network
class ActorCritic(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(ActorCritic, self).__init__()
        self.fc1 = nn.Linear(state_dim, 256)
        self.fc2 = nn.Linear(256, 256)
        self.fc_pi = nn.Linear(256, action_dim)
        self.fc_v = nn.Linear(256, 1)
        
        self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)

    def pi(self, x):
        x = torch.tanh(self.fc1(x))
        x = torch.tanh(self.fc2(x))
        x = self.fc_pi(x)
        return Categorical(logits=x)
    
    def v(self, x):
        x = torch.tanh(self.fc1(x))
        x = torch.tanh(self.fc2(x))
        v = self.fc_v(x)
        return v


In [4]:
# PPO Agent
class PPO:
    def __init__(self, state_dim, action_dim):
        self.policy = ActorCritic(state_dim, action_dim)
        self.policy_old = ActorCritic(state_dim, action_dim)
        self.policy_old.load_state_dict(self.policy.state_dict())
        self.optimizer = self.policy.optimizer
        self.MseLoss = nn.MSELoss()

    def update(self, memory):
        rewards = []
        discounted_reward = 0
        for reward, is_terminal in zip(reversed(memory.rewards), reversed(memory.is_terminals)):
            if is_terminal:
                discounted_reward = 0
            discounted_reward = reward + (gamma * discounted_reward)
            rewards.insert(0, discounted_reward)

        rewards = torch.tensor(rewards, dtype=torch.float32)
        rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-7)

        old_states = torch.squeeze(torch.stack(memory.states).detach())
        old_actions = torch.squeeze(torch.stack(memory.actions).detach())
        old_logprobs = torch.squeeze(torch.stack(memory.logprobs).detach())

        for _ in range(K_epochs):
            logprobs, state_values, dist_entropy = self.evaluate(old_states, old_actions)

            ratios = torch.exp(logprobs - old_logprobs.detach())

            advantages = rewards - state_values.detach()
            surr1 = ratios * advantages
            surr2 = torch.clamp(ratios, 1 - eps_clip, 1 + eps_clip) * advantages

            loss = -torch.min(surr1, surr2) + 0.5 * self.MseLoss(state_values, rewards) - 0.01 * dist_entropy

            self.optimizer.zero_grad()
            loss.mean().backward()
            self.optimizer.step()

        self.policy_old.load_state_dict(self.policy.state_dict())

    def evaluate(self, state, action):
        state_value = self.policy.v(state)
        dist = self.policy.pi(state)
        action_logprobs = dist.log_prob(action)
        dist_entropy = dist.entropy()
        return action_logprobs, torch.squeeze(state_value), dist_entropy

    def save(self, checkpoint_path):
        torch.save(self.policy.state_dict(), checkpoint_path)
        print(f"Model saved to {checkpoint_path}")

    def load(self, checkpoint_path):
        self.policy.load_state_dict(torch.load(checkpoint_path))
        self.policy_old.load_state_dict(self.policy.state_dict())
        print(f"Model loaded from {checkpoint_path}")


In [5]:
# Memory
class Memory:
    def __init__(self):
        self.states = []
        self.actions = []
        self.logprobs = []
        self.rewards = []
        self.is_terminals = []

    def clear_memory(self):
        del self.states[:]
        del self.actions[:]
        del self.logprobs[:]
        del self.rewards[:]
        del self.is_terminals[:]


In [6]:
# Normalize state
def normalize_state(state):
    return (state - np.mean(state)) / (np.std(state) + 1e-8)


In [7]:
env_name = "LunarLander-v2"
env = gym.make(env_name)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

memory = Memory()
ppo = PPO(state_dim, action_dim)

  deprecation(
  deprecation(


In [8]:
print_interval = 20
max_episodes = 5000  # Increase the number of episodes
max_timesteps = 300

running_reward = 0
avg_length = 0

In [9]:
for episode in range(1, max_episodes+1):
    state = env.reset()
    state = normalize_state(state)  # Normalize state
    for t in range(max_timesteps):
        state_tensor = torch.FloatTensor(state).unsqueeze(0)
        dist = ppo.policy_old.pi(state_tensor)
        action = dist.sample()

        next_state, reward, done, _ = env.step(action.item())
        next_state = normalize_state(next_state)  # Normalize next state

        memory.states.append(state_tensor)
        memory.actions.append(action)
        memory.logprobs.append(dist.log_prob(action))
        memory.rewards.append(reward)
        memory.is_terminals.append(done)

        state = next_state

        if done:
            break

    ppo.update(memory)
    memory.clear_memory()
    
    running_reward += reward
    avg_length += t

    if episode % print_interval == 0:
        avg_length = int(avg_length/print_interval)
        running_reward = int((running_reward/print_interval))

        print(f'Episode {episode} \t avg length: {avg_length} \t reward: {running_reward}')
        running_reward = 0
        avg_length = 0
    
    # if episode % 50 == 0:
    #     ppo.save(checkpoint_path)

env.close()


  if not isinstance(terminated, (bool, np.bool8)):


Episode 20 	 avg length: 112 	 reward: -100
Episode 40 	 avg length: 138 	 reward: -100
Episode 60 	 avg length: 175 	 reward: -88
Episode 80 	 avg length: 171 	 reward: -95
Episode 100 	 avg length: 184 	 reward: -80
Episode 120 	 avg length: 224 	 reward: -50
Episode 140 	 avg length: 133 	 reward: -90
Episode 160 	 avg length: 175 	 reward: -65
Episode 180 	 avg length: 186 	 reward: -64
Episode 200 	 avg length: 216 	 reward: -44
Episode 220 	 avg length: 196 	 reward: -54
Episode 240 	 avg length: 219 	 reward: -58
Episode 260 	 avg length: 257 	 reward: -35
Episode 280 	 avg length: 252 	 reward: -29
Episode 300 	 avg length: 229 	 reward: -34
Episode 320 	 avg length: 253 	 reward: -34
Episode 340 	 avg length: 275 	 reward: -30
Episode 360 	 avg length: 192 	 reward: -65
Episode 380 	 avg length: 279 	 reward: -14
Episode 400 	 avg length: 222 	 reward: -55
Episode 420 	 avg length: 255 	 reward: -30
Episode 440 	 avg length: 220 	 reward: -44
Episode 460 	 avg length: 278 	 re

In [10]:
path = 'models/ppo_more_lunar_lander.pth'

In [11]:
ppo.save(path)

Model saved to models/ppo_more_lunar_lander.pth
