In [1]:
import gym
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torch.distributions import Categorical
import os

In [2]:
# Hyperparameters
learning_rate = 0.0003
gamma = 0.99
lmbda = 0.95
eps_clip = 0.2
K_epochs = 4
T_horizon = 2000

In [3]:
# Policy Network
class ActorCritic(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(ActorCritic, self).__init__()
        self.fc1 = nn.Linear(state_dim, 256)
        self.fc2 = nn.Linear(256, 256)
        self.fc_pi = nn.Linear(256, action_dim)
        self.fc_v = nn.Linear(256, 1)
        
        self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)

    def pi(self, x):
        x = torch.tanh(self.fc1(x))
        x = torch.tanh(self.fc2(x))
        x = self.fc_pi(x)
        return Categorical(logits=x)
    
    def v(self, x):
        x = torch.tanh(self.fc1(x))
        x = torch.tanh(self.fc2(x))
        v = self.fc_v(x)
        return v


In [4]:
# PPO Agent
class PPO:
    def __init__(self, state_dim, action_dim):
        self.policy = ActorCritic(state_dim, action_dim)
        self.policy_old = ActorCritic(state_dim, action_dim)
        self.policy_old.load_state_dict(self.policy.state_dict())
        self.optimizer = self.policy.optimizer
        self.MseLoss = nn.MSELoss()

    def update(self, memory):
        rewards = []
        discounted_reward = 0
        for reward, is_terminal in zip(reversed(memory.rewards), reversed(memory.is_terminals)):
            if is_terminal:
                discounted_reward = 0
            discounted_reward = reward + (gamma * discounted_reward)
            rewards.insert(0, discounted_reward)

        rewards = torch.tensor(rewards, dtype=torch.float32)
        rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-7)

        old_states = torch.squeeze(torch.stack(memory.states).detach())
        old_actions = torch.squeeze(torch.stack(memory.actions).detach())
        old_logprobs = torch.squeeze(torch.stack(memory.logprobs).detach())

        for _ in range(K_epochs):
            logprobs, state_values, dist_entropy = self.evaluate(old_states, old_actions)

            ratios = torch.exp(logprobs - old_logprobs.detach())

            advantages = rewards - state_values.detach()
            surr1 = ratios * advantages
            surr2 = torch.clamp(ratios, 1 - eps_clip, 1 + eps_clip) * advantages

            loss = -torch.min(surr1, surr2) + 0.5 * self.MseLoss(state_values, rewards) - 0.01 * dist_entropy

            self.optimizer.zero_grad()
            loss.mean().backward()
            self.optimizer.step()

        self.policy_old.load_state_dict(self.policy.state_dict())

    def evaluate(self, state, action):
        state_value = self.policy.v(state)
        dist = self.policy.pi(state)
        action_logprobs = dist.log_prob(action)
        dist_entropy = dist.entropy()
        return action_logprobs, torch.squeeze(state_value), dist_entropy

    def save(self, checkpoint_path):
        torch.save(self.policy.state_dict(), checkpoint_path)
        print(f"Model saved to {checkpoint_path}")

    def load(self, checkpoint_path):
        self.policy.load_state_dict(torch.load(checkpoint_path))
        self.policy_old.load_state_dict(self.policy.state_dict())
        print(f"Model loaded from {checkpoint_path}")


In [5]:
# Memory
class Memory:
    def __init__(self):
        self.states = []
        self.actions = []
        self.logprobs = []
        self.rewards = []
        self.is_terminals = []

    def clear_memory(self):
        del self.states[:]
        del self.actions[:]
        del self.logprobs[:]
        del self.rewards[:]
        del self.is_terminals[:]


In [6]:
# Normalize state
def normalize_state(state):
    return (state - np.mean(state)) / (np.std(state) + 1e-8)


In [7]:
env_name = "LunarLander-v2"
env = gym.make(env_name,render_mode='human')
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

memory = Memory()
ppo = PPO(state_dim, action_dim)

  deprecation(
  deprecation(


In [8]:
print_interval = 20
episodes = 10  # Increase the number of episodes
max_timesteps = 300

running_reward = 0
avg_length = 0

In [9]:
path = 'models/ppo_more_lunar_lander.pth'

In [10]:
ppo.load(path)

Model loaded from models/ppo_more_lunar_lander.pth


In [11]:
for episode in range(episodes):
    state = env.reset()
    state = normalize_state(state)
    done = False
    total_reward = 0
    while not done:
        state_tensor = torch.FloatTensor(state).unsqueeze(0)
        dist = ppo.policy_old.pi(state_tensor)
        action = dist.sample()
        state, reward, done, _ = env.step(action.item())
        state = normalize_state(state)
        total_reward += reward
        # env.render()
    print(f"Episode {episode + 1}: Total Reward: {total_reward}")
env.close()

  if not isinstance(terminated, (bool, np.bool8)):


Episode 1: Total Reward: 261.28045012592736
Episode 2: Total Reward: 239.1605007664963
Episode 3: Total Reward: 70.95619486131955
Episode 4: Total Reward: 266.8247369715348
Episode 5: Total Reward: 267.5044659699455
Episode 6: Total Reward: 17.76187887019816
Episode 7: Total Reward: 313.2516837605966
Episode 8: Total Reward: 262.7629767590495
Episode 9: Total Reward: 37.256795733321155
Episode 10: Total Reward: 18.25462927445055
