In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import gymnasium as gym

In [2]:
class Policy(nn.Module):
    def __init__(self, n_obs, n_actions):
        super().__init__()
        self.stream = nn.Sequential(
            nn.Linear(n_obs, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, n_actions),
            nn.Softmax(dim = -1)
        )

    def forward(self, x):
        return self.stream(x)

In [3]:
env = gym.make("CartPole-v1")
n_obs = env.observation_space.shape[0]
n_actions = env.action_space.n

In [33]:
# Train every episode
def get_episode_Gt(rewards, gamma):
    R = 0
    returns = []
    for r in reversed(rewards):
        R = r + gamma * R
        returns.insert(0, R)
    return returns

def train(env, policy, optimizer, episodes = 500, gamma = 0.99):    
    for episode in range(episodes):
        rewards, log_probs = [], []
        state, _ = env.reset()

        done = False
        while not done: # play episode

            state_tensor = torch.tensor(state, dtype = torch.float32)
            action_probs = policy(state_tensor)
            dist = torch.distributions.Categorical(action_probs)
            action = dist.sample()

            log_prob = dist.log_prob(action)
            log_probs.append(log_prob)

            state, reward, terminated, truncated, _ = env.step(action.item())
            done = terminated or truncated
            rewards.append(reward)

        # learn
        # prepare returns
        returns =  get_episode_Gt(rewards, gamma)   
        returns_tensor = torch.tensor(returns, dtype = torch.float32)
        returns_tensor = (returns_tensor - returns_tensor.mean()) / (returns_tensor.std() + 3e-20)

        # prepare log_probs        
        loss = - torch.sum(torch.stack(log_probs) * returns_tensor)

        # train
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if episode % 10 == 0:
            print(f"Episode {episode} reward {sum(rewards)}")

          
LR = 0.001

policyNN = Policy(n_obs, n_actions)            
optimizer = optim.Adam(policyNN.parameters(), lr = LR)        
train(env, policy = policyNN, optimizer = optimizer, episodes = 5000, gamma = 0.99)        

Episode 0 reward 26.0
Episode 10 reward 64.0
Episode 20 reward 60.0
Episode 30 reward 43.0
Episode 40 reward 101.0
Episode 50 reward 26.0
Episode 60 reward 24.0
Episode 70 reward 26.0
Episode 80 reward 58.0
Episode 90 reward 60.0
Episode 100 reward 82.0
Episode 110 reward 34.0
Episode 120 reward 28.0
Episode 130 reward 27.0
Episode 140 reward 108.0
Episode 150 reward 124.0
Episode 160 reward 61.0
Episode 170 reward 60.0
Episode 180 reward 128.0
Episode 190 reward 87.0
Episode 200 reward 47.0
Episode 210 reward 87.0
Episode 220 reward 310.0
Episode 230 reward 158.0
Episode 240 reward 262.0
Episode 250 reward 500.0
Episode 260 reward 270.0
Episode 270 reward 500.0
Episode 280 reward 500.0
Episode 290 reward 500.0
Episode 300 reward 432.0
Episode 310 reward 154.0
Episode 320 reward 150.0
Episode 330 reward 248.0
Episode 340 reward 99.0
Episode 350 reward 157.0
Episode 360 reward 500.0
Episode 370 reward 500.0
Episode 380 reward 221.0
Episode 390 reward 137.0
Episode 400 reward 120.0
Episo

In [4]:
# Train batch of episodes
def get_episode_Gt(rewards, gamma):
    R = 0
    returns = []
    for r in reversed(rewards):
        R = r + gamma * R
        returns.insert(0, R)
    return returns

def train(env, policy, optimizer, episodes = 500, gamma = 0.99, batch_size = 512):
    batch_log_probs, batch_returns, total_rewards = [], [], []
    
    for episode in range(episodes):
        rewards, log_probs = [], []
        state, _ = env.reset()

        done = False
        while not done: # play episode

            state_tensor = torch.tensor(state, dtype = torch.float32)
            action_probs = policy(state_tensor)
            dist = torch.distributions.Categorical(action_probs)
            action = dist.sample()

            log_prob = dist.log_prob(action)
            log_probs.append(log_prob)

            state, reward, terminated, truncated, _ = env.step(action.item())
            done = terminated or truncated
            rewards.append(reward)

        # calculate returns over episode
        returns =  get_episode_Gt(rewards, gamma)  

        # collect log_probs and rewards
        batch_log_probs.extend(log_probs)
        batch_returns.extend(returns)
        total_rewards.append(sum(rewards))
        
        # learn
        if len(batch_log_probs) >= batch_size:      
            # prepare returns
            
            returns_tensor = torch.tensor(batch_returns, dtype = torch.float32)
            returns_tensor = (returns_tensor - returns_tensor.mean()) / (returns_tensor.std() + 3e-20)

            # stack batch_log_probs into a single tensor and compute loss. Since we need gradient ascent - we use "-". The loss - actually expected cumulative reward here
            loss = - torch.sum(torch.stack(batch_log_probs) * returns_tensor)
    
            # train
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # reset batch
            batch_log_probs, batch_returns = [], []

        if episode % 100 == 0:
            print(f"Episode {episode} mean reward over last episodes {sum(total_rewards[-100:]) / 100.0}")

       
BATCH_SIZE = 512            
LR = 0.001

policyNN = Policy(n_obs, n_actions)            
optimizer = optim.Adam(policyNN.parameters(), lr = LR)        
train(env, policy = policyNN, optimizer = optimizer, episodes = 3000, gamma = 0.99, batch_size = BATCH_SIZE)        

Episode 0 mean reward over last episodes 0.11
Episode 100 mean reward over last episodes 52.3
Episode 200 mean reward over last episodes 156.57
Episode 300 mean reward over last episodes 207.28
Episode 400 mean reward over last episodes 165.35
Episode 500 mean reward over last episodes 188.67
Episode 600 mean reward over last episodes 240.21
Episode 700 mean reward over last episodes 437.65
Episode 800 mean reward over last episodes 319.8
Episode 900 mean reward over last episodes 349.62
Episode 1000 mean reward over last episodes 288.92
Episode 1100 mean reward over last episodes 500.0
Episode 1200 mean reward over last episodes 495.64
Episode 1300 mean reward over last episodes 468.34
Episode 1400 mean reward over last episodes 440.25
Episode 1500 mean reward over last episodes 495.7
Episode 1600 mean reward over last episodes 446.84
Episode 1700 mean reward over last episodes 494.43
Episode 1800 mean reward over last episodes 487.78
Episode 1900 mean reward over last episodes 471.02