In [44]:
import numpy as np
import matplotlib.pyplot as plt
import gymnasium as gym
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
from torch.optim import Adam
from torch.utils.data import Dataset, DataLoader
from torch.distributions.categorical import Categorical


In [45]:
torch.manual_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [46]:
# environment parameters
env_name="CartPole-v1"
env = gym.make(env_name)
obs_dim = env.observation_space.shape[0]
acts_dim = env.action_space.n
print(f"{obs_dim} obs | {acts_dim} acts")

# mlp parameters
hidden_sizes = [10,5]
sizes = [obs_dim]+hidden_sizes+[n_acts]

# training parameters
epochs=50
batch_size=5000


4 obs | 2 acts


In [51]:
class MLP(nn.Module):
    def __init__(self, sizes, activation=nn.Tanh, output_activation=nn.Identity):
        super().__init__()
        layers = []
        for j in range(len(sizes)-1):
            act = activation if j < len(sizes)-2 else output_activation
            layers += [nn.Linear(sizes[j], sizes[j+1]), act()]
        self.layers = nn.Sequential(*layers)
        

    def forward(self, x):
        return self.layers(x)


In [71]:
# Making the Policy Network
model = MLP(sizes)

def get_policy(obs):
    logits = model(obs)
    return Categorical(logits=logits)

def get_action(obs): # only one observation as input
    # you can remove .item() to pass batch of obs as input
    return get_policy(obs).sample().item()

obs = torch.Tensor([1,0,5,1])
act = get_action()

0

In [70]:
# Making the Loss Function

# make loss function whose gradient, for the right data, is policy gradient
def compute_loss(obs, act, weights):
    logp = get_policy(obs).log_prob(act)
    return -(logp * weights).mean()
loss = compute_loss(obs, act, )

In [None]:

# training loop
for i in range(epochs):
    # make some empty lists for logging.
    batch_obs = []          # for observations
    batch_acts = []         # for actions
    batch_weights = []      # for R(tau) weighting in policy gradient
    batch_rets = []         # for measuring episode returns
    batch_lens = []         # for measuring episode lengths

    # reset episode-specific variables
    obs = env.reset()       # first obs comes from starting distribution
    done = False            # signal from environment that episode is over
    ep_rews = []            # list for rewards accrued throughout ep
    while True:
        batch_obs.append(obs.copy())
        
        act = get_action(torch.as_tensor(obs, dtype=torch.float32))
        obs, rew, done, _, _ = env.step(act)
        
        batch_acts.append(act)
        ep_rews.append(rew)

        if done:
            # if episode is over, record info about episode
            ep_ret, ep_len = sum(ep_rews), len(ep_rews)
            batch_rets.append(ep_ret)
            batch_lens.append(ep_len)
            
            # the weight for each logprob(a|s) is R(tau)
            batch_weights += [ep_ret] * ep_len

    
    print('epoch: %3d \t loss: %.3f \t return: %.3f \t ep_len: %.3f'%
            (i, batch_loss, np.mean(batch_rets), np.mean(batch_lens)))
