REINFORCE is one for policy based RL Algorithm. Compared with value based functions of RL there are some critical advantages. Please refer chapter 13 of the book "Reinforcement Learning An introduction second edition" by Richard from more details.

In this notebook we implement REINFORCE Algorithm with Artificial neural network. By leveraging Deep Learing, the REINFORCE has better performence.

# How REINFORCE Works

1. Collect Episodes
1. Calculate Returns
1. Policy Gradient Update
1. Repeat

In [None]:
import gymnasium as gym

env = gym.make("LunarLander-v3", render_mode="human")
#env = gym.make("LunarLander-v3", render_mode=None)
obs_space = env.observation_space.shape[0]
act_space = env.action_space.n

print(f"obs_space is ${obs_space}")
print(f"act_space is ${act_space}")

# Implementation

Using an artificial neural network to present parametered policy. In our example, we use network which has a signle hiden layer with 512 neural unites and full connection.

In [None]:
import torch
from torch import nn

class PolicyNetwork(nn.Module):
    def __init__(self, device, action_space, dim_in = 128):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(dim_in, 512),
            nn.ReLU(),
            nn.Linear(512, action_space)
        )

        self.loss_fn = nn.CrossEntropyLoss()
        self.optimizer = torch.optim.SGD(self.parameters(), lr=1e-3)
        self.device = device
        self.to(device)

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

    def train(self, dataloader):
        size = len(dataloader.dataset)
        nn.Module().train()
        for batch, (X, y) in enumerate(dataloader):
            X, y = X.to(self.device), y.to(self.device)
            pred = self(X)
            loss = self.loss_fn(pred, y)

            # Backpropagation
            loss.backward()
            self.optimizer.step()
            self.optimizer.zero_grad()

            if batch % 100 == 0:
                loss, current = loss.item(), (batch + 1) * len(X)
                print(f"loss: {loss:>7f} [{current:>5f}/{size:>5f}]")

    def test(self, dataloader):
        size = len(dataloader.dataset)
        num_batches = len(dataloader)
        nn.Module.eval(nn.Module())
        test_loss, correct = 0, 0
        with torch.no_grad():
            for X, y in dataloader:
                X, y = X.to(self.device), y.to(self.device)
                pred = self(X)
                test_loss += self.loss_fun(pred, y).item()
                correct += (pred.argmax(1) == y).type(torch.float).sum().item()
        test_loss /= num_batches
        correct /= size
        print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

    def save(self, file = "model.pth"):
        torch.save(self.state_dict(), file)
        print(f"saved the model in {file}")

In [None]:
import collections
from torch.utils.data import Dataset

class ReplyBuffer(Dataset):
    def __init__(self, maxsize: int = 128):
        self.buffer = []
        self.maxsize =min(56, maxsize)
        self.index = 0

    def __len__(self):
        return len(self.img_labels)

    def __getitem__(self, idx):
        return self.buffer[idx]
    
    def record(self, item):
        next = (self.index + 1) / self.maxsize
        self.buffer[next] = item
        self.index = next

In [None]:
import numpy as np

def compute_returns(rewards, gamma):
    returns = np.zeros_like(rewards, dtype=np.float32)
    running_return = 0
    for t in reversed(range(len(rewards))):
        running_return = rewards[t] + gamma * running_return
        returns[t] = running_return
    return returns

In [None]:
import numpy as np
from torch import nn
from torch.utils.data import DataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# create modul
policy = PolicyNetwork(device, action_space=act_space, dim_in= 128)
print(policy)

# Create data loaders.
training_data = ReplyBuffer(512)
train_dataloader = DataLoader(training_data, batch_size=56)

num_episodes = 200
gamma = 9e-1
for episode in range(num_episodes):
    state, _ = env.reset()
    done = False
    states, actions, rewards = [], [], []

    while not done:
        state_input = np.array(state, dtype=np.float32).reshape(1, -1)
        probs = policy.forward(state_input).numpy()[0]
        action = np.random.choice(act_space, p=probs)

        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated

        states.append(state_input[0])
        actions.append(action)
        rewards.append(reward)
        state = next_state
        

    # compute returns of env
    r = compute_returns(rewards, gamma)
    training_data.record(np.vstack(states), actions, r)

    policy.train(training_data)
# Train model in given dataset

# module.train(train_dataloader, loss_fn, optimizer)