# REINFORCE Algorithm test

In [1023]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [1024]:
import logging
import os
import random
import sys
import warnings
from itertools import accumulate

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from tqdm import tqdm

ROOT_FOLDER = os.path.join(".", "..")
if ROOT_FOLDER not in sys.path:
    sys.path.insert(0, ROOT_FOLDER)


from dataset import RegexDataset
from environment import Environment, EnvSettings

warnings.filterwarnings("ignore")
logging.disable(logging.WARNING)
torch.backends.cudnn.deterministic = True
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DEVICE

device(type='cpu')

## Utils

In [1025]:
def set_seed(seed: int = 420):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)

## Dataset

In [1026]:
dataset = RegexDataset(["a2d", "2bb"], r"\d+")
data_iter = dataset.create_iterator()

for i in range(10):
    print(next(data_iter))

('a2d', [0, 1, 0], 1)
('2bb', [1, 0, 0], 1)
('a2d', [0, 1, 0], 1)
('2bb', [1, 0, 0], 1)
('2bb', [1, 0, 0], 1)
('a2d', [0, 1, 0], 1)
('2bb', [1, 0, 0], 1)
('a2d', [0, 1, 0], 1)
('a2d', [0, 1, 0], 1)
('2bb', [1, 0, 0], 1)


## Environment

In [1027]:
env = Environment(dataset, settings=EnvSettings(max_steps=3))

env.action_space

11

In [1028]:
state = env.reset()
for _ in range(6):
    action = np.random.randint(env.action_space)
    print(f"{action=}")
    print(env.step(action))

action=8
(array([0.27272727, 0.        , 0.        ]), 0, False)
action=6
(array([0.27272727, 0.45454545, 0.        ]), 0, False)
action=3
(array([0., 0., 0.]), -10104.0, True)
action=5
(array([0.54545455, 0.        , 0.        ]), 0, False)
action=7
(array([0.54545455, 0.36363636, 0.        ]), 0, False)
action=3
(array([0., 0., 0.]), -10104.0, True)


## REINFORCE

In [1038]:
def calculate_qvals(
    rewards: list[float] | np.ndarray, gamma: float = 1.0, reward_steps: int = 0
) -> np.ndarray:
    rw_steps = reward_steps if reward_steps != 0 else len(rewards)

    return np.array(
        [
            list(
                accumulate(
                    reversed(rewards[i : i + rw_steps]), lambda x, y: gamma * x + y
                )
            )[-1]
            for i in range(len(rewards))
        ]
    )

In [1030]:
class PGN(nn.Module):
    def __init__(
        self,
        input_dim: int = env.state_space,
        output_dim: int = env.action_space,
        hidden_dim: int = 64,
    ) -> None:
        super(PGN, self).__init__()

        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.Tanh(),
            nn.Linear(64, 32),
            nn.Tanh(),
            nn.Linear(32, 16),
            nn.Tanh(),
            nn.Linear(16, output_dim),
        )

    def forward(self, x):
        return self.net(x)

## Agent

In [1031]:
class Agent:
    def choose_action(self, action_logits: torch.Tensor):
        return np.random.choice(
            range(len(action_logits)), size=1, p=F.softmax(action_logits, dim=0).numpy()
        )[0]

    def choose_optimal_action(self, action_logits: torch.Tensor) -> int:
        return int(np.argmax(F.softmax(action_logits, dim=0).cpu()).item())

## Trajectory Buffer

In [None]:
class TrajectoryBuffer:
    """
    Buffer class to store the experience from a unique policy
    """

    def _batch(self, iterable):
        ln = len(iterable)
        for ndx in range(0, ln, self.batch_size):
            yield iterable[ndx : min(ndx + self.batch_size, ln)]

    def __init__(self, batch_size: int = 64):
        self.batch_size = batch_size
        self.clean()

    def clean(self):
        self.states = []
        self.actions = []
        self.discounted_rewards = []

    def store(
        self,
        states_trajectory: np.ndarray,
        trajectory: np.ndarray,
    ):
        """
        Add trajectory values to the buffers and compute the advantage and reward to go

        Parameters:
        -----------
        states_trajectory:  list that contains states
        trajectory: list where each element is a list that contains: reward, action
        """
        assert len(states_trajectory) == len(trajectory)

        if len(states_trajectory) > 0:
            self.states.extend(states_trajectory)
            self.actions.extend(trajectory[:, 1])

            self.discounted_rewards.extend(calculate_qvals(trajectory[:, 0]))

    def get_batches(self, mean_baseline: bool):
        if mean_baseline:
            mean_rewards = np.mean(self.discounted_rewards)
        else:
            mean_rewards = 0

        for states_batch, actions_batch, discounted_rewards_batch in zip(
            self._batch(self.states),
            self._batch(self.actions),
            self._batch(self.discounted_rewards),
        ):
            yield (
                torch.tensor(states_batch, dtype=torch.float32, device=DEVICE),
                torch.tensor(actions_batch, dtype=torch.long, device=DEVICE),
                torch.tensor(
                    np.array(discounted_rewards_batch) - mean_rewards,
                    dtype=torch.float,
                    device=DEVICE,
                ),
            )

    def __len__(self):
        return len(self.states)

## Evaluation

TBD: generate one Regex and test it on dataset

## Training

In [1033]:
def train_loop(
    pgn_net: nn.Module,
    pgn_optimizer: optim.Optimizer,
    agent: Agent,
    buffer: TrajectoryBuffer,
    epochs: int,
    episodes: int,
    mean_baseline: bool = True,
    entropy_beta: float = 1e-3,
):
    set_seed()

    pgn_net.train()
    for i in range(1, epochs + 1):
        buffer.clean()
        state = env.reset()
        done_episodes = 0
        ep_states_buf, ep_rew_act_buf = [], []

        train_rewards = []

        epoch_loop = tqdm(total=episodes, desc=f"Epoch #{i}", position=0, disable=True)

        while done_episodes < episodes:
            state_tensor = torch.tensor(state, dtype=torch.float32, device=DEVICE)

            with torch.no_grad():
                action_logits = pgn_net(state_tensor)

            action = agent.choose_action(action_logits)
            next_state, reward, done = env.step(action)

            ep_states_buf.append(state)
            ep_rew_act_buf.append([reward, int(action)])

            state = next_state

            if done:
                buffer.store(
                    np.array(ep_states_buf),
                    np.array(ep_rew_act_buf),
                )

                ep_states_buf, ep_rew_act_buf = [], []

                train_rewards.append(reward)

                done_episodes += 1
                epoch_loop.update(1)

        losses = []
        for batch in buffer.get_batches(mean_baseline):
            pgn_optimizer.zero_grad()
            (
                state_batch,
                action_batch,
                reward_batch,
            ) = batch

            logits_v = pgn_net(state_batch)
            log_prob_v = F.log_softmax(logits_v, dim=1)

            log_prob_actions_v = (
                reward_batch * log_prob_v[range(len(state_batch)), action_batch]
            )
            loss_policy_v = -log_prob_actions_v.mean()

            prob_v = F.softmax(logits_v, dim=1)
            entropy_v = (prob_v * log_prob_v).sum(dim=1).mean()
            entropy_loss_v = entropy_beta * entropy_v
            loss_v = loss_policy_v + entropy_loss_v

            loss_v.backward()

            pgn_optimizer.step()

            losses.append(loss_v.item())
            # return

        print(
            f"Epoch {i: >3}/{epochs}:\tMean reward: {np.mean(train_rewards):.3f}\tMean Loss: {np.mean(losses):.3f}"
        )

In [1034]:
set_seed()
env = Environment(
    RegexDataset(["a2d", "2bb", "cc2"], r"\d+"), settings=EnvSettings(max_steps=3)
)

agent = Agent()
buffer = TrajectoryBuffer(batch_size=1)

pgn_net = PGN().to(DEVICE)
# pgn_optimizer = optim.SGD(pgn_net.parameters(), lr=1e-3, nesterov=True, momentum=0.99)
pgn_optimizer = optim.Adam(pgn_net.parameters(), lr=1e-3)

In [1035]:
train_loop(pgn_net, pgn_optimizer, agent, buffer, epochs=20, episodes=1000)

Epoch   1/20:	Mean reward: -11583.139	Mean Loss: -4358.246
Epoch   2/20:	Mean reward: -10103.874	Mean Loss: -0.005
Epoch   3/20:	Mean reward: -10103.818	Mean Loss: -0.079
Epoch   4/20:	Mean reward: -10103.186	Mean Loss: -1.620
Epoch   5/20:	Mean reward: -9535.390	Mean Loss: -3491.617
Epoch   6/20:	Mean reward: -29859.216	Mean Loss: 689.568
Epoch   7/20:	Mean reward: -22742.263	Mean Loss: -137014.405
Epoch   8/20:	Mean reward: -1456.564	Mean Loss: -20592.165
Epoch   9/20:	Mean reward: -9942.359	Mean Loss: 127.021
Epoch  10/20:	Mean reward: -6810.539	Mean Loss: -32750.277
Epoch  11/20:	Mean reward: -2.000	Mean Loss: -0.000
Epoch  12/20:	Mean reward: -2.000	Mean Loss: -0.000
Epoch  13/20:	Mean reward: -2.000	Mean Loss: -0.000
Epoch  14/20:	Mean reward: -2.000	Mean Loss: -0.000
Epoch  15/20:	Mean reward: -2.000	Mean Loss: -0.000
Epoch  16/20:	Mean reward: -2.000	Mean Loss: -0.000
Epoch  17/20:	Mean reward: -2.000	Mean Loss: -0.000
Epoch  18/20:	Mean reward: -2.000	Mean Loss: -0.000
Epoch  

In [1036]:
# env = Environment(RegexDataset(["a2d", "2bb"], r"\d+"), settings=EnvSettings(max_steps=3))

# state = env.reset()
# a1 = env.action_to_idx("2")
# a2 = env.action_to_idx("FIN")
# print(env.step(a1))
# print(env.step(a2))