# REINFORCE Algorithm test

In [27]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [28]:
import logging
import os
import random
import sys
import warnings
from itertools import accumulate

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from tqdm import tqdm

ROOT_FOLDER = os.path.join(".", "..")
if ROOT_FOLDER not in sys.path:
    sys.path.insert(0, ROOT_FOLDER)


from dataset import RegexDataset
from environment_metrics import Environment, EnvSettings

warnings.filterwarnings("ignore")
logging.disable(logging.WARNING)
torch.backends.cudnn.deterministic = True
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DEVICE

device(type='cpu')

## Utils

In [29]:
def set_seed(seed: int = 420):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)

## Dataset

In [30]:
dataset = RegexDataset(["a2d", "2bb"], r"\d+")
data_iter = dataset.create_iterator()

for i in range(10):
    print(next(data_iter))

('a2d', [0, 1, 0], 1)
('2bb', [1, 0, 0], 1)
('a2d', [0, 1, 0], 1)
('2bb', [1, 0, 0], 1)
('2bb', [1, 0, 0], 1)
('a2d', [0, 1, 0], 1)
('2bb', [1, 0, 0], 1)
('a2d', [0, 1, 0], 1)
('a2d', [0, 1, 0], 1)
('2bb', [1, 0, 0], 1)


## Environment

In [31]:
env = Environment(dataset, settings=EnvSettings(max_steps=10))

env.action_space

11

In [32]:
env.state_space

10

In [33]:
state = env.reset()
for _ in range(6):
    action = np.random.randint(env.action_space)
    print(f"{action=}")
    print(env.step(action))

action=8
(array([0.27272727, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ]), 0, False)
action=6
(array([0.27272727, 0.45454545, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ]), 0, False)
action=3
(array([0.27272727, 0.45454545, 0.72727273, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ]), 0, False)
action=9
(array([0.27272727, 0.45454545, 0.72727273, 0.18181818, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ]), 0, False)
action=10
(array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), -225.0, True)
action=5
(array([0.54545455, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ]), 0, False)


## REINFORCE

In [34]:
def calculate_qvals(
    rewards: list[float] | np.ndarray, gamma: float = 0.9, reward_steps: int = 0
) -> np.ndarray:
    rw_steps = reward_steps if reward_steps != 0 else len(rewards)

    return np.array(
        [
            list(
                accumulate(
                    reversed(rewards[i : i + rw_steps]), lambda x, y: gamma * x + y
                )
            )[-1]
            for i in range(len(rewards))
        ]
    )

In [35]:
class PGN(nn.Module):
    def __init__(
        self,
        input_dim: int = env.state_space,
        output_dim: int = env.action_space,
        hidden_dim: int = 64,
    ) -> None:
        super(PGN, self).__init__()

        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, output_dim),
        )

    def forward(self, x):
        return self.net(x)

## Agent

In [36]:
class Agent:
    def choose_action(self, action_logits: torch.Tensor):
        return np.random.choice(
            range(len(action_logits)), size=1, p=F.softmax(action_logits, dim=0).numpy()
        )[0]
    def choose_random(self, action_logits):
        return np.random.choice(range(len(action_logits)))

    def choose_optimal_action(self, action_logits: torch.Tensor) -> int:
        return int(np.argmax(F.softmax(action_logits, dim=0).cpu()).item())

## Trajectory Buffer

In [37]:
class TrajectoryBuffer:
    """
    Buffer class to store the experience from a unique policy
    """

    def _batch(self, iterable):
        ln = len(iterable)
        for ndx in range(0, ln, self.batch_size):
            yield iterable[ndx : min(ndx + self.batch_size, ln)]

    def __init__(self, batch_size: int = 64):
        self.batch_size = batch_size
        self.clean()

    def clean(self):
        self.states = []
        self.actions = []
        self.rewards = []
        self.discounted_rewards = []

    def store(
        self,
        states_trajectory: np.ndarray,
        trajectory: np.ndarray,
    ):
        """
        Add trajectory values to the buffers and compute the advantage and reward to go

        Parameters:
        -----------
        states_trajectory:  list that contains states
        trajectory: list where each element is a list that contains: reward, action
        """
        assert len(states_trajectory) == len(trajectory)

        if len(states_trajectory) > 0:
            self.states.extend(states_trajectory)
            self.rewards.extend(trajectory[:, 0])
            self.actions.extend(trajectory[:, 1])

            self.discounted_rewards.extend(calculate_qvals(trajectory[:, 0]))

    def get_batches(self, mean_baseline: bool):
        if mean_baseline:
            mean_rewards = np.mean(self.discounted_rewards)
        else:
            mean_rewards = 0

        for states_batch, actions_batch, discounted_rewards_batch in zip(
            self._batch(self.states),
            self._batch(self.actions),
            self._batch(self.discounted_rewards),
        ):
            yield (
                torch.tensor(states_batch, dtype=torch.float32, device=DEVICE),
                torch.tensor(actions_batch, dtype=torch.long, device=DEVICE),
                torch.tensor(
                    np.array(discounted_rewards_batch) - mean_rewards,
                    dtype=torch.float,
                    device=DEVICE,
                ),
            )

    def __len__(self):
        return len(self.states)

## Evaluation

TBD: generate one Regex and test it on dataset

In [38]:
def evaluate(
    pgn_net: nn.Module,
    env: Environment,
    agent: Agent,
    verbose: bool = True,
) -> tuple[str, float]:

    pgn_net.eval()
    max_steps =  env.settings.max_steps
    state = env.reset()
    regex_actions = []
    total_reward = 0.0
    done = False
    
    with torch.no_grad():
        for _ in range(max_steps):
            state_tensor = torch.tensor(state, dtype=torch.float32, device=DEVICE)
            action_logits = pgn_net(state_tensor)
            
           
            action = agent.choose_optimal_action(action_logits)
            regex_actions.append(env.idx_to_action(action))
            
           
            next_state, reward, done = env.step(action)
            total_reward += reward
            
            if done:
                break
            state = next_state

    if regex_actions and regex_actions[-1] == env._finish_action:
        regex_actions = regex_actions[:-1]
    
    try:
        regex = env.rpn.to_infix(regex_actions)
    except Exception as e:
        regex = f"Invalid: {regex_actions}"
    
    if verbose:
        print(f"Generated regex: {regex}")
        print(f"Total reward: {total_reward:.2f}")
    
    return regex, total_reward

## Training

In [39]:
def train_loop(
    pgn_net: nn.Module,
    pgn_optimizer: optim.Optimizer,
    agent: Agent,
    buffer: TrajectoryBuffer,
    epochs: int,
    episodes: int,
    mean_baseline: bool = True,
    entropy_beta: float = 0.6,
):
    set_seed()
    
    pgn_net.train()
    for i in range(1, epochs + 1):
        buffer.clean()
        state = env.reset()
        done_episodes = 0
        ep_states_buf, ep_rew_act_buf = [], []
        reg_exp = []
        train_rewards = []

        epoch_loop = tqdm(total=episodes, desc=f"Epoch #{i}", position=0, disable=True)

        while done_episodes < episodes:
            state_tensor = torch.tensor(state, dtype=torch.float32, device=DEVICE)

            with torch.no_grad():
                action_logits = pgn_net(state_tensor)

            # if i < epochs // 5:
            #     action = agent.choose_random(action_logits)
            # else:
            action = agent.choose_action(action_logits)
                
            
            next_state, reward, done = env.step(action)
            
            reg_exp.append(env.idx_to_action(action))

            ep_states_buf.append(state)
            ep_rew_act_buf.append([reward, int(action)])

            state = next_state
            if done:
                buffer.store(
                    np.array(ep_states_buf),
                    np.array(ep_rew_act_buf),
                )

                ep_states_buf, ep_rew_act_buf = [], []

                train_rewards.append(reward)

                done_episodes += 1
                epoch_loop.update(1)

        losses = []
        for batch in buffer.get_batches(mean_baseline):
            pgn_optimizer.zero_grad()
            (
                state_batch,
                action_batch,
                reward_batch,
            ) = batch
           

            logits_v = pgn_net(state_batch)

            mx = torch.max(torch.abs(logits_v))
            t = 1/i * mx*10
            
            log_prob_v = F.log_softmax(logits_v / t, dim=1)

            log_prob_actions_v = (
                reward_batch * log_prob_v[range(len(action_batch)), action_batch]
            )
            loss_policy_v = -log_prob_actions_v.mean()

            prob_v = F.softmax(logits_v, dim=1)
            entropy_v = (prob_v * log_prob_v).sum(dim=1).mean()
            entropy_loss_v = entropy_beta * entropy_v
            loss_v = loss_policy_v + entropy_loss_v

            loss_v.backward()

            pgn_optimizer.step()

            losses.append(loss_v.item())
            # return

        print(
            f"Epoch {i: >3}/{epochs}:\tMean reward: {np.mean(train_rewards):.3f}\tMean Loss: {np.mean(losses):.3f}"
        )

In [40]:
set_seed()
env = Environment(
    RegexDataset(["a2d", "2bb", "cc2", "b6J", "G1S", "pp3", "km9", "mb4", "o70", "pi5"], r"\d+"), settings=EnvSettings(max_steps=10)
)

agent = Agent()
buffer = TrajectoryBuffer(batch_size=32)

pgn_net = PGN().to(DEVICE)
# pgn_optimizer = optim.SGD(pgn_net.parameters(), lr=1e-3, nesterov=True, momentum=0.99)
pgn_optimizer = optim.Adam(pgn_net.parameters(), lr=1e-2)

In [41]:
train_loop(
    pgn_net, pgn_optimizer, agent, buffer, mean_baseline=True, epochs=20, episodes=1000
)

Epoch   1/20:	Mean reward: -289.910	Mean Loss: -1.559
Epoch   2/20:	Mean reward: -252.800	Mean Loss: -5.864
Epoch   3/20:	Mean reward: -27.240	Mean Loss: -0.896
Epoch   4/20:	Mean reward: -26.050	Mean Loss: -1.440
Epoch   5/20:	Mean reward: -343.030	Mean Loss: -37.497
Epoch   6/20:	Mean reward: -369.475	Mean Loss: -58.492
Epoch   7/20:	Mean reward: -338.175	Mean Loss: -56.140
Epoch   8/20:	Mean reward: -375.000	Mean Loss: -73.897
Epoch   9/20:	Mean reward: -363.025	Mean Loss: -68.925
Epoch  10/20:	Mean reward: -375.000	Mean Loss: -89.170
Epoch  11/20:	Mean reward: -375.000	Mean Loss: -93.330
Epoch  12/20:	Mean reward: -250.175	Mean Loss: -22.738
Epoch  13/20:	Mean reward: -375.000	Mean Loss: -93.738
Epoch  14/20:	Mean reward: -375.000	Mean Loss: -106.856
Epoch  15/20:	Mean reward: -375.000	Mean Loss: -85.721
Epoch  16/20:	Mean reward: -251.200	Mean Loss: -33.859
Epoch  17/20:	Mean reward: -375.000	Mean Loss: -112.499
Epoch  18/20:	Mean reward: -375.000	Mean Loss: -113.119
Epoch  19/20:

In [42]:
env_eval = Environment(
    RegexDataset(["a2365d"], r"\d+"),  
    settings=EnvSettings( max_steps  = 10, full_match_bonus=100)
)

best_regex, reward = evaluate(pgn_net, env_eval, agent)
print(f"Final regex: {best_regex}")

Generated regex: 2222266555
Total reward: -375.00
Final regex: 2222266555
