# DQN Algorithm test

In [175]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
import logging
import os
import sys
import warnings

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from tqdm import tqdm

ROOT_FOLDER = os.path.join(".", "..")
if ROOT_FOLDER not in sys.path:
    sys.path.insert(0, ROOT_FOLDER)


from dataset import RegexDataset
from environment import Environment, EnvSettings

warnings.filterwarnings("ignore")
logging.disable(logging.WARNING)
torch.backends.cudnn.deterministic = True
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DEVICE

device(type='cpu')

## Utils

In [177]:
def set_seed(seed: int = 420):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)

## Dataset

In [178]:
dataset = RegexDataset(["a2d", "2bb"], r"\d+")
data_iter = dataset.create_iterator()

for i in range(10):
    print(next(data_iter))

('a2d', [0, 1, 0], 1)
('2bb', [1, 0, 0], 1)
('a2d', [0, 1, 0], 1)
('2bb', [1, 0, 0], 1)
('2bb', [1, 0, 0], 1)
('a2d', [0, 1, 0], 1)
('2bb', [1, 0, 0], 1)
('a2d', [0, 1, 0], 1)
('a2d', [0, 1, 0], 1)
('2bb', [1, 0, 0], 1)


## Environment

In [179]:
env = Environment(dataset, settings=EnvSettings(max_steps=3))

env.action_space

101

In [180]:
state = env.reset()
for _ in range(6):
    action = np.random.randint(env.action_space)
    print(f"{action=}")
    print(env.step(action))

action=72
(array([0.28712871, 0.        , 0.        ]), 0, False)
action=6
(array([0.28712871, 0.94059406, 0.        ]), 0, False)
action=63
(array([0., 0., 0.]), -100000, True)
action=53
(array([0.47524752, 0.        , 0.        ]), 0, False)
action=19
(array([0.47524752, 0.81188119, 0.        ]), 0, False)
action=81
(array([0., 0., 0.]), -10104.0, True)


## DQN

In [181]:
class DQN(nn.Module):
    def __init__(
        self,
        input_dim: int = env.state_space,
        output_dim: int = env.action_space,
        hidden_dim: int = 32,
    ) -> None:
        super(DQN, self).__init__()

        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.LeakyReLU(),
            nn.Linear(hidden_dim, output_dim),
        )

    def forward(self, x):
        return self.net(x)

## Agent

In [182]:
class Agent:
    def __init__(self, action_space: int) -> None:
        self.action_space = action_space

    def sample_action(self) -> int:
        return np.random.choice(self.action_space)

    @torch.no_grad()
    def choose_optimal_action(self, state: torch.Tensor, dqn: nn.Module) -> int:
        q_vals_v = dqn(state)
        act_v = torch.argmax(q_vals_v)
        return int(act_v.item())

    def choose_action(self, state: torch.Tensor, dqn: nn.Module, epsilon: float) -> int:
        if np.random.random() < epsilon:
            return self.sample_action()
        return self.choose_optimal_action(state, dqn)

## Trajectory Buffer

In [183]:
class TrajectoryBuffer:
    """
    Buffer class to store the experience from a unique policy
    """

    def _batch(self, iterable, n=1):
        ln = len(iterable)
        for ndx in range(0, ln, n):
            yield iterable[ndx : min(ndx + n, ln)]

    def __init__(self, batch_size: int = 64):
        self.batch_size = 64
        self.clean()

    def clean(self):
        self.states = []
        self.actions = []
        self.rewards = []
        self.dones = []
        self.next_states = []

    def store(
        self,
        state: np.ndarray,
        action: int,
        reward: float,
        done: bool,
        next_state: np.ndarray,
    ):
        """
        Add trajectory values to the buffers
        """
        self.states.append(state)
        self.actions.append(action)
        self.rewards.append(reward)
        self.dones.append(int(done))
        self.next_states.append(next_state)

    def get_batches(self):
        for (
            states_batch,
            actions_batch,
            rewards_batch,
            dones_batch,
            nest_states_batch,
        ) in zip(
            self._batch(self.states),
            self._batch(self.actions),
            self._batch(self.rewards),
            self._batch(self.dones),
            self._batch(self.next_states),
        ):
            yield (
                torch.tensor(states_batch, dtype=torch.float32, device=DEVICE),
                torch.tensor(actions_batch, dtype=torch.long, device=DEVICE),
                torch.tensor(rewards_batch, dtype=torch.float, device=DEVICE),
                torch.tensor(dones_batch, dtype=torch.bool, device=DEVICE),
                torch.tensor(nest_states_batch, dtype=torch.float, device=DEVICE),
            )

    def __len__(self):
        return len(self.states)

## Evaluation

TBD: generate one Regex and test it on dataset

## Training

In [184]:
def train_loop(
    dqn_net: nn.Module,
    dqn_target_net: nn.Module,
    dqn_optimizer: optim.Optimizer,
    agent: Agent,
    buffer: TrajectoryBuffer,
    epochs: int,
    episodes: int,
    dqn_sync_period: int = 1,
    gamma: float = 0.99,
):
    set_seed()
    buffer.clean()
    dqn_net.train()

    epsilon = 0.99
    epsilon_decay = 0.99

    for i in range(1, epochs + 1):
        state = env.reset()
        done_episodes = 0

        train_rewards = []

        epoch_loop = tqdm(total=episodes, desc=f"Epoch #{i}", position=0, disable=True)

        while done_episodes < episodes:
            state_tensor = torch.tensor(state, dtype=torch.float32, device=DEVICE)
            action = agent.choose_action(state_tensor, dqn_net, epsilon)
            next_state, reward, done = env.step(action)

            buffer.store(
                state,
                action,
                reward,
                done,
                next_state,
            )

            state = next_state

            if done:
                train_rewards.append(reward)
                state = env.reset()

                done_episodes += 1
                epoch_loop.update(1)

        # Update DQN
        if i % dqn_sync_period == 0:
            dqn_target_net.load_state_dict(dqn_net.state_dict())
            # Should we clear buffer here?
            buffer.clean()

        for batch in buffer.get_batches():
            (
                state_batch,
                action_batch,
                reward_batch,
                done_batch,
                next_state_batch,
            ) = batch

            dqn_optimizer.zero_grad()

            state_action_values = (
                dqn_net(state_batch).gather(1, action_batch.unsqueeze(-1)).squeeze(-1)
            )
            with torch.no_grad():
                next_state_values = dqn_target_net(next_state_batch).max(1)[0]
                next_state_values[done_batch] = 0.0
                next_state_values = next_state_values.detach()

            expected_state_action_values = next_state_values * gamma + reward_batch
            loss_v = F.mse_loss(state_action_values, expected_state_action_values)
            loss_v.backward()

            dqn_optimizer.step()

        epsilon *= epsilon_decay
        print(f"Epoch {i: >3}/{epochs}:\tMean reward: {np.mean(train_rewards)}")

In [185]:
set_seed()
env = Environment(RegexDataset(["a2d", "2bb"], r"\d+"), settings=EnvSettings(max_steps=3))

agent = Agent(env.action_space)
buffer = TrajectoryBuffer()

dqn_net = DQN().to(DEVICE)
dqn_target_net = DQN()
dqn_optimizer = optim.Adam(dqn_net.parameters(), lr=1e-2)

In [186]:
train_loop(
    dqn_net, dqn_target_net, dqn_optimizer, agent, buffer, epochs=50, episodes=1000
)

Epoch   1/50:	Mean reward: -22461.08
Epoch   2/50:	Mean reward: -23100.453
Epoch   3/50:	Mean reward: -22250.596
Epoch   4/50:	Mean reward: -21970.906
Epoch   5/50:	Mean reward: -21352.037
Epoch   6/50:	Mean reward: -21571.445
Epoch   7/50:	Mean reward: -21852.127
Epoch   8/50:	Mean reward: -23229.94
Epoch   9/50:	Mean reward: -23509.543
Epoch  10/50:	Mean reward: -22161.096
Epoch  11/50:	Mean reward: -22340.994
Epoch  12/50:	Mean reward: -22580.69
Epoch  13/50:	Mean reward: -24049.219
Epoch  14/50:	Mean reward: -21342.738
Epoch  15/50:	Mean reward: -23409.647
Epoch  16/50:	Mean reward: -22500.483
Epoch  17/50:	Mean reward: -23418.859
Epoch  18/50:	Mean reward: -23429.545
Epoch  19/50:	Mean reward: -21472.156
Epoch  20/50:	Mean reward: -23429.753
Epoch  21/50:	Mean reward: -21041.955
Epoch  22/50:	Mean reward: -22739.79
Epoch  23/50:	Mean reward: -22091.216
Epoch  24/50:	Mean reward: -22670.39
Epoch  25/50:	Mean reward: -22670.203
Epoch  26/50:	Mean reward: -22889.81
Epoch  27/50:	Mean