In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
from tqdm import tqdm
import itertools
import os
import pandas as pd
#from google.colab import drive
import pickle
import random
#drive.mount('/content/drive')

np.random.seed(123)
torch.manual_seed(123)


<torch._C.Generator at 0x2a0a9782a10>

## **Environment Class**

In [2]:
class BanditEnvironment:
    def __init__(self, difficulty, max_steps = 100):
        self.num_actions = 2  # Number of actions (arms/bandits)
        self.difficulty = difficulty
        self.max_steps = max_steps  # Maximum number of steps per episode
        self.reset()

    def reset(self):
        """Reset the environment to its initial state."""

        # Set brenoulli probabilities for difficulties
        if self.difficulty == 'easy':
            bandits_prob = np.random.choice([0.9, 0.1])
            self.bandits_prob = np.array([bandits_prob, 1 - bandits_prob])
        elif self.difficulty == 'medium':
            bandits_prob = np.random.choice([0.75, 0.25])
            self.bandits_prob = np.array([bandits_prob, 1 - bandits_prob])
        elif self.difficulty == 'hard':
            bandits_prob = np.random.choice([0.6, 0.4])
            self.bandits_prob = np.array([bandits_prob, 1 - bandits_prob])
        elif self.difficulty == 'uniform':
            bandits_prob = np.random.uniform()
            self.bandits_prob = np.array([bandits_prob, 1 - bandits_prob])
        else:  # Independent difficulty (random probabilities for each arm)
            self.bandits_prob = np.random.uniform(size=2)

        self.timestep = 0  # Initialize timestep
        self.last_action = None
        self.last_reward = 0
        return self.get_state()

    def step(self, action):
        """Take an action and return the next state, reward, and done flag."""
        self.timestep += 1
        self.last_action = action
        bandit_prob = self.bandits_prob[action]
        reward = 1 if np.random.uniform() < bandit_prob else 0
        self.last_reward = reward
        done = self.timestep >= self.max_steps  # End episode after 100 timesteps
        return self.get_state(), reward, done

    def get_state(self):
        """Return the current state: timestep, last action, and last reward."""
        one_hot_action = np.zeros(self.num_actions)
        if self.last_action is not None:
          one_hot_action[self.last_action] = 1
        return np.concatenate([np.array([self.timestep, self.last_reward], dtype=np.float32), one_hot_action])


In [3]:
def metrics(total_reward, bandits_prob, actions_chosen):

    # Cumulative Regret
    optimal_reward = np.max(bandits_prob) * 100
    cumulative_regret = optimal_reward - total_reward

    return cumulative_regret


In [4]:
class A2C_LSTM_Agent(nn.Module):
    def __init__(self, input_dim, hidden_size, num_actions):
        super(A2C_LSTM_Agent, self).__init__()
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size=input_dim, hidden_size=hidden_size, batch_first=True)
        self.actor_layer = nn.Linear(hidden_size, num_actions)
        self.critic_layer = nn.Linear(hidden_size, 1)
        nn.init.xavier_uniform_(self.actor_layer.weight, gain=0.01)
        nn.init.xavier_uniform_(self.critic_layer.weight, gain=1.0)

    def forward(self, x, hidden_state):
        # x shape: (batch_size, seq_len, input_dim)
        lstm_out, hidden_state = self.lstm(x, hidden_state)
        lstm_out = lstm_out[:, -1, :]  # Use the output of the last timestep
        action_probs = F.softmax(self.actor_layer(lstm_out), dim=-1)
        value = self.critic_layer(lstm_out)
        return action_probs, value, hidden_state

    def init_hidden_state(self, batch_size=1):
        # Initialize the LSTM hidden and cell states
        return (torch.zeros(1, batch_size, self.hidden_size).to(device),
                torch.zeros(1, batch_size, self.hidden_size).to(device))

In [5]:
def train_agent(TRAIN = True, n_episodes = 25000, gamma = 0.99, learning_rate = 0.001, entropy_decay = False):

    stats = {'Episode Returns': [], 'Optimal Action': [], 'Bandits_Probs': [], 'Actions Taken': [], 'Cumulative Regret': [], 'Actions-Chosen List': []}

    for episode in tqdm(range(1, n_episodes + 1)):

        # Restart Constants, Environmentes and LSTM Hidden-State
        actions_chosen_list = [] if TRAIN == False else None
        actions_chosen = np.zeros(num_actions)
        state = env.reset()
        state = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0).unsqueeze(0)  # Add batch and sequence dimensions
        hidden_state = agent.init_hidden_state()

        total_reward = 0
        done = False

        while not done:

            # Forward pass
            action_probs, value, hidden_state = agent(state, (hidden_state[0].detach(), hidden_state[1].detach()))

            # Sample action from probability distribution
            action_dist = torch.distributions.Categorical(action_probs)
            action = action_dist.sample()
            log_prob = action_dist.log_prob(action)

            # Take action in the environment
            next_state, reward, done = env.step(action.item())
            total_reward += reward

            # Prepare next state
            next_state = torch.FloatTensor(next_state).unsqueeze(0).unsqueeze(0).to(device)

            # Trainning
            if TRAIN == True:

                # Compute TD target and advantage
                with torch.no_grad():
                    _, next_value, _ = agent(next_state, hidden_state) #if not done else (None, torch.tensor(0.0), None)
                    td_target = reward + gamma * next_value
                advantage = td_target - value

                # Compute losses
                actor_loss = -log_prob * advantage.detach()
                critic_loss = F.mse_loss(value, td_target)
                entropy_loss = -action_dist.entropy().mean()
                entropy_coef = max(0.005, 0.1 * (1 - episode / n_episodes)) if entropy_decay == True else 0.005

                loss = actor_loss + 0.05 * critic_loss + entropy_coef * entropy_loss

                # Optimize the network
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            else:
                actions_chosen_list.append(action.item())

            # Move to the next state
            state = next_state

            # Add action
            actions_chosen[action.item()] += 1

        # Cumulative Regret:
        cumulative_regret_value = metrics(total_reward, env.bandits_prob, actions_chosen)

        #Save Episode stats
        stats['Episode Returns'].append(total_reward)
        stats['Optimal Action'].append(np.argmax(env.bandits_prob))
        stats['Bandits_Probs'].append(env.bandits_prob)
        stats['Actions Taken'].append(actions_chosen)
        stats['Actions-Chosen List'].append(actions_chosen_list)
        stats['Cumulative Regret'].append(metrics(total_reward, env.bandits_prob, actions_chosen))

        if len(stats['Cumulative Regret']) >= 100:
            mean_last_100_regret = np.mean(stats['Cumulative Regret'][-100:])
        else:
            mean_last_100_regret = np.mean(stats['Cumulative Regret'])


        if episode % 1000 == 0:
            print(f"Ep {episode}/{n_episodes}, Opt. Action: {np.argmax(env.bandits_prob)}, Reward: {total_reward}, Cumulative-Regret: {cumulative_regret_value}, AVG100-Regret: {mean_last_100_regret}")

    return stats



In [6]:
def save_train_stats(stats, model_name, train_difficulty, discount_factor_gamma, learning_rate, entropy_decay):

    stats = pd.DataFrame(stats)

    stats['model_name'] = model_name
    stats['train_difficulty'] = train_difficulty
    stats['gamma'] = discount_factor_gamma
    stats['lr'] = learning_rate
    stats['entropy_decay'] = entropy_decay
    stats.to_pickle(google_drive_folder + '/' + 'train_stats' + '/' + model_name + '.pkl')

def save_test_stats(stats, model_name, train_difficulty, test_difficulty, discount_factor_gamma, learning_rate, entropy_decay):

    stats = pd.DataFrame(stats)

    stats['model_name'] = model_name
    stats['train_difficulty'] = train_difficulty
    stats['gamma'] = discount_factor_gamma
    stats['lr'] = learning_rate
    stats['test_difficulty'] = test_difficulty
    stats['test_difficulty'] = test_difficulty
    stats['entropy_decay'] = entropy_decay


    model_name = model_name + f'__TestDiff_{test_difficulty}'
    stats.to_pickle(google_drive_folder + '/' + 'test_stats' + '/' + model_name + '.pkl')



In [7]:
# Set Google Drive Folder
google_drive_folder = './Monografia/Exp1-2/'
os.makedirs(google_drive_folder + 'models', exist_ok=True)
os.makedirs(google_drive_folder + 'train_stats', exist_ok=True)
os.makedirs(google_drive_folder + 'test_stats', exist_ok=True)

# Hyperparameters
GAMMA_LIST = [0.7, 0.9, 0.99]
LR_LIST = [0.001, 0.0001]
ENTROPY_DECAY = [False, True]
TRAIN_DIFFICULTY = ['easy']
TEST_DIFFICULTY = ['easy', 'medium', 'hard', 'uniform', 'independent']
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

hidden_size = 48
max_steps = 100
n_episodes_train = 25000
n_episodes_test= 300

hyperparameters_done = [
    (0.7, 0.0001, True),
    (0.9, 0.0001, True),
    (0.9, 0.0001, False),
    (0.9, 0.001, True),
    (0.7, 0.001, False),
    (0.9, 0.001, False)
]


# Process
for train_difficulty in TRAIN_DIFFICULTY:

  hyperparameters = list(itertools.product(GAMMA_LIST, LR_LIST, ENTROPY_DECAY)) # 100 in paper
  random.shuffle(hyperparameters)

  for remove in hyperparameters_done:
     hyperparameters.remove(remove)

  for i, (gamma, learning_rate, entropy_decay) in enumerate(hyperparameters):

      print(f'\nTRAIN DIFFICULTY {train_difficulty}\nHyperparameter {i+1}: GAMMA {gamma} - LR {learning_rate} - Entropy Decay {entropy_decay}')

      # Environment and agent setup
      env = BanditEnvironment(difficulty = train_difficulty, max_steps = max_steps)
      num_actions = env.num_actions
      input_dim = 1 + num_actions + 1  # Timestep, past action oh, past reward
      agent = A2C_LSTM_Agent(input_dim, hidden_size, num_actions).to(device)
      optimizer = optim.Adam(agent.parameters(), lr=learning_rate)

      # Train
      stats = train_agent(TRAIN = True, n_episodes = n_episodes_train, gamma = gamma, learning_rate = learning_rate, entropy_decay = entropy_decay)

      # Save Model and Train Stats
      agent.eval()
      model_name = f'TrainDiff_{train_difficulty}__LR_{str(learning_rate).replace(".","_")}__GAMMA_{str(gamma).replace(".","_")}__EntropyDecay_{str(entropy_decay)}'
      torch.save(agent.state_dict(), google_drive_folder + 'models/'+ model_name + '.pth')
      save_train_stats(stats, model_name, train_difficulty, gamma, learning_rate, entropy_decay)

      for test_difficulty in TEST_DIFFICULTY:

          print(f'\nTEST {test_difficulty}:')

          # Test and save stats
          stats = train_agent(TRAIN = False, n_episodes = n_episodes_test, gamma = gamma, learning_rate = learning_rate, entropy_decay = entropy_decay)
          save_test_stats(stats, model_name, train_difficulty, test_difficulty, gamma, learning_rate, entropy_decay)




TRAIN DIFFICULTY easy
Hyperparameter 1: GAMMA 0.7 - LR 0.0001 - Entropy Decay False


  4%|▍         | 1000/25000 [14:29<6:17:01,  1.06it/s]

Ep 1000/25000, Opt. Action: 1, Reward: 83, Cumulative-Regret: 7.0, AVG100-Regret: 5.26


  8%|▊         | 2000/25000 [28:24<5:27:59,  1.17it/s]

Ep 2000/25000, Opt. Action: 1, Reward: 91, Cumulative-Regret: -1.0, AVG100-Regret: 3.89


 12%|█▏        | 3000/25000 [41:36<5:03:47,  1.21it/s]

Ep 3000/25000, Opt. Action: 1, Reward: 94, Cumulative-Regret: -4.0, AVG100-Regret: 0.26


 16%|█▌        | 4000/25000 [56:17<5:07:49,  1.14it/s] 

Ep 4000/25000, Opt. Action: 0, Reward: 91, Cumulative-Regret: -1.0, AVG100-Regret: 1.09


 20%|██        | 5000/25000 [1:09:49<5:38:18,  1.01s/it]

Ep 5000/25000, Opt. Action: 0, Reward: 87, Cumulative-Regret: 3.0, AVG100-Regret: 0.82


 24%|██▍       | 6000/25000 [1:25:12<5:17:44,  1.00s/it]

Ep 6000/25000, Opt. Action: 0, Reward: 85, Cumulative-Regret: 5.0, AVG100-Regret: 0.33


 28%|██▊       | 7000/25000 [1:41:16<4:42:26,  1.06it/s] 

Ep 7000/25000, Opt. Action: 1, Reward: 91, Cumulative-Regret: -1.0, AVG100-Regret: 1.83


 32%|███▏      | 8000/25000 [1:55:49<3:46:24,  1.25it/s]

Ep 8000/25000, Opt. Action: 0, Reward: 94, Cumulative-Regret: -4.0, AVG100-Regret: 0.88


 36%|███▌      | 9000/25000 [2:09:35<3:39:43,  1.21it/s]

Ep 9000/25000, Opt. Action: 1, Reward: 91, Cumulative-Regret: -1.0, AVG100-Regret: 0.66


 40%|████      | 10000/25000 [2:23:24<3:22:42,  1.23it/s]

Ep 10000/25000, Opt. Action: 0, Reward: 89, Cumulative-Regret: 1.0, AVG100-Regret: 0.73


 44%|████▍     | 11000/25000 [2:37:16<3:10:58,  1.22it/s]

Ep 11000/25000, Opt. Action: 1, Reward: 84, Cumulative-Regret: 6.0, AVG100-Regret: 0.74


 48%|████▊     | 12000/25000 [2:50:51<2:53:44,  1.25it/s]

Ep 12000/25000, Opt. Action: 0, Reward: 90, Cumulative-Regret: 0.0, AVG100-Regret: 0.89


 52%|█████▏    | 13000/25000 [3:04:33<2:55:37,  1.14it/s]

Ep 13000/25000, Opt. Action: 1, Reward: 87, Cumulative-Regret: 3.0, AVG100-Regret: 0.94


 56%|█████▌    | 14000/25000 [3:18:26<2:26:37,  1.25it/s]

Ep 14000/25000, Opt. Action: 0, Reward: 89, Cumulative-Regret: 1.0, AVG100-Regret: 0.73


 60%|██████    | 15000/25000 [3:32:12<2:15:05,  1.23it/s]

Ep 15000/25000, Opt. Action: 1, Reward: 90, Cumulative-Regret: 0.0, AVG100-Regret: 0.97


 64%|██████▍   | 16000/25000 [3:45:50<2:02:14,  1.23it/s]

Ep 16000/25000, Opt. Action: 1, Reward: 91, Cumulative-Regret: -1.0, AVG100-Regret: 1.07


 68%|██████▊   | 17000/25000 [3:59:48<1:58:07,  1.13it/s]

Ep 17000/25000, Opt. Action: 1, Reward: 86, Cumulative-Regret: 4.0, AVG100-Regret: 0.7


 72%|███████▏  | 18000/25000 [4:13:36<1:36:17,  1.21it/s]

Ep 18000/25000, Opt. Action: 1, Reward: 89, Cumulative-Regret: 1.0, AVG100-Regret: 1.34


 76%|███████▌  | 19000/25000 [4:27:26<1:22:38,  1.21it/s]

Ep 19000/25000, Opt. Action: 0, Reward: 91, Cumulative-Regret: -1.0, AVG100-Regret: 0.86


 80%|████████  | 20000/25000 [4:41:08<1:07:52,  1.23it/s]

Ep 20000/25000, Opt. Action: 1, Reward: 88, Cumulative-Regret: 2.0, AVG100-Regret: 1.06


 84%|████████▍ | 21000/25000 [4:55:02<1:00:16,  1.11it/s]

Ep 21000/25000, Opt. Action: 0, Reward: 89, Cumulative-Regret: 1.0, AVG100-Regret: 0.65


 88%|████████▊ | 22000/25000 [5:09:06<45:09,  1.11it/s]  

Ep 22000/25000, Opt. Action: 0, Reward: 87, Cumulative-Regret: 3.0, AVG100-Regret: 0.67


 92%|█████████▏| 23000/25000 [5:21:37<23:58,  1.39it/s]

Ep 23000/25000, Opt. Action: 0, Reward: 92, Cumulative-Regret: -2.0, AVG100-Regret: 0.58


 96%|█████████▌| 24000/25000 [5:33:36<12:30,  1.33it/s]

Ep 24000/25000, Opt. Action: 1, Reward: 87, Cumulative-Regret: 3.0, AVG100-Regret: 0.41


100%|██████████| 25000/25000 [5:45:34<00:00,  1.21it/s]

Ep 25000/25000, Opt. Action: 1, Reward: 93, Cumulative-Regret: -3.0, AVG100-Regret: 0.83






TEST easy:


100%|██████████| 300/300 [00:57<00:00,  5.19it/s]



TEST medium:


100%|██████████| 300/300 [00:56<00:00,  5.26it/s]



TEST hard:


100%|██████████| 300/300 [00:58<00:00,  5.14it/s]



TEST uniform:


100%|██████████| 300/300 [00:59<00:00,  5.04it/s]



TEST independent:


100%|██████████| 300/300 [00:59<00:00,  5.02it/s]



TRAIN DIFFICULTY easy
Hyperparameter 2: GAMMA 0.7 - LR 0.001 - Entropy Decay True


  4%|▍         | 1000/25000 [12:32<5:04:37,  1.31it/s]

Ep 1000/25000, Opt. Action: 0, Reward: 87, Cumulative-Regret: 3.0, AVG100-Regret: 2.65


  8%|▊         | 2000/25000 [24:42<4:29:22,  1.42it/s]

Ep 2000/25000, Opt. Action: 0, Reward: 94, Cumulative-Regret: -4.0, AVG100-Regret: 0.76


 12%|█▏        | 3000/25000 [36:28<4:20:58,  1.40it/s]

Ep 3000/25000, Opt. Action: 0, Reward: 90, Cumulative-Regret: 0.0, AVG100-Regret: 3.18


 16%|█▌        | 4000/25000 [48:14<4:06:37,  1.42it/s]

Ep 4000/25000, Opt. Action: 1, Reward: 92, Cumulative-Regret: -2.0, AVG100-Regret: 1.35


 20%|██        | 5000/25000 [1:00:03<3:57:06,  1.41it/s]

Ep 5000/25000, Opt. Action: 1, Reward: 90, Cumulative-Regret: 0.0, AVG100-Regret: 2.86


 24%|██▍       | 6000/25000 [1:12:23<3:53:41,  1.36it/s]

Ep 6000/25000, Opt. Action: 0, Reward: 88, Cumulative-Regret: 2.0, AVG100-Regret: 2.52


 28%|██▊       | 7000/25000 [1:24:42<3:39:53,  1.36it/s]

Ep 7000/25000, Opt. Action: 0, Reward: 82, Cumulative-Regret: 8.0, AVG100-Regret: 2.08


 32%|███▏      | 8000/25000 [1:36:48<3:21:29,  1.41it/s]

Ep 8000/25000, Opt. Action: 0, Reward: 93, Cumulative-Regret: -3.0, AVG100-Regret: 3.19


 36%|███▌      | 9000/25000 [1:48:35<3:08:04,  1.42it/s]

Ep 9000/25000, Opt. Action: 0, Reward: 96, Cumulative-Regret: -6.0, AVG100-Regret: 0.74


 40%|████      | 10000/25000 [2:00:22<2:55:58,  1.42it/s]

Ep 10000/25000, Opt. Action: 0, Reward: 91, Cumulative-Regret: -1.0, AVG100-Regret: 2.68


 44%|████▍     | 11000/25000 [2:12:09<2:44:02,  1.42it/s]

Ep 11000/25000, Opt. Action: 1, Reward: 95, Cumulative-Regret: -5.0, AVG100-Regret: 3.31


 48%|████▊     | 12000/25000 [2:23:53<2:33:52,  1.41it/s]

Ep 12000/25000, Opt. Action: 0, Reward: 90, Cumulative-Regret: 0.0, AVG100-Regret: 2.15


 52%|█████▏    | 13000/25000 [2:35:41<2:29:55,  1.33it/s]

Ep 13000/25000, Opt. Action: 0, Reward: 77, Cumulative-Regret: 13.0, AVG100-Regret: 2.26


 56%|█████▌    | 14000/25000 [2:48:16<2:19:01,  1.32it/s]

Ep 14000/25000, Opt. Action: 1, Reward: 92, Cumulative-Regret: -2.0, AVG100-Regret: 4.8


 60%|██████    | 15000/25000 [3:00:37<1:56:30,  1.43it/s]

Ep 15000/25000, Opt. Action: 0, Reward: 9, Cumulative-Regret: 81.0, AVG100-Regret: 2.73


 64%|██████▍   | 16000/25000 [3:12:22<1:45:35,  1.42it/s]

Ep 16000/25000, Opt. Action: 1, Reward: 95, Cumulative-Regret: -5.0, AVG100-Regret: 1.96


 68%|██████▊   | 17000/25000 [3:24:05<1:33:23,  1.43it/s]

Ep 17000/25000, Opt. Action: 0, Reward: 83, Cumulative-Regret: 7.0, AVG100-Regret: 0.47


 72%|███████▏  | 18000/25000 [3:36:31<1:27:40,  1.33it/s]

Ep 18000/25000, Opt. Action: 1, Reward: 90, Cumulative-Regret: 0.0, AVG100-Regret: 0.63


 76%|███████▌  | 19000/25000 [3:48:43<1:10:44,  1.41it/s]

Ep 19000/25000, Opt. Action: 1, Reward: 89, Cumulative-Regret: 1.0, AVG100-Regret: 0.2


 80%|████████  | 20000/25000 [4:00:29<58:23,  1.43it/s]  

Ep 20000/25000, Opt. Action: 1, Reward: 85, Cumulative-Regret: 5.0, AVG100-Regret: 0.46


 84%|████████▍ | 21000/25000 [4:12:13<46:44,  1.43it/s]

Ep 21000/25000, Opt. Action: 0, Reward: 66, Cumulative-Regret: 24.0, AVG100-Regret: 5.15


 88%|████████▊ | 22000/25000 [4:23:58<35:09,  1.42it/s]

Ep 22000/25000, Opt. Action: 1, Reward: 83, Cumulative-Regret: 7.0, AVG100-Regret: 0.62


 92%|█████████▏| 23000/25000 [4:36:31<25:26,  1.31it/s]

Ep 23000/25000, Opt. Action: 1, Reward: 94, Cumulative-Regret: -4.0, AVG100-Regret: 1.81


 96%|█████████▌| 24000/25000 [4:48:30<11:42,  1.42it/s]

Ep 24000/25000, Opt. Action: 0, Reward: 93, Cumulative-Regret: -3.0, AVG100-Regret: 1.25


100%|██████████| 25000/25000 [5:00:42<00:00,  1.39it/s]

Ep 25000/25000, Opt. Action: 0, Reward: 90, Cumulative-Regret: 0.0, AVG100-Regret: 0.97






TEST easy:


100%|██████████| 300/300 [00:56<00:00,  5.29it/s]



TEST medium:


100%|██████████| 300/300 [00:58<00:00,  5.13it/s]



TEST hard:


100%|██████████| 300/300 [00:58<00:00,  5.16it/s]



TEST uniform:


100%|██████████| 300/300 [00:58<00:00,  5.16it/s]



TEST independent:


100%|██████████| 300/300 [00:59<00:00,  5.05it/s]



TRAIN DIFFICULTY easy
Hyperparameter 3: GAMMA 0.99 - LR 0.0001 - Entropy Decay True


  4%|▍         | 1000/25000 [12:15<4:49:08,  1.38it/s]

Ep 1000/25000, Opt. Action: 1, Reward: 70, Cumulative-Regret: 20.0, AVG100-Regret: 38.24


  8%|▊         | 2000/25000 [25:12<5:05:37,  1.25it/s]

Ep 2000/25000, Opt. Action: 0, Reward: 39, Cumulative-Regret: 51.0, AVG100-Regret: 38.34


 12%|█▏        | 3000/25000 [38:16<4:49:38,  1.27it/s]

Ep 3000/25000, Opt. Action: 0, Reward: 47, Cumulative-Regret: 43.0, AVG100-Regret: 35.09


 16%|█▌        | 4000/25000 [52:10<4:52:28,  1.20it/s]

Ep 4000/25000, Opt. Action: 1, Reward: 43, Cumulative-Regret: 47.0, AVG100-Regret: 39.88


 20%|██        | 5000/25000 [1:06:10<4:39:12,  1.19it/s]

Ep 5000/25000, Opt. Action: 1, Reward: 53, Cumulative-Regret: 37.0, AVG100-Regret: 40.81


 24%|██▍       | 6000/25000 [1:19:57<4:08:46,  1.27it/s]

Ep 6000/25000, Opt. Action: 1, Reward: 30, Cumulative-Regret: 60.0, AVG100-Regret: 38.71


 28%|██▊       | 7000/25000 [1:33:04<3:56:07,  1.27it/s]

Ep 7000/25000, Opt. Action: 0, Reward: 70, Cumulative-Regret: 20.0, AVG100-Regret: 39.67


 32%|███▏      | 8000/25000 [1:46:10<3:55:01,  1.21it/s]

Ep 8000/25000, Opt. Action: 0, Reward: 62, Cumulative-Regret: 28.0, AVG100-Regret: 40.59


 36%|███▌      | 9000/25000 [2:00:11<3:43:07,  1.20it/s]

Ep 9000/25000, Opt. Action: 1, Reward: 67, Cumulative-Regret: 23.0, AVG100-Regret: 41.18


 40%|████      | 10000/25000 [2:14:10<3:29:33,  1.19it/s]

Ep 10000/25000, Opt. Action: 1, Reward: 58, Cumulative-Regret: 32.0, AVG100-Regret: 36.94


 44%|████▍     | 11000/25000 [2:27:20<3:05:35,  1.26it/s]

Ep 11000/25000, Opt. Action: 0, Reward: 89, Cumulative-Regret: 1.0, AVG100-Regret: 38.67


 48%|████▊     | 12000/25000 [2:40:42<3:01:54,  1.19it/s]

Ep 12000/25000, Opt. Action: 0, Reward: 14, Cumulative-Regret: 76.0, AVG100-Regret: 39.81


 52%|█████▏    | 13000/25000 [2:54:47<2:48:45,  1.19it/s]

Ep 13000/25000, Opt. Action: 0, Reward: 56, Cumulative-Regret: 34.0, AVG100-Regret: 38.56


 56%|█████▌    | 14000/25000 [3:08:09<2:23:48,  1.27it/s]

Ep 14000/25000, Opt. Action: 1, Reward: 21, Cumulative-Regret: 69.0, AVG100-Regret: 37.56


 60%|██████    | 15000/25000 [3:21:50<2:20:29,  1.19it/s]

Ep 15000/25000, Opt. Action: 1, Reward: 71, Cumulative-Regret: 19.0, AVG100-Regret: 39.65


 64%|██████▍   | 16000/25000 [3:35:30<1:58:43,  1.26it/s]

Ep 16000/25000, Opt. Action: 0, Reward: 44, Cumulative-Regret: 46.0, AVG100-Regret: 38.65


 68%|██████▊   | 17000/25000 [3:48:40<1:45:03,  1.27it/s]

Ep 17000/25000, Opt. Action: 0, Reward: 25, Cumulative-Regret: 65.0, AVG100-Regret: 41.8


 72%|███████▏  | 18000/25000 [4:02:36<1:32:37,  1.26it/s]

Ep 18000/25000, Opt. Action: 1, Reward: 74, Cumulative-Regret: 16.0, AVG100-Regret: 39.59


 76%|███████▌  | 19000/25000 [4:16:16<1:24:47,  1.18it/s]

Ep 19000/25000, Opt. Action: 0, Reward: 91, Cumulative-Regret: -1.0, AVG100-Regret: 37.03


 80%|████████  | 20000/25000 [4:29:48<1:06:26,  1.25it/s]

Ep 20000/25000, Opt. Action: 0, Reward: 89, Cumulative-Regret: 1.0, AVG100-Regret: 42.33


 84%|████████▍ | 21000/25000 [4:43:02<56:17,  1.18it/s]  

Ep 21000/25000, Opt. Action: 1, Reward: 7, Cumulative-Regret: 83.0, AVG100-Regret: 38.21


 88%|████████▊ | 22000/25000 [4:56:51<39:25,  1.27it/s]

Ep 22000/25000, Opt. Action: 0, Reward: 89, Cumulative-Regret: 1.0, AVG100-Regret: 34.79


 92%|█████████▏| 23000/25000 [5:10:22<28:22,  1.17it/s]

Ep 23000/25000, Opt. Action: 1, Reward: 11, Cumulative-Regret: 79.0, AVG100-Regret: 44.01


 96%|█████████▌| 24000/25000 [5:24:30<13:55,  1.20it/s]

Ep 24000/25000, Opt. Action: 0, Reward: 97, Cumulative-Regret: -7.0, AVG100-Regret: 41.12


100%|██████████| 25000/25000 [5:37:44<00:00,  1.23it/s]

Ep 25000/25000, Opt. Action: 0, Reward: 96, Cumulative-Regret: -6.0, AVG100-Regret: 42.69






TEST easy:


100%|██████████| 300/300 [00:55<00:00,  5.42it/s]



TEST medium:


100%|██████████| 300/300 [00:55<00:00,  5.43it/s]



TEST hard:


100%|██████████| 300/300 [00:54<00:00,  5.46it/s]



TEST uniform:


100%|██████████| 300/300 [00:55<00:00,  5.44it/s]



TEST independent:


100%|██████████| 300/300 [00:55<00:00,  5.40it/s]



TRAIN DIFFICULTY easy
Hyperparameter 4: GAMMA 0.99 - LR 0.001 - Entropy Decay True


  4%|▍         | 1000/25000 [12:09<5:18:04,  1.26it/s]

Ep 1000/25000, Opt. Action: 0, Reward: 88, Cumulative-Regret: 2.0, AVG100-Regret: 36.18


  8%|▊         | 2000/25000 [25:26<5:07:04,  1.25it/s]

Ep 2000/25000, Opt. Action: 0, Reward: 87, Cumulative-Regret: 3.0, AVG100-Regret: 44.55


 12%|█▏        | 3000/25000 [38:51<5:11:37,  1.18it/s]

Ep 3000/25000, Opt. Action: 1, Reward: 10, Cumulative-Regret: 80.0, AVG100-Regret: 33.94


 16%|█▌        | 4000/25000 [53:04<4:58:31,  1.17it/s]

Ep 4000/25000, Opt. Action: 1, Reward: 6, Cumulative-Regret: 84.0, AVG100-Regret: 36.63


 20%|██        | 5000/25000 [1:07:16<4:48:37,  1.15it/s]

Ep 5000/25000, Opt. Action: 1, Reward: 11, Cumulative-Regret: 79.0, AVG100-Regret: 42.6


 24%|██▍       | 6000/25000 [1:20:44<4:10:25,  1.26it/s]

Ep 6000/25000, Opt. Action: 1, Reward: 85, Cumulative-Regret: 5.0, AVG100-Regret: 35.36


 28%|██▊       | 7000/25000 [1:34:02<3:58:33,  1.26it/s]

Ep 7000/25000, Opt. Action: 0, Reward: 7, Cumulative-Regret: 83.0, AVG100-Regret: 34.09


 32%|███▏      | 8000/25000 [1:48:01<4:02:02,  1.17it/s]

Ep 8000/25000, Opt. Action: 0, Reward: 15, Cumulative-Regret: 75.0, AVG100-Regret: 39.48


 36%|███▌      | 9000/25000 [2:02:10<3:33:08,  1.25it/s]

Ep 9000/25000, Opt. Action: 0, Reward: 7, Cumulative-Regret: 83.0, AVG100-Regret: 44.15


 40%|████      | 10000/25000 [2:15:39<3:22:17,  1.24it/s]

Ep 10000/25000, Opt. Action: 1, Reward: 93, Cumulative-Regret: -3.0, AVG100-Regret: 40.31


 44%|████▍     | 11000/25000 [2:29:03<3:07:23,  1.25it/s]

Ep 11000/25000, Opt. Action: 1, Reward: 88, Cumulative-Regret: 2.0, AVG100-Regret: 39.53


 48%|████▊     | 12000/25000 [2:43:05<3:05:05,  1.17it/s]

Ep 12000/25000, Opt. Action: 1, Reward: 87, Cumulative-Regret: 3.0, AVG100-Regret: 39.82


 52%|█████▏    | 13000/25000 [2:56:56<2:41:41,  1.24it/s]

Ep 13000/25000, Opt. Action: 1, Reward: 93, Cumulative-Regret: -3.0, AVG100-Regret: 36.36


 56%|█████▌    | 14000/25000 [3:10:18<2:27:08,  1.25it/s]

Ep 14000/25000, Opt. Action: 0, Reward: 13, Cumulative-Regret: 77.0, AVG100-Regret: 39.4


 60%|██████    | 15000/25000 [3:23:38<2:11:54,  1.26it/s]

Ep 15000/25000, Opt. Action: 0, Reward: 6, Cumulative-Regret: 84.0, AVG100-Regret: 44.19


 64%|██████▍   | 16000/25000 [3:37:22<2:07:07,  1.18it/s]

Ep 16000/25000, Opt. Action: 1, Reward: 92, Cumulative-Regret: -2.0, AVG100-Regret: 35.93


 68%|██████▊   | 17000/25000 [3:51:34<1:46:22,  1.25it/s]

Ep 17000/25000, Opt. Action: 0, Reward: 9, Cumulative-Regret: 81.0, AVG100-Regret: 43.09


 72%|███████▏  | 18000/25000 [4:04:54<1:33:18,  1.25it/s]

Ep 18000/25000, Opt. Action: 1, Reward: 87, Cumulative-Regret: 3.0, AVG100-Regret: 40.01


 76%|███████▌  | 19000/25000 [4:18:15<1:19:50,  1.25it/s]

Ep 19000/25000, Opt. Action: 1, Reward: 90, Cumulative-Regret: 0.0, AVG100-Regret: 40.41


 80%|████████  | 20000/25000 [4:31:38<1:06:34,  1.25it/s]

Ep 20000/25000, Opt. Action: 1, Reward: 85, Cumulative-Regret: 5.0, AVG100-Regret: 40.27


 84%|████████▍ | 21000/25000 [4:44:51<53:12,  1.25it/s]  

Ep 21000/25000, Opt. Action: 0, Reward: 8, Cumulative-Regret: 82.0, AVG100-Regret: 39.87


 88%|████████▊ | 22000/25000 [4:58:55<41:54,  1.19it/s]

Ep 22000/25000, Opt. Action: 1, Reward: 88, Cumulative-Regret: 2.0, AVG100-Regret: 31.08


 92%|█████████▏| 23000/25000 [5:12:28<26:19,  1.27it/s]

Ep 23000/25000, Opt. Action: 1, Reward: 92, Cumulative-Regret: -2.0, AVG100-Regret: 38.52


 96%|█████████▌| 24000/25000 [5:28:05<14:46,  1.13it/s]  

Ep 24000/25000, Opt. Action: 1, Reward: 10, Cumulative-Regret: 80.0, AVG100-Regret: 39.32


100%|██████████| 25000/25000 [5:42:21<00:00,  1.22it/s]

Ep 25000/25000, Opt. Action: 1, Reward: 91, Cumulative-Regret: -1.0, AVG100-Regret: 43.54






TEST easy:


100%|██████████| 300/300 [01:00<00:00,  4.94it/s]



TEST medium:


100%|██████████| 300/300 [01:00<00:00,  4.93it/s]



TEST hard:


100%|██████████| 300/300 [01:01<00:00,  4.88it/s]



TEST uniform:


100%|██████████| 300/300 [01:01<00:00,  4.86it/s]



TEST independent:


100%|██████████| 300/300 [01:01<00:00,  4.89it/s]



TRAIN DIFFICULTY easy
Hyperparameter 5: GAMMA 0.99 - LR 0.001 - Entropy Decay False


  4%|▍         | 1000/25000 [13:11<5:47:25,  1.15it/s]

Ep 1000/25000, Opt. Action: 0, Reward: 11, Cumulative-Regret: 79.0, AVG100-Regret: 40.56


  8%|▊         | 2000/25000 [27:21<5:23:40,  1.18it/s]

Ep 2000/25000, Opt. Action: 1, Reward: 95, Cumulative-Regret: -5.0, AVG100-Regret: 37.11


 12%|█▏        | 3000/25000 [41:33<5:10:27,  1.18it/s]

Ep 3000/25000, Opt. Action: 0, Reward: 12, Cumulative-Regret: 78.0, AVG100-Regret: 49.55


 16%|█▌        | 4000/25000 [55:43<4:56:43,  1.18it/s]

Ep 4000/25000, Opt. Action: 0, Reward: 13, Cumulative-Regret: 77.0, AVG100-Regret: 42.42


 20%|██        | 5000/25000 [1:09:54<4:40:51,  1.19it/s]

Ep 5000/25000, Opt. Action: 0, Reward: 7, Cumulative-Regret: 83.0, AVG100-Regret: 40.45


 24%|██▍       | 6000/25000 [1:24:04<4:29:10,  1.18it/s]

Ep 6000/25000, Opt. Action: 1, Reward: 89, Cumulative-Regret: 1.0, AVG100-Regret: 43.71


 28%|██▊       | 7000/25000 [1:38:16<4:14:59,  1.18it/s]

Ep 7000/25000, Opt. Action: 0, Reward: 10, Cumulative-Regret: 80.0, AVG100-Regret: 38.66


 32%|███▏      | 8000/25000 [1:52:27<3:59:06,  1.18it/s]

Ep 8000/25000, Opt. Action: 1, Reward: 98, Cumulative-Regret: -8.0, AVG100-Regret: 42.4


 36%|███▌      | 9000/25000 [2:06:46<3:45:47,  1.18it/s]

Ep 9000/25000, Opt. Action: 1, Reward: 92, Cumulative-Regret: -2.0, AVG100-Regret: 35.77


 40%|████      | 10000/25000 [2:20:59<3:35:24,  1.16it/s]

Ep 10000/25000, Opt. Action: 1, Reward: 90, Cumulative-Regret: 0.0, AVG100-Regret: 41.59


 44%|████▍     | 11000/25000 [2:35:19<3:17:12,  1.18it/s]

Ep 11000/25000, Opt. Action: 0, Reward: 14, Cumulative-Regret: 76.0, AVG100-Regret: 33.49


 48%|████▊     | 12000/25000 [2:49:27<3:02:47,  1.19it/s]

Ep 12000/25000, Opt. Action: 1, Reward: 96, Cumulative-Regret: -6.0, AVG100-Regret: 43.51


 52%|█████▏    | 13000/25000 [3:03:37<2:49:10,  1.18it/s]

Ep 13000/25000, Opt. Action: 1, Reward: 93, Cumulative-Regret: -3.0, AVG100-Regret: 37.03


 56%|█████▌    | 14000/25000 [3:17:46<2:39:38,  1.15it/s]

Ep 14000/25000, Opt. Action: 1, Reward: 9, Cumulative-Regret: 81.0, AVG100-Regret: 48.1


 60%|██████    | 15000/25000 [3:31:54<2:19:36,  1.19it/s]

Ep 15000/25000, Opt. Action: 1, Reward: 4, Cumulative-Regret: 86.0, AVG100-Regret: 39.88


 64%|██████▍   | 16000/25000 [3:46:06<2:09:02,  1.16it/s]

Ep 16000/25000, Opt. Action: 1, Reward: 12, Cumulative-Regret: 78.0, AVG100-Regret: 35.74


 68%|██████▊   | 17000/25000 [4:00:26<1:52:01,  1.19it/s]

Ep 17000/25000, Opt. Action: 1, Reward: 11, Cumulative-Regret: 79.0, AVG100-Regret: 38.21


 72%|███████▏  | 18000/25000 [4:14:37<1:38:05,  1.19it/s]

Ep 18000/25000, Opt. Action: 0, Reward: 89, Cumulative-Regret: 1.0, AVG100-Regret: 31.96


 76%|███████▌  | 19000/25000 [4:28:57<1:27:23,  1.14it/s]

Ep 19000/25000, Opt. Action: 1, Reward: 8, Cumulative-Regret: 82.0, AVG100-Regret: 42.07


 80%|████████  | 20000/25000 [4:43:15<1:11:28,  1.17it/s]

Ep 20000/25000, Opt. Action: 0, Reward: 88, Cumulative-Regret: 2.0, AVG100-Regret: 42.36


 84%|████████▍ | 21000/25000 [4:57:23<56:21,  1.18it/s]  

Ep 21000/25000, Opt. Action: 0, Reward: 90, Cumulative-Regret: 0.0, AVG100-Regret: 38.21


 88%|████████▊ | 22000/25000 [5:11:49<44:22,  1.13it/s]

Ep 22000/25000, Opt. Action: 1, Reward: 90, Cumulative-Regret: 0.0, AVG100-Regret: 43.17


 92%|█████████▏| 23000/25000 [5:26:02<28:04,  1.19it/s]

Ep 23000/25000, Opt. Action: 0, Reward: 9, Cumulative-Regret: 81.0, AVG100-Regret: 38.43


 96%|█████████▌| 24000/25000 [5:40:09<14:04,  1.18it/s]

Ep 24000/25000, Opt. Action: 0, Reward: 10, Cumulative-Regret: 80.0, AVG100-Regret: 40.37


100%|██████████| 25000/25000 [5:54:17<00:00,  1.18it/s]

Ep 25000/25000, Opt. Action: 1, Reward: 91, Cumulative-Regret: -1.0, AVG100-Regret: 37.6






TEST easy:


100%|██████████| 300/300 [01:00<00:00,  4.97it/s]



TEST medium:


100%|██████████| 300/300 [01:00<00:00,  5.00it/s]



TEST hard:


100%|██████████| 300/300 [00:59<00:00,  5.02it/s]



TEST uniform:


100%|██████████| 300/300 [00:59<00:00,  5.01it/s]



TEST independent:


100%|██████████| 300/300 [01:00<00:00,  4.97it/s]



TRAIN DIFFICULTY easy
Hyperparameter 6: GAMMA 0.99 - LR 0.0001 - Entropy Decay False


  4%|▍         | 1000/25000 [12:30<5:07:52,  1.30it/s]

Ep 1000/25000, Opt. Action: 0, Reward: 8, Cumulative-Regret: 82.0, AVG100-Regret: 45.62


  8%|▊         | 2000/25000 [26:26<5:22:24,  1.19it/s]

Ep 2000/25000, Opt. Action: 0, Reward: 11, Cumulative-Regret: 79.0, AVG100-Regret: 34.49


 12%|█▏        | 3000/25000 [40:33<5:07:41,  1.19it/s]

Ep 3000/25000, Opt. Action: 1, Reward: 87, Cumulative-Regret: 3.0, AVG100-Regret: 43.95


 16%|█▌        | 4000/25000 [54:40<5:02:24,  1.16it/s]

Ep 4000/25000, Opt. Action: 1, Reward: 92, Cumulative-Regret: -2.0, AVG100-Regret: 35.76


 20%|██        | 5000/25000 [1:08:48<4:42:19,  1.18it/s]

Ep 5000/25000, Opt. Action: 0, Reward: 12, Cumulative-Regret: 78.0, AVG100-Regret: 36.49


 24%|██▍       | 6000/25000 [1:22:56<4:26:35,  1.19it/s]

Ep 6000/25000, Opt. Action: 1, Reward: 87, Cumulative-Regret: 3.0, AVG100-Regret: 31.76


 28%|██▊       | 7000/25000 [1:37:05<4:16:51,  1.17it/s]

Ep 7000/25000, Opt. Action: 0, Reward: 8, Cumulative-Regret: 82.0, AVG100-Regret: 42.91


 32%|███▏      | 8000/25000 [1:51:13<3:59:08,  1.18it/s]

Ep 8000/25000, Opt. Action: 1, Reward: 89, Cumulative-Regret: 1.0, AVG100-Regret: 36.99


 36%|███▌      | 9000/25000 [2:05:21<3:47:23,  1.17it/s]

Ep 9000/25000, Opt. Action: 1, Reward: 88, Cumulative-Regret: 2.0, AVG100-Regret: 37.86


 40%|████      | 10000/25000 [2:19:44<3:37:41,  1.15it/s]

Ep 10000/25000, Opt. Action: 0, Reward: 3, Cumulative-Regret: 87.0, AVG100-Regret: 40.81


 44%|████▍     | 11000/25000 [2:33:55<3:17:13,  1.18it/s]

Ep 11000/25000, Opt. Action: 1, Reward: 92, Cumulative-Regret: -2.0, AVG100-Regret: 37.12


 48%|████▊     | 12000/25000 [2:48:10<3:02:24,  1.19it/s]

Ep 12000/25000, Opt. Action: 1, Reward: 96, Cumulative-Regret: -6.0, AVG100-Regret: 39.89


 52%|█████▏    | 13000/25000 [3:02:19<2:49:17,  1.18it/s]

Ep 13000/25000, Opt. Action: 1, Reward: 92, Cumulative-Regret: -2.0, AVG100-Regret: 36.18


 56%|█████▌    | 14000/25000 [3:16:29<2:35:14,  1.18it/s]

Ep 14000/25000, Opt. Action: 0, Reward: 10, Cumulative-Regret: 80.0, AVG100-Regret: 33.63


 60%|██████    | 15000/25000 [3:30:41<2:23:03,  1.17it/s]

Ep 15000/25000, Opt. Action: 0, Reward: 5, Cumulative-Regret: 85.0, AVG100-Regret: 41.66


 64%|██████▍   | 16000/25000 [3:44:49<2:07:47,  1.17it/s]

Ep 16000/25000, Opt. Action: 1, Reward: 89, Cumulative-Regret: 1.0, AVG100-Regret: 42.25


 68%|██████▊   | 17000/25000 [3:59:00<1:52:35,  1.18it/s]

Ep 17000/25000, Opt. Action: 0, Reward: 12, Cumulative-Regret: 78.0, AVG100-Regret: 40.23


 72%|███████▏  | 18000/25000 [4:13:08<1:37:45,  1.19it/s]

Ep 18000/25000, Opt. Action: 1, Reward: 87, Cumulative-Regret: 3.0, AVG100-Regret: 33.16


 76%|███████▌  | 19000/25000 [4:27:25<1:27:18,  1.15it/s]

Ep 19000/25000, Opt. Action: 1, Reward: 91, Cumulative-Regret: -1.0, AVG100-Regret: 29.53


 80%|████████  | 20000/25000 [4:41:45<1:11:24,  1.17it/s]

Ep 20000/25000, Opt. Action: 1, Reward: 83, Cumulative-Regret: 7.0, AVG100-Regret: 31.44


 84%|████████▍ | 21000/25000 [4:56:02<57:58,  1.15it/s]  

Ep 21000/25000, Opt. Action: 0, Reward: 6, Cumulative-Regret: 84.0, AVG100-Regret: 33.59


 88%|████████▊ | 22000/25000 [5:10:31<43:11,  1.16it/s]  

Ep 22000/25000, Opt. Action: 0, Reward: 8, Cumulative-Regret: 82.0, AVG100-Regret: 37.58


 92%|█████████▏| 23000/25000 [5:24:59<28:52,  1.15it/s]

Ep 23000/25000, Opt. Action: 1, Reward: 89, Cumulative-Regret: 1.0, AVG100-Regret: 33.25


 96%|█████████▌| 24000/25000 [5:39:30<14:31,  1.15it/s]

Ep 24000/25000, Opt. Action: 1, Reward: 84, Cumulative-Regret: 6.0, AVG100-Regret: 34.47


100%|██████████| 25000/25000 [5:53:56<00:00,  1.18it/s]

Ep 25000/25000, Opt. Action: 0, Reward: 7, Cumulative-Regret: 83.0, AVG100-Regret: 36.56






TEST easy:


100%|██████████| 300/300 [01:01<00:00,  4.87it/s]



TEST medium:


100%|██████████| 300/300 [01:01<00:00,  4.89it/s]



TEST hard:


100%|██████████| 300/300 [01:01<00:00,  4.87it/s]



TEST uniform:


100%|██████████| 300/300 [01:01<00:00,  4.86it/s]



TEST independent:


100%|██████████| 300/300 [01:01<00:00,  4.87it/s]
