In [45]:
from maze_env import MazeEnv_v0
from utils.PettingZooEnv_new import PettingZooEnv_new
import supersuit
import numpy as np
from tianshou.env.utils import PettingZooEnv
import tianshou as ts
from tianshou.utils.net.common import Net
import torch
import torch.nn as nn
import torch.nn.functional as F
import gymnasium as gym
import wandb

In [46]:
eps_train_start, eps_test = 0.95, 0.0 # exploration rate for training and testing respectively
eps_decay, eps_min = 0.999, 0.15 # the exploration rate decay and the minimum exploration rate
lr, epochs, batch_size = 5e-4, 150, 512 # the learning rate, max epochs per new maze intro and the update batch size
gamma, n_step, target_update_freq = 0.9, 3, 100 # gamma in dqn formula, number of steps to look ahead, number of update calls before updating target network
train_num, test_num = 10, 1 # num of simultaneous training and testing environments respectively
buffer_size = 30000 # buffer size
step_per_epoch, step_per_collect, ep_per_collect = 10000, 200, 1 # number of steps for each epoch, number of steps to collect before updating, number of episodes before updating
maze_width = 6 # maze width (not incl. walls)
high_eps_run, obs_train, passed_mazes = False, True, True # for random high eps run, for interleaving (might be broken?), whether the policies passed the mazes
steps_total, steps_n, episodes_total = 0, 0, 0 # steps count total, steps count within epoch, total number of episodes so far
n_mazes, total_mazes = 0, 16 # start with 3 (it will add one later) mazes initially to prevent single maze overfitting, total number of random mazes
# for the trivial maze, we use 36 (since it should be 'easier')
test_mazes = [] # for printing later
threshold_rew = 0.5 # threshold reward to consider a maze passed (tentative value)
maze_type = "random" # the type of maze to pass into the environment

"""
logger = ts.utils.WandbLogger(train_interval=1, update_interval=1)
writer = SummaryWriter('log/test_maze')
writer.add_text("run 2", "wandb")
logger.load(writer)
"""

'\nlogger = ts.utils.WandbLogger(train_interval=1, update_interval=1)\nwriter = SummaryWriter(\'log/test_maze\')\nwriter.add_text("run 2", "wandb")\nlogger.load(writer)\n'

In [47]:
# logger initialization
wandb.login()
run = wandb.init(
    project="Final_Year_Project",
    config={
        "eps_train":eps_train, "eps_test":eps_test,
        "eps_decay": eps_decay, "eps_min":eps_min,
        "learning rate": lr, "epochs": epochs, "batch_size":batch_size,
        "gamma": gamma, "n_step":n_step, "target_update_freq":target_update_freq,
        "buffer_size":buffer_size,
        "step_per_epoch":step_per_epoch, "step_per_collect":step_per_collect, "ep_per_collect":ep_per_collect,
        "maze width": maze_width, "n_mazes":n_mazes, "total_mazes":total_mazes,
        "threshold_rew":threshold_rew, "maze_type":maze_type,
        "non-marl":True
    }
)

  return LooseVersion(v) >= LooseVersion(check)
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
wandb: Currently logged in as: dandd0. Use `wandb login --relogin` to force relogin
  from IPython.core.display import HTML, display  # type: ignore


In [48]:
# define some helper functions
def preprocess_maze_env(render_mode=None, size=maze_width):
    env = MazeEnv_v0.env_single(render_mode=render_mode, size=size)
    env = supersuit.multiagent_wrappers.pad_observations_v0(env)
    env = PettingZooEnv_new(env)
    return env

"""
def preprocess_maze_env(render_mode=None, size=maze_width):
    env = MazeEnv_v0.env(render_mode=render_mode, size=size)
    env = supersuit.multiagent_wrappers.pad_observations_v0(env)
    env = PettingZooEnv_new(env)
    return env
"""

def interleave_training(obs_train):
    if obs_train:
        policy.policies[agents[0]].set_eps(eps_train)
        policy.policies[agents[1]].set_eps(0)
        obs_train = obs_train != True
    else:
        policy.policies[agents[0]].set_eps(0)
        policy.policies[agents[1]].set_eps(eps_train)
        obs_train = obs_train != True

def set_eps(eps1, eps2=None, single = False):
    if single:
        policy.set_eps(eps1)
    else:
        policy.policies[agents[0]].set_eps(eps1)
        policy.policies[agents[1]].set_eps(eps2)

# create a CNN for the observer
class CNN(nn.Module):
    def __init__(self):
        super().__init__()
        
        lin_size = ((((maze_width*2+1)-3+1)-3+1)-3+1)
        self.model = nn.Sequential(
            # assume maze size of 6x6 (13x13 with walls)
            nn.Conv2d(3, 16, 3), nn.ReLU(inplace=True),  # (13-3)+1 = 11, 
            nn.Conv2d(16, 32, 3), nn.ReLU(inplace=True), # 11-3+1=9, 
            nn.Conv2d(32, 64, 3), nn.ReLU(inplace=True), # 9-3+1=7
            nn.Flatten(), nn.Linear(64*lin_size*lin_size, 128), nn.ReLU(inplace=True),
            nn.Linear(128, 64), nn.ReLU(inplace=True),
            nn.Linear(64, 5)
        )
    
    def forward(self, obs, state=None, info={}):
        if not isinstance(obs, torch.Tensor):
            obs = torch.tensor(obs, dtype=torch.float)
        self.batch = obs.shape[0]
        #logits = self.model(obs.view(batch, -1))
        logits = self.model(obs)
        return logits, state

def watch(gym_reset_kwargs):
    assert gym_reset_kwargs is not None, "Please input reset kwargs i.e. options"
    # set policy to eval mode
    policy.eval()
    human_collector.reset_env(gym_reset_kwargs=gym_reset_kwargs)
    #np.random.seed()
    human_collector.collect(n_episode=1, render=1/120, gym_reset_kwargs=gym_reset_kwargs)
# reset back to training mode
    policy.train()

In [49]:
# get the vectorized training/testing environments
train_envs = ts.env.DummyVectorEnv([preprocess_maze_env for _ in range(train_num)])
test_envs = ts.env.DummyVectorEnv([preprocess_maze_env for _ in range(test_num)])

# set up training with no render environment
env = preprocess_maze_env()

# set up human render environment
env_human = preprocess_maze_env(render_mode="human")
env_human = ts.env.DummyVectorEnv([lambda: env_human])

# get agent names
agents = env.agents

# observation spaces/action spaces for the two agents
state_shape = env.observation_space.shape or env.observation_space.n
action_shape = env.action_space.shape or env.action_space.n

# define DQN network (128x3 hidden units linear)
#net_obs = Net(state_shape, action_shape, [128,128,128])
#net_obs = Net(state_shape, action_shape, [512, 512, 512])
net_obs = CNN()
optim_obs = torch.optim.Adam(params=net_obs.parameters(), lr=lr)

#net_exp = Net(state_shape, action_shape, [8])
#optim_exp = torch.optim.Adam(params=net_exp.parameters(), lr=lr)

# set up policy and collectors
agent_observer = ts.policy.DQNPolicy(net_obs, optim_obs, gamma, n_step, target_update_freq)
#agent_explorer = ts.policy.DQNPolicy(net_exp, optim_exp, gamma, n_step, target_update_freq)
#agent_policies = [agent_observer, agent_explorer]
#agent_policies = [ts.policy.RandomPolicy(), ts.policy.RandomPolicy()] # baseline testing
#policy = ts.policy.MultiAgentPolicyManager(agent_policies, env)

policy = agent_observer

# define the training collector (the calc q and step functions)
train_collector = ts.data.Collector(
    policy, 
    train_envs, 
    ts.data.VectorReplayBuffer(buffer_size, train_num),
    exploration_noise=True
)

# define the testing collector
test_collector = ts.data.Collector(
    policy, 
    test_envs,
    exploration_noise=True
)

human_collector = ts.data.Collector(policy, env_human, exploration_noise=True)

since Python 3.9 and will be removed in a subsequent version. The only 
supported seed types are: None, int, float, str, bytes, and bytearray.
  random.seed(seed)


In [9]:
# manual training loop
np.random.seed()

"""
# collect a bunch of random ones
policy.policies[agents[0]].set_eps(1)
policy.policies[agents[1]].set_eps(1)
train_collector.collect(n_step=10000)
"""
for mazes in range(total_mazes):
    if passed_mazes == True:
        n_mazes += 1
        # reset epsilon again for the new maze
        eps_train = 0.9
    else:
        print("Failed to find a solution within suitable time. Stopping training.")
        break

    # set first eps
    policy.policies[agents[0]].set_eps(eps_train)
    policy.policies[agents[1]].set_eps(eps_train)
    
    for epoch in range(epochs):
        # reset number of steps couhnt
        steps_n = 0 
        
        # training loop
        while steps_n < step_per_epoch:
            # have runs where the exploration rate is very high
            if np.random.randint(0, 10) == 0:
                eps_prev = eps_train
                eps_train = 0.9
                high_eps_run = True
            
            policy.policies[agents[0]].set_eps(eps_train)
            policy.policies[agents[1]].set_eps(eps_train)

            # train the model in training environment
            train_collector.reset_env(gym_reset_kwargs={"options":{"n_mazes":n_mazes}})
            result = train_collector.collect(n_episode=ep_per_collect, gym_reset_kwargs={"options":{"n_mazes":n_mazes}})
            steps_n += int(result['n/st'])
            steps_total += int(result['n/st'])
            
            # update the parameters after train_num steps
            policy.update(batch_size, train_collector.buffer)

            # set the random training epsilon after each steps per collect
            # decay it by specified parameter every
            eps_train *= eps_decay
            eps_train = np.max([eps_train, eps_min])

            """
            # swap training modes of the two agents after 1000 steps (interleaving)
            if (int(result['n/st']) + steps_total) % 1000 < steps_total % 1000:
                #interleave_training(obs_train)
                set_eps(eps_train)
            """

            #  reset high exploration
            if high_eps_run:
                eps_train = eps_prev
                high_eps_run = False

            # log
            if result["n/ep"] > 0:
                log_data = {"train":{
                        "episode": result["n/ep"],
                        "obs_reward": np.mean(result["rews"][:,0]),
                        "exp_reward": np.mean(result["rews"][:,1]),
                        "length": result["len"],
                        "exploration rate": eps_train
                    }
                }
                wandb.log(data=log_data, step=steps_total)
        
        #print(f"Current training epsilon: {np.round(policy.policies[agents[0]].eps, 4)}")
        
        # check test results
        agent0_eps = policy.policies[agents[0]]
        agent1_aps = policy.policies[agents[1]]
        policy.policies[agents[0]].set_eps(eps_test)
        policy.policies[agents[1]].set_eps(eps_test)
        
        passed_mazes = True
        test_mazes = []
        for seed in range(n_mazes):
            # test through all previous mazes
            test_collector.reset_env(gym_reset_kwargs={"seed":seed+1})
            test_result = test_collector.collect(n_episode=test_num, gym_reset_kwargs={"seed":seed+1})
            
            # if any of the previous mazes failed, continue
            if np.mean(test_result['rews'][:,0]) < threshold_rew:
                passed_mazes = False
                test_mazes.append(0)
            else:
                test_mazes.append(1)

        # early stop when policy reaches good enough performance
        #if np.mean(test_result['rews'][:,0]) >= reward_threshold:
        #    break

        # log
        log_data = {"test":{
                "obs_reward": np.mean(test_result["rews"][:,0]),
                "exp_reward": np.mean(test_result["rews"][:,1]),
                "length": test_result["len"],
                "obs_reward_std": np.std(test_result["rews"][:,0]),
                "exp_reward_std": np.std(test_result["rews"][:,1])
            }
        }
        wandb.log(data=log_data, step=steps_total)
        
        print(f"Evaluation Reward at Epoch {epoch+1}. Obs: {np.round(np.mean(test_result['rews'][:,0]), 3)}, Exp: {np.round(np.mean(test_result['rews'][:,1]), 3)}")
        
        if epoch % 10 == 0:
            print(f"Test Mazes results: {test_mazes}")

        # check if the agent can auccessfully solve the maze (within some threshold)
        if passed_mazes:
            print(f"Agents solved the current maze and all previous mazes. Current number of mazes: {n_mazes}.")
            print(f"Solved all mazes on epoch {epoch+1}.")
            break

        # every n epochs render the policy for human-based evalution
        if (epoch % 50) == 0:

            # set policy to eval mode
            policy.eval()
            for maze in range(n_mazes):
                human_collector.reset_env(gym_reset_kwargs={"seed":maze+1})
                #np.random.seed()
                human_collector.collect(n_episode=1, render=1/60)

            # reset back to training mode
            policy.train()
        
        # reset eps
        policy.policies[agents[0]].set_eps(eps_train)
        policy.policies[agents[1]].set_eps(eps_train)
        
print('Finished Training.')

since Python 3.9 and will be removed in a subsequent version. The only 
supported seed types are: None, int, float, str, bytes, and bytearray.
  random.seed(seed)


Evaluation Reward at Epoch 1. Obs: -0.984, Exp: -0.992
Test Mazes results: [0, 0, 0]
Evaluation Reward at Epoch 2. Obs: -0.986, Exp: -0.494
Evaluation Reward at Epoch 3. Obs: -0.985, Exp: -0.988
Evaluation Reward at Epoch 4. Obs: -0.986, Exp: -0.485
Evaluation Reward at Epoch 5. Obs: -0.984, Exp: -0.507
Evaluation Reward at Epoch 6. Obs: -0.985, Exp: -0.501
Evaluation Reward at Epoch 7. Obs: -0.985, Exp: -0.5
Evaluation Reward at Epoch 8. Obs: -0.986, Exp: -0.496
Evaluation Reward at Epoch 9. Obs: -0.985, Exp: -0.488
Evaluation Reward at Epoch 10. Obs: -0.986, Exp: -0.982
Evaluation Reward at Epoch 11. Obs: -0.985, Exp: -0.509
Test Mazes results: [0, 0, 0]
Evaluation Reward at Epoch 12. Obs: -0.986, Exp: -0.481
Evaluation Reward at Epoch 13. Obs: -0.985, Exp: -0.481
Evaluation Reward at Epoch 14. Obs: -0.984, Exp: -0.505
Evaluation Reward at Epoch 15. Obs: -0.997, Exp: -0.498
Evaluation Reward at Epoch 16. Obs: -0.985, Exp: -0.496
Evaluation Reward at Epoch 17. Obs: -0.984, Exp: -0.023

KeyboardInterrupt: 

In [42]:
wandb.finish(0)

  from IPython.core.display import HTML, display  # type: ignore


In [None]:
#torch.save(policy.policies[agents[0]].state_dict(), "model/dqn_baseline_obs.pt")
#torch.save(policy.policies[agents[1]].state_dict(), "model/dqn_baseline_exp.pt")

---

In [23]:
# manual training loop
np.random.seed()

"""
# collect a bunch of random ones
policy.policies[agents[0]].set_eps(1)
policy.policies[agents[1]].set_eps(1)
train_collector.collect(n_step=10000)
"""
for mazes in range(total_mazes):
    if passed_mazes == True:
        n_mazes += 1
        # reset epsilon again for the new maze
        eps_train = 0.9
    else:
        print("Failed to find a solution within suitable time. Stopping training.")
        break

    # set first eps
    policy.policies[agents[0]].set_eps(eps_train)
    policy.policies[agents[1]].set_eps(eps_train)
    
    for epoch in range(epochs):
        # reset number of steps couhnt
        steps_n = 0 
        
        # training loop
        while steps_n < step_per_epoch:
            # have runs where the exploration rate is very high
            if np.random.randint(0, 10) == 0:
                eps_prev = eps_train
                eps_train = 0.9
                high_eps_run = True
            
            policy.policies[agents[0]].set_eps(eps_train)
            policy.policies[agents[1]].set_eps(eps_train)

            # train the model in training environment
            train_collector.reset_env(gym_reset_kwargs={"seed":n_mazes})
            result = train_collector.collect(n_episode=ep_per_collect, gym_reset_kwargs={"seed":n_mazes})
            steps_n += int(result['n/st'])
            steps_total += int(result['n/st'])
            
            # update the parameters after train_num steps
            policy.update(batch_size, train_collector.buffer)

            # set the random training epsilon after each steps per collect
            # decay it by specified parameter every
            eps_train *= eps_decay
            eps_train = np.max([eps_train, eps_min])

            """
            # swap training modes of the two agents after 1000 steps (interleaving)
            if (int(result['n/st']) + steps_total) % 1000 < steps_total % 1000:
                #interleave_training(obs_train)
                set_eps(eps_train)
            """

            #  reset high exploration
            if high_eps_run:
                eps_train = eps_prev
                high_eps_run = False

            # log
            if result["n/ep"] > 0:
                log_data = {"train":{
                        "episode": result["n/ep"],
                        "obs_reward": np.mean(result["rews"][:,0]),
                        "exp_reward": np.mean(result["rews"][:,1]),
                        "length": result["len"],
                        "exploration rate": eps_train
                    }
                }
                wandb.log(data=log_data, step=steps_total)
        
        #print(f"Current training epsilon: {np.round(policy.policies[agents[0]].eps, 4)}")
        
        # check test results
        agent0_eps = policy.policies[agents[0]]
        agent1_aps = policy.policies[agents[1]]
        policy.policies[agents[0]].set_eps(eps_test)
        policy.policies[agents[1]].set_eps(eps_test)
        
        passed_mazes = True
        test_mazes = []
        #for seed in range(n_mazes):
        # test through just new maze
        test_collector.reset_env(gym_reset_kwargs={"seed":n_mazes})
        test_result = test_collector.collect(n_episode=test_num, gym_reset_kwargs={"seed":n_mazes})
        
        # if any of the previous mazes failed, continue
        if np.mean(test_result['rews'][:,0]) < threshold_rew:
            passed_mazes = False
            test_mazes.append(0)
        else:
            test_mazes.append(1)

        # early stop when policy reaches good enough performance
        #if np.mean(test_result['rews'][:,0]) >= reward_threshold:
        #    break

        # log
        log_data = {"test":{
                "obs_reward": np.mean(test_result["rews"][:,0]),
                "exp_reward": np.mean(test_result["rews"][:,1]),
                "length": test_result["len"],
                "obs_reward_std": np.std(test_result["rews"][:,0]),
                "exp_reward_std": np.std(test_result["rews"][:,1])
            }
        }
        wandb.log(data=log_data, step=steps_total)
        
        print(f"Evaluation Reward at Epoch {epoch+1}. Obs: {np.round(np.mean(test_result['rews'][:,0]), 3)}, Exp: {np.round(np.mean(test_result['rews'][:,1]), 3)}")
        
        if epoch % 10 == 0:
            print(f"Test Mazes results: {test_mazes}")

        # check if the agent can auccessfully solve the maze (within some threshold)
        if passed_mazes:
            print(f"Agents solved the current maze and all previous mazes. Current number of mazes: {n_mazes}.")
            print(f"Solved all mazes on epoch {epoch+1}.")
            break

        # every n epochs render the policy for human-based evalution
        if (epoch % 10) == 0:

            # set policy to eval mode
            policy.eval()
            #for maze in range(n_mazes):
            human_collector.reset_env(gym_reset_kwargs={"seed":n_mazes})
            #np.random.seed()
            human_collector.collect(n_episode=1, render=1/60)

            # reset back to training mode
            policy.train()
        
        # reset eps
        policy.policies[agents[0]].set_eps(eps_train)
        policy.policies[agents[1]].set_eps(eps_train)
        
print('Finished Training.')

since Python 3.9 and will be removed in a subsequent version. The only 
supported seed types are: None, int, float, str, bytes, and bytearray.
  random.seed(seed)


Evaluation Reward at Epoch 1. Obs: -0.972, Exp: -0.006
Test Mazes results: [0]
Evaluation Reward at Epoch 2. Obs: -0.972, Exp: -0.513
Evaluation Reward at Epoch 3. Obs: -0.972, Exp: -0.49
Evaluation Reward at Epoch 4. Obs: -0.972, Exp: -0.524
Evaluation Reward at Epoch 5. Obs: -0.972, Exp: -0.039
Evaluation Reward at Epoch 6. Obs: -0.972, Exp: -0.045
Evaluation Reward at Epoch 7. Obs: -0.972, Exp: -0.53
Evaluation Reward at Epoch 8. Obs: -0.972, Exp: -0.045
Evaluation Reward at Epoch 9. Obs: -0.972, Exp: -0.518
Evaluation Reward at Epoch 10. Obs: -0.972, Exp: -0.045
Evaluation Reward at Epoch 11. Obs: -0.972, Exp: -0.507
Test Mazes results: [0]


KeyboardInterrupt: 

---

In [82]:
np.random.seed()
seed=15
human_collector.reset_env(gym_reset_kwargs={"seed":seed})
human_collector.collect(n_episode=1, gym_reset_kwargs={"seed":seed})

{'n/ep': 1,
 'n/st': 356,
 'rews': array([[-0.93239437, -0.51267606]]),
 'lens': array([356]),
 'idxs': array([0]),
 'rew': -0.722535211267605,
 'len': 356.0,
 'rew_std': 0.20985915492957719,
 'len_std': 0.0}

In [46]:
maze = MazeEnv_v0.MazeEnv(render_mode='human', size=6)
maze.reset(seed=1, options={"maze_type":"trivial"})
maze.render()

In [62]:
maze.step(1)

---  
# Trivial Maze Sanity Check

In [29]:
# manual training loop
np.random.seed()

"""
# collect a bunch of random ones
policy.policies[agents[0]].set_eps(1)
policy.policies[agents[1]].set_eps(1)
train_collector.collect(n_step=10000)
"""
for mazes in range(1, total_mazes+1):
    if passed_mazes == True:
        # reset epsilon again for the new maze
        eps_train = 0.9
        passed_mazes = False
        print(f"Current number of mazes: {mazes}")
    else:
        print("Failed to find a solution within suitable time. Stopping training.")
        break

    # set first eps
    set_eps(eps_train, single=True)
    
    for epoch in range(epochs):
        # reset number of steps couhnt
        steps_n = 0 
        
        # training loop
        while steps_n < step_per_epoch:
            # have runs where the exploration rate is very high
            if np.random.randint(0, 10) == 0:
                eps_prev = eps_train
                eps_train = 0.9
                high_eps_run = True
            
            set_eps(eps_train, single=True)

            # train the model in training environment
            train_collector.reset_env(gym_reset_kwargs={"options":{"maze_type":"trivial", "n_mazes":mazes, "random":True}})
            result = train_collector.collect(n_episode=ep_per_collect, gym_reset_kwargs={"options":{"maze_type":"trivial", "n_mazes":mazes, "random":True}})
            steps_n += int(result['n/st'])
            steps_total += int(result['n/st'])
            episodes_total += int(result['n/ep'])
            
            # update the parameters after train_num steps
            policy.update(batch_size, train_collector.buffer)

            #  reset high exploration
            if high_eps_run:
                eps_train = eps_prev
                high_eps_run = False

            # set the random training epsilon after each steps per collect
            # decay it by specified parameter every
            eps_train *= eps_decay
            eps_train = np.max([eps_train, eps_min])
            
            # log
            if result["n/ep"] > 0:
                log_data = {"train":{
                        "episode": result["n/ep"],
                        "obs_reward": np.mean(result["rews"]),
                        "length": result["len"],
                        "exploration rate": eps_train,
                        "episodes": episodes_total
                    }
                }
                wandb.log(data=log_data, step=steps_total)
        
        # check test results
        set_eps(eps_test, single=True)
        policy.eval()
        
        passed_before = True
        test_mazes = []
        for seed in range(mazes):
            # test through all previous mazes
            test_collector.reset_env(gym_reset_kwargs={"options":{"maze_type":"trivial", "n_mazes":seed, "random":False}})
            test_result = test_collector.collect(n_episode=test_num, gym_reset_kwargs={"options":{"maze_type":"trivial", "n_mazes":seed, "random":False}})
            
            # if any of the previous mazes failed, break out of loop and do the train loop again
            if np.mean(test_result['rews']) < threshold_rew:
                passed_mazes = False
                passed_before = False
                test_mazes.append(0)
                break
            else:
                test_mazes.append(1)

        # log
        log_data = {"test":{
                "obs_reward": np.mean(test_result["rews"]),
                "length": test_result["len"],
                "obs_reward_std": np.std(test_result["rews"]),
            }
        }
        wandb.log(data=log_data, step=steps_total)
        
        print(f"Evaluation Reward at Epoch {epoch+1}. Obs: {np.round(np.mean(test_result['rews']), 3)}, End at Maze: {seed}")
        
        print(f"Test Mazes results: {test_mazes}")

        # every n epochs render the policy for human-based evalution
        if (epoch % 10) == 0:
            for seed in range(mazes):
                watch({"options":{"maze_type":"trivial", "n_mazes":seed, "random":False}})
            # reset back to training mode
            policy.train()
        
        # reset back to training
        set_eps(eps_train, single=True)
        policy.train()

        # check if the agent can auccessfully solve the maze (within some threshold)
        if passed_mazes:
            print(f"Agents solved the current maze and all previous mazes. Current number of mazes: {mazes}.")
            print(f"Solved all mazes on epoch {epoch+1}.")
            for seed in range(mazes):
                watch({"options":{"maze_type":"trivial", "n_mazes":seed, "random":False}})
            # reset back to training mode
            policy.train()
            break
        
        # to see if the agent can do it consequtively
        if passed_before:
            # pass the test maze at least twice
            passed_mazes = True
            print("Passed once")
        
print('Finished Training.')

Current number of mazes: 1
Evaluation Reward at Epoch 1. Obs: -0.996
Test Mazes results: [0]
Evaluation Reward at Epoch 2. Obs: -0.998
Test Mazes results: [0]
Evaluation Reward at Epoch 3. Obs: -0.996
Test Mazes results: [0]
Evaluation Reward at Epoch 4. Obs: -0.988
Test Mazes results: [0]
Evaluation Reward at Epoch 5. Obs: -0.988
Test Mazes results: [0]
Evaluation Reward at Epoch 6. Obs: -0.988
Test Mazes results: [0]
Evaluation Reward at Epoch 7. Obs: 0.986
Test Mazes results: [1]
Passed once
Evaluation Reward at Epoch 8. Obs: -0.989
Test Mazes results: [0]
Evaluation Reward at Epoch 9. Obs: -0.996
Test Mazes results: [0]
Evaluation Reward at Epoch 10. Obs: -0.995
Test Mazes results: [0]
Evaluation Reward at Epoch 11. Obs: 0.986
Test Mazes results: [1]
Passed once
Evaluation Reward at Epoch 12. Obs: -0.994
Test Mazes results: [0]
Evaluation Reward at Epoch 13. Obs: -0.998
Test Mazes results: [0]
Evaluation Reward at Epoch 14. Obs: -0.998
Test Mazes results: [0]
Evaluation Reward at E

In [30]:
wandb.finish(0)

In [31]:
torch.save(policy.state_dict(), "model/trivial_maze_baseline_20-6-23.pt")

---
# Random Maze Single Agent Baseline

In [50]:
# manual training loop
np.random.seed()

"""
# collect a bunch of random ones
policy.policies[agents[0]].set_eps(1)
policy.policies[agents[1]].set_eps(1)
train_collector.collect(n_step=10000)
"""
for mazes in range(1, total_mazes):
    if passed_mazes == True:
        # reset epsilon again for the new maze
        eps_train = eps_train_start
        passed_mazes = False
        print(f"Current number of mazes: {mazes}")
    else:
        print("Failed to find a solution within suitable time. Stopping training.")
        break

    # reset
    set_eps(eps_train, single=True)
    policy.train()
    
    for epoch in range(epochs):
        # reset number of steps couhnt
        steps_n = 0 
        
        # training loop
        while steps_n < step_per_epoch:
            # have runs where the exploration rate is very high
            if np.random.randint(0, 10) == 0:
                eps_prev = eps_train
                eps_train = eps_train_start
                high_eps_run = True
            
            set_eps(eps_train, single=True)

            # train the model in training environment
            train_collector.reset_env(gym_reset_kwargs={"options":{"maze_type":maze_type, "n_mazes":mazes, "random":True}})
            result = train_collector.collect(n_episode=ep_per_collect, gym_reset_kwargs={"options":{"maze_type":maze_type, "n_mazes":mazes, "random":True}})
            steps_n += int(result['n/st'])
            steps_total += int(result['n/st'])
            episodes_total += int(result['n/ep'])
            
            # update the parameters after train_num steps
            policy.update(batch_size, train_collector.buffer)

            #  reset high exploration
            if high_eps_run:
                eps_train = eps_prev
                high_eps_run = False

            # set the random training epsilon after each steps per collect
            # decay it by specified parameter every
            eps_train *= eps_decay
            eps_train = np.max([eps_train, eps_min])
            
            # log
            if result["n/ep"] > 0:
                log_data = {"train":{
                        "episode": result["n/ep"],
                        "obs_reward": np.mean(result["rews"]),
                        "length": result["len"],
                        "exploration rate": eps_train,
                        "episodes": episodes_total
                    }
                }
                wandb.log(data=log_data, step=steps_total)
        
        # check test results
        set_eps(eps_test, single=True)
        policy.eval()
        
        passed_before = True
        test_mazes = []
        for seed in range(1, mazes+1):
            # test through all previous mazes
            test_collector.reset_env(gym_reset_kwargs={"options":{"maze_type":maze_type, "n_mazes":seed, "random":False}})
            test_result = test_collector.collect(n_episode=test_num, gym_reset_kwargs={"options":{"maze_type":maze_type, "n_mazes":seed, "random":False}})
            
            # if any of the previous mazes failed, break out of loop and do the train loop again
            if np.mean(test_result['rews']) < threshold_rew:
                passed_mazes = False
                passed_before = False
                test_mazes.append(0)
                break
            else:
                test_mazes.append(1)

        # log
        log_data = {"test":{
                "obs_reward": np.mean(test_result["rews"]),
                "length": test_result["len"],
                "obs_reward_std": np.std(test_result["rews"]),
            }
        }
        wandb.log(data=log_data, step=steps_total)
        
        print(f"Evaluation Reward at Epoch {epoch+1}. Obs: {np.round(np.mean(test_result['rews']), 3)}, Maze: {seed}")
        
        print(f"Test Mazes results: {test_mazes}")

        # every n epochs render the policy for human-based evalution
        if (epoch % 10) == 0:
            for seed in range(1, mazes+1):
                watch({"options":{"maze_type":maze_type, "n_mazes":seed, "random":False}})
            # reset back to training mode
            policy.train()

        # check if the agent can auccessfully solve the maze (within some threshold)
        if passed_mazes:
            print(f"Agents solved the current maze and all previous mazes. Solved all mazes on epoch {epoch+1}.")

            # watch the results if successful in passing twice
            for seed in range(1, mazes+1):
                watch({"options":{"maze_type":maze_type, "n_mazes":seed, "random":False}})
            # reset back to training mode
            policy.train()
            break
        
        # to see if the agent can do it consequtively
        if passed_before:
            # pass the test maze at least twice
            passed_mazes = True
            print("Passed once")
        
        # reset back to training
        set_eps(eps_train, single=True)
        policy.train()
        
print('Finished Training.')

Current number of mazes: 1


since Python 3.9 and will be removed in a subsequent version. The only 
supported seed types are: None, int, float, str, bytes, and bytearray.
  random.seed(seed)


Evaluation Reward at Epoch 1. Obs: 0.995, Maze: 1
Test Mazes results: [1]
Passed once
Evaluation Reward at Epoch 2. Obs: -1.001, Maze: 1
Test Mazes results: [0]
Evaluation Reward at Epoch 3. Obs: -0.996, Maze: 1
Test Mazes results: [0]
Evaluation Reward at Epoch 4. Obs: -1.001, Maze: 1
Test Mazes results: [0]
Evaluation Reward at Epoch 5. Obs: -1.001, Maze: 1
Test Mazes results: [0]
Evaluation Reward at Epoch 6. Obs: 0.995, Maze: 1
Test Mazes results: [1]
Passed once
Evaluation Reward at Epoch 7. Obs: 0.995, Maze: 1
Test Mazes results: [1]
Agents solved the current maze and all previous mazes. Solved all mazes on epoch 7.
Current number of mazes: 2
Evaluation Reward at Epoch 1. Obs: -1.0, Maze: 2
Test Mazes results: [1, 0]
Evaluation Reward at Epoch 2. Obs: -0.999, Maze: 2
Test Mazes results: [1, 0]
Evaluation Reward at Epoch 3. Obs: -1.0, Maze: 2
Test Mazes results: [1, 0]
Evaluation Reward at Epoch 4. Obs: -1.0, Maze: 2
Test Mazes results: [1, 0]
Evaluation Reward at Epoch 5. Obs: -0

In [51]:
wandb.finish(0)

  from IPython.core.display import HTML, display  # type: ignore


In [29]:
mazes = 1
policy.set_eps(1)

train_collector.reset_env(gym_reset_kwargs={"options":{"maze_type":"random", "n_mazes":mazes, "random":True}})
result = train_collector.collect(n_episode=ep_per_collect, gym_reset_kwargs={"options":{"maze_type":"random", "n_mazes":mazes, "random":True}})

In [30]:
result

{'n/ep': 1,
 'n/st': 711,
 'rews': array([[-0.99225352]]),
 'lens': array([711]),
 'idxs': array([194]),
 'rew': -0.9922535211267594,
 'len': 711.0,
 'rew_std': 0.0,
 'len_std': 0.0}

In [31]:
train_collector.buffer.get()

VectorReplayBuffer(
    info: Batch(
              env_id: array([0, 0, 0, ..., 0, 0, 0]),
          ),
    policy: Batch(),
    obs: Batch(
             agent_id: array(['observer', 'observer', 'observer', ..., None, None, None],
                             dtype=object),
             obs: array([[[[1., 1., 1., ..., 1., 1., 1.],
                           [1., 0., 0., ..., 1., 0., 1.],
                           [1., 1., 1., ..., 1., 0., 1.],
                           ...,
                           [1., 0., 1., ..., 1., 0., 1.],
                           [1., 0., 0., ..., 0., 0., 1.],
                           [1., 1., 1., ..., 1., 1., 1.]],
                  
                          [[0., 0., 0., ..., 0., 0., 0.],
                           [0., 0., 0., ..., 0., 0., 0.],
                           [0., 0., 0., ..., 0., 0., 0.],
                           ...,
                           [0., 0., 0., ..., 0., 0., 0.],
                           [0., 0., 0., ..., 0., 0., 0.],
   