In [1]:
from maze_env import MazeEnv_v0
from env_utils.PettingZooEnv_new import PettingZooEnv_new
import supersuit
import numpy as np
from tianshou.env.utils import PettingZooEnv
import tianshou as ts
from tianshou.utils.net.common import Net
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter
import gymnasium as gym

In [10]:
eps_train, eps_test = 0.9, 0
eps_decay, eps_min = 0.99, 0.1
lr, epoch, batch_size = 1e-4, 3, 64
gamma, n_step, target_update_freq = 0.9, 3, 320
train_num, test_num = 10, 10
buffer_size = 30000
step_per_epoch, step_per_collect = 10000, 150
maze_width = 6

#logger = ts.utils.TensorboardLogger(SummaryWriter('log/dqn_no_abs_no_interleaving_cnn'))

In [3]:
def preprocess_maze_env(render_mode=None, size=6):
    env = MazeEnv_v0.env(render_mode=render_mode, size=size)
    env = supersuit.multiagent_wrappers.pad_observations_v0(env)
    env = PettingZooEnv_new(env)
    return env

In [91]:
# create a CNN for the observer
class CNN(nn.Module):
    def __init__(self):
        super().__init__()

        self.model = nn.Sequential(
            # assume maze size of 6x6 (13x13 with walls)
            nn.Conv2d(3, 12, 3), nn.ReLU(inplace=True), nn.MaxPool2d(2,2,1), # (13-3)+1 = 11, (11-2+1)/2+1 = 6
            nn.Conv2d(12, 20, 3), nn.ReLU(inplace=True), nn.MaxPool2d(2,2), # 6-3+1=4, (4-2)/2+1 = 2
            nn.Flatten(), nn.Linear(20*2*2, 128), nn.ReLU(inplace=True),
            nn.Linear(128, 64), nn.ReLU(inplace=True),
            nn.Linear(64, 4)
        )
    
    def forward(self, obs, state=None, info={}):
        if not isinstance(obs, torch.Tensor):
            obs = torch.tensor(obs, dtype=torch.float)
        self.batch = obs.shape[0]
        #logits = self.model(obs.view(batch, -1))
        logits = self.model(obs)
        return logits, state

In [4]:
# get the vectorized training/testing environments
train_envs = ts.env.DummyVectorEnv([preprocess_maze_env for _ in range(train_num)])
test_envs = ts.env.DummyVectorEnv([preprocess_maze_env for _ in range(test_num)])

since Python 3.9 and will be removed in a subsequent version. The only 
supported seed types are: None, int, float, str, bytes, and bytearray.
  random.seed(seed)


In [5]:
# set up training with no render environment
env = preprocess_maze_env()

In [6]:
# get agent names
agents = env.agents

# observation spaces/action spaces for the two agents
state_shape = env.observation_space.shape or env.observation_space.n
action_shape = env.action_space.shape or env.action_space.n

# define DQN network (128x3 hidden units linear)
#net_obs = Net(state_shape, action_shape, [128,128,128])
net_obs = Net(state_shape, action_shape, [512, 512, 512])
optim_obs = torch.optim.Adam(params=net_obs.parameters(), lr=lr)
net_exp = Net(state_shape, action_shape, [16])
optim_exp = torch.optim.Adam(params=net_exp.parameters(), lr=lr)

# set up policy and collectors
agent_observer = ts.policy.DQNPolicy(net_obs, optim_obs, gamma, n_step, target_update_freq)
agent_explorer = ts.policy.DQNPolicy(net_exp, optim_exp, gamma, n_step, target_update_freq)
agent_policies = [agent_observer, agent_explorer]
#agent_policies = [ts.policy.RandomPolicy(), ts.policy.RandomPolicy()] # baseline testing
policy = ts.policy.MultiAgentPolicyManager(agent_policies, env)


In [7]:
# define the training collector (the calc q and step functions)
train_collector = ts.data.Collector(
    policy, 
    train_envs, 
    ts.data.VectorReplayBuffer(buffer_size, train_num),
    exploration_noise=True
)

# define the testing collector
test_collector = ts.data.Collector(
    policy, 
    test_envs,
    exploration_noise=True
)

since Python 3.9 and will be removed in a subsequent version. The only 
supported seed types are: None, int, float, str, bytes, and bytearray.
  random.seed(seed)


In [8]:
# set up human render environment
env_human = preprocess_maze_env(render_mode="human")
env_human = ts.env.DummyVectorEnv([lambda: env_human])
human_collector = ts.data.Collector(policy, env_human, exploration_noise=True)

In [11]:
# manual training loop
steps_total = 0
steps_n = 0

"""
# collect a bunch of random ones
policy.policies[agents[0]].set_eps(1)
policy.policies[agents[1]].set_eps(1)
train_collector.collect(n_step=10000)
"""

for i in range(epoch):
    steps_n = 0 
    
    while steps_n < step_per_epoch:
        # set epislon for greedy training
        policy.policies[agents[0]].set_eps(eps_train)
        policy.policies[agents[1]].set_eps(eps_train)

        # train the model in training environment
        train_collector.reset_env(gym_reset_kwargs={"options":{"n_mazes":i+1}})
        result = train_collector.collect(n_step=step_per_collect, gym_reset_kwargs={"options":{"n_mazes":i+1}})
        steps_n += int(result['n/st'])
        steps_total += int(result['n/st'])
        
        # update the parameters after train_num steps
        policy.update(batch_size, train_collector.buffer)

        # log
        #logger.log_train_data(result, steps_total)
        
        # set the random training epsilon after each steps per collect
        # decay it by specified parameter every
        eps_train *= eps_decay
        eps_train = np.max([eps_train, eps_min])
    
    print(f"Current training epsilon: {np.round(policy.policies[agents[0]].eps, 4)}")
    
    # check test results
    policy.policies[agents[0]].set_eps(eps_test)
    policy.policies[agents[1]].set_eps(eps_test)
    test_collector.reset_env(gym_reset_kwargs={"seed":i+1})
    test_result = test_collector.collect(n_episode=test_num, gym_reset_kwargs={"options":{"maze_type":"trivial"}})

    # early stop when policy reaches good enough performance
    #if np.mean(test_result['rews'][:,0]) >= reward_threshold:
    #    break

    # log
    #logger.log_test_data(test_result, steps_total)
    
    print(f"Evaluation Reward at Epoch {i+1}: {np.mean(test_result['rews'][:,0])}")

    # every 20 epochs render the policy for human-based evalution
    if (i % 1) == 0:

        # set policy to eval mode
        policy.eval()
        #human_collector.reset(gym_reset_kwargs={'seed':13})
        #np.random.seed()
        human_collector.reset_env(gym_reset_kwargs={"seed":i+1})
        human_collector.collect(n_episode=1, render=1/60, gym_reset_kwargs={"seed":i+1})

        # reset back to training mode
        policy.train()
    
print('Finished Training.')
print(result)

since Python 3.9 and will be removed in a subsequent version. The only 
supported seed types are: None, int, float, str, bytes, and bytearray.
  random.seed(seed)
since Python 3.9 and will be removed in a subsequent version. The only 
supported seed types are: None, int, float, str, bytes, and bytearray.
  random.seed(seed)


Current training epsilon: 0.4636
Evaluation Reward at Epoch 1: -349.0
Current training epsilon: 0.2364
Evaluation Reward at Epoch 2: -355.0
Current training epsilon: 0.1206
Evaluation Reward at Epoch 3: -354.0


RuntimeError: Failed to find default font

In [8]:
maze = MazeEnv_v0.env(size=6, render_mode="human")
maze.reset(seed=1)
maze.render()

In [11]:
maze.step(3)

In [205]:
maze.last()

6

In [215]:
np.sum(np.abs(np.array(maze.exit) - np.array(maze.start)))

18

In [33]:
# save the models
torch.save(policy.policies[agents[0]].state_dict(), "model/obs_dqn_no_abs_no_il_cnn.pt")
torch.save(policy.policies[agents[1]].state_dict(), "model/exp_dqn_no_abs_no_il_linear.pt")

In [108]:
policy.eval()
human_collector.reset_env(gym_reset_kwargs={"seed":7, "options":{"maze_type":"trivial"}})
#np.random.seed()
human_collector.collect(n_step=100, render=1/60)

{'n/ep': 0,
 'n/st': 100,
 'rews': array([], dtype=float64),
 'lens': array([], dtype=int32),
 'idxs': array([], dtype=int32),
 'rew': 0,
 'len': 0,
 'rew_std': 0,
 'len_std': 0}

In [51]:
train_collector.reset_env(gym_reset_kwargs={"options":{"maze_type":"trivial"}})
train_collector.collect(n_step=20, render=1/60)

  gym.logger.warn("You are calling render method without specifying any render mode.")


{'n/ep': 0,
 'n/st': 20,
 'rews': array([], dtype=float64),
 'lens': array([], dtype=int32),
 'idxs': array([], dtype=int32),
 'rew': 0,
 'len': 0,
 'rew_std': 0,
 'len_std': 0}

In [None]:
env = preprocess_maze_env(render_mode="human")
env = ts.env.DummyVectorEnv([lambda: env])

policy.policies[agents[0]].set_eps(1)
policy.policies[agents[1]].set_eps(1)
collector = ts.data.Collector(policy, env)
collector.reset_env(gym_reset_kwargs={'seed':17})
np.random.seed()
collector.collect(n_episode=1, render=1/60)

In [44]:
env = preprocess_maze_env(render_mode="human")

In [12]:
env.close()

NameError: name 'env' is not defined