Simple Tag
https://www.pettingzoo.ml/mpe/simple_tag

> This is a predator-prey environment. Good agents (green) are faster and receive a negative reward for being hit by adversaries (red) (-10 for each collision). Adversaries are slower and are rewarded for hitting good agents (+10 for each collision). Obstacles (large black circles) block the way. By default, there is 1 good agent, 3 adversaries and 2 obstacles.

Baseline agent algorithm with experience replay buffer

In [1]:
import os
import time
import enum
import math
import random
import collections
import statistics
import json

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import torch
import torch.nn
import torch.nn.functional as F

import imageio

class AttrDict(dict):
    def __init__(self, *args, **kwargs):
        super(AttrDict, self).__init__(*args, **kwargs)
        self.__dict__ = self

class TimeDelta(object):
    def __init__(self, delta_time):
        """Convert time difference in seconds to days, hours, minutes, seconds.
        
        Parameters
        ==========
        delta_time : float
            Time difference in seconds.
        """
        self.fractional, seconds = math.modf(delta_time)
        seconds = int(seconds)
        minutes, self.seconds = divmod(seconds, 60)
        hours, self.minutes = divmod(minutes, 60)
        self.days, self.hours = divmod(hours, 24)
    
    def __repr__(self):
        return f"{self.days}-{self.hours:02}:{self.minutes:02}:{self.seconds + self.fractional:02}"

from pettingzoo.mpe import simple_tag_v2
from pettingzoo.utils import random_demo

# torch.autograd.set_detect_anomaly(True)

Arguments in instantiate environment.

- num_good: number of good agents
- num_adversaries: number of adversaries
- num_obstacles: number of obstacles
- max_cycles: number of frames (a step for each agent) until game terminates
- continuous_actions: Whether agent action spaces are discrete(default) or continuous

In [2]:
# env.close()

In [2]:
num_good = 1
num_adversaries = 3
num_obstacles = 0

env = simple_tag_v2.env(
    num_good=num_good,
    num_adversaries=num_adversaries,
    num_obstacles=num_obstacles,
    max_cycles=30,
    continuous_actions=False
).unwrapped

### What are the environment parameters?

Adversaries (red) capture non-adversary (green). The map is a 2D grid and everything is initialized in the region [-1, +1]. There doesn't seem to be position clipping for out of bounds, but non-adversary agent are penalized for out of bounds.
Agent's observation is a ndarray vector of concatenated data in the following order:

1. current velocity (2,)
2. current position (2,)
3. relative position (2,) of each landmark
4. relative position (2,) of each other agent
5. velocity (2,) of each other non-adversary agent

When there are 3 adverseries and 3 non-adversaries, then advarsary observation space is 24 dimensional and non-advarsary observation space is 22 dimensional.

The environment is sequential. Agents move one at a time. Agents are either `adversary_*` for adversary or `agent_*` for non-adversary.

Actions:

- 0 is NOP
- 1 is go left
- 2 is go right
- 3 is go down
- 4 is go up

### How to train the agents?

When loss is increasing for Double DQN, then increase the interval for updating target network.
<https://stackoverflow.com/questions/56964657/cartpole-v0-loss-increasing-using-dqn>

In [3]:
env.reset()

In [4]:
def get_agent_counts():
    all_agents = 0
    adversaries = 0
    for agent in env.world.agents:
        all_agents += 1
        adversaries += 1 if agent.adversary else 0
    good_agents = all_agents - adversaries
    return (adversaries, good_agents)

def process_config(config):
    for k, v in config.common.items():
        config.adversary[k] = v
        config.agent[k] = v

n_adversaries, n_good_agents = get_agent_counts()
config = AttrDict(
    exp_name = "agents={}-adversary={}-obs={}-network={}".format(num_good, num_adversaries, num_obstacles, 'MLP'),
    discount = 0.99,
    epsilon = 0.1,
    n_episodes=20_000,
    batch_size=1,
    update_target_interval=32,
    report_interval=32,
    clip_grad_norm=5.0,
    lr=0.002,
    reward_scale=1,
    device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
    common=AttrDict(
        hidden_size=32,
        n_actions=env.action_space(env.agent_selection).n,
    ),
    adversary=AttrDict(
        n_agents=n_adversaries,
        observation_shape=env.observation_space("adversary_0").shape

    ),
    agent=AttrDict(
        n_agents=n_good_agents,
        observation_shape=env.observation_space("agent_0").shape
    )
)
process_config(config)
config

{'exp_name': 'agents=1-adversary=3-obs=0-network=MLP',
 'discount': 0.99,
 'epsilon': 0.1,
 'n_episodes': 20000,
 'batch_size': 1,
 'update_target_interval': 32,
 'report_interval': 32,
 'clip_grad_norm': 5.0,
 'lr': 0.002,
 'reward_scale': 1,
 'device': device(type='cuda'),
 'common': {'hidden_size': 32, 'n_actions': 5},
 'adversary': {'n_agents': 3,
  'observation_shape': (12,),
  'hidden_size': 32,
  'n_actions': 5},
 'agent': {'n_agents': 1,
  'observation_shape': (10,),
  'hidden_size': 32,
  'n_actions': 5}}

In [5]:
class Normalizer(object):
    def __init__(self, env):
        self.n_landmarks = len(env.world.landmarks)
        self.n_allagents = len(env.world.agents)
        self.n_good = sum(map(lambda a: not a.adversary, env.world.agents))
    
    @staticmethod
    def normalize_abs_pos(s):
        """Clip absolute position and scale to [-1, 1]
        s is a scalar or an ndarray of one dimension."""
        return np.clip(s, -1.5, 1.5) / 1.5

    @staticmethod
    def normalize_rel_pos(s):
        """Clip relative position and scale to [-1, 1]
        s is a scalar or an ndarray of one dimension."""
        return np.clip(s, -3, 3) / 3

    def __call__(self, obs):
        # normalize and clip positions
        norm_obs = obs.copy()
        # normalize velocity of current entity
        norm_obs[:2] = norm_obs[:2] / 1.3
        # clip/scale abs. position of current entity
        norm_obs[2:4] = self.normalize_abs_pos(norm_obs[2:4])
        # clip/scale rel. position of other entities
        n_range = self.n_landmarks + self.n_allagents - 1
        for i in range(n_range):
            norm_obs[4 + (2*i):4 + (2*(i + 1))] = self.normalize_rel_pos(
                norm_obs[4 + (2*i):4 + (2*(i + 1))]
            )
        # normalize velocity of other entities
        norm_obs[4 + (2*n_range):] = norm_obs[4 + (2*n_range):] / 1.3
        return norm_obs
    
class RewardsShaper(object):
    def __init__(self, env):
        self.n_landmarks = len(env.world.landmarks)
        # self.n_allagents = len(env.world.agents)
        self.name_to_idx = {agent.name: i for i, agent in enumerate(env.world.agents)}
        self.idx_to_name = {i: agent.name for i, agent in enumerate(env.world.agents)}
        self.goodagent_indices = [
            i for i, agent in enumerate(env.world.agents) if agent.name.startswith("agent")
        ]
        self.adversary_indices = [
            i for i, agent in enumerate(env.world.agents) if agent.name.startswith("adversary")
        ]
        # rdist - distance between adversary-good agent to start computing rewards.
        self.rdist = 1
        # collision_dist - distance between adversary-good agent to count collision.
        #    Based on PettingZoo numbers. 
        self.collision_dist = 0.075 + 0.05

    @staticmethod
    def bound(x):
        if x < 0.9:
            return 0
        if x < 1.0:
            return (x - 0.9) * 10
        return min(np.exp(2 * x - 2), 10)
        
    def __call__(self, agent_name, obs):
        """Compute reshaped rewards from observation for agent given agent name.
        Adversary: start gaining small rewards as it nears good agents.
        
        Good agent: starts gaining small penality as it nears bad agents.
        """
        _obs = obs[4 + (2*self.n_landmarks):]
        agent_idx = self.name_to_idx[agent_name]
        cum_r = 0.
        if agent_name.startswith("agent"):
            # penalty across all adversaries
            for adversary_idx in self.adversary_indices:
                # penalty from distance of adversary; penalty of collision
                other_idx = adversary_idx - 1 if agent_idx < adversary_idx else adversary_idx
                x, y = _obs[2*other_idx:(2*other_idx) + 2]
                d    = math.sqrt(x**2 + y**2)
                cum_r -= min(max(1 -  (1/self.rdist)*d, 0), 1)
                
            # penalty from boudary based on PettingZoo
            pos = obs[2:4]
            cum_r -= self.bound(abs(pos[0]))
            cum_r -= self.bound(abs(pos[1]))
        
        elif agent_name.startswith("adversary"):
            # reward across all agents
            for goodagent_idx in self.goodagent_indices:
                # reward from distance to agent; reward of collision
                other_idx = goodagent_idx - 1 if agent_idx < goodagent_idx else goodagent_idx
                x, y = _obs[2*other_idx:(2*other_idx) + 2]
                d    = math.sqrt(x**2 + y**2)
                cum_r += 1 - (1/self.rdist)*d
        
        return cum_r

normalize = Normalizer(env) # norm_obs = normalize(obs)
shapereward = RewardsShaper(env) # reward = shapereward(agent_name, obs)
criterion = torch.nn.MSELoss()
# criterion = torch.nn.SmoothL1Loss()

In [6]:
class SimpleTagNet(torch.nn.Module):
    """NN Model for the agents. Both good agents and adversaries use this model."""
        
    def __init__(self, config, agent_type):
        super().__init__()
        # self.config = config
        self.device      = config.device
        self.observation_size = math.prod(config[agent_type].observation_shape)
        self.n_actions   = config[agent_type].n_actions
        self.hidden_size = config[agent_type].hidden_size
        self.output_mlp = torch.nn.Sequential(
            torch.nn.Linear(self.observation_size, self.hidden_size),
            torch.nn.ReLU(inplace=True),
            torch.nn.Linear(self.hidden_size, self.hidden_size),
            torch.nn.ReLU(inplace=True),
            torch.nn.Linear(self.hidden_size, self.n_actions)
        )
    
    def forward(self, observation):
        """Apply DQN to episode step.
        
        Parameters
        ==========
        observation : ndarray
            The observation vector obtained from the environment.
        
        Returns
        =======
        torch.Tensor
            Vector of Q-value associated with each action.
        """
        observation = normalize(observation)
        observation = torch.tensor(observation, dtype=torch.float, device=self.device)
        Q = self.output_mlp(observation)
        return Q

def choose_action(config, agent_type, Q, epsilon=0.05, is_val=False):
    if not is_val and random.random() < epsilon:
        return random.randrange(config[agent_type].n_actions)
    else:
        return torch.argmax(Q).item()

In [7]:
def run_episode(config, adversary_net, should_render=False, epsilon=0.05, is_val=False,
                evaluation=False, save_video=False, save_video_dict=None):
    """Run one episodes.
    
    inputs consist of observation, message (backprop), hidden (backprop) indexed by agent
    outputs consist of action, q-value of action (backprop), reward, done indexed by (step, agent)
    
    Returns
    =======
    AttrDict
        Contains episode metrics:
        - steps : number of steps. All agents take an action at each step.
        - reward : episodic rewards indexed by ('adversary', 'agent').
        - step_records : list of quantities produced indiced by step, ('adversary', 'agent'), agent index.
          Each step record has:
            + observation
            + reward
            + done
        - loss : contains episodic losses indexed by ('adversary', 'agent'). To be updated by train_agents()
    """
    rendered_video = []
    episode = AttrDict(
        steps=0,
        reward=AttrDict(adversary=0, agent=0),
        step_records=[],
        loss=AttrDict(adversary=0, agent=0)
    )
    n_agents = config.adversary.n_agents + config.agent.n_agents
    step_record = None
    
    env.reset()
    for agent_step_idx, agent_name in enumerate(env.agent_iter()):
        if agent_step_idx % n_agents == 0:
            episode.steps += 1
            step_record = AttrDict(adversary={}, agent={})
            episode.step_records.append(step_record)
            
        obs_curr, reward, done, _ = env.last()
        if not evaluation:
            reward += shapereward(agent_name, obs_curr)
        if should_render:
            if save_video:
                rendered_image = env.render(mode='rgb_array')
#                 print(rendered_image.shape)
                rendered_video.append(rendered_image)
            env.render()
            if agent_name == "adversary_0":
                # print("rew, shaped rew", round(_reward, 2), round(reward, 2))
                # print("obs, normed obs", np.round(obs_curr, 2), np.round(normalize(obs_curr), 2))
                # print("obs, normed obs", np.round(obs_curr[4:6], 2), np.round(normalize(obs_curr[4:6]), 2))
                # print("obs, rew", np.round(normalize(obs_curr[4:6]), 2), reward)
                pass
        agent_type, agent_idx = agent_name.split("_")
        agent_idx = int(agent_idx)
        if done:
            step_record[agent_type][agent_idx] = AttrDict(
                observation=obs_curr,
                action=None,
                Q=None,
                reward=reward,
                done=done,
            )
            env.step(None)
            continue
        if agent_type == "agent":
            env.step(0)
            step_record[agent_type][agent_idx] = AttrDict(
                observation=obs_curr,
                action=0,
                Q=None,
                reward=reward,
                done=done,
            )
        else:
            # agent_type == "adversary"
            Q_curr = adversary_net(obs_curr)
            action = choose_action(config, agent_type, Q_curr, epsilon, is_val=is_val)
            env.step(action)
            step_record[agent_type][agent_idx] = AttrDict(
                # inputs to network
                observation=obs_curr,
                # outputs of network / inputs to environment
                action=action,
#                 Q=Q_curr,
                Q=None,
                # output of environment
                reward=reward,
                done=done,
            )
        episode.reward[agent_type] += reward
    
    if should_render:
        env.close()
        if save_video:
            # print(len(rendered_video))
            imageio.mimwrite(os.path.join(save_video_dict['validation_save_dir'],'result-{}.mp4'.format(save_video_dict['image_counter'])), rendered_video, fps=30)
    return episode

def train_agents(config, batch, adversary_net, adversary_target_net, adversary_optimizer):
    """Compute loss of episode and update agent weights.
    """
    device = config.device
    discount = torch.tensor(config.discount, dtype=torch.float, device=device)
    for episode in batch:
        for step_idx in range(episode.steps):
            for agent_idx in episode.step_records[step_idx].adversary.keys():
                curr_record = episode.step_records[step_idx].adversary[agent_idx]
                if curr_record.done:
                    # agent is done at this step
                    continue
                adversary_optimizer.zero_grad()
                next_record = episode.step_records[step_idx + 1].adversary[agent_idx]
                r = torch.tensor(next_record.reward, dtype=torch.float, device=device)
                y = None
                if next_record.done:
                    # agent terminates at next step
                    y = r
                else:
                    next_o = next_record.observation
                    target_Q = adversary_target_net(next_o)
                    max_target_Q = torch.max(target_Q)
                    y = r + discount*max_target_Q
                curr_o = curr_record.observation
                u = curr_record.action
#                 Q = curr_record.Q
                Q = adversary_net(curr_o)
                Q_u = Q[u]
                # adversary_loss = torch.pow(y - Q_u, 2.)
                adversary_loss = criterion(y, Q_u)
                adversary_loss.backward()#retain_graph=True)
                
                show_norms = False
                if show_norms:
                    norms = [p.grad.detach().data.norm().item() for p in adversary_net.parameters()]
                    print("norm of gradiants", *np.round(norms, 2))
                    
                torch.nn.utils.clip_grad_norm_(adversary_net.parameters(), config.clip_grad_norm)
                adversary_optimizer.step()
                episode.loss.adversary += adversary_loss.item()
                
def train(config):
    """
    - Use parameter sharing between agents of the same class.
    - Good agents use one RL model, adversaries use another RL model.
      Train the agents side by side.
    - Separate, disjoint communication channels for two classes of agents,
      maintained by a container to store the messages.
    """
    eps_start = 0.9
    eps_end = 0.05
    eps_decay = 0.9996
    epsilon = eps_start
    
    best_reward_evaluation = 0
        
    print("Training the agents...")
    os.makedirs("models/{}".format(config.exp_name), exist_ok=True)
    t0 = time.time()
    device = config.device
    adversary_net = SimpleTagNet(config, "adversary").to(device)
    adversary_target_net = SimpleTagNet(config, "adversary").to(device)
    adversary_target_net.eval()
    print("Created the agent nets.")
    adversary_optimizer = torch.optim.SGD(adversary_net.parameters(), lr=config.lr)
    logger = AttrDict(
        episodic_losses=AttrDict(adversary=[], agent=[]),
        episodic_rewards=AttrDict(adversary=[], agent=[])
    )
    def update_targets():
        adversary_target_net.load_state_dict(adversary_net.state_dict())
    print("Initial update of target nets")
    update_targets()
    
    batch = []
    print("Beginning the episodes...")
    for episode_idx in range(config.n_episodes):
        # Run an episode
        episode = run_episode(config, adversary_net,
                              epsilon=epsilon,
                              should_render=episode_idx % config.report_interval == 0 and episode_idx > 0)
        batch.append(episode)
        epsilon = max(epsilon*eps_decay, eps_end)
        
        # Train on the episode
        if episode_idx % config.batch_size == 0 and episode_idx > 0:
            train_agents(config, batch, adversary_net,
                         adversary_target_net,
                         adversary_optimizer)
            batch = []
        
        # Logging the reward and los
        logger.episodic_losses.adversary.append(episode.loss.adversary)
        logger.episodic_losses.agent.append(episode.loss.agent)
        logger.episodic_rewards.adversary.append(episode.reward.adversary)
        logger.episodic_rewards.agent.append(episode.reward.agent)

        if episode_idx % config.update_target_interval == 0 and episode_idx > 0:
            # Update double network
            update_targets()
        
        if episode_idx % config.report_interval == 0 and episode_idx > 0:
            # Logging
            t1 = time.time()
            tdelta = TimeDelta(round(t1 - t0, 0))
            print(f"on episode {episode_idx}, curr epsilon {epsilon} (time taken so far: {tdelta})")
            mean_loss_adversary = statistics.fmean(logger.episodic_losses.adversary[-config.report_interval:])
            mean_reward_adversary = statistics.fmean(logger.episodic_rewards.adversary[-config.report_interval:])
            mean_reward_agent = statistics.fmean(logger.episodic_rewards.agent[-config.report_interval:])
            print(f"     mean loss: adversary {mean_loss_adversary}")
            print(f"     mean reward: adversary {mean_reward_adversary}, agent {mean_reward_agent}")
            
        if episode_idx % 500 == 0 and episode_idx > 0:
            adversary_net.eval()
            with torch.no_grad():
                validation_save_dir = "models/{}/validation_results/epoch-{}".format(config.exp_name, episode_idx)
                os.makedirs(validation_save_dir, exist_ok=True)
                validation_total_reward = 0
                torch.save(
                    adversary_net.state_dict(),
                    f"models/{config.exp_name}/adversary-net-{episode_idx}.pth"
                )
                for e in range(100):
                    d = {
                        'validation_save_dir':validation_save_dir,
                        'image_counter': e
                    }
                    episode = run_episode(config, adversary_net,
                                          epsilon=epsilon,
                                          should_render=e % 10 == 0, save_video=e%10==0, save_video_dict=d, is_val=True, evaluation=True)
                    print(episode.reward.adversary)
                    validation_total_reward += episode.reward.adversary
                    adversary_net.train()

                validation_total_reward /= 100
                print('Evaluation Reward for episode {} is: {}'.format(episode_idx, validation_total_reward))
                if validation_total_reward > best_reward_evaluation:
                    best_reward_evaluation = validation_total_reward
                    torch.save(
                        adversary_net.state_dict(),
                        f"models/{config.exp_name}/adversary-net-best.pth"
                    )
            adversary_net.train()
        
    
    return adversary_net, logger


In [8]:
# train model
adversary_net, logger = train(config)

Training the agents...
Created the agent nets.
Initial update of target nets
Beginning the episodes...
on episode 32, curr epsilon 0.8881957186750149 (time taken so far: 0-00:00:8.0)
     mean loss: adversary 131.80569727213975
     mean reward: adversary -3.842050141927477, agent -24.082571788052647
on episode 64, curr epsilon 0.8768970195551845 (time taken so far: 0-00:00:14.0)
     mean loss: adversary 223.42055460313887
     mean reward: adversary -1.0056777204398775, agent -29.777168804668552
on episode 96, curr epsilon 0.8657420506956067 (time taken so far: 0-00:00:19.0)
     mean loss: adversary 143.3687644372973
     mean reward: adversary -2.2239012196432166, agent -19.012946029531424
on episode 128, curr epsilon 0.8547289837098901 (time taken so far: 0-00:00:25.0)
     mean loss: adversary 35.895247541106826
     mean reward: adversary -15.75631182250897, agent -21.305499147674382
on episode 160, curr epsilon 0.843856013470467 (time taken so far: 0-00:00:30.0)
     mean loss:



124




0.0
0.0
0.0
0.0
0.0
0.0
0.0
90.0
0.0
0.0




124




360.0
90.0
0.0
0.0
0.0
0.0
0.0
0.0
90.0
0.0




124




0.0
0.0
60.0
420.0
90.0
60.0
90.0
60.0
0.0
150.0




124




0.0
60.0
0.0
0.0
0.0
60.0
60.0
90.0
0.0
0.0




124




30.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0




124




0.0
0.0
0.0
0.0
30.0
0.0
0.0
60.0
90.0
0.0




124




60.0
390.0
270.0
0.0
30.0
150.0
0.0
0.0
0.0
0.0




124




0.0
60.0
0.0
0.0
0.0
90.0
0.0
0.0
0.0
180.0




124




0.0
90.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0




124




540.0
90.0
0.0
420.0
0.0
0.0
0.0
0.0
0.0
60.0
Evaluation Reward for episode 500 is: 44.7
on episode 512, curr epsilon 0.7330058717129788 (time taken so far: 0-00:01:43.0)
     mean loss: adversary 329.1995131455725
     mean reward: adversary 42.14008064190912, agent -38.73967580654051
on episode 544, curr epsilon 0.7236813358889278 (time taken so far: 0-00:01:49.0)
     mean loss: adversary 241.31795519791146
     mean reward: adversary 31.294894737209027, agent -38.237873936745885
on episode 576, curr epsilon 0.7144754170796777 (time taken so far: 0-00:01:55.0)
     mean loss: adversary 235.62354192602805
     mean reward: adversary 35.16067158834328, agent -34.39376325496161
on episode 608, curr epsilon 0.7053866063633399 (time taken so far: 0-00:02:1.0)
     mean loss: adversary 144.89217204250394
     mean reward: adversary 22.661528792091424, agent -36.64461307206503
on episode 640, curr epsilon 0.6964134140129569 (time taken so far: 0-00:02:7.0)
     mean loss: adversary 250.521



124




0.0
0.0
0.0
0.0
120.0
270.0
0.0
60.0
0.0
120.0




124




120.0
0.0
60.0
0.0
120.0
0.0
30.0
0.0
30.0
60.0




124




90.0
30.0
0.0
30.0
0.0
60.0
0.0
30.0
0.0
0.0




124




30.0
0.0
0.0
240.0
0.0
150.0
90.0
60.0
0.0
60.0




124




210.0
60.0
300.0
0.0
0.0
60.0
150.0
0.0
0.0
60.0




124




0.0
0.0
0.0
180.0
0.0
0.0
0.0
120.0
30.0
0.0




124




0.0
150.0
120.0
0.0
90.0
60.0
270.0
0.0
0.0
0.0




124




0.0
60.0
150.0
150.0
210.0
0.0
0.0
0.0
0.0
0.0




124




270.0
0.0
240.0
0.0
0.0
0.0
210.0
90.0
0.0
0.0




124




0.0
0.0
0.0
0.0
0.0
150.0
270.0
0.0
60.0
0.0
Evaluation Reward for episode 1000 is: 55.8
on episode 1024, curr epsilon 0.5972362366787872 (time taken so far: 0-00:03:30.0)
     mean loss: adversary 371.7893388817893
     mean reward: adversary 21.369563967103847, agent -35.37354930178836
on episode 1056, curr epsilon 0.5896388204789431 (time taken so far: 0-00:03:35.0)
     mean loss: adversary 580.0813712678081
     mean reward: adversary 48.320103850925264, agent -45.337006497813896
on episode 1088, curr epsilon 0.5821380506802529 (time taken so far: 0-00:03:41.0)
     mean loss: adversary 700.3841547261758
     mean reward: adversary 57.75947903033185, agent -44.775958223121215
on episode 1120, curr epsilon 0.5747326978480498 (time taken so far: 0-00:03:47.0)
     mean loss: adversary 622.1675774926248
     mean reward: adversary 50.94281174971336, agent -47.89334486009437
on episode 1152, curr epsilon 0.5674215481872513 (time taken so far: 0-00:03:53.0)
     mean loss: adversary 93



124




0.0
0.0
30.0
0.0
90.0
60.0
210.0
60.0
0.0
270.0




124




0.0
30.0
0.0
0.0
60.0
120.0
0.0
0.0
60.0
0.0




124




0.0
90.0
90.0
0.0
30.0
0.0
0.0
0.0
60.0
0.0




124




120.0
60.0
60.0
0.0
0.0
0.0
30.0
240.0
0.0
0.0




124




0.0
60.0
0.0
30.0
60.0
30.0
0.0
0.0
0.0
0.0




124




0.0
0.0
30.0
0.0
0.0
0.0
0.0
0.0
180.0
150.0




124




0.0
180.0
90.0
0.0
120.0
90.0
0.0
0.0
120.0
0.0




124




60.0
0.0
0.0
0.0
0.0
60.0
30.0
30.0
240.0
0.0




124




90.0
0.0
0.0
0.0
0.0
330.0
0.0
0.0
0.0
30.0




124




0.0
0.0
0.0
90.0
60.0
0.0
0.0
0.0
0.0
0.0
Evaluation Reward for episode 1500 is: 39.3
on episode 1504, curr epsilon 0.4928842360762302 (time taken so far: 0-00:05:6.0)
     mean loss: adversary 1301.0815516348557
     mean reward: adversary 75.20768168621548, agent -58.89527241504163
on episode 1536, curr epsilon 0.4866142771389811 (time taken so far: 0-00:05:11.0)
     mean loss: adversary 1530.1105693799313
     mean reward: adversary 104.53980631199653, agent -73.42374394693393
on episode 1568, curr epsilon 0.4804240780767642 (time taken so far: 0-00:05:17.0)
     mean loss: adversary 1657.3633574640528
     mean reward: adversary 122.10488552726663, agent -77.70721869187945
on episode 1600, curr epsilon 0.47431262426767706 (time taken so far: 0-00:05:23.0)
     mean loss: adversary 1536.9012243875889
     mean reward: adversary 77.9265217066321, agent -65.60561833784327
on episode 1632, curr epsilon 0.46827891399677846 (time taken so far: 0-00:05:29.0)
     mean loss: adversary 163



124




0.0
0.0
180.0
150.0
0.0
510.0
90.0
690.0
180.0
630.0




124




60.0
390.0
0.0
420.0
30.0
0.0
60.0
240.0
480.0
30.0




124




0.0
0.0
0.0
720.0
420.0
60.0
0.0
390.0
0.0
240.0




124




0.0
120.0
30.0
0.0
0.0
0.0
0.0
300.0
0.0
150.0




124




0.0
360.0
420.0
390.0
270.0
0.0
270.0
30.0
60.0
0.0




124




0.0
990.0
390.0
30.0
30.0
0.0
0.0
60.0
0.0
0.0




124




150.0
270.0
330.0
0.0
570.0
90.0
0.0
180.0
300.0
0.0




124




0.0
180.0
90.0
0.0
60.0
270.0
30.0
330.0
420.0
90.0




124




390.0
960.0
660.0
420.0
0.0
0.0
0.0
540.0
120.0
150.0




124




720.0
600.0
600.0
0.0
480.0
30.0
360.0
0.0
360.0
0.0
Evaluation Reward for episode 2000 is: 196.2
on episode 2016, curr epsilon 0.40159067973705354 (time taken so far: 0-00:06:51.0)
     mean loss: adversary 2787.4791223316925
     mean reward: adversary 136.7335666281909, agent -86.35864039227924
on episode 2048, curr epsilon 0.3964820621606865 (time taken so far: 0-00:06:57.0)
     mean loss: adversary 3274.465634629496
     mean reward: adversary 149.07444658701405, agent -89.52354079928307
on episode 2080, curr epsilon 0.3914384310863933 (time taken so far: 0-00:07:3.0)
     mean loss: adversary 3407.369266575468
     mean reward: adversary 98.72716953064787, agent -68.58694547140627
on episode 2112, curr epsilon 0.3864589598237064 (time taken so far: 0-00:07:9.0)
     mean loss: adversary 3418.636614822806
     mean reward: adversary 94.34243177866969, agent -69.31701190426796
on episode 2144, curr epsilon 0.3815428321984522 (time taken so far: 0-00:07:15.0)
     mean loss: advers



124




300.0
300.0
390.0
840.0
180.0
240.0
240.0
90.0
540.0
390.0




124




120.0
810.0
330.0
330.0
420.0
210.0
180.0
540.0
60.0
510.0




124




480.0
90.0
630.0
90.0
60.0
150.0
210.0
1350.0
210.0
150.0




124




1140.0
180.0
1350.0
1020.0
180.0
180.0
330.0
120.0
240.0
90.0




124




180.0
120.0
480.0
240.0
420.0
630.0
480.0
330.0
150.0
300.0




124




180.0
480.0
90.0
390.0
150.0
780.0
660.0
450.0
480.0
510.0




124




1110.0
270.0
240.0
390.0
300.0
210.0
360.0
270.0
210.0
210.0




124




450.0
930.0
570.0
390.0
240.0
750.0
420.0
270.0
450.0
240.0




124




300.0
330.0
510.0
600.0
150.0
1140.0
420.0
450.0
600.0
210.0




124




360.0
390.0
360.0
270.0
390.0
360.0
390.0
210.0
150.0
450.0
Evaluation Reward for episode 2500 is: 390.9
on episode 2528, curr epsilon 0.3272068008008388 (time taken so far: 0-00:08:36.0)
     mean loss: adversary 5233.934803429174
     mean reward: adversary 137.61362853130927, agent -82.28129064620603
on episode 2560, curr epsilon 0.32304441731431843 (time taken so far: 0-00:08:42.0)
     mean loss: adversary 4935.679807940882
     mean reward: adversary 135.05396279373744, agent -84.48964804317096
on episode 2592, curr epsilon 0.31893498332715586 (time taken so far: 0-00:08:48.0)
     mean loss: adversary 5426.212608229392
     mean reward: adversary 144.72045367319362, agent -91.14794332164834
on episode 2624, curr epsilon 0.3148778252710719 (time taken so far: 0-00:08:54.0)
     mean loss: adversary 5561.6723415438355
     mean reward: adversary 150.71861365008428, agent -93.67073386254391
on episode 2656, curr epsilon 0.3108722781462201 (time taken so far: 0-00:09:0.0)
     mean 



124




0.0
60.0
150.0
180.0
30.0
0.0
450.0
180.0
330.0
90.0




124




180.0
0.0
0.0
90.0
90.0
60.0
240.0
60.0
30.0
30.0




124




150.0
150.0
90.0
0.0
90.0
210.0
30.0
300.0
90.0
210.0




124




90.0
120.0
60.0
240.0
90.0
30.0
60.0
0.0
150.0
180.0




124




30.0
180.0
30.0
240.0
90.0
0.0
120.0
0.0
0.0
300.0




124




90.0
90.0
90.0
90.0
60.0
0.0
30.0
0.0
180.0
150.0




124




150.0
180.0
0.0
120.0
60.0
90.0
30.0
0.0
0.0
300.0




124




0.0
180.0
570.0
330.0
30.0
0.0
330.0
30.0
120.0
0.0




124




90.0
90.0
90.0
150.0
120.0
0.0
60.0
0.0
60.0
60.0




124




0.0
0.0
90.0
30.0
60.0
210.0
330.0
150.0
270.0
0.0
Evaluation Reward for episode 3000 is: 107.4
on episode 3008, curr epsilon 0.2700356477840573 (time taken so far: 0-00:10:16.0)
     mean loss: adversary 7747.787257004478
     mean reward: adversary 173.33914883175407, agent -96.31268638855784
on episode 3040, curr epsilon 0.2666005360493463 (time taken so far: 0-00:10:22.0)
     mean loss: adversary 8182.527290369084
     mean reward: adversary 161.41193543546177, agent -97.87931114050174
on episode 3072, curr epsilon 0.2632091222216591 (time taken so far: 0-00:10:28.0)
     mean loss: adversary 7970.416515040952
     mean reward: adversary 180.20209953413766, agent -106.29674262329762
on episode 3104, curr epsilon 0.2598608504218202 (time taken so far: 0-00:10:34.0)
     mean loss: adversary 8646.872768377485
     mean reward: adversary 187.59487251883107, agent -104.73449622738615
on episode 3136, curr epsilon 0.256555171841969 (time taken so far: 0-00:10:40.0)
     mean loss: adve



124




0.0
0.0
0.0
0.0
0.0
0.0
150.0
0.0
0.0
0.0




124




0.0
30.0
0.0
0.0
0.0
30.0
0.0
0.0
0.0
60.0




124




30.0
0.0
0.0
0.0
30.0
0.0
0.0
60.0
0.0
60.0




124




0.0
0.0
0.0
0.0
30.0
120.0
0.0
0.0
0.0
0.0




124




0.0
120.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0




124




0.0
0.0
0.0
0.0
30.0
0.0
0.0
0.0
0.0
0.0




124




0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
30.0




124




0.0
0.0
0.0
0.0
30.0
0.0
0.0
0.0
0.0
0.0




124




0.0
0.0
30.0
0.0
0.0
0.0
30.0
0.0
0.0
0.0




124




0.0
0.0
0.0
60.0
0.0
0.0
0.0
0.0
0.0
0.0
Evaluation Reward for episode 3500 is: 9.3
on episode 3520, curr epsilon 0.22001880240710908 (time taken so far: 0-00:12:2.0)
     mean loss: adversary 13902.393478923292
     mean reward: adversary 135.74646902240463, agent -88.88475534411045
on episode 3552, curr epsilon 0.2172199527877799 (time taken so far: 0-00:12:8.0)
     mean loss: adversary 13551.39388283055
     mean reward: adversary 170.8142270760938, agent -97.26639829373457
on episode 3584, curr epsilon 0.2144567072127684 (time taken so far: 0-00:12:14.0)
     mean loss: adversary 13530.7100799595
     mean reward: adversary 154.87911266217236, agent -91.3814687061312
on episode 3616, curr epsilon 0.21172861276457478 (time taken so far: 0-00:12:21.0)
     mean loss: adversary 13846.07066301943
     mean reward: adversary 179.0676012502902, agent -103.49267172568167
on episode 3648, curr epsilon 0.20903522228724314 (time taken so far: 0-00:12:27.0)
     mean loss: adversary 13799.64



124




30.0
630.0
120.0
690.0
420.0
210.0
300.0
840.0
720.0
60.0




124




330.0
150.0
30.0
120.0
90.0
60.0
360.0
330.0
630.0
180.0




124




420.0
210.0
60.0
0.0
180.0
390.0
570.0
90.0
450.0
780.0




124




360.0
60.0
150.0
150.0
180.0
180.0
60.0
30.0
180.0
480.0




124




240.0
930.0
150.0
270.0
180.0
660.0
150.0
450.0
420.0
720.0




124




210.0
120.0
420.0
240.0
480.0
1170.0
90.0
150.0
90.0
240.0




124




90.0
240.0
90.0
630.0
300.0
90.0
570.0
180.0
300.0
690.0




124




90.0
300.0
450.0
840.0
150.0
510.0
210.0
270.0
330.0
480.0




124




60.0
150.0
270.0
90.0
630.0
480.0
540.0
810.0
180.0
450.0




124




330.0
750.0
120.0
120.0
90.0
90.0
270.0
0.0
240.0
60.0
Evaluation Reward for episode 4000 is: 312.0
on episode 4032, curr epsilon 0.17926623321736299 (time taken so far: 0-00:13:49.0)
     mean loss: adversary 19191.976967550596
     mean reward: adversary 188.28355917615562, agent -105.58862278994401
on episode 4064, curr epsilon 0.17698579525883532 (time taken so far: 0-00:13:54.0)
     mean loss: adversary 19918.40409395456
     mean reward: adversary 213.97941458686626, agent -115.9930050817246
on episode 4096, curr epsilon 0.17473436665242795 (time taken so far: 0-00:14:0.0)
     mean loss: adversary 20809.039380937462
     mean reward: adversary 168.93244582434642, agent -93.12371453824825
on episode 4128, curr epsilon 0.1725115783714339 (time taken so far: 0-00:14:6.0)
     mean loss: adversary 22101.908625457818
     mean reward: adversary 256.97586151618094, agent -140.48987154435835
on episode 4160, curr epsilon 0.17031706608351888 (time taken so far: 0-00:14:11.0)
     mean 



124




330.0
120.0
120.0
150.0
150.0
180.0
300.0
120.0
690.0
510.0




124




600.0
210.0
150.0
570.0
60.0
90.0
270.0
180.0
300.0
420.0




124




420.0
630.0
180.0
450.0
300.0
180.0
120.0
150.0
270.0
330.0




124




90.0
60.0
180.0
210.0
210.0
240.0
480.0
180.0
720.0
60.0




124




90.0
450.0
810.0
450.0
300.0
210.0
90.0
330.0
300.0
240.0




124




630.0
0.0
330.0
630.0
240.0
120.0
330.0
690.0
420.0
150.0




124




210.0
120.0
840.0
450.0
210.0
240.0
300.0
240.0
90.0
330.0




124




330.0
480.0
360.0
810.0
180.0
420.0
150.0
150.0
330.0
210.0




124




450.0
420.0
90.0
540.0
150.0
150.0
300.0
300.0
210.0
690.0




124




210.0
900.0
150.0
270.0
390.0
660.0
450.0
330.0
480.0
420.0
Evaluation Reward for episode 4500 is: 318.0
on episode 4512, curr epsilon 0.1479439708899059 (time taken so far: 0-00:15:27.0)
     mean loss: adversary 31091.712911680246
     mean reward: adversary 172.21422436744342, agent -93.50682139250554
on episode 4544, curr epsilon 0.1460619820686001 (time taken so far: 0-00:15:32.0)
     mean loss: adversary 30484.281284478227
     mean reward: adversary 142.50789046333617, agent -85.19709851633012
on episode 4576, curr epsilon 0.14420393394526412 (time taken so far: 0-00:15:38.0)
     mean loss: adversary 30387.755029911496
     mean reward: adversary 182.96305098653517, agent -105.13567588959431
on episode 4608, curr epsilon 0.1423695219713199 (time taken so far: 0-00:15:44.0)
     mean loss: adversary 29899.534610598424
     mean reward: adversary 183.00988887827145, agent -103.44431799720633
on episode 4640, curr epsilon 0.14055844547233876 (time taken so far: 0-00:15:50.0)
    



124




60.0
0.0
0.0
0.0
0.0
0.0
60.0
0.0
0.0
30.0




124




30.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
30.0




124




30.0
0.0
0.0
0.0
0.0
0.0
60.0
0.0
0.0
0.0




124




0.0
0.0
0.0
0.0
0.0
0.0
0.0
240.0
0.0
0.0




124




0.0
0.0
30.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0




124




0.0
30.0
30.0
0.0
0.0
60.0
0.0
60.0
0.0
0.0




124




0.0
0.0
0.0
0.0
60.0
60.0
30.0
0.0
0.0
0.0




124




0.0
0.0
30.0
0.0
0.0
0.0
0.0
30.0
60.0
0.0




124




0.0
0.0
0.0
0.0
120.0
0.0
0.0
0.0
0.0
60.0




124




120.0
30.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
Evaluation Reward for episode 5000 is: 13.5
on episode 5024, curr epsilon 0.12054132691613881 (time taken so far: 0-00:17:10.0)
     mean loss: adversary 35095.25393719695
     mean reward: adversary 156.06449810134603, agent -96.62220679125143
on episode 5056, curr epsilon 0.11900792593739698 (time taken so far: 0-00:17:16.0)
     mean loss: adversary 35418.42795134049
     mean reward: adversary 183.35092242020795, agent -103.45383538631901
on episode 5088, curr epsilon 0.11749403128583571 (time taken so far: 0-00:17:22.0)
     mean loss: adversary 36557.33317387071
     mean reward: adversary 164.89519264430373, agent -99.84384581401882
on episode 5120, curr epsilon 0.11599939482231508 (time taken so far: 0-00:17:28.0)
     mean loss: adversary 35402.853131961005
     mean reward: adversary 178.65311165017957, agent -103.11652508785215
on episode 5152, curr epsilon 0.11452377156426229 (time taken so far: 0-00:17:34.0)
     mean loss: advers



124




30.0
0.0
150.0
0.0
30.0
0.0
0.0
0.0
0.0
0.0




124




0.0
0.0
0.0
120.0
0.0
0.0
0.0
30.0
0.0
0.0




124




0.0
0.0
0.0
0.0
0.0
30.0
0.0
0.0
660.0
0.0




124




0.0
0.0
60.0
0.0
0.0
0.0
0.0
30.0
60.0
0.0




124




60.0
30.0
0.0
0.0
0.0
0.0
0.0
0.0
360.0
0.0




124




120.0
30.0
60.0
90.0
150.0
30.0
0.0
60.0
0.0
0.0




124




0.0
60.0
0.0
60.0
0.0
0.0
30.0
60.0
0.0
0.0




124




300.0
0.0
0.0
60.0
30.0
60.0
90.0
0.0
180.0
300.0




124




60.0
0.0
0.0
0.0
0.0
120.0
30.0
30.0
0.0
30.0




124




0.0
0.0
60.0
0.0
0.0
0.0
60.0
0.0
0.0
0.0
Evaluation Reward for episode 5500 is: 38.1
on episode 5504, curr epsilon 0.09947976392569516 (time taken so far: 0-00:18:49.0)
     mean loss: adversary 40372.81513438822
     mean reward: adversary 194.7303345393664, agent -98.82802803958562
on episode 5536, curr epsilon 0.09821428617402919 (time taken so far: 0-00:18:55.0)
     mean loss: adversary 45293.82873807111
     mean reward: adversary 115.58760737223798, agent -76.2712195296116
on episode 5568, curr epsilon 0.09696490650982102 (time taken so far: 0-00:19:1.0)
     mean loss: adversary 48049.35569836061
     mean reward: adversary 123.54424192600986, agent -79.1459002836007
on episode 5600, curr epsilon 0.09573142015000011 (time taken so far: 0-00:19:7.0)
     mean loss: adversary 48341.5689175935
     mean reward: adversary 94.91475455093929, agent -74.21109046394761
on episode 5632, curr epsilon 0.09451362491653233 (time taken so far: 0-00:19:12.0)
     mean loss: adversary 47456.2



124




0.0
0.0
0.0
60.0
0.0
0.0
0.0
0.0
0.0
0.0




124




180.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0




124




0.0
0.0
0.0
60.0
0.0
0.0
0.0
0.0
0.0
0.0




124




0.0
0.0
60.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0




124




0.0
0.0
0.0
0.0
0.0
30.0
0.0
0.0
60.0
60.0




124




60.0
0.0
0.0
0.0
60.0
0.0
0.0
0.0
0.0
0.0




124




0.0
30.0
0.0
0.0
60.0
30.0
0.0
0.0
0.0
0.0




124




0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0




124




60.0
0.0
0.0
0.0
0.0
0.0
0.0
30.0
0.0
0.0




124




0.0
30.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
Evaluation Reward for episode 6000 is: 8.7
on episode 6016, curr epsilon 0.08105381160703781 (time taken so far: 0-00:20:33.0)
     mean loss: adversary 52507.8376779378
     mean reward: adversary 152.42202377886613, agent -95.7792476387458
on episode 6048, curr epsilon 0.08002272959368441 (time taken so far: 0-00:20:39.0)
     mean loss: adversary 53246.65915132554
     mean reward: adversary 172.81808473287134, agent -96.94006752377197
on episode 6080, curr epsilon 0.07900476392979293 (time taken so far: 0-00:20:45.0)
     mean loss: adversary 54386.75489367741
     mean reward: adversary 116.06419807709858, agent -75.14942433286818
on episode 6112, curr epsilon 0.07799974776285219 (time taken so far: 0-00:20:51.0)
     mean loss: adversary 55140.06761182907
     mean reward: adversary 135.72736369904652, agent -87.20953333874257
on episode 6144, curr epsilon 0.07700751636287444 (time taken so far: 0-00:20:57.0)
     mean loss: adversary 5645



124




510.0
60.0
120.0
330.0
180.0
300.0
210.0
390.0
60.0
120.0




124




150.0
240.0
630.0
360.0
150.0
90.0
210.0
30.0
810.0
300.0




124




510.0
210.0
420.0
180.0
180.0
300.0
240.0
30.0
0.0
150.0




124




0.0
150.0
60.0
0.0
120.0
90.0
420.0
180.0
330.0
120.0




124




180.0
120.0
60.0
330.0
60.0
90.0
240.0
270.0
120.0
390.0




124




180.0
330.0
0.0
240.0
210.0
120.0
390.0
180.0
330.0
60.0




124




0.0
180.0
180.0
270.0
360.0
360.0
180.0
420.0
420.0
180.0




124




60.0
420.0
210.0
120.0
30.0
120.0
0.0
270.0
90.0
300.0




124




150.0
270.0
210.0
0.0
0.0
90.0
60.0
480.0
60.0
240.0




124




180.0
330.0
90.0
60.0
480.0
510.0
60.0
360.0
330.0
240.0
Evaluation Reward for episode 6500 is: 212.1
on episode 6528, curr epsilon 0.0660407716783118 (time taken so far: 0-00:22:17.0)
     mean loss: adversary 51556.689013407886
     mean reward: adversary 198.30601277152428, agent -109.66303888228997
on episode 6560, curr epsilon 0.06520066989314696 (time taken so far: 0-00:22:23.0)
     mean loss: adversary 50576.01617562982
     mean reward: adversary 201.69058068838567, agent -114.38521518048229
on episode 6592, curr epsilon 0.0643712550062648 (time taken so far: 0-00:22:28.0)
     mean loss: adversary 52404.760280143244
     mean reward: adversary 228.0735051328859, agent -112.37382137619103
on episode 6624, curr epsilon 0.06355239107009693 (time taken so far: 0-00:22:34.0)
     mean loss: adversary 60929.4602843281
     mean reward: adversary 202.0106902465403, agent -108.46664146392614
on episode 6656, curr epsilon 0.06274394386645807 (time taken so far: 0-00:22:40.0)
     mean



124




240.0
0.0
150.0
90.0
0.0
150.0
30.0
60.0
120.0
0.0




124




120.0
60.0
60.0
30.0
60.0
30.0
240.0
0.0
0.0
0.0




124




150.0
30.0
30.0
30.0
150.0
90.0
30.0
150.0
0.0
30.0




124




120.0
0.0
0.0
60.0
90.0
60.0
0.0
90.0
60.0
30.0




124




0.0
0.0
150.0
0.0
90.0
0.0
120.0
0.0
90.0
90.0




124




60.0
0.0
180.0
30.0
120.0
120.0
30.0
120.0
0.0
330.0




124




0.0
0.0
30.0
30.0
60.0
150.0
0.0
30.0
0.0
210.0




124




90.0
180.0
60.0
0.0
120.0
60.0
30.0
90.0
0.0
90.0




124




60.0
150.0
210.0
120.0
90.0
30.0
60.0
180.0
90.0
30.0




124




30.0
120.0
0.0
0.0
30.0
120.0
0.0
90.0
30.0
60.0
Evaluation Reward for episode 7000 is: 69.0
on episode 7008, curr epsilon 0.0545018089986662 (time taken so far: 0-00:23:55.0)
     mean loss: adversary 61648.885393540426
     mean reward: adversary 162.8766889982985, agent -94.29058566092984
on episode 7040, curr epsilon 0.05380849385605209 (time taken so far: 0-00:24:1.0)
     mean loss: adversary 60876.72821711682
     mean reward: adversary 169.87723091354934, agent -99.5186645916945
on episode 7072, curr epsilon 0.05312399834522287 (time taken so far: 0-00:24:7.0)
     mean loss: adversary 63385.336052324325
     mean reward: adversary 186.13191701540427, agent -112.01379651868596
on episode 7104, curr epsilon 0.05244821027202605 (time taken so far: 0-00:24:13.0)
     mean loss: adversary 69037.6660707183
     mean reward: adversary 151.79232049958063, agent -93.15656568877317
on episode 7136, curr epsilon 0.051781018869526116 (time taken so far: 0-00:24:18.0)
     mean loss: adver



124




0.0
210.0
30.0
0.0
300.0
0.0
60.0
30.0
90.0
210.0




124




0.0
0.0
0.0
0.0
0.0
0.0
0.0
90.0
0.0
0.0




124




60.0
0.0
0.0
0.0
60.0
0.0
0.0
0.0
0.0
150.0




124




0.0
0.0
0.0
90.0
0.0
0.0
0.0
0.0
60.0
0.0




124




0.0
0.0
0.0
0.0
0.0
60.0
360.0
0.0
150.0
0.0




124




0.0
0.0
0.0
0.0
90.0
0.0
0.0
0.0
0.0
0.0




124




0.0
240.0
0.0
60.0
0.0
180.0
90.0
30.0
30.0
0.0




124




0.0
0.0
0.0
510.0
0.0
0.0
0.0
150.0
0.0
120.0




124




0.0
0.0
240.0
90.0
0.0
0.0
0.0
0.0
60.0
0.0




124




0.0
0.0
0.0
0.0
30.0
90.0
0.0
0.0
0.0
0.0
Evaluation Reward for episode 7500 is: 40.2
on episode 7520, curr epsilon 0.05 (time taken so far: 0-00:25:41.0)
     mean loss: adversary 79323.43340877295
     mean reward: adversary 162.00607340715175, agent -96.68589624604479
on episode 7552, curr epsilon 0.05 (time taken so far: 0-00:25:47.0)
     mean loss: adversary 77460.1652591606
     mean reward: adversary 149.40279272943215, agent -87.73455676192634
on episode 7584, curr epsilon 0.05 (time taken so far: 0-00:25:53.0)
     mean loss: adversary 77382.82654129388
     mean reward: adversary 159.71016445070595, agent -98.30869801400333
on episode 7616, curr epsilon 0.05 (time taken so far: 0-00:25:59.0)
     mean loss: adversary 76393.10924088355
     mean reward: adversary 211.6108966110641, agent -112.34889735935676
on episode 7648, curr epsilon 0.05 (time taken so far: 0-00:26:5.0)
     mean loss: adversary 75719.54480660673
     mean reward: adversary 210.53934368732084, agent -112.



124




0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0




124




0.0
60.0
0.0
0.0
0.0
30.0
0.0
0.0
0.0
0.0




124




0.0
0.0
90.0
60.0
0.0
0.0
0.0
0.0
0.0
30.0




124




0.0
60.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0




124




0.0
30.0
0.0
0.0
0.0
0.0
30.0
0.0
0.0
0.0




124




0.0
0.0
0.0
0.0
0.0
0.0
0.0
30.0
0.0
0.0




124




0.0
0.0
0.0
0.0
90.0
0.0
0.0
0.0
0.0
0.0




124




0.0
0.0
0.0
0.0
0.0
0.0
0.0
60.0
0.0
0.0




124




30.0
0.0
0.0
30.0
30.0
30.0
0.0
0.0
0.0
0.0




124




0.0
0.0
0.0
0.0
0.0
0.0
0.0
30.0
0.0
0.0
Evaluation Reward for episode 8000 is: 7.2
on episode 8032, curr epsilon 0.05 (time taken so far: 0-00:27:29.0)
     mean loss: adversary 101520.05925465864
     mean reward: adversary 145.21037323512968, agent -81.09320965784453
on episode 8064, curr epsilon 0.05 (time taken so far: 0-00:27:35.0)
     mean loss: adversary 100447.72880018302
     mean reward: adversary 119.54480575267746, agent -81.69844899749894
on episode 8096, curr epsilon 0.05 (time taken so far: 0-00:27:41.0)
     mean loss: adversary 99714.77284690746
     mean reward: adversary 151.2018544259406, agent -94.00744474162613
on episode 8128, curr epsilon 0.05 (time taken so far: 0-00:27:47.0)
     mean loss: adversary 99502.07116154092
     mean reward: adversary 159.8097369454763, agent -97.33695042513554
on episode 8160, curr epsilon 0.05 (time taken so far: 0-00:27:54.0)
     mean loss: adversary 104019.2677788106
     mean reward: adversary 141.72101879717422, agent -88.9



124




0.0
0.0
60.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0




124




30.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0




124




30.0
0.0
60.0
0.0
0.0
60.0
0.0
0.0
60.0
0.0




124




60.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0




124




0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0




124




0.0
0.0
0.0
0.0
0.0
120.0
0.0
0.0
0.0
0.0




124




0.0
0.0
0.0
0.0
30.0
0.0
0.0
120.0
0.0
0.0




124




120.0
0.0
0.0
0.0
0.0
0.0
30.0
0.0
30.0
0.0




124




60.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0




124




0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
Evaluation Reward for episode 8500 is: 8.7
on episode 8512, curr epsilon 0.05 (time taken so far: 0-00:29:14.0)
     mean loss: adversary 108795.66717959428
     mean reward: adversary 218.02989839348973, agent -116.60911312089267
on episode 8544, curr epsilon 0.05 (time taken so far: 0-00:29:20.0)
     mean loss: adversary 110643.29321442684
     mean reward: adversary 141.4308582689446, agent -87.82549190678634
on episode 8576, curr epsilon 0.05 (time taken so far: 0-00:29:26.0)
     mean loss: adversary 107462.11138942602
     mean reward: adversary 186.1141307012185, agent -107.76022863100575
on episode 8608, curr epsilon 0.05 (time taken so far: 0-00:29:32.0)
     mean loss: adversary 107566.1660096827
     mean reward: adversary 133.70553903407207, agent -85.31010332596631
on episode 8640, curr epsilon 0.05 (time taken so far: 0-00:29:38.0)
     mean loss: adversary 106874.30414729798
     mean reward: adversary 155.05438545785373, agent -8



124




0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
60.0
0.0




124




30.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0




124




0.0
0.0
0.0
0.0
90.0
0.0
0.0
0.0
0.0
0.0




124




0.0
0.0
0.0
0.0
0.0
0.0
0.0
60.0
0.0
0.0




124




0.0
0.0
0.0
0.0
0.0
0.0
0.0
60.0
0.0
0.0




124




0.0
0.0
0.0
0.0
0.0
90.0
0.0
0.0
0.0
0.0




124




0.0
90.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0




124




0.0
0.0
0.0
0.0
0.0
60.0
0.0
0.0
0.0
0.0




124




0.0
0.0
0.0
0.0
60.0
0.0
0.0
0.0
0.0
30.0




124




0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
Evaluation Reward for episode 9000 is: 6.3
on episode 9024, curr epsilon 0.05 (time taken so far: 0-00:31:3.0)
     mean loss: adversary 111277.98080770615
     mean reward: adversary 145.0050941637249, agent -85.81945761052917
on episode 9056, curr epsilon 0.05 (time taken so far: 0-00:31:9.0)
     mean loss: adversary 112599.19226291138
     mean reward: adversary 162.3166713825179, agent -93.844215454319
on episode 9088, curr epsilon 0.05 (time taken so far: 0-00:31:15.0)
     mean loss: adversary 112094.62945050164
     mean reward: adversary 204.4653888171477, agent -106.99253718586873
on episode 9120, curr epsilon 0.05 (time taken so far: 0-00:31:20.0)
     mean loss: adversary 110458.35014144698
     mean reward: adversary 190.76836188183955, agent -109.93017212679993
on episode 9152, curr epsilon 0.05 (time taken so far: 0-00:31:26.0)
     mean loss: adversary 109660.7463862215
     mean reward: adversary 176.19993750850497, agent -97.880



124




0.0
0.0
0.0
0.0
0.0
0.0
90.0
60.0
0.0
60.0




124




30.0
60.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0




124




0.0
0.0
0.0
30.0
0.0
0.0
0.0
0.0
90.0
30.0




124




30.0
0.0
0.0
120.0
0.0
0.0
0.0
0.0
0.0
0.0




124




0.0
0.0
30.0
0.0
0.0
0.0
0.0
0.0
30.0
0.0




124




0.0
0.0
0.0
0.0
0.0
0.0
0.0
30.0
0.0
0.0




124




60.0
0.0
0.0
0.0
0.0
60.0
0.0
0.0
0.0
0.0




124




0.0
60.0
0.0
0.0
0.0
30.0
0.0
0.0
0.0
0.0




124




0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0




124




0.0
0.0
0.0
0.0
0.0
30.0
0.0
0.0
60.0
0.0
Evaluation Reward for episode 9500 is: 9.9
on episode 9504, curr epsilon 0.05 (time taken so far: 0-00:32:43.0)
     mean loss: adversary 101865.89771631075
     mean reward: adversary 171.85853139893374, agent -98.37157858214563
on episode 9536, curr epsilon 0.05 (time taken so far: 0-00:32:49.0)
     mean loss: adversary 108966.16221627983
     mean reward: adversary 157.52075880810358, agent -91.39810597225471
on episode 9568, curr epsilon 0.05 (time taken so far: 0-00:32:54.0)
     mean loss: adversary 106793.92270767203
     mean reward: adversary 200.75074893574777, agent -108.06452475649866
on episode 9600, curr epsilon 0.05 (time taken so far: 0-00:33:0.0)
     mean loss: adversary 105822.1379597345
     mean reward: adversary 90.75534924242962, agent -66.8304466458987
on episode 9632, curr epsilon 0.05 (time taken so far: 0-00:33:6.0)
     mean loss: adversary 105220.78897330648
     mean reward: adversary 172.62236505405568, agent -99



124




0.0
0.0
0.0
0.0
60.0
60.0
0.0
0.0
30.0
0.0




124




0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
60.0
0.0




124




0.0
0.0
0.0
0.0
30.0
30.0
0.0
0.0
30.0
0.0




124




0.0
60.0
0.0
0.0
0.0
0.0
0.0
30.0
0.0
0.0




124




60.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0




124




0.0
0.0
0.0
0.0
0.0
0.0
60.0
0.0
0.0
0.0




124




0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
60.0




124




0.0
0.0
0.0
0.0
60.0
0.0
0.0
0.0
90.0
0.0




124




0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0




124




0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
Evaluation Reward for episode 10000 is: 7.2
on episode 10016, curr epsilon 0.05 (time taken so far: 0-00:34:28.0)
     mean loss: adversary 100546.33703799313
     mean reward: adversary 251.0644900573425, agent -130.1946069477978
on episode 10048, curr epsilon 0.05 (time taken so far: 0-00:34:34.0)
     mean loss: adversary 107879.4458305323
     mean reward: adversary 198.4473441949699, agent -108.85439416248856
on episode 10080, curr epsilon 0.05 (time taken so far: 0-00:34:40.0)
     mean loss: adversary 104556.6853517067
     mean reward: adversary 184.19533320734112, agent -111.01522971090462
on episode 10112, curr epsilon 0.05 (time taken so far: 0-00:34:46.0)
     mean loss: adversary 104229.9414077863
     mean reward: adversary 224.6632162502796, agent -116.20138935766425
on episode 10144, curr epsilon 0.05 (time taken so far: 0-00:34:51.0)
     mean loss: adversary 114312.80832605259
     mean reward: adversary 184.16450346996305, agen



124




0.0
90.0
30.0
120.0
120.0
480.0
0.0
180.0
90.0
210.0




124




120.0
150.0
180.0
150.0
0.0
300.0
0.0
60.0
90.0
0.0




124




150.0
30.0
420.0
330.0
180.0
60.0
30.0
270.0
330.0
90.0




124




90.0
180.0
60.0
450.0
30.0
120.0
30.0
120.0
270.0
270.0




124




150.0
60.0
210.0
60.0
120.0
60.0
0.0
0.0
150.0
210.0




124




0.0
270.0
120.0
420.0
0.0
0.0
330.0
210.0
150.0
90.0




124




240.0
150.0
150.0
150.0
210.0
120.0
240.0
300.0
90.0
150.0




124




30.0
60.0
30.0
150.0
450.0
60.0
480.0
0.0
240.0
0.0




124




0.0
0.0
30.0
150.0
30.0
0.0
540.0
270.0
90.0
570.0




124




30.0
180.0
30.0
270.0
150.0
120.0
150.0
60.0
60.0
300.0
Evaluation Reward for episode 10500 is: 148.2
on episode 10528, curr epsilon 0.05 (time taken so far: 0-00:36:14.0)
     mean loss: adversary 127614.55538750679
     mean reward: adversary 171.51801878181246, agent -99.07550836889358
on episode 10560, curr epsilon 0.05 (time taken so far: 0-00:36:20.0)
     mean loss: adversary 126701.57885844138
     mean reward: adversary 205.89849995569276, agent -110.5560518769658
on episode 10592, curr epsilon 0.05 (time taken so far: 0-00:36:26.0)
     mean loss: adversary 129630.33731808519
     mean reward: adversary 190.31258826623105, agent -99.8629230123048
on episode 10624, curr epsilon 0.05 (time taken so far: 0-00:36:32.0)
     mean loss: adversary 136593.1758706934
     mean reward: adversary 156.58839385978763, agent -96.95333585953713
on episode 10656, curr epsilon 0.05 (time taken so far: 0-00:36:38.0)
     mean loss: adversary 132067.44666978982
     mean reward: adversary 156.9



124




60.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
30.0
30.0




124




0.0
0.0
0.0
0.0
90.0
0.0
0.0
30.0
0.0
0.0




124




0.0
0.0
90.0
0.0
90.0
30.0
0.0
0.0
0.0
0.0




124




0.0
0.0
0.0
30.0
0.0
0.0
0.0
0.0
0.0
60.0




124




120.0
0.0
0.0
30.0
120.0
0.0
60.0
0.0
0.0
30.0




124




0.0
60.0
0.0
0.0
0.0
0.0
120.0
0.0
0.0
60.0




124




0.0
120.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0




124




0.0
30.0
60.0
60.0
0.0
0.0
0.0
0.0
60.0
210.0




124




90.0
0.0
0.0
0.0
0.0
30.0
0.0
0.0
60.0
0.0




124




60.0
0.0
0.0
30.0
0.0
60.0
0.0
30.0
0.0
0.0
Evaluation Reward for episode 11000 is: 20.4
on episode 11008, curr epsilon 0.05 (time taken so far: 0-00:37:53.0)
     mean loss: adversary 158274.79986076563
     mean reward: adversary 198.28113027824836, agent -104.49085576634326
on episode 11040, curr epsilon 0.05 (time taken so far: 0-00:37:59.0)
     mean loss: adversary 156673.71303259625
     mean reward: adversary 118.33286004001059, agent -80.46494301321322
on episode 11072, curr epsilon 0.05 (time taken so far: 0-00:38:5.0)
     mean loss: adversary 157259.89013739693
     mean reward: adversary 142.84564448508524, agent -84.79409194898552
on episode 11104, curr epsilon 0.05 (time taken so far: 0-00:38:11.0)
     mean loss: adversary 153518.29783597053
     mean reward: adversary 149.06254820701034, agent -87.99767561430083
on episode 11136, curr epsilon 0.05 (time taken so far: 0-00:38:17.0)
     mean loss: adversary 153828.2243939839
     mean reward: adversary 183.0766649699446



124




450.0
540.0
330.0
150.0
240.0
270.0
450.0
480.0
780.0
420.0




124




300.0
660.0
960.0
210.0
840.0
210.0
570.0
360.0
210.0
330.0




124




450.0
540.0
510.0
270.0
240.0
750.0
690.0
330.0
690.0
390.0




124




180.0
840.0
90.0
1110.0
300.0
690.0
360.0
780.0
630.0
900.0




124




510.0
810.0
330.0
690.0
480.0
870.0
630.0
660.0
630.0
150.0




124




600.0
540.0
540.0
630.0
720.0
300.0
630.0
720.0
660.0
330.0




124




420.0
540.0
780.0
570.0
960.0
270.0
300.0
420.0
600.0
270.0




124




870.0
330.0
300.0
510.0
600.0
270.0
570.0
480.0
540.0
300.0




124




540.0
810.0
270.0
360.0
480.0
390.0
480.0
570.0
630.0
480.0




124




360.0
690.0
600.0
450.0
600.0
210.0
630.0
270.0
390.0
420.0
Evaluation Reward for episode 11500 is: 504.6
on episode 11520, curr epsilon 0.05 (time taken so far: 0-00:39:38.0)
     mean loss: adversary 154515.60173457215
     mean reward: adversary 164.868347999829, agent -92.80811351233153
on episode 11552, curr epsilon 0.05 (time taken so far: 0-00:39:44.0)
     mean loss: adversary 151385.98910069722
     mean reward: adversary 189.7999828970414, agent -105.46149903292822
on episode 11584, curr epsilon 0.05 (time taken so far: 0-00:39:50.0)
     mean loss: adversary 156095.0360005144
     mean reward: adversary 204.67298305146332, agent -107.47497549013491
on episode 11616, curr epsilon 0.05 (time taken so far: 0-00:39:56.0)
     mean loss: adversary 156253.37086789752
     mean reward: adversary 167.6017997989295, agent -98.50724942357066
on episode 11648, curr epsilon 0.05 (time taken so far: 0-00:40:2.0)
     mean loss: adversary 153590.96402906522
     mean reward: adversary 201



124




0.0
0.0
0.0
0.0
0.0
0.0
30.0
0.0
0.0
60.0




124




0.0
0.0
0.0
0.0
60.0
0.0
0.0
0.0
0.0
0.0




124




0.0
0.0
0.0
0.0
0.0
0.0
0.0
30.0
0.0
0.0




124




60.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0




124




0.0
0.0
30.0
0.0
0.0
0.0
0.0
60.0
60.0
0.0




124




0.0
120.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0




124




0.0
90.0
90.0
0.0
0.0
0.0
0.0
0.0
60.0
0.0




124




0.0
0.0
0.0
0.0
0.0
60.0
0.0
0.0
0.0
0.0




124




0.0
60.0
0.0
0.0
30.0
0.0
0.0
0.0
0.0
0.0




124




30.0
0.0
0.0
60.0
60.0
0.0
0.0
0.0
0.0
0.0
Evaluation Reward for episode 12000 is: 10.5
on episode 12032, curr epsilon 0.05 (time taken so far: 0-00:41:25.0)
     mean loss: adversary 142143.24795015494
     mean reward: adversary 167.9920614237643, agent -105.18129065695614
on episode 12064, curr epsilon 0.05 (time taken so far: 0-00:41:30.0)
     mean loss: adversary 138431.16687158018
     mean reward: adversary 141.72437374496243, agent -91.8019583948045
on episode 12096, curr epsilon 0.05 (time taken so far: 0-00:41:36.0)
     mean loss: adversary 135408.45008291389
     mean reward: adversary 116.32529984969673, agent -78.43387866938454
on episode 12128, curr epsilon 0.05 (time taken so far: 0-00:41:42.0)
     mean loss: adversary 138030.31880635142
     mean reward: adversary 191.94307058430866, agent -115.15483712431222
on episode 12160, curr epsilon 0.05 (time taken so far: 0-00:41:48.0)
     mean loss: adversary 133233.67640895938
     mean reward: adversary 135.1880192153991



124




0.0
0.0
60.0
60.0
0.0
0.0
90.0
30.0
180.0
120.0




124




30.0
0.0
0.0
30.0
0.0
0.0
240.0
90.0
90.0
0.0




124




30.0
120.0
0.0
0.0
0.0
0.0
0.0
30.0
0.0
0.0




124




120.0
60.0
30.0
30.0
30.0
90.0
0.0
0.0
0.0
0.0




124




0.0
0.0
0.0
30.0
210.0
0.0
0.0
90.0
0.0
0.0




124




0.0
0.0
60.0
0.0
0.0
0.0
0.0
0.0
0.0
30.0




124




60.0
60.0
120.0
0.0
0.0
0.0
0.0
0.0
0.0
210.0




124




60.0
0.0
0.0
0.0
0.0
60.0
30.0
0.0
0.0
90.0




124




0.0
0.0
60.0
0.0
0.0
60.0
0.0
60.0
0.0
60.0




124




0.0
120.0
0.0
30.0
0.0
0.0
30.0
0.0
30.0
0.0
Evaluation Reward for episode 12500 is: 31.2
on episode 12512, curr epsilon 0.05 (time taken so far: 0-00:43:4.0)
     mean loss: adversary 133320.23154560593
     mean reward: adversary 152.98028975897748, agent -100.45728143894064
on episode 12544, curr epsilon 0.05 (time taken so far: 0-00:43:10.0)
     mean loss: adversary 137407.75984179636
     mean reward: adversary 173.4925739762736, agent -104.37166294995586
on episode 12576, curr epsilon 0.05 (time taken so far: 0-00:43:16.0)
     mean loss: adversary 132270.4080991337
     mean reward: adversary 229.48018111771535, agent -124.33610449012089
on episode 12608, curr epsilon 0.05 (time taken so far: 0-00:43:21.0)
     mean loss: adversary 133914.19849871748
     mean reward: adversary 223.15793220529233, agent -120.61146403794997
on episode 12640, curr epsilon 0.05 (time taken so far: 0-00:43:27.0)
     mean loss: adversary 139403.7135662022
     mean reward: adversary 203.05820462354



124




120.0
90.0
150.0
30.0
0.0
90.0
300.0
60.0
90.0
60.0




124




0.0
120.0
210.0
210.0
240.0
570.0
0.0
0.0
90.0
60.0




124




150.0
150.0
60.0
0.0
180.0
90.0
60.0
0.0
180.0
60.0




124




210.0
0.0
180.0
120.0
90.0
90.0
180.0
30.0
120.0
60.0




124




330.0
90.0
0.0
210.0
150.0
210.0
60.0
0.0
30.0
60.0




124




300.0
240.0
120.0
90.0
210.0
210.0
360.0
150.0
150.0
120.0




124




90.0
0.0
90.0
150.0
0.0
0.0
0.0
120.0
60.0
30.0




124




0.0
90.0
270.0
0.0
0.0
210.0
90.0
60.0
210.0
120.0




124




120.0
90.0
150.0
240.0
0.0
60.0
60.0
180.0
120.0
360.0




124




330.0
390.0
240.0
60.0
240.0
240.0
0.0
0.0
0.0
150.0
Evaluation Reward for episode 13000 is: 122.1
on episode 13024, curr epsilon 0.05 (time taken so far: 0-00:44:48.0)
     mean loss: adversary 147503.5095034668
     mean reward: adversary 130.89481525988876, agent -85.43447942181653
on episode 13056, curr epsilon 0.05 (time taken so far: 0-00:44:54.0)
     mean loss: adversary 146573.78465031364
     mean reward: adversary 193.9319187841607, agent -111.0804842286811
on episode 13088, curr epsilon 0.05 (time taken so far: 0-00:45:0.0)
     mean loss: adversary 147480.89466614497
     mean reward: adversary 175.76102669143853, agent -98.08816469753228
on episode 13120, curr epsilon 0.05 (time taken so far: 0-00:45:6.0)
     mean loss: adversary 139436.83906878124
     mean reward: adversary 217.29161182319316, agent -115.01657124677908
on episode 13152, curr epsilon 0.05 (time taken so far: 0-00:45:12.0)
     mean loss: adversary 142464.17145595927
     mean reward: adversary 234.62137



124




150.0
0.0
0.0
0.0
60.0
270.0
60.0
0.0
0.0
0.0




124




30.0
0.0
60.0
0.0
30.0
0.0
0.0
0.0
30.0
0.0




124




0.0
30.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0




124




0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
30.0




124




0.0
60.0
30.0
60.0
0.0
0.0
0.0
0.0
0.0
0.0




124




0.0
0.0
90.0
0.0
0.0
60.0
0.0
0.0
0.0
0.0




124




120.0
0.0
0.0
0.0
0.0
0.0
60.0
0.0
0.0
60.0




124




0.0
0.0
0.0
90.0
210.0
180.0
30.0
0.0
0.0
0.0




124




0.0
60.0
0.0
0.0
60.0
0.0
30.0
0.0
60.0
60.0




124




30.0
0.0
0.0
60.0
0.0
60.0
0.0
0.0
60.0
60.0
Evaluation Reward for episode 13500 is: 23.4
on episode 13504, curr epsilon 0.05 (time taken so far: 0-00:46:28.0)
     mean loss: adversary 132035.56764115518
     mean reward: adversary 118.19378365664072, agent -81.19066802486456
on episode 13536, curr epsilon 0.05 (time taken so far: 0-00:46:34.0)
     mean loss: adversary 136384.52668091859
     mean reward: adversary 107.51608901264835, agent -74.64202649650117
on episode 13568, curr epsilon 0.05 (time taken so far: 0-00:46:40.0)
     mean loss: adversary 129684.32800166214
     mean reward: adversary 129.93724271107868, agent -89.58158720364695
on episode 13600, curr epsilon 0.05 (time taken so far: 0-00:46:46.0)
     mean loss: adversary 126106.23414337178
     mean reward: adversary 104.48978148395322, agent -77.54465271826558
on episode 13632, curr epsilon 0.05 (time taken so far: 0-00:46:52.0)
     mean loss: adversary 128926.25452757915
     mean reward: adversary 174.86018683194



124




300.0
180.0
270.0
150.0
360.0
270.0
300.0
330.0
210.0
510.0




124




180.0
90.0
150.0
210.0
90.0
120.0
630.0
420.0
150.0
210.0




124




120.0
150.0
180.0
120.0
360.0
600.0
90.0
60.0
390.0
120.0




124




180.0
210.0
540.0
90.0
180.0
210.0
90.0
570.0
60.0
210.0




124




210.0
180.0
600.0
360.0
90.0
120.0
300.0
210.0
360.0
450.0




124




360.0
360.0
120.0
360.0
390.0
270.0
210.0
150.0
270.0
390.0




124




420.0
150.0
270.0
600.0
360.0
270.0
180.0
180.0
540.0
120.0




124




330.0
120.0
390.0
210.0
30.0
510.0
390.0
270.0
120.0
180.0




124




270.0
150.0
270.0
300.0
60.0
150.0
120.0
390.0
210.0
360.0




124




180.0
570.0
390.0
210.0
0.0
150.0
150.0
150.0
120.0
360.0
Evaluation Reward for episode 14000 is: 256.2
on episode 14016, curr epsilon 0.05 (time taken so far: 0-00:48:13.0)
     mean loss: adversary 113474.77658284426
     mean reward: adversary 207.31235953862296, agent -114.29758113340871
on episode 14048, curr epsilon 0.05 (time taken so far: 0-00:48:19.0)
     mean loss: adversary 110847.30803406547
     mean reward: adversary 202.09391848111443, agent -112.51397548937288
on episode 14080, curr epsilon 0.05 (time taken so far: 0-00:48:25.0)
     mean loss: adversary 110812.65307065414
     mean reward: adversary 156.58112937542984, agent -87.38407334936916
on episode 14112, curr epsilon 0.05 (time taken so far: 0-00:48:30.0)
     mean loss: adversary 111510.2413526212
     mean reward: adversary 124.25390126536759, agent -83.86606879233884
on episode 14144, curr epsilon 0.05 (time taken so far: 0-00:48:36.0)
     mean loss: adversary 108337.32480177138
     mean reward: adversary 



124




0.0
0.0
30.0
30.0
0.0
0.0
120.0
60.0
0.0
0.0




124




0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0




124




0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0




124




0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
60.0




124




0.0
0.0
60.0
60.0
0.0
0.0
0.0
0.0
0.0
0.0




124




0.0
60.0
180.0
30.0
0.0
0.0
60.0
30.0
0.0
0.0




124




0.0
0.0
0.0
0.0
60.0
0.0
0.0
60.0
60.0
60.0




124




0.0
0.0
0.0
0.0
0.0
60.0
0.0
30.0
0.0
0.0




124




0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0




124




0.0
0.0
0.0
0.0
60.0
0.0
0.0
0.0
0.0
0.0
Evaluation Reward for episode 14500 is: 11.7
on episode 14528, curr epsilon 0.05 (time taken so far: 0-00:49:57.0)
     mean loss: adversary 126746.39658923028
     mean reward: adversary 97.25003885394275, agent -71.23832611853918
on episode 14560, curr epsilon 0.05 (time taken so far: 0-00:50:3.0)
     mean loss: adversary 126697.0620385918
     mean reward: adversary 149.97770584783603, agent -92.99524937790731
on episode 14592, curr epsilon 0.05 (time taken so far: 0-00:50:9.0)
     mean loss: adversary 125499.49454189876
     mean reward: adversary 219.91340773728464, agent -119.58386146154956
on episode 14624, curr epsilon 0.05 (time taken so far: 0-00:50:15.0)
     mean loss: adversary 121525.24117704987
     mean reward: adversary 225.42195395156142, agent -115.77185802790022
on episode 14656, curr epsilon 0.05 (time taken so far: 0-00:50:21.0)
     mean loss: adversary 129909.46580625587
     mean reward: adversary 230.215469632336, age



124




60.0
90.0
60.0
330.0
120.0
90.0
30.0
240.0
90.0
0.0




124




30.0
30.0
120.0
30.0
270.0
180.0
60.0
330.0
30.0
120.0




124




300.0
60.0
0.0
120.0
30.0
90.0
120.0
0.0
30.0
90.0




124




30.0
150.0
390.0
60.0
30.0
120.0
60.0
180.0
0.0
150.0




124




0.0
60.0
0.0
150.0
150.0
90.0
90.0
180.0
120.0
60.0




124




210.0
0.0
300.0
60.0
240.0
0.0
210.0
240.0
150.0
390.0




124




60.0
120.0
60.0
0.0
120.0
120.0
30.0
120.0
60.0
60.0




124




0.0
0.0
150.0
330.0
90.0
180.0
30.0
0.0
90.0
120.0




124




210.0
360.0
60.0
0.0
300.0
0.0
90.0
300.0
30.0
0.0




124




120.0
90.0
30.0
120.0
90.0
30.0
60.0
150.0
60.0
0.0
Evaluation Reward for episode 15000 is: 108.6
on episode 15008, curr epsilon 0.05 (time taken so far: 0-00:51:38.0)
     mean loss: adversary 146435.73455686864
     mean reward: adversary 144.0344286472152, agent -93.32412225428538
on episode 15040, curr epsilon 0.05 (time taken so far: 0-00:51:44.0)
     mean loss: adversary 143602.48098460396
     mean reward: adversary 151.5056570429947, agent -96.19115861317319
on episode 15072, curr epsilon 0.05 (time taken so far: 0-00:51:50.0)
     mean loss: adversary 146247.38620125057
     mean reward: adversary 128.9529792468808, agent -87.01717690685373
on episode 15104, curr epsilon 0.05 (time taken so far: 0-00:51:55.0)
     mean loss: adversary 146380.9168107942
     mean reward: adversary 206.70207503981666, agent -108.60253333191623
on episode 15136, curr epsilon 0.05 (time taken so far: 0-00:52:1.0)
     mean loss: adversary 160739.09192384186
     mean reward: adversary 140.3122220



124




0.0
30.0
0.0
0.0
0.0
0.0
0.0
330.0
240.0
840.0




124




210.0
60.0
480.0
0.0
390.0
0.0
360.0
0.0
150.0
90.0




124




120.0
60.0
180.0
0.0
0.0
300.0
270.0
0.0
0.0
0.0




124




0.0
0.0
120.0
30.0
30.0
30.0
600.0
0.0
0.0
0.0




124




0.0
60.0
240.0
30.0
330.0
0.0
210.0
60.0
0.0
60.0




124




330.0
0.0
240.0
210.0
0.0
390.0
330.0
0.0
360.0
0.0




124




0.0
0.0
0.0
0.0
120.0
180.0
90.0
0.0
30.0
0.0




124




60.0
180.0
0.0
0.0
120.0
120.0
0.0
0.0
180.0
810.0




124




0.0
390.0
0.0
0.0
150.0
180.0
60.0
270.0
60.0
0.0




124




60.0
0.0
0.0
450.0
270.0
990.0
630.0
120.0
0.0
450.0
Evaluation Reward for episode 15500 is: 137.4
on episode 15520, curr epsilon 0.05 (time taken so far: 0-00:53:23.0)
     mean loss: adversary 152647.9459166684
     mean reward: adversary 154.4617494244206, agent -94.58223650871395
on episode 15552, curr epsilon 0.05 (time taken so far: 0-00:53:29.0)
     mean loss: adversary 153073.2451234863
     mean reward: adversary 175.94214591358948, agent -91.96288567275839
on episode 15584, curr epsilon 0.05 (time taken so far: 0-00:53:35.0)
     mean loss: adversary 146248.1147458841
     mean reward: adversary 191.09473580680725, agent -114.19662591186737
on episode 15616, curr epsilon 0.05 (time taken so far: 0-00:53:41.0)
     mean loss: adversary 148766.09043914726
     mean reward: adversary 196.61681203626117, agent -102.56029503625811
on episode 15648, curr epsilon 0.05 (time taken so far: 0-00:53:47.0)
     mean loss: adversary 149565.45442361612
     mean reward: adversary 92.62037



124




360.0
120.0
330.0
330.0
420.0
240.0
570.0
450.0
330.0
510.0




124




240.0
570.0
1140.0
1080.0
240.0
180.0
180.0
420.0
450.0
510.0




124




30.0
240.0
150.0
480.0
150.0
360.0
540.0
150.0
270.0
180.0




124




150.0
90.0
180.0
270.0
150.0
600.0
510.0
390.0
300.0
390.0




124




90.0
360.0
660.0
420.0
420.0
300.0
180.0
510.0
450.0
660.0




124




300.0
450.0
240.0
240.0
300.0
300.0
330.0
240.0
180.0
210.0




124




660.0
180.0
240.0
210.0
450.0
240.0
630.0
300.0
510.0
480.0




124




840.0
180.0
270.0
360.0
720.0
360.0
270.0
660.0
240.0
150.0




124




300.0
330.0
150.0
120.0
450.0
420.0
810.0
750.0
300.0
180.0




124




390.0
210.0
450.0
420.0
30.0
180.0
510.0
360.0
420.0
300.0
Evaluation Reward for episode 16000 is: 361.2
on episode 16032, curr epsilon 0.05 (time taken so far: 0-00:55:11.0)
     mean loss: adversary 148855.7174232027
     mean reward: adversary 209.6026492159629, agent -112.43601834655566
on episode 16064, curr epsilon 0.05 (time taken so far: 0-00:55:17.0)
     mean loss: adversary 157493.66481561225
     mean reward: adversary 200.53598408420075, agent -109.72272750481288
on episode 16096, curr epsilon 0.05 (time taken so far: 0-00:55:24.0)
     mean loss: adversary 159531.9393179637
     mean reward: adversary 172.14662368376764, agent -98.75947075567703
on episode 16128, curr epsilon 0.05 (time taken so far: 0-00:55:30.0)
     mean loss: adversary 155497.00403238094
     mean reward: adversary 204.55575997512236, agent -116.19641754363404
on episode 16160, curr epsilon 0.05 (time taken so far: 0-00:55:36.0)
     mean loss: adversary 145144.43127643852
     mean reward: adversary 



124




0.0
0.0
0.0
60.0
0.0
0.0
0.0
0.0
0.0
0.0




124




270.0
0.0
0.0
30.0
0.0
0.0
0.0
0.0
60.0
30.0




124




0.0
0.0
0.0
60.0
0.0
0.0
0.0
0.0
0.0
0.0




124




0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0




124




0.0
30.0
0.0
0.0
0.0
0.0
0.0
30.0
60.0
30.0




124




0.0
0.0
0.0
0.0
0.0
60.0
0.0
0.0
0.0
0.0




124




30.0
0.0
0.0
0.0
0.0
0.0
30.0
30.0
0.0
0.0




124




0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0




124




0.0
0.0
0.0
60.0
0.0
30.0
0.0
0.0
0.0
30.0




124




0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
60.0
0.0
Evaluation Reward for episode 16500 is: 9.9
on episode 16512, curr epsilon 0.05 (time taken so far: 0-00:56:53.0)
     mean loss: adversary 156893.59338080062
     mean reward: adversary 173.3846043425604, agent -100.29533156809013
on episode 16544, curr epsilon 0.05 (time taken so far: 0-00:56:59.0)
     mean loss: adversary 152010.23426386467
     mean reward: adversary 200.39283609714514, agent -105.95156084775186
on episode 16576, curr epsilon 0.05 (time taken so far: 0-00:57:5.0)
     mean loss: adversary 163532.27275250628
     mean reward: adversary 111.86714447143567, agent -75.57199294974964
on episode 16608, curr epsilon 0.05 (time taken so far: 0-00:57:11.0)
     mean loss: adversary 162742.47756054567
     mean reward: adversary 169.56349673364312, agent -97.4004683467394
on episode 16640, curr epsilon 0.05 (time taken so far: 0-00:57:17.0)
     mean loss: adversary 160347.61965641065
     mean reward: adversary 126.48228027620937, a



124




0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0




124




0.0
0.0
0.0
0.0
0.0
0.0
0.0
60.0
0.0
0.0




124




0.0
0.0
0.0
120.0
0.0
0.0
30.0
150.0
0.0
0.0




124




0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
60.0
60.0




124




0.0
60.0
0.0
0.0
150.0
0.0
30.0
150.0
0.0
120.0




124




0.0
0.0
0.0
0.0
0.0
0.0
30.0
60.0
0.0
0.0




124




0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0




124




90.0
0.0
0.0
0.0
0.0
0.0
0.0
30.0
0.0
0.0




124




0.0
0.0
210.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0




124




0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
Evaluation Reward for episode 17000 is: 14.1
on episode 17024, curr epsilon 0.05 (time taken so far: 0-00:58:39.0)
     mean loss: adversary 144747.13198257034
     mean reward: adversary 205.87357394809564, agent -106.89226484352018
on episode 17056, curr epsilon 0.05 (time taken so far: 0-00:58:45.0)
     mean loss: adversary 140031.17600572403
     mean reward: adversary 208.75630093513314, agent -115.38008113812818
on episode 17088, curr epsilon 0.05 (time taken so far: 0-00:58:51.0)
     mean loss: adversary 139850.48195142922
     mean reward: adversary 201.97512232137157, agent -112.8795044123392
on episode 17120, curr epsilon 0.05 (time taken so far: 0-00:58:57.0)
     mean loss: adversary 148808.4881349202
     mean reward: adversary 191.62635892436808, agent -105.91454743974091
on episode 17152, curr epsilon 0.05 (time taken so far: 0-00:59:3.0)
     mean loss: adversary 145892.09323354685
     mean reward: adversary 212.88421446792844,



124




0.0
120.0
0.0
30.0
0.0
90.0
60.0
0.0
0.0
0.0




124




0.0
0.0
0.0
60.0
0.0
60.0
60.0
0.0
0.0
180.0




124




60.0
60.0
0.0
0.0
0.0
30.0
30.0
0.0
0.0
0.0




124




30.0
60.0
0.0
0.0
0.0
0.0
0.0
30.0
60.0
0.0




124




150.0
0.0
0.0
0.0
30.0
120.0
510.0
0.0
0.0
0.0




124




0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
120.0
0.0




124




0.0
210.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0




124




0.0
30.0
30.0
30.0
0.0
30.0
90.0
60.0
30.0
30.0




124




0.0
0.0
0.0
0.0
60.0
60.0
0.0
0.0
0.0
0.0




124




0.0
0.0
0.0
60.0
0.0
0.0
0.0
0.0
0.0
0.0
Evaluation Reward for episode 17500 is: 26.7
on episode 17504, curr epsilon 0.05 (time taken so far: 0-01:00:20.0)
     mean loss: adversary 162463.01699250602
     mean reward: adversary 117.5581484767177, agent -79.1822411140078
on episode 17536, curr epsilon 0.05 (time taken so far: 0-01:00:26.0)
     mean loss: adversary 158022.425311618
     mean reward: adversary 153.3653936995451, agent -97.05142613899748
on episode 17568, curr epsilon 0.05 (time taken so far: 0-01:00:33.0)
     mean loss: adversary 154767.44294894644
     mean reward: adversary 139.63328335812653, agent -79.56634605176635
on episode 17600, curr epsilon 0.05 (time taken so far: 0-01:00:39.0)
     mean loss: adversary 155674.84039407267
     mean reward: adversary 206.4517629581073, agent -119.68175851593463
on episode 17632, curr epsilon 0.05 (time taken so far: 0-01:00:45.0)
     mean loss: adversary 152869.3492179541
     mean reward: adversary 213.40729942891494, agent



124




270.0
60.0
120.0
150.0
180.0
0.0
330.0
0.0
60.0
180.0




124




30.0
120.0
90.0
540.0
150.0
30.0
240.0
30.0
180.0
120.0




124




300.0
60.0
240.0
0.0
750.0
150.0
60.0
240.0
150.0
180.0




124




90.0
90.0
270.0
360.0
360.0
120.0
120.0
0.0
300.0
30.0




124




390.0
210.0
210.0
120.0
0.0
180.0
240.0
120.0
30.0
120.0




124




270.0
0.0
30.0
330.0
0.0
270.0
450.0
0.0
270.0
0.0




124




150.0
480.0
0.0
300.0
150.0
240.0
0.0
120.0
450.0
150.0




124




180.0
60.0
90.0
180.0
420.0
120.0
30.0
90.0
240.0
120.0




124




150.0
30.0
90.0
270.0
0.0
240.0
600.0
540.0
570.0
60.0




124




30.0
270.0
90.0
390.0
30.0
120.0
0.0
510.0
60.0
90.0
Evaluation Reward for episode 18000 is: 177.0
on episode 18016, curr epsilon 0.05 (time taken so far: 0-01:02:6.0)
     mean loss: adversary 132282.20295692596
     mean reward: adversary 203.7720207858114, agent -113.70884780634988
on episode 18048, curr epsilon 0.05 (time taken so far: 0-01:02:11.0)
     mean loss: adversary 130480.25800294109
     mean reward: adversary 183.79222181066288, agent -110.77067624936275
on episode 18080, curr epsilon 0.05 (time taken so far: 0-01:02:17.0)
     mean loss: adversary 131101.8307630134
     mean reward: adversary 207.39363937725204, agent -116.09672838582978
on episode 18112, curr epsilon 0.05 (time taken so far: 0-01:02:23.0)
     mean loss: adversary 132535.94640951103
     mean reward: adversary 214.3822302266499, agent -125.89334537689443
on episode 18144, curr epsilon 0.05 (time taken so far: 0-01:02:29.0)
     mean loss: adversary 137743.8159765575
     mean reward: adversary 151.131



124




0.0
0.0
0.0
60.0
0.0
0.0
0.0
0.0
0.0
0.0




124




0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0




124




0.0
60.0
60.0
60.0
0.0
30.0
0.0
0.0
0.0
0.0




124




0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
60.0




124




0.0
60.0
0.0
0.0
0.0
0.0
30.0
0.0
0.0
0.0




124




0.0
0.0
0.0
0.0
90.0
0.0
0.0
0.0
0.0
0.0




124




0.0
0.0
0.0
60.0
0.0
0.0
0.0
0.0
30.0
0.0




124




0.0
30.0
0.0
0.0
150.0
60.0
120.0
0.0
0.0
0.0




124




60.0
120.0
0.0
30.0
0.0
0.0
0.0
0.0
0.0
0.0




124




0.0
0.0
90.0
30.0
0.0
0.0
0.0
0.0
0.0
0.0
Evaluation Reward for episode 18500 is: 12.9
on episode 18528, curr epsilon 0.05 (time taken so far: 0-01:03:50.0)
     mean loss: adversary 129488.885907947
     mean reward: adversary 188.35004146560928, agent -110.18872779219183
on episode 18560, curr epsilon 0.05 (time taken so far: 0-01:03:55.0)
     mean loss: adversary 131405.14266428846
     mean reward: adversary 194.59220072235163, agent -111.6736462208907
on episode 18592, curr epsilon 0.05 (time taken so far: 0-01:04:1.0)
     mean loss: adversary 133690.6190023131
     mean reward: adversary 176.4669868447432, agent -101.14706737718372
on episode 18624, curr epsilon 0.05 (time taken so far: 0-01:04:7.0)
     mean loss: adversary 131664.6561431318
     mean reward: adversary 191.3971685312217, agent -113.85374090967062
on episode 18656, curr epsilon 0.05 (time taken so far: 0-01:04:13.0)
     mean loss: adversary 138072.91144166476
     mean reward: adversary 226.3494300068694, agen



124




0.0
420.0
0.0
210.0
210.0
0.0
600.0
30.0
0.0
30.0




124




300.0
300.0
240.0
0.0
30.0
90.0
540.0
0.0
270.0
120.0




124




0.0
0.0
210.0
0.0
150.0
600.0
600.0
360.0
630.0
120.0




124




0.0
60.0
90.0
480.0
30.0
690.0
60.0
90.0
90.0
600.0




124




0.0
60.0
0.0
0.0
240.0
0.0
690.0
0.0
0.0
330.0




124




360.0
60.0
0.0
90.0
120.0
720.0
240.0
0.0
0.0
150.0




124




90.0
240.0
60.0
60.0
0.0
30.0
540.0
510.0
30.0
330.0




124




0.0
330.0
1080.0
360.0
960.0
60.0
450.0
0.0
0.0
420.0




124




660.0
0.0
120.0
150.0
150.0
0.0
0.0
0.0
0.0
60.0




124




660.0
210.0
150.0
390.0
900.0
720.0
480.0
0.0
150.0
660.0
Evaluation Reward for episode 19000 is: 223.2
on episode 19008, curr epsilon 0.05 (time taken so far: 0-01:05:28.0)
     mean loss: adversary 146557.52793894464
     mean reward: adversary 190.90155920372996, agent -107.75120548730732
on episode 19040, curr epsilon 0.05 (time taken so far: 0-01:05:34.0)
     mean loss: adversary 139707.4429034518
     mean reward: adversary 219.90301932765357, agent -125.16915370064956
on episode 19072, curr epsilon 0.05 (time taken so far: 0-01:05:40.0)
     mean loss: adversary 141321.9657401622
     mean reward: adversary 214.76780960478033, agent -120.70937515923784
on episode 19104, curr epsilon 0.05 (time taken so far: 0-01:05:46.0)
     mean loss: adversary 140155.7948644785
     mean reward: adversary 183.7021841738799, agent -107.11538315754777
on episode 19136, curr epsilon 0.05 (time taken so far: 0-01:05:52.0)
     mean loss: adversary 143686.21483141108
     mean reward: adversary 2



124




30.0
270.0
390.0
120.0
30.0
30.0
270.0
300.0
30.0
0.0




124




180.0
0.0
360.0
0.0
390.0
420.0
0.0
0.0
420.0
60.0




124




60.0
120.0
0.0
0.0
510.0
0.0
450.0
90.0
0.0
390.0




124




90.0
0.0
0.0
60.0
90.0
330.0
240.0
30.0
60.0
270.0




124




0.0
300.0
420.0
60.0
120.0
0.0
990.0
180.0
30.0
0.0




124




0.0
60.0
0.0
180.0
210.0
30.0
60.0
180.0
0.0
0.0




124




0.0
60.0
390.0
270.0
30.0
150.0
0.0
0.0
300.0
390.0




124




600.0
0.0
240.0
0.0
390.0
210.0
60.0
120.0
0.0
30.0




124




480.0
0.0
540.0
30.0
120.0
360.0
210.0
30.0
60.0
0.0




124




510.0
360.0
90.0
0.0
210.0
60.0
0.0
510.0
210.0
60.0
Evaluation Reward for episode 19500 is: 159.9
on episode 19520, curr epsilon 0.05 (time taken so far: 0-01:07:14.0)
     mean loss: adversary 121635.54744133458
     mean reward: adversary 210.8546020593821, agent -108.97624689716443
on episode 19552, curr epsilon 0.05 (time taken so far: 0-01:07:19.0)
     mean loss: adversary 123764.81469340666
     mean reward: adversary 212.38744235745673, agent -114.45392456500468
on episode 19584, curr epsilon 0.05 (time taken so far: 0-01:07:25.0)
     mean loss: adversary 128561.91911163245
     mean reward: adversary 166.59717768233276, agent -98.06900555570613
on episode 19616, curr epsilon 0.05 (time taken so far: 0-01:07:31.0)
     mean loss: adversary 125260.36631981433
     mean reward: adversary 203.9262659491029, agent -111.2106351103054
on episode 19648, curr epsilon 0.05 (time taken so far: 0-01:07:37.0)
     mean loss: adversary 122098.84317724747
     mean reward: adversary 190.75

In [None]:
eps_start = 0.9
eps_end = 0.05
eps_decay = 0.9996
epsilon = eps_start

eps = [epsilon]
for i in range(10_000):
    epsilon = max((epsilon*eps_decay), eps_end)
    eps.append(epsilon)

plt.plot(eps)

In [None]:
# load adversary net
adversary_net = SimpleTagNet(config, "adversary").to(config.device)
adversary_net.load_state_dict(torch.load('./models/batched-baseline-test/adversary-net-5632.pth'))

In [None]:
# save logs
logger = AttrDict(
        episodic_losses=AttrDict(adversary=[], agent=[]),
        episodic_rewards=AttrDict(adversary=[], agent=[])
    )
with open("models/batched-baseline-test/log.json", "w") as f:
    json.dump(logger, f)

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 6))
axes = axes.ravel()

axes[0].plot(logger.episodic_losses.adversary[50:], label="adversary")
axes[0].plot(logger.episodic_losses.agent[50:], label="good agent")
axes[0].set_title("loss")
axes[0].legend()

axes[1].plot(logger.episodic_rewards.adversary[50:], label="adversary")
axes[1].plot(logger.episodic_rewards.agent[50:], label="good agent")
axes[1].set_title("reward")
axes[1].legend()

In [None]:
fig.savefig("models/batched-baseline-test/vis.png")

In [None]:
# load logs
with open("models/batched-baseline-test/log.json", "r") as f:
    logger = json.load(f)

In [None]:
def moving_average(a, n=3):
    ret = np.cumsum(a, dtype=float)
    ret[n:] = ret[n:] - ret[:-n]
    return ret[n - 1:] / n

def f(a, n):
    """Zero out beginning of last axis"""
    pad_width = [(0, 0) for _ in range(a.ndim - 1)] + [(n, 0)]
    return np.pad(a, pad_width, mode='constant', constant_values=0)

fig, axes = plt.subplots(1, 2, figsize=(15, 6))
axes = axes.ravel()

axes[0].plot(logger["episodic_losses"]["adversary"][50:], label="adversary")
axes[0].plot(logger["episodic_losses"]["agent"][50:], label="good agent")
axes[0].set_title("loss")
axes[0].legend()

adversary_episodic_rewards = np.array(logger["episodic_rewards"]["adversary"])*10
mean_adversary_episodic_rewards = f(moving_average(adversary_episodic_rewards, n=512), 0)
# axes[1].plot(adversary_episodic_rewards, label="adversary")
axes[1].plot(mean_adversary_episodic_rewards, label="adversary mean")
# axes[1].plot(logger["episodic_rewards"]["agent"][50:], label="good agent")
axes[1].set_title("reward")
axes[1].legend()

In [None]:
env = simple_tag_v2.env(
    num_good=1,
    num_adversaries=3,
    num_obstacles=0,
    max_cycles=30,
    continuous_actions=False
).unwrapped
def visualize(config, adversary_net):
    adversary_net.eval()
    with torch.no_grad():
        return run_episode(config, adversary_net, should_render=True, is_val=True, evaluation=True)

episode = visualize(config, adversary_net)
print("episode steps", episode.steps)
print("episode rewards", *episode.reward.items())