In [1]:
from mlagents_envs.registry import default_registry

environment_names = list(default_registry.keys())
for name in environment_names:
   print(name)

Basic
3DBall
3DBallHard
GridWorld
Hallway
VisualHallway
CrawlerDynamicTarget
CrawlerStaticTarget
Bouncer
SoccerTwos
PushBlock
VisualPushBlock
WallJump
Tennis
Reacher
Pyramids
VisualPyramids
Walker
FoodCollector
VisualFoodCollector
StrikersVsGoalie
WormStaticTarget
WormDynamicTarget


In [2]:
env = default_registry["3DBall"].make()
env.reset()

Found path: /tmp/ml-agents-binaries/binaries/3DBall-b1b94a0ae13eef9d91f9d8db1e5770c5/Startup/Startup.x86_64


## Get the Behavior Specs from the Environment

In [3]:
# We will only consider the first Behavior
behavior_name = list(env.behavior_specs)[0]
print(f"Name of the behavior : {behavior_name}")
spec = env.behavior_specs[behavior_name]

Name of the behavior : 3DBall?team=0


## Get the Observation Space from the Behavior Specs

In [4]:
# Examine the number of observations per Agent
print("Number of observations : ", len(spec.observation_specs))

# Is there a visual observation ?
# Visual observation have 3 dimensions: Height, Width and number of channels
vis_obs = any(len(spec.shape) == 3 for spec in spec.observation_specs)
print("Is there a visual observation ?", vis_obs)

Number of observations :  1
Is there a visual observation ? False


## Get the Action Space from the Behavior Specs

In [5]:
# Is the Action continuous or multi-discrete ?
if spec.action_spec.continuous_size > 0:
  print(f"There are {spec.action_spec.continuous_size} continuous actions")
if spec.action_spec.is_discrete():
  print(f"There are {spec.action_spec.discrete_size} discrete actions")


# How many actions are possible ?
#print(f"There are {spec.action_size} action(s)")

# For discrete actions only : How many different options does each action has ?
if spec.action_spec.discrete_size > 0:
  for action, branch_size in enumerate(spec.action_spec.discrete_branches):
    print(f"Action number {action} has {branch_size} different options")


There are 2 continuous actions


Get the steps from the Environment

You can do this with the env.get_steps(behavior_name) method. If there are multiple behaviors in the Environment, you can call this method with each of the behavior's names. Note This will not move the simulation forward.

In [6]:
decision_steps, terminal_steps = env.get_steps(behavior_name)

#### Set actions for each behavior
You can set the actions for the Agents of a Behavior by calling `env.set_actions()` you will need to specify the behavior name and pass a tensor of dimension 2. The first dimension of the action must be equal to the number of Agents that requested a decision during the step.

In [7]:
env.set_actions(behavior_name, spec.action_spec.empty_action(len(decision_steps)))

#### Move the simulation forward
Call `env.step()` to move the simulation forward. The simulation will progress until an Agent requestes a decision or terminates.

In [8]:
env.step()

In [9]:
len(decision_steps)

12

In [10]:
print(spec.action_spec.empty_action(len(decision_steps)))

<mlagents_envs.base_env.ActionTuple object at 0x7f246306ecd0>


#### Show the observations for one of the Agents
`DecisionSteps.obs` is a tuple containing all of the observations for all of the Agents with the provided Behavior name.
Each value in the tuple is an observation tensor containing the observation data for all of the agents.

In [11]:
import matplotlib.pyplot as plt
%matplotlib inline

for index, obs_spec in enumerate(spec.observation_specs):
  if len(obs_spec.shape) == 3:
    print("Here is the first visual observation")
    plt.imshow(decision_steps.obs[index][0,:,:,:])
    plt.show()

for index, obs_spec in enumerate(spec.observation_specs):
  if len(obs_spec.shape) == 1:
    print("First vector observations : ", decision_steps.obs[index][0,:])

First vector observations :  [-0.01467304 -0.01468306 -0.5208206   4.         -0.79952097  0.
  0.          0.        ]


### Run the Environment for a few episodes

In [12]:
for episode in range(3):
  env.reset()
  decision_steps, terminal_steps = env.get_steps(behavior_name)
  tracked_agent = -1 # -1 indicates not yet tracking
  done = False # For the tracked_agent
  episode_rewards = 0 # For the tracked_agent
  while not done:
    # Track the first agent we see if not tracking
    # Note : len(decision_steps) = [number of agents that requested a decision]
    if tracked_agent == -1 and len(decision_steps) >= 1:
      tracked_agent = decision_steps.agent_id[0]

    # Generate an action for all agents
    action = spec.action_spec.random_action(len(decision_steps))

    # Set the actions
    env.set_actions(behavior_name, action)

    # Move the simulation forward
    env.step()

    # Get the new simulation results
    decision_steps, terminal_steps = env.get_steps(behavior_name)
    if tracked_agent in decision_steps: # The agent requested a decision
      print(tracked_agent)
      episode_rewards += decision_steps[tracked_agent].reward
    if tracked_agent in terminal_steps: # The agent terminated its episode
      episode_rewards += terminal_steps[tracked_agent].reward
      done = True
  print(f"Total rewards for episode {episode} is {episode_rewards}")

0
0
0
0
0
0
0
0
0
0
0
0
0
Total rewards for episode 0 is 0.30000001937150955
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
Total rewards for episode 1 is 0.700000025331974
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
Total rewards for episode 2 is 1.1000000312924385


In [13]:
# from agent import Agent
# import torch
# agent = Agent(state_size=8, action_size=2, seed=0)
# from collections import deque

# def dqn(n_episodes=2000, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995):
#     """Deep Q-Learning.
    
#     Params
#     ======
#         n_episodes (int): maximum number of training episodes
#         max_t (int): maximum number of timesteps per episode
#         eps_start (float): starting value of epsilon, for epsilon-greedy action selection
#         eps_end (float): minimum value of epsilon
#         eps_decay (float): multiplicative factor (per episode) for decreasing epsilon
#     """
#     scores = []                        # list containing scores from each episode
#     scores_window = deque(maxlen=100)  # last 100 scores
#     eps = eps_start                    # initialize epsilon
#     for i_episode in range(1, n_episodes+1):
#         env_info = env.reset()
#         state = env_info.vector_observations[0]
#         score = 0 
#         decision_steps, terminal_steps = env.get_steps(behavior_name)
#         # while not done:
             
#         #     if tracked_agent == -1 and len(decision_steps) >= 1:
#         #         tracked_agent = decision_steps.agent_id[0]
#         #     action = agent.act(state, eps).item()
#         #     env.set_actions(behavior_name, action)
#         #     env.step()

#         #     # Get the new simulation results
#         #     decision_steps, terminal_steps = env.get_steps(behavior_name)
#         #     if tracked_agent in decision_steps: # The agent requested a decision
#         #         rewards += decision_steps[tracked_agent].reward
#         #     if tracked_agent in terminal_steps: # The agent terminated its episode
#         #         rewards += terminal_steps[tracked_agent].reward
#         #         done = True

#         for t in range(max_t):
#             action = agent.act(state, eps).item()
#             env_info = env.step(action)[0]
#             next_state = env_info.vector_observations[0]   # get the next state
#             reward = env_info.rewards[0]                   # get the reward
#             done = env_info.local_done[0]                  # see if episode has finished
#             agent.step(state, action, reward, next_state, done)
#             score += reward                                # update the score
#             state = next_state
#             if done:
#                 break 
#         scores_window.append(score)       # save most recent score
#         scores.append(score)              # save most recent score
#         eps = max(eps_end, eps_decay*eps) # decrease epsilon
#         print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)), end="")
#         if i_episode % 100 == 0:
#             print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
#         if np.mean(scores_window)>=200.0:
#             print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_window)))
#             torch.save(agent.qnetwork_local.state_dict(), 'checkpoint.pth')
#             break
#     return scores

# scores = dqn()

In [14]:
env.close()
print("Closed environment")

Closed environment


In [15]:
# -----------------
# This code is used to close an env that might not have been closed before
try:
  env.close()
except:
  pass
# -----------------

from mlagents_envs.registry import default_registry
from mlagents_envs.environment import UnityEnvironment
import matplotlib.pyplot as plt
%matplotlib inline

# Create the GridWorld Environment from the registry
env = default_registry["GridWorld"].make()
print("GridWorld environment created.")

from agent import *
# Create a new Q-Network.
qnet = VisualQNetwork((64, 84, 3), 126, 5)

experiences: Buffer = []
optim = torch.optim.Adam(qnet.parameters(), lr= 0.001)

cumulative_rewards: List[float] = []

# The number of training steps that will be performed
NUM_TRAINING_STEPS = int(os.getenv('QLEARNING_NUM_TRAINING_STEPS', 70))
# The number of experiences to collect per training step
NUM_NEW_EXP = int(os.getenv('QLEARNING_NUM_NEW_EXP', 1000))
# The maximum size of the Buffer
BUFFER_SIZE = int(os.getenv('QLEARNING_BUFFER_SIZE', 10000))

for n in range(NUM_TRAINING_STEPS):
  new_exp,_ = Trainer.generate_trajectories(env, qnet, NUM_NEW_EXP, epsilon=0.1)
  random.shuffle(experiences)
  if len(experiences) > BUFFER_SIZE:
    experiences = experiences[:BUFFER_SIZE]
  experiences.extend(new_exp)
  Trainer.update_q_net(qnet, optim, experiences, 5)
  _, rewards = Trainer.generate_trajectories(env, qnet, 100, epsilon=0)
  cumulative_rewards.append(rewards)
  print("Training step ", n+1, "\treward ", rewards)


env.close()

# Show the training graph
plt.plot(range(NUM_TRAINING_STEPS), cumulative_rewards)

Found path: /tmp/ml-agents-binaries/binaries/GridWorld-b1b94a0ae13eef9d91f9d8db1e5770c5/Startup/Startup.x86_64
GridWorld environment created.


NameError: name 'Buffer' is not defined