In [1]:
def transpose_list(mylist):
    return list(map(list, zip(*mylist)))


transpose_list([1,2,3])

TypeError: zip argument #1 must support iteration

In [6]:
# main function that sets up environments
# perform training loop

import envs
from buffer import ReplayBuffer
from maddpg import MADDPG
import torch
import numpy as np
from tensorboardX import SummaryWriter
import os
from utilities import transpose_list, transpose_to_tensor

# keep training awake
# from workspace_utils import keep_awake

# for saving gif
import imageio

def seeding(seed=1):
    np.random.seed(seed)
    torch.manual_seed(seed)

def pre_process(entity, batchsize):
    processed_entity = []
    for j in range(3):
        list = []
        for i in range(batchsize):
            b = entity[i][j]
            list.append(b)
        c = torch.Tensor(list)
        processed_entity.append(c)
    return processed_entity



seeding()
# number of parallel agents
parallel_envs = 4
# number of training episodes.
# change this to higher number to experiment. say 30000.
number_of_episodes = 16
episode_length = 10
batchsize = 10
# how many episodes to save policy and gif
save_interval = 5000
# what is this ?
t = 0

# amplitude of OU noise
# this slowly decreases to 0
noise = 2
noise_reduction = 0.9999

# how many episodes before update
episode_per_update = 2 * parallel_envs

log_path = os.getcwd()+"/log"
model_dir= os.getcwd()+"/model_dir"

os.makedirs(model_dir, exist_ok=True)

torch.set_num_threads(parallel_envs)
# this may be a list of all environments
env = envs.make_parallel_env(parallel_envs)

# keep 5000 episodes worth of replay
buffer = ReplayBuffer(int(5000*episode_length))

# initialize policy and critic
# this creates a list of models, each element in the list refers to an agent in the simulation
# [agent_one_ddpg, agent_two_ddpg, ...]
# agent_one_ddpg contains the agent actor and critic models,e.g., agent_one_ddpg.actor, agent_one_ddpg.critic
maddpg = MADDPG()
logger = SummaryWriter(log_dir=log_path)
agent0_reward = []
agent1_reward = []
agent2_reward = []

# training loop
# show progressbar
import progressbar as pb
widget = ['episode: ', pb.Counter(),'/',str(number_of_episodes),' ', 
          pb.Percentage(), ' ', pb.ETA(), ' ', pb.Bar(marker=pb.RotatingMarker()), ' ' ]

timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start()

# use keep_awake to keep workspace from disconnecting
# for episode in keep_awake(range(0, number_of_episodes, parallel_envs)):
# notice we jump forward by number of parallel environments
for episode in range(0, number_of_episodes, parallel_envs):
    timer.update(episode)

    # i believe there are as many as number of agents times parallel env reward
    reward_this_episode = np.zeros((parallel_envs, 3))
    # obs is the observation state space of all the three agents in the 4 parallel env.
    # for the Physical Dception environment with three agents it is of dimension 4x3x14.
    # obs_full is world state irrespective of the agents and its dimension is 4x14.
    all_obs = env.reset()
    obs, obs_full = transpose_list(all_obs)


   #for calculating rewards for this particular episode - addition of all time steps

    # save info or not
    save_info = (episode % save_interval < parallel_envs or episode == number_of_episodes-parallel_envs)
    frames = []
    tmax = 0

    if save_info:
        frames.append(env.render('rgb_array'))



    for episode_t in range(episode_length):
        # t jumps forward in a multiple of environment
        t += parallel_envs


        # explore = only explore for a certain number of episodes
        # action input needs to be transposed
        # the transpose_to_tensor(obs) changes the data to each agent point of view
        # since we have 4 environments, there are 4 agent 1, 4 agent 2, and 4 agent 3
        # each agent has a state in each environment, total states across 4 environments for agent 1 is 4x14 tensor
        # transpose_to_tensor(obs) = is a list of 3 elements. each element is for 1 agent
        # pick element 1. this is an array of 4x14 elements of agent observation across 4 environments.
        # maddpg.act has a for loop that take each element of obs and pass it to the agents actor models and 
        # to generate an action from each agent actor. 
        actions = maddpg.act(transpose_to_tensor(obs), noise=noise)
        noise *= noise_reduction
        # there are 4 actions per agent and 3 agents, total of 12 actions 
        actions_array = torch.stack(actions).detach().numpy()
        
        actions_for_env = np.rollaxis(actions_array, 1)
        
        
        # step forward one frame
        # obs is the observation state space of all the three agents in the 4 parallel env.
        # for the Physical Dception environment with three agents it is of dimension 4x3x14.
        # obs_full is world state irrespective of the agents and its dimension is 4x14.
        # To gain more understanding, please see the code in the multiagent folder.
        next_obs, next_obs_full, rewards, dones, info = env.step(actions_for_env)

#         # add data to buffer
#         transition = (obs, obs_full, actions_for_env, rewards, next_obs, next_obs_full, dones)

#         buffer.push(transition)

#         reward_this_episode += rewards

#         obs, obs_full = next_obs, next_obs_full



#     samples = buffer.sample(5)

[33mWARN: Could not seed environment <MultiAgentEnv instance>[0m
[33mWARN: Could not seed environment <MultiAgentEnv instance>[0m
[33mWARN: Could not seed environment <MultiAgentEnv instance>[0m
[33mWARN: Could not seed environment <MultiAgentEnv instance>[0m


  return array(a, dtype, copy=False, order=order, subok=True)
episode: 12/16  75% ETA:  0:00:00 |\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\           | 

In [9]:
rewards

array([[-1.07381478,  0.22205014,  0.22205014],
       [-0.20003734,  0.37448418,  0.37448418],
       [-0.57595087,  0.57313365,  0.57313365],
       [-1.64609881,  0.46162903,  0.46162903]])

In [2]:
obs, obs_full, action, reward, next_obs, next_obs_full, done = map(transpose_to_tensor, samples)

In [3]:
len(next_obs)

3

In [5]:
next_obs[0].shape

torch.Size([5, 14])

In [28]:
all_obs[0,:][0][0].shape

(14,)

In [30]:
all_obs[0,:][1].shape

array([ 0.85010052, -0.04933641, -0.34143099, -0.76787591,  0.        ,
        0.        , -0.45229252,  0.21949375,  0.        ,  0.        ,
        0.08854489, -1.27879257,  0.        ,  0.        ])

In [45]:
actions_array.shape

(3, 4, 2)

In [34]:
len(obs[0])

3

In [35]:
obs[0][0].shape

(14,)

In [38]:
obs_full[0].shape

(14,)

In [41]:
len(transpose_to_tensor(obs))

3

In [43]:
transpose_to_tensor(obs)[0].shape

torch.Size([4, 14])

In [48]:
actions_for_env.shape

(4, 3, 2)