# Single agent case, but with a different purpose

There's one key property of self-organising systems: the agents are precisely the same.

The idea in this notebook is to explore whether I can use this sameness to sidestep the multi-agency completely, and simply focus on making a single agent as general and robust as possible, then transfering all its knowledge to all the other agents for self-assembly/organisation!

First things first, I'll need a more complex environment...

In [1]:
import numpy as np
import matplotlib.pyplot as plt

import torch
from torch import nn

import tianshou as ts
from tianshou.policy import DQNPolicy
from tianshou.trainer import offpolicy_trainer
from tianshou.data import Collector, ReplayBuffer

from selforg_gridworld_env import SelfOrgGridWorld

  import pkg_resources


## Environment

We'll use our custom Gym environment, should make it easy to interface with other RL libraries (i.e., [Tianshou](https://tianshou.readthedocs.io/en/stable/index.html), in our case)

In [2]:
env = SelfOrgGridWorld(size=11, num_agents=5, num_targets=3)
env.reset() # TODO decide how to deal with observations!!! => Tianshou has trouble handling the dictionary format, but it's actually quite nice...

({'agents': array([[2, 8],
         [9, 2],
         [7, 9],
         [2, 2],
         [7, 6]]),
  'targets': array([[1, 4],
         [5, 4],
         [8, 6]])},
 {'min_distance': 1.0})

In [6]:
np.concatenate([(5, 2), (3, 2)], axis=0).flatten()

array([5, 2, 3, 2])

In [3]:
env.action_space, env.observation_space

(Discrete(5),
 Dict('agents': Box(0, 10, (5, 2), int64), 'targets': Box(0, 10, (3, 2), int64)))

Let's see what that looks like.

In [4]:
env = SelfOrgGridWorld(size=11, num_agents=5, num_targets=3, render_mode="human")
observation, info = env.reset()

for _ in range(50):
    action = [env.action_space.sample() for _ in range(env.num_agents)]  # List of actions for each agent
    observation, reward, terminated, truncated, info = env.step(action)

    if terminated or truncated:
        observation, info = env.reset()
env.close()

## Agent

In [7]:
# we won't use the full potential of the environment just yet
env = SelfOrgGridWorld(size=11, num_agents=1, num_targets=1)

### Testing Tianshou

In [8]:
agent_shape = env.observation_space['agents'].shape
target_shape = env.observation_space['targets'].shape

# Combining the dimensions of agents and targets
state_shape = (np.prod(agent_shape) + np.prod(target_shape), )
np.prod(4)

4

In [20]:
state, info = env.reset()

state['agents'].flatten()

array([ 8, 10])

In [16]:
env.observation_space['agents'].flatten()

AttributeError: 'Box' object has no attribute 'flatten'

In [9]:
class Net(nn.Module):
    def __init__(self, state_shape, action_shape):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(state_shape, 128), nn.ReLU(inplace=True),
            nn.Linear(128, 128), nn.ReLU(inplace=True),
            nn.Linear(128, 128), nn.ReLU(inplace=True),
            nn.Linear(128, action_shape),
        )

    def forward(self, obs, state=None, info={}):
        if not isinstance(obs, torch.Tensor):
            obs = torch.tensor(obs, dtype=torch.float)
        batch = obs.shape[0]
        logits = self.model(obs.view(batch, -1))
        return logits, state

In [10]:
state_shape = np.prod(env.observation_space['agents'].shape) + np.prod(env.observation_space['targets'].shape)
action_shape = env.action_space.n

net = Net(state_shape, action_shape)
optim = torch.optim.Adam(net.parameters(), lr=1e-3)

net

Net(
  (model): Sequential(
    (0): Linear(in_features=4, out_features=128, bias=True)
    (1): ReLU(inplace=True)
    (2): Linear(in_features=128, out_features=128, bias=True)
    (3): ReLU(inplace=True)
    (4): Linear(in_features=128, out_features=128, bias=True)
    (5): ReLU(inplace=True)
    (6): Linear(in_features=128, out_features=5, bias=True)
  )
)

In [11]:
train_envs = ts.env.DummyVectorEnv([lambda: env])
test_envs = ts.env.DummyVectorEnv([lambda: env])

# Policy
policy = DQNPolicy(net, optim, discount_factor=0.9, estimation_step=3, target_update_freq=50)

# Collector
train_collector = Collector(policy, train_envs, ReplayBuffer(size=20_000))
test_collector = Collector(policy, test_envs)

In [12]:
# pre-collect at least 5000 transitions with random action before training
train_collector.collect(n_step=5000, random=True)

policy.set_eps(0.1)
for i in range(int(1e6)):  # total step
    collect_result = train_collector.collect(n_step=10)

    # once if the collected episodes' mean returns reach the threshold,
    # or every 1000 steps, we test it on test_collector
    if collect_result['rews'].mean() >= env.spec.reward_threshold or i % 1000 == 0:
        policy.set_eps(0.05)
        result = test_collector.collect(n_episode=100)
        if result['rews'].mean() >= env.spec.reward_threshold:
            print(f'Finished training! Test mean returns: {result["rews"].mean()}')
            break
        else:
            # back to training eps
            policy.set_eps(0.1)

    # train policy with a sampled batch data from buffer
    losses = policy.update(64, train_collector.buffer)

TypeError: Object 7 in Batch(
    targets: 7,
    agents: 2,
) has no len()