In [1]:
import gym
import torch, numpy as np
from torch import nn
import tianshou as ts
from agents import TwoAgentPolicy

In [2]:
# https://github.com/thu-ml/tianshou/issues/192 to enable rendering wrapper
def make_env():
    return gym.wrappers.FlattenObservation(gym.make('gym_env:penalty-shot-v0'))

In [3]:
class testPolicy(ts.policy.BasePolicy):
    def forward(self, batch, state, **kwargs):
        # print("forward")
        # print(batch)
        return ts.data.Batch(act=np.zeros((len(batch),1)), state={"h": 1})
    
    def learn(self, batch, **kwargs):
        # print('learn')
        # print(batch)
        return {}

In [4]:
# create environment

env = make_env()
train_envs = ts.env.DummyVectorEnv([make_env for _ in range(3)])
test_envs = ts.env.DummyVectorEnv([make_env for _ in range(13)])



In [5]:
# setup network

class Net(nn.Module):
    def __init__(self, state_shape, action_shape):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(np.prod(state_shape), 128), nn.ReLU(inplace=True),
            nn.Linear(128, 128), nn.ReLU(inplace=True),
            nn.Linear(128, 128), nn.ReLU(inplace=True),
            nn.Linear(128, np.prod(action_shape)),
        )

    def forward(self, obs, state=None, info={}):
        if not isinstance(obs, torch.Tensor):
            obs = torch.tensor(obs, dtype=torch.float)
        batch = obs.shape[0]
        # print("obs", obs)
        # print("view", obs.view(batch, -1))
        logits = self.model(obs.view(batch, -1))
        return logits, state

In [6]:
state_shape = env.observation_space.shape or env.observation_space.n
action_shape = env.action_space.shape or env.action_space.n

In [7]:
net1 = Net(state_shape, action_shape)
net2 = Net(state_shape, action_shape)

In [8]:
optim1 = torch.optim.Adam(net1.parameters(), lr=1e-3)
optim2 = torch.optim.Adam(net2.parameters(), lr=1e-3)

In [9]:
p1 = ts.policy.DQNPolicy(net1, optim1, discount_factor=0.9, estimation_step=3, target_update_freq=320)
p2 = ts.policy.DQNPolicy(net2, optim2, discount_factor=0.9, estimation_step=3, target_update_freq=320)
policy = TwoAgentPolicy(observation_space=env.observation_space, action_space=env.action_space, policies=(p1, p2))

In [10]:
# setup collector

train_collector = ts.data.Collector(policy, train_envs, ts.data.VectorReplayBuffer(20000, 10), exploration_noise=True)
test_collector = ts.data.Collector(policy, test_envs, exploration_noise=True)

In [11]:
# training

result = ts.trainer.offpolicy_trainer(
    policy, train_collector, test_collector,
    max_epoch=10, step_per_epoch=10000, step_per_collect=10,
    update_per_step=0.1, episode_per_test=100, batch_size=64)
print(f'Finished training! Use {result}')

Epoch #1: 10008it [00:09, 1077.97it/s, env_step=10008, len=0, loss=0.012, n/ep=0, n/st=12, rew=0.00]


Epoch #1: test_reward: 1.000000 ± 0.000000, best_reward: 1.000000 ± 0.000000 in #0


Epoch #2: 10008it [00:09, 1067.96it/s, env_step=20016, len=0, loss=0.010, n/ep=0, n/st=12, rew=0.00]


Epoch #2: test_reward: 1.000000 ± 0.000000, best_reward: 1.000000 ± 0.000000 in #0


Epoch #3: 10008it [00:09, 1061.58it/s, env_step=30024, len=0, loss=0.004, n/ep=0, n/st=12, rew=0.00]


Epoch #3: test_reward: 1.000000 ± 0.000000, best_reward: 1.000000 ± 0.000000 in #0


Epoch #4: 10008it [00:09, 1048.38it/s, env_step=40032, len=0, loss=0.000, n/ep=0, n/st=12, rew=0.00]


Epoch #4: test_reward: 1.000000 ± 0.000000, best_reward: 1.000000 ± 0.000000 in #0


Epoch #5: 10008it [00:09, 1037.11it/s, env_step=50040, len=0, loss=0.001, n/ep=0, n/st=12, rew=0.00]


Epoch #5: test_reward: 1.000000 ± 0.000000, best_reward: 1.000000 ± 0.000000 in #0


Epoch #6: 10008it [00:09, 1041.17it/s, env_step=60048, len=0, loss=0.001, n/ep=0, n/st=12, rew=0.00]


Epoch #6: test_reward: 1.000000 ± 0.000000, best_reward: 1.000000 ± 0.000000 in #0


Epoch #7: 10008it [00:09, 1024.51it/s, env_step=70056, len=0, loss=0.000, n/ep=0, n/st=12, rew=0.00]


Epoch #7: test_reward: 1.000000 ± 0.000000, best_reward: 1.000000 ± 0.000000 in #0


Epoch #8: 10008it [00:09, 1043.23it/s, env_step=80064, len=0, loss=0.003, n/ep=0, n/st=12, rew=0.00]


Epoch #8: test_reward: 1.000000 ± 0.000000, best_reward: 1.000000 ± 0.000000 in #0


Epoch #9: 10008it [00:09, 1058.68it/s, env_step=90072, len=0, loss=0.000, n/ep=0, n/st=12, rew=0.00]


Epoch #9: test_reward: 1.000000 ± 0.000000, best_reward: 1.000000 ± 0.000000 in #0


Epoch #10: 10008it [00:09, 1035.17it/s, env_step=100080, len=0, loss=0.000, n/ep=0, n/st=12, rew=0.00]


Epoch #10: test_reward: 1.000000 ± 0.000000, best_reward: 1.000000 ± 0.000000 in #0
Finished training! Use {'test_step': 99000, 'test_episode': 1100, 'test_time': '10.67s', 'test_speed': '9278.36 step/s', 'best_reward': 1.0, 'best_result': '1.00 ± 0.00', 'duration': '106.08s', 'train_time/model': '58.24s', 'train_step': 100080, 'train_episode': 1113, 'train_time/collector': '37.17s', 'train_speed': '1048.96 step/s'}
