In [1]:
import gym
import tianshou as ts


ENV_N = 3
ENV_NAME = 'CartPole-v1'

env = gym.make(ENV_NAME)
envs = [gym.make(ENV_NAME) for _ in range(ENV_N)]


### initiate multiple envs

In [2]:
train_envs = [gym.make(ENV_NAME) for _ in range(ENV_N)] # each individual agent per env
test_envs = [gym.make(ENV_NAME) for _ in range(ENV_N)] ## TODO: think? do we need one test env for all policies or each policies need one test env

In [3]:
### define policy networks and optimization criteria

import torch, numpy as np
from torch import nn

class Net(nn.Module):
    def __init__(self, state_shape, action_shape):
        super().__init__()
        self.model = nn.Sequential(*[
            nn.Linear(np.prod(state_shape), 128), nn.ReLU(inplace=True),
            nn.Linear(128, 128), nn.ReLU(inplace=True),
            nn.Linear(128, 128), nn.ReLU(inplace=True),
            nn.Linear(128, np.prod(action_shape))
        ])
    def forward(self, obs, state=None, info={}):
        if not isinstance(obs, torch.Tensor):
            obs = torch.tensor(obs, dtype=torch.float)
        batch = obs.shape[0]
        logits = self.model(obs.view(batch, -1))
        return logits, state

state_shape = env.observation_space.shape or env.observation_space.n
action_shape = env.action_space.shape or env.action_space.n
net = Net(state_shape, action_shape)
optim = torch.optim.Adam(net.parameters(), lr=1e-3)

In [4]:
nets = [Net(state_shape, action_shape) for _ in range(ENV_N)]

policies = [ts.policy.DQNPolicy(nets[i],  # net
                                torch.optim.Adam(nets[i].parameters(), lr=1e-3),  # optim
                                discount_factor=0.9, 
                                estimation_step=3, 
                                target_update_freq=320) for i in range(ENV_N)]
assert len(policies) == len(train_envs)

## collector

The collector is a key concept in Tianshou. It allows the policy to interact with different types of environments conveniently. In each step, the collector will let the policy perform (at least) a specified number of steps or episodes and store the data in a replay buffer.

In [5]:
train_collectors = [ts.data.Collector(policies[i], 
                                      train_envs[i], 
                                      ts.data.ReplayBuffer(size=1000)) for i in range(ENV_N)]
test_collectors = [ts.data.Collector(policies[i], test_envs[i]) for i in range(ENV_N)] ## TODO: think? do we need one test env for all policies or each policies need one test env


## training

In [8]:
results = [[] for _ in range(ENV_N)]
for j in range(ENV_N):
    print(f"Training policy {j} on env {j}")
    results[j] = ts.trainer.offpolicy_trainer(
        policies[j], train_collectors[j], test_collectors[j],
        max_epoch=2, step_per_epoch=1000, collect_per_step=10,
        episode_per_test=100, batch_size=64,
        train_fn=lambda e: policies[j].set_eps(0.1),
        test_fn=lambda e: policies[j].set_eps(0.05),
        stop_fn=lambda x: x >= env.spec.reward_threshold,
        writer=None)
    print(f'Finished training! Use {results[j]["duration"]}')

Epoch #1:   2%|2         | 20/1000 [00:00<00:10, 94.15it/s, len=0.00, loss=0.533388, n/ep=0.00, n/st=10.00, rew=0.00, v/ep=13.83, v/st=1951.27]Training policy 0 on env 0
Epoch #1: 1001it [00:20, 49.39it/s, len=0.00, loss=0.015264, n/ep=0.00, n/st=10.00, rew=0.00, v/ep=11.47, v/st=1910.05]
Epoch #2:   2%|2         | 20/1000 [00:00<00:09, 105.23it/s, len=0.00, loss=0.014602, n/ep=0.00, n/st=10.00, rew=0.00, v/ep=11.31, v/st=1899.52]Epoch #1: test_reward: 144.440000, best_reward: 144.440000 in #1
Epoch #2: 1001it [00:09, 105.44it/s, len=0.00, loss=0.022661, n/ep=0.00, n/st=10.00, rew=0.00, v/ep=10.36, v/st=1966.98]
Epoch #1:   2%|2         | 22/1000 [00:00<00:08, 111.84it/s, len=0.00, loss=3.582766, n/ep=0.00, n/st=10.00, rew=0.00, v/ep=191.03, v/st=2012.80]Epoch #2: test_reward: 173.010000, best_reward: 173.010000 in #2
Finished training! Use 41.99s
Training policy 1 on env 1
Epoch #1: 1001it [00:09, 107.24it/s, len=186.00, loss=0.566913, n/ep=1.00, n/st=10.00, rew=186.00, v/ep=13.65, v/

### reward from above training
- 173
- 236
- 161

In [17]:
# retrive data from reply buffer after training
train_collector_0 = train_collectors[0]

In [18]:
train_collector_0.data.cat(train_collectors[1].data)

Batch(
    policy: Batch(
                _state: Batch(),
            ),
    obs_next: array([[-0.0366928 , -0.34702743, -0.00845681,  0.33000577]]),
    act: array([0]),
    obs: array([[-0.0366928 , -0.34702743, -0.00845681,  0.33000577]]),
    rew: array([1.]),
    done: array([False]),
    state: Batch(),
    info: Batch(),
)

In [13]:
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter('log/dqn')

In [14]:
result

{'train_step': 11940,
 'train_episode': 584.0,
 'train_time/collector': '6.14s',
 'train_time/model': '4.87s',
 'train_speed': '1084.19 step/s',
 'test_step': 200658,
 'test_episode': 1200.0,
 'test_time': '76.90s',
 'test_speed': '2609.21 step/s',
 'best_reward': 196.88,
 'duration': '87.92s'}

# save policy

In [15]:
torch.save(policy.state_dict(), 'dqn.pth')

In [4]:
torch.save(policy.state_dict(), 'dqn-best.pth')