In [49]:
import gym
import tianshou as ts


ENV_N = 10
ENV_NAME = 'CartPole-v1'

env = gym.make(ENV_NAME)
envs = [gym.make(ENV_NAME) for _ in range(ENV_N)]


### initiate multiple envs

In [50]:
train_envs = [gym.make(ENV_NAME) for _ in range(ENV_N)] # each individual agent per env
test_envs = [gym.make(ENV_NAME) for _ in range(ENV_N)] ## TODO: think? do we need one test env for all policies or each policies need one test env

In [51]:
### define policy networks and optimization criteria

import torch, numpy as np
from torch import nn

class Net(nn.Module):
    def __init__(self, state_shape, action_shape):
        super().__init__()
        self.model = nn.Sequential(*[
            nn.Linear(np.prod(state_shape), 128), nn.ReLU(inplace=True),
            nn.Linear(128, 128), nn.ReLU(inplace=True),
            nn.Linear(128, 128), nn.ReLU(inplace=True),
            nn.Linear(128, np.prod(action_shape))
        ])
    def forward(self, obs, state=None, info={}):
        if not isinstance(obs, torch.Tensor):
            obs = torch.tensor(obs, dtype=torch.float)
        batch = obs.shape[0]
        logits = self.model(obs.view(batch, -1))
        return logits, state

state_shape = env.observation_space.shape or env.observation_space.n
action_shape = env.action_space.shape or env.action_space.n
net = Net(state_shape, action_shape)
optim = torch.optim.Adam(net.parameters(), lr=1e-3)

In [52]:
nets = [Net(state_shape, action_shape) for _ in range(ENV_N)]

# policies = [ts.policy.DQNPolicy(nets[i],  # net
#                                 torch.optim.Adam(nets[i].parameters(), lr=1e-3),  # optim
#                                 discount_factor=0.9, 
#                                 estimation_step=3, 
#                                 target_update_freq=320) for i in range(ENV_N)]
policies = [ts.policy.DQNPolicy(nets[i],  # net
                                torch.optim.Adam(nets[i].parameters(), lr=1e-3),  # optim
                                discount_factor=0.9, 
                                estimation_step=3, 
                                target_update_freq=320) for i in range(ENV_N)]

assert len(policies) == len(train_envs)

## collector

The collector is a key concept in Tianshou. It allows the policy to interact with different types of environments conveniently. In each step, the collector will let the policy perform (at least) a specified number of steps or episodes and store the data in a replay buffer.

In [53]:
train_collectors = [ts.data.Collector(policies[i], 
                                      train_envs[i], 
                                      ts.data.ReplayBuffer(size=1000)) for i in range(ENV_N)]
test_collectors = [ts.data.Collector(policies[i], test_envs[i]) for i in range(ENV_N)] ## TODO: think? do we need one test env for all policies or each policies need one test env


## training

In [54]:
results = [[] for _ in range(ENV_N)]
for j in range(ENV_N):
    print(f"Training policy {j} on env {j}")
    results[j] = ts.trainer.offpolicy_trainer(
        policies[j], train_collectors[j], test_collectors[j],
        max_epoch=2, step_per_epoch=1000, collect_per_step=10,
        episode_per_test=100, batch_size=64,
        train_fn=lambda e: policies[j].set_eps(0.1),
        test_fn=lambda e: policies[j].set_eps(0.05),
        stop_fn=lambda x: x >= env.spec.reward_threshold,
        writer=None)
    print(f'Finished training! Use {results[j]["duration"]}')

Epoch #1:   2%|2         | 23/1000 [00:00<00:08, 109.56it/s, len=11.00, loss=3.230363, n/ep=1.00, n/st=10.00, rew=11.00, v/ep=199.54, v/st=2017.19]Training policy 0 on env 0
Epoch #1: 1001it [00:09, 110.13it/s, len=0.00, loss=0.282627, n/ep=0.00, n/st=10.00, rew=0.00, v/ep=17.82, v/st=2052.60]
Epoch #2:   2%|2         | 20/1000 [00:00<00:09, 106.61it/s, len=0.00, loss=0.326034, n/ep=0.00, n/st=10.00, rew=0.00, v/ep=21.72, v/st=2023.19]Epoch #1: test_reward: 33.320000, best_reward: 33.320000 in #1
Epoch #2: 1001it [00:09, 109.93it/s, len=164.00, loss=0.046963, n/ep=1.00, n/st=10.00, rew=164.00, v/ep=12.30, v/st=2055.77]
Epoch #1:   2%|2         | 22/1000 [00:00<00:08, 115.56it/s, len=10.00, loss=3.614060, n/ep=1.00, n/st=10.00, rew=10.00, v/ep=208.41, v/st=1990.51]Epoch #2: test_reward: 147.650000, best_reward: 147.650000 in #2
Finished training! Use 24.55s
Training policy 1 on env 1
Epoch #1: 1001it [00:08, 115.18it/s, len=0.00, loss=0.155781, n/ep=0.00, n/st=10.00, rew=0.00, v/ep=6.35

### initiate a new policy which won't interact with env

In [55]:
global_net = Net(state_shape, action_shape)

global_policy = ts.policy.DQNPolicy(global_net,  # net
                                torch.optim.Adam(global_net.parameters(), lr=1e-3),  # optim
                                discount_factor=0.9, 
                                estimation_step=3, 
                                target_update_freq=320)

In [62]:
# take collector 0's buffer for training the new policy
# num_G_step = 1000
# batch_size = 128
for j in range(ENV_N):
    batch, indice = train_collectors[j].buffer.sample(0) # batch_size = 0, take the whole batch from reply buffer
    print(batch.rew.shape)
    batch = global_policy.process_fn(batch, train_collectors[j].buffer, indice) # processing 
    global_policy.learn(batch)



(1000,)
(1000,)
(1000,)
(1000,)
(1000,)
(1000,)
(1000,)
(1000,)
(1000,)
(1000,)


### now evaluate the global policy

In [64]:
test_env_global = gym.make(ENV_NAME)

In [65]:
test_collector_global = ts.data.Collector(global_policy, test_env_global)

In [66]:
result = test_collector_global.collect(n_episode=5, render=1 / 35 )

In [67]:
print(result)

{'n/ep': 5.0, 'n/st': 190, 'v/st': 33.27685260854186, 'v/ep': 0.8757066475932068, 'rew': 38.0, 'len': 38.0}


# log


| ENV_N   |      reply buff size      |  rewards of local policies | reward of test policy (global) |
|:----------:|:-------------:|:------:|:------:|
| 3 |  1000 |  240, 213, 196 | 27.5 |
| 10 |    1000   | 147, 439, 246, 212, 222, 185, 238, 213, 193, 460  | 38 |
 

In [13]:
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter('log/dqn')

In [14]:
result

{'train_step': 11940,
 'train_episode': 584.0,
 'train_time/collector': '6.14s',
 'train_time/model': '4.87s',
 'train_speed': '1084.19 step/s',
 'test_step': 200658,
 'test_episode': 1200.0,
 'test_time': '76.90s',
 'test_speed': '2609.21 step/s',
 'best_reward': 196.88,
 'duration': '87.92s'}

# save policy

In [15]:
torch.save(policy.state_dict(), 'dqn.pth')

In [4]:
torch.save(policy.state_dict(), 'dqn-best.pth')

In [19]:
j = 1
ts.trainer.offpolicy_trainer(
        policies[j], train_collectors[0], test_collectors[j],
        max_epoch=2, step_per_epoch=1000, collect_per_step=10,
        episode_per_test=100, batch_size=64,
        train_fn=lambda e: policies[j].set_eps(0.1),
        test_fn=lambda e: policies[j].set_eps(0.05),
        stop_fn=lambda x: x >= env.spec.reward_threshold,
        writer=None)

Epoch #1: 1001it [00:09, 107.26it/s, len=0.00, loss=0.039329, n/ep=0.00, n/st=10.00, rew=0.00, v/ep=11.86, v/st=1991.73]
Epoch #2:   2%|2         | 22/1000 [00:00<00:08, 112.93it/s, len=0.00, loss=0.048284, n/ep=0.00, n/st=10.00, rew=0.00, v/ep=10.14, v/st=1999.42]Epoch #1: test_reward: 152.870000, best_reward: 152.870000 in #1
Epoch #2:  72%|#######1  | 718/1000 [00:06<00:02, 105.88it/s, len=0.00, loss=0.023279, n/ep=0.00, n/st=10.00, rew=0.00, v/ep=9.90, v/st=1951.06]


KeyboardInterrupt: 