In [1]:
import torch, numpy as np
from torch import nn

import tianshou as ts
from tianshou.utils import TensorboardLogger, WandbLogger
from torch.utils.tensorboard import SummaryWriter, writer

from agents import TwoAgentPolicy
from agents.lib_agents import SinePolicy
from agents.lib_agents import SAC

from utils import make_render_env, make_env, make_discrete_env, make_render_discrete_env

In [2]:
# create environment

env = make_env()
train_envs = ts.env.DummyVectorEnv([make_env for _ in range(10)])
test_envs = ts.env.DummyVectorEnv([make_render_env for _ in range(5)])



In [3]:
# creating policies

p1 = SinePolicy()
p2 = SAC(env.action_space['bar'], env.observation_space.shape, env.action_space['bar'].shape)(n_step=1)
policy = TwoAgentPolicy(observation_space=env.observation_space, action_space=env.action_space, policies=(p1, p2))

In [4]:
# setup collector

train_collector = ts.data.Collector(policy, train_envs, ts.data.VectorReplayBuffer(2000, len(train_envs)), exploration_noise=True)
test_collector = ts.data.Collector(policy, test_envs, exploration_noise=True)

In [5]:
# logging
logger = WandbLogger(
    save_interval=1,
    project="test-project",
    name='Devanshu Singla',
    entity='penalty-shot-project',
    run_id='sac'
)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
wandb: W&B API key is configured (use `wandb login --relogin` to force relogin)


In [6]:
# training

result = ts.trainer.offpolicy_trainer(
    policy, train_collector, test_collector,
    max_epoch=10, step_per_epoch=10000, step_per_collect=10,
    update_per_step=0.1, episode_per_test=100, batch_size=64, logger=logger)
print(f'Finished training! Use {result}')

Epoch #1: 10001it [00:23, 419.15it/s, env_step=10000, len=0, loss/actor=-0.941, loss/critic1=0.008, loss/critic2=0.008, n/ep=0, n/st=10, rew=0.00]                           


Epoch #1: test_reward: 0.580000 ± 0.814616, best_reward: 0.900000 ± 0.435890 in #0


Epoch #2: 10001it [00:23, 422.42it/s, env_step=20000, len=0, loss/actor=-1.570, loss/critic1=0.007, loss/critic2=0.007, n/ep=0, n/st=10, rew=0.00]                           


Epoch #2: test_reward: 0.920000 ± 0.391918, best_reward: 0.920000 ± 0.391918 in #2


Epoch #3: 10001it [00:24, 411.13it/s, env_step=30000, len=0, loss/actor=-2.162, loss/critic1=0.006, loss/critic2=0.006, n/ep=0, n/st=10, rew=0.00]                           


Epoch #3: test_reward: 0.800000 ± 0.600000, best_reward: 0.920000 ± 0.391918 in #2


Epoch #4: 10001it [00:24, 412.83it/s, env_step=40000, len=0, loss/actor=-2.723, loss/critic1=0.004, loss/critic2=0.005, n/ep=0, n/st=10, rew=0.00]                           


Epoch #4: test_reward: 0.860000 ± 0.510294, best_reward: 0.920000 ± 0.391918 in #2


Epoch #5: 10001it [00:24, 400.63it/s, env_step=50000, len=0, loss/actor=-3.242, loss/critic1=0.004, loss/critic2=0.004, n/ep=0, n/st=10, rew=0.00]                           


Epoch #5: test_reward: 0.860000 ± 0.510294, best_reward: 0.920000 ± 0.391918 in #2


Epoch #6: 10001it [00:24, 411.76it/s, env_step=60000, len=88, loss/actor=-3.759, loss/critic1=0.002, loss/critic2=0.002, n/ep=1, n/st=10, rew=1.00]                           


Epoch #6: test_reward: 0.780000 ± 0.625780, best_reward: 0.920000 ± 0.391918 in #2


Epoch #7: 10001it [00:24, 409.94it/s, env_step=70000, len=0, loss/actor=-4.156, loss/critic1=0.004, loss/critic2=0.004, n/ep=0, n/st=10, rew=0.00]                           


Epoch #7: test_reward: 0.860000 ± 0.510294, best_reward: 0.920000 ± 0.391918 in #2


Epoch #8: 10001it [00:24, 415.94it/s, env_step=80000, len=0, loss/actor=-4.620, loss/critic1=0.004, loss/critic2=0.004, n/ep=0, n/st=10, rew=0.00]                           


Epoch #8: test_reward: 0.940000 ± 0.341174, best_reward: 0.940000 ± 0.341174 in #8


Epoch #9: 10001it [00:23, 423.57it/s, env_step=90000, len=0, loss/actor=-4.889, loss/critic1=0.004, loss/critic2=0.004, n/ep=0, n/st=10, rew=0.00]                           


Epoch #9: test_reward: 0.860000 ± 0.510294, best_reward: 0.940000 ± 0.341174 in #8


Epoch #10: 10001it [00:22, 434.96it/s, env_step=100000, len=0, loss/actor=-5.309, loss/critic1=0.003, loss/critic2=0.004, n/ep=0, n/st=10, rew=0.00]                           


Epoch #10: test_reward: 0.880000 ± 0.474974, best_reward: 0.940000 ± 0.341174 in #8
Finished training! Use {'test_step': 96986, 'test_episode': 1100, 'test_time': '92.48s', 'test_speed': '1048.73 step/s', 'best_reward': 0.94, 'best_result': '0.94 ± 0.34', 'duration': '332.92s', 'train_time/model': '191.18s', 'train_step': 100000, 'train_episode': 1130, 'train_time/collector': '49.26s', 'train_speed': '415.91 step/s'}
