In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%cd ../..

/home/ubuntu/sustaingym


In [2]:
import gymnasium as gym
# import gym
import numpy as np
import torch

from tianshou.data import Collector, VectorReplayBuffer
from tianshou.env import DummyVectorEnv
from tianshou.policy import PPOPolicy
from tianshou.trainer import onpolicy_trainer
from tianshou.utils.net.common import ActorCritic, Net
from tianshou.utils.net.continuous import Actor, Critic

import warnings
warnings.filterwarnings('ignore')

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# environments
from sustaingym.envs.evcharging import RealTraceGenerator
from sustaingym.envs.evcharging.ev_charging import EVChargingEnv

test_ranges = (
    ('2019-05-01', '2019-08-31'),
    ('2019-09-01', '2019-12-31'),
    ('2020-02-01', '2020-05-31'),
    ('2021-05-01', '2021-08-31'),
)

env = EVChargingEnv(RealTraceGenerator('caltech', test_ranges[0]))
train_envs = DummyVectorEnv([lambda: EVChargingEnv(RealTraceGenerator('caltech', test_ranges[0])) for _ in range(10)])
test_envs = DummyVectorEnv([lambda: EVChargingEnv(RealTraceGenerator('caltech', test_ranges[0])) for _ in range(5)])
# env = gym.make('CartPole-v0')
# train_envs = DummyVectorEnv([lambda: gym.make('CartPole-v0') for _ in range(20)])
# test_envs = DummyVectorEnv([lambda: gym.make('CartPole-v0') for _ in range(10)])

In [7]:
# model & optimizer

# net = Net(env., hidden_sizes=[64, 64], device=device)
# print(env.action_space.shape)
# actor = Actor(net, 1, device=device).to(device)

# model & optimizer
net = Net(env._vectorized_shape, hidden_sizes=[64, 64], device=device)
actor = Actor(net, env.action_space.shape, device=device).to(device)
critic = Critic(net, device=device).to(device)
actor_critic = ActorCritic(actor, critic)
optim = torch.optim.Adam(actor_critic.parameters(), lr=0.0003)

# PPO policy
def dist_fn(logits):
    return torch.distributions.Independent(torch.distributions.Normal(logits, 1), 1)

dist = dist_fn #torch.distributions.Categorical() # torch.distributions.Categorical
policy = PPOPolicy(actor, critic, optim, dist, action_space=env.action_space, deterministic_eval=True)
        
          
# collector
train_collector = Collector(policy, train_envs, VectorReplayBuffer(20000, len(train_envs)))
test_collector = Collector(policy, test_envs)

# trainer
result = onpolicy_trainer(
    policy,
    train_collector,
    test_collector,
    max_epoch=10,
    step_per_epoch=50000,
    repeat_per_collect=10,
    episode_per_test=10,
    batch_size=256,
    step_per_collect=2000,
    stop_fn=lambda mean_reward: mean_reward >= 195,
)
print(result)

AssertionError: 

In [9]:
env.action_space

Box(0.0, 1.0, (54,), float32)

In [12]:
np.concatenate(env._vectorized_obs)

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32)

In [7]:
%debug

> [0;32m/home/ubuntu/.conda/envs/sustaingymnasium/lib/python3.9/site-packages/tianshou/env/venvs.py[0m(332)[0;36mstep[0;34m()[0m
[0;32m    330 [0;31m        [0mid[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0m_wrap_id[0m[0;34m([0m[0mid[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    331 [0;31m        [0;32mif[0m [0;32mnot[0m [0mself[0m[0;34m.[0m[0mis_async[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 332 [0;31m            [0;32massert[0m [0mlen[0m[0;34m([0m[0maction[0m[0;34m)[0m [0;34m==[0m [0mlen[0m[0;34m([0m[0mid[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    333 [0;31m            [0;32mfor[0m [0mi[0m[0;34m,[0m [0mj[0m [0;32min[0m [0menumerate[0m[0;34m([0m[0mid[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    334 [0;31m                [0mself[0m[0;34m.[0m[0mworkers[0m[0;34m[[0m[0mj[0m[0;34m][0m[0;34m.[0m[0msend[0m[0;34m([0m[0maction[0m[0;34m[[0m[0mi[0m[0

In [8]:
# Let's watch its performance!
policy.eval()
result = test_collector.collect(n_episode=1, render=False)
print("Final reward: {}, length: {}".format(result["rews"].mean(), result["lens"].mean()))

Final reward: 200.0, length: 200.0


In [6]:
import tianshou
print(tianshou.__version__)

0.4.11
