In [1]:
import gymnasium as gym
import gym_environment

import imitation

import imitation.algorithms

import imitation.data
import imitation.data.rollout
import imitation.data.wrappers

import imitation.policies
import imitation.policies.base

import imitation.util.util

import stable_baselines3
import stable_baselines3.common
import stable_baselines3.common.evaluation

import sys
import abc
import numpy as np
from typing import Union, Dict

gym.register(
	id="HVAC-v0",
	entry_point=gym_environment.Environment,
	max_episode_steps=1440,
)

env = gym.make("HVAC-v0")
venv = imitation.util.util.make_vec_env(
	"HVAC-v0",
	rng=np.random.default_rng(),
	n_envs=4,
	post_wrappers=[lambda env, _: imitation.data.wrappers.RolloutInfoWrapper(env)],
)


Constants(cooler_btu=2000, heater_btu=2500, ext_wall_thick=0.15, int_wall_thick=0.12, ext_wall_therm_cond=0.04, int_wall_therm_cond=0.18, ext_roof_thick=0.12, int_roof_thick=0.12, ext_roof_therm_cond=0.15, int_roof_therm_cond=0.62, outside_convection=22, inside_convection=6, settings=[-1, 0, 1], air_density=1.293, air_heat_capacity=718, ext_wall_ext_thick=0.01, ext_wall_ext_heat_capacity=1000, ext_wall_ext_density=1300, ext_wall_int_thick=0.01, ext_wall_int_heat_capacity=837, ext_wall_int_density=801, ext_roof_ext_thick=0.045, ext_roof_ext_heat_capacity=1400, ext_roof_ext_density=860, ext_roof_int_thick=0.015, ext_roof_int_heat_capacity=1860, ext_roof_int_density=630, int_wall_outside_thick=0.01, int_wall_outside_heat_capacity=1000, int_wall_outside_density=800, damper_leak=0.05, epsilon=0.8)
Constants(cooler_btu=2000, heater_btu=2500, ext_wall_thick=0.15, int_wall_thick=0.12, ext_wall_therm_cond=0.04, int_wall_therm_cond=0.18, ext_roof_thick=0.12, int_roof_thick=0.12, ext_roof_therm_c

In [2]:
class DumbPolicy(imitation.policies.base.NonTrainablePolicy):
	def _choose_action(self, obs: Union[np.ndarray, Dict[str, np.ndarray]],) -> int:
		epsilon = 0.9
		
		room0_temp, room0_setp, room1_temp, room1_setp, outside_temp, prev_setting, _ = obs
		prev_ac_status, prev_dampers = env.actions[int(prev_setting)]

		if room0_temp > room0_setp + epsilon and room1_temp > room1_setp:
			ac_status, dampers = (-1, [[False, False]])
		elif room0_temp < room0_setp - epsilon and room1_temp < room1_setp:
			ac_status, dampers = (1, [[False, False]])
		
		elif room0_temp > room0_setp and room1_temp > room1_setp + epsilon:
			ac_status, dampers = (-1, [[False, False]])
		elif room0_temp < room0_setp and room1_temp < room1_setp - epsilon:
			ac_status, dampers = (1, [[False, False]])

		elif room0_temp > room0_setp + epsilon and abs(room1_temp - room1_setp) < epsilon:
			ac_status, dampers = (-1, [[False, True]])
		elif room1_temp > room1_setp + epsilon and abs(room0_temp - room0_setp) < epsilon:
			ac_status, dampers = (-1, [[True, False]])
		
		elif room0_temp < room0_setp - epsilon and abs(room1_temp - room1_setp) < epsilon:
			ac_status, dampers = (1, [[False, True]])
		elif room1_temp < room1_setp - epsilon and abs(room0_temp - room0_setp) < epsilon:
			ac_status, dampers = (1, [[True, False]])
		
		elif room0_temp > room0_setp + epsilon and room1_temp < room1_setp - epsilon:
			error0, error1 = abs(room0_temp - room0_setp), abs(room1_temp - room1_setp)
			if error0 > error1:
				ac_status, dampers = (-1, [[False, True]])
			else:
				ac_status, dampers = (1, [[True, False]])
		elif room0_temp < room0_setp - epsilon and room1_temp > room1_setp + epsilon:
			error0, error1 = abs(room0_temp - room0_setp), abs(room1_temp - room1_setp)
			if error0 > error1:
				ac_status, dampers = (1, [[False, True]])
			else:
				ac_status, dampers = (-1, [[True, False]])
		else:
			ac_status, dampers = (prev_ac_status, prev_dampers)
		
		if room0_temp > outside_temp and room1_temp > outside_temp and ac_status < 0:
			ac_status = 0
			dampers = prev_dampers
		if room0_temp < outside_temp and room1_temp < outside_temp and ac_status > 0:
			ac_status = 0
			dampers = prev_dampers
		
		# print(room0_temp, room0_setp, room1_temp, room1_setp, outside_temp, ac_status, dampers)
		return env.actions.index((ac_status, dampers))

In [3]:
stupid = DumbPolicy(env.observation_space, env.action_space)
rng = np.random.default_rng()
rollouts = imitation.data.rollout.rollout(
	stupid,
	venv,
	imitation.data.rollout.make_sample_until(min_timesteps=None, min_episodes=2000),
	rng=rng,
	verbose=True
)
transitions = imitation.data.rollout.flatten_trajectories(rollouts)

Constants(cooler_btu=2000, heater_btu=2500, ext_wall_thick=0.15, int_wall_thick=0.12, ext_wall_therm_cond=0.04, int_wall_therm_cond=0.18, ext_roof_thick=0.12, int_roof_thick=0.12, ext_roof_therm_cond=0.15, int_roof_therm_cond=0.62, outside_convection=22, inside_convection=6, settings=[-1, 0, 1], air_density=1.293, air_heat_capacity=718, ext_wall_ext_thick=0.01, ext_wall_ext_heat_capacity=1000, ext_wall_ext_density=1300, ext_wall_int_thick=0.01, ext_wall_int_heat_capacity=837, ext_wall_int_density=801, ext_roof_ext_thick=0.045, ext_roof_ext_heat_capacity=1400, ext_roof_ext_density=860, ext_roof_int_thick=0.015, ext_roof_int_heat_capacity=1860, ext_roof_int_density=630, int_wall_outside_thick=0.01, int_wall_outside_heat_capacity=1000, int_wall_outside_density=800, damper_leak=0.05, epsilon=0.8)
Constants(cooler_btu=2000, heater_btu=2500, ext_wall_thick=0.15, int_wall_thick=0.12, ext_wall_therm_cond=0.04, int_wall_therm_cond=0.18, ext_roof_thick=0.12, int_roof_thick=0.12, ext_roof_therm_c

  logger.warn(


Constants(cooler_btu=2000, heater_btu=2500, ext_wall_thick=0.15, int_wall_thick=0.12, ext_wall_therm_cond=0.04, int_wall_therm_cond=0.18, ext_roof_thick=0.12, int_roof_thick=0.12, ext_roof_therm_cond=0.15, int_roof_therm_cond=0.62, outside_convection=22, inside_convection=6, settings=[-1, 0, 1], air_density=1.293, air_heat_capacity=718, ext_wall_ext_thick=0.01, ext_wall_ext_heat_capacity=1000, ext_wall_ext_density=1300, ext_wall_int_thick=0.01, ext_wall_int_heat_capacity=837, ext_wall_int_density=801, ext_roof_ext_thick=0.045, ext_roof_ext_heat_capacity=1400, ext_roof_ext_density=860, ext_roof_int_thick=0.015, ext_roof_int_heat_capacity=1860, ext_roof_int_density=630, int_wall_outside_thick=0.01, int_wall_outside_heat_capacity=1000, int_wall_outside_density=800, damper_leak=0.05, epsilon=0.8)
Constants(cooler_btu=2000, heater_btu=2500, ext_wall_thick=0.15, int_wall_thick=0.12, ext_wall_therm_cond=0.04, int_wall_therm_cond=0.18, ext_roof_thick=0.12, int_roof_thick=0.12, ext_roof_therm_c

KeyboardInterrupt: 

In [8]:
from imitation.algorithms import bc
bc_trainer = bc.BC(
	observation_space=env.observation_space,
	action_space=env.action_space,
	demonstrations=transitions,
	rng=rng,
)

reward_before_training, _ = stable_baselines3.common.evaluation.evaluate_policy(bc_trainer.policy, env, 10)
print(f"Reward before training: {reward_before_training}")

bc_trainer.train(n_epochs=10, log_interval=100000)
reward_after_training, _ = stable_baselines3.common.evaluation.evaluate_policy(bc_trainer.policy, env, 10)
print(f"Reward after training: {reward_after_training}")

Reward before training: -38213.69628582746


0batch [00:00, ?batch/s]

--------------------------------
| batch_size        | 32       |
| bc/               |          |
|    batch          | 0        |
|    ent_loss       | -0.00248 |
|    entropy        | 2.48     |
|    epoch          | 0        |
|    l2_loss        | 0        |
|    l2_norm        | 78.5     |
|    loss           | 2.48     |
|    neglogp        | 2.48     |
|    prob_true_act  | 0.0834   |
|    samples_so_far | 32       |
--------------------------------


99973batch [03:56, 363.50batch/s]

---------------------------------
| batch_size        | 32        |
| bc/               |           |
|    batch          | 100000    |
|    ent_loss       | -5.23e-05 |
|    entropy        | 0.0523    |
|    epoch          | 1         |
|    l2_loss        | 0         |
|    l2_norm        | 330       |
|    loss           | 0.0577    |
|    neglogp        | 0.0578    |
|    prob_true_act  | 0.962     |
|    samples_so_far | 3200032   |
---------------------------------


199986batch [07:55, 360.96batch/s]

---------------------------------
| batch_size        | 32        |
| bc/               |           |
|    batch          | 200000    |
|    ent_loss       | -3.97e-05 |
|    entropy        | 0.0397    |
|    epoch          | 2         |
|    l2_loss        | 0         |
|    l2_norm        | 424       |
|    loss           | 0.14      |
|    neglogp        | 0.14      |
|    prob_true_act  | 0.962     |
|    samples_so_far | 6400032   |
---------------------------------


299979batch [12:38, 353.59batch/s]

---------------------------------
| batch_size        | 32        |
| bc/               |           |
|    batch          | 300000    |
|    ent_loss       | -8.26e-05 |
|    entropy        | 0.0826    |
|    epoch          | 3         |
|    l2_loss        | 0         |
|    l2_norm        | 503       |
|    loss           | 0.108     |
|    neglogp        | 0.108     |
|    prob_true_act  | 0.94      |
|    samples_so_far | 9600032   |
---------------------------------


399974batch [17:17, 341.03batch/s]

---------------------------------
| batch_size        | 32        |
| bc/               |           |
|    batch          | 400000    |
|    ent_loss       | -8.63e-05 |
|    entropy        | 0.0863    |
|    epoch          | 4         |
|    l2_loss        | 0         |
|    l2_norm        | 576       |
|    loss           | 0.0866    |
|    neglogp        | 0.0867    |
|    prob_true_act  | 0.944     |
|    samples_so_far | 12800032  |
---------------------------------


499997batch [21:56, 384.46batch/s]

---------------------------------
| batch_size        | 32        |
| bc/               |           |
|    batch          | 500000    |
|    ent_loss       | -8.22e-05 |
|    entropy        | 0.0822    |
|    epoch          | 5         |
|    l2_loss        | 0         |
|    l2_norm        | 646       |
|    loss           | 0.0449    |
|    neglogp        | 0.045     |
|    prob_true_act  | 0.968     |
|    samples_so_far | 16000032  |
---------------------------------


599974batch [26:46, 402.04batch/s]

---------------------------------
| batch_size        | 32        |
| bc/               |           |
|    batch          | 600000    |
|    ent_loss       | -4.94e-05 |
|    entropy        | 0.0494    |
|    epoch          | 6         |
|    l2_loss        | 0         |
|    l2_norm        | 724       |
|    loss           | 0.0211    |
|    neglogp        | 0.0211    |
|    prob_true_act  | 0.984     |
|    samples_so_far | 19200032  |
---------------------------------


699964batch [31:01, 393.09batch/s]

---------------------------------
| batch_size        | 32        |
| bc/               |           |
|    batch          | 700000    |
|    ent_loss       | -5.97e-05 |
|    entropy        | 0.0597    |
|    epoch          | 7         |
|    l2_loss        | 0         |
|    l2_norm        | 793       |
|    loss           | 0.12      |
|    neglogp        | 0.12      |
|    prob_true_act  | 0.943     |
|    samples_so_far | 22400032  |
---------------------------------


799991batch [35:44, 362.05batch/s]

---------------------------------
| batch_size        | 32        |
| bc/               |           |
|    batch          | 800000    |
|    ent_loss       | -4.07e-05 |
|    entropy        | 0.0407    |
|    epoch          | 8         |
|    l2_loss        | 0         |
|    l2_norm        | 862       |
|    loss           | 0.0464    |
|    neglogp        | 0.0465    |
|    prob_true_act  | 0.971     |
|    samples_so_far | 25600032  |
---------------------------------


900000batch [40:27, 370.78batch/s]


Reward after training: -20780.57926607579


In [13]:
type(bc_trainer.policy)

imitation.policies.base.FeedForward32Policy