In [1]:
import gym
import numpy as np
from collections import deque
import random
import re
import os
import sys
import time
import json
import itertools

# import stable_baselines3
from stable_baselines3 import PPO, A2C, DDPG, TD3
from stable_baselines3.common.utils import set_random_seed

from citylearn.citylearn import CityLearnEnv

import functools


In [2]:
class Constants:
    episodes = 3
    schema_path = 'data/citylearn_challenge_2022_phase_1/schema.json'


def action_space_to_dict(aspace):
    """ Only for box space """
    return { "high": aspace.high,
             "low": aspace.low,
             "shape": aspace.shape,
             "dtype": str(aspace.dtype)
    }

def env_reset(env):
    observations = env.reset()
    action_space = env.action_space
    observation_space = env.observation_space
    building_info = env.get_building_information()
    building_info = list(building_info.values())
    action_space_dicts = [action_space_to_dict(asp) for asp in action_space]
    observation_space_dicts = [action_space_to_dict(osp) for osp in observation_space]
    obs_dict = {"action_space": action_space_dicts,
                "observation_space": observation_space_dicts,
                "building_info": building_info,
                "observation": observations }
    return obs_dict

import gym

# here we init the citylearn env
env = CityLearnEnv(schema=Constants.schema_path)

#### IMPORTANT 
# here we choose the observation we want to take from the building env
# we divide observation that are specific to buildings (index_particular)
# and observation that are the same for all the buildings (index_commun)

index_commun = [0, 2, 19, 4, 8, 24]
index_particular = [20, 21, 22, 23]

normalization_value_commun = [12, 24, 2, 100, 100, 1]
normalization_value_particular = [5, 5, 5, 5]

len_tot_index = len(index_commun) + len(index_particular) * 5

## env wrapper for stable baselines
class EnvCityGym(gym.Env):
    """
    Env wrapper coming from the gym library.
    """
    def __init__(self, env):
        self.env = env

        # get the number of buildings
        self.num_buildings = len(env.action_space)

        # define action and observation space
        self.action_space = gym.spaces.Box(low=np.array([-1] * self.num_buildings), high=np.array([1] * self.num_buildings), dtype=np.float32)

        # define the observation space
        self.observation_space = gym.spaces.Box(low=np.array([0] * len_tot_index), high=np.array([1] * len_tot_index), dtype=np.float32)

        # TO THINK : normalize the observation space

    def reset(self):
        obs_dict = env_reset(self.env)
        obs = self.env.reset()

        observation = self.get_observation(obs)

        return observation

    def get_observation(self, obs):
        """
        We retrieve new observation from the building observation to get a proper array of observation
        Basicly the observation array will be something like obs[0][index_commun] + obs[i][index_particular] for i in range(5)

        The first element of the new observation will be "commun observation" among all building like month / hour / carbon intensity / outdoor_dry_bulb_temperature_predicted_6h ...
        The next element of the new observation will be the concatenation of certain observation specific to buildings non_shiftable_load / solar_generation / ...  
        """
        
        # we get the observation commun for each building (index_commun)
        observation_commun = [obs[0][i]/n for i, n in zip(index_commun, normalization_value_commun)]
        observation_particular = [[o[i]/n for i, n in zip(index_particular, normalization_value_particular)] for o in obs]

        observation_particular = list(itertools.chain(*observation_particular))
        # we concatenate the observation
        observation = observation_commun + observation_particular

        return observation

    def step(self, action):
        """
        we apply the same action for all the buildings
        """
        # reprocessing action
        action = [[act] for act in action]

        # we do a step in the environment
        obs, reward, done, info = self.env.step(action)

        observation = self.get_observation(obs)

        return observation, sum(reward), done, info
        
    def render(self, mode='human'):
        return self.env.render(mode)

In [3]:
import gym
import wandb
import os


wandb.init(project="SAC", entity="cleancity_challenge_rl")

wandb.run.name = 'SAC_MlpPolicy'
from stable_baselines3 import SAC
from stable_baselines3.common.callbacks import CallbackList, CheckpointCallback, EvalCallback


from stable_baselines3.common.callbacks import BaseCallback




Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mfrozenwolf[0m ([33mcleancity_challenge_rl[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [4]:

def test_ppo():

    # Modify the petting zoo environment to make a custom observation space (return an array of value for each agent)
    

    # first we initialize the environment (petting zoo)
    env = CityLearnEnv(schema=Constants.schema_path)
    env = EnvCityGym(env)
    
    # we load the model
    model = PPO.load("ppo_citylearn")

    # we reset the environment
    obs = env.reset()

    nb_iter = 8000

    # loop on the number of iteration
    for i in range(nb_iter):
        # we get the action for each agent
        actions = []
        for agent in env.possible_agents:
            action, _states = model.predict(obs[agent], deterministic=True)


            actions.append(action)

        actions = {agent: action for agent, action in zip(env.possible_agents, actions)}

        # we do a step in the environment
        obs, rewards, dones, info = env.step(actions)

        # sometimes check the actions and rewards
        if i % 100 == 0:
            print("actions : ", actions)
            print("rewards : ", rewards)

        


    final_result = sum(env.citylearnenv.evaluate())/2

    print("final result : ", final_result)
    # launch as main

    return final_result
    

In [5]:
from stable_baselines3.common.results_plotter import load_results, ts2xy
from stable_baselines3.common.monitor import Monitor

class CustomCallback(BaseCallback):
    """
    A custom callback that derives from ``BaseCallback``.

    :param verbose: Verbosity level: 0 for no output, 1 for info messages, 2 for debug messages
    """
    def __init__(self, verbose, log_dir:str):
        super(CustomCallback, self).__init__(verbose)
        self.log_dir = log_dir
        # Those variables will be accessible in the callback
        # (they are defined in the base class)
        # The RL model
        # self.model = None  # type: BaseAlgorithm
        # An alias for self.model.get_env(), the environment used for training
        # self.training_env = None  # type: Union[gym.Env, VecEnv, None]
        # Number of time the callback was called
        # self.n_calls = 0  # type: int
        # self.num_timesteps = 0  # type: int
        # local and global variables
        # self.locals = None  # type: Dict[str, Any]
        # self.globals = None  # type: Dict[str, Any]
        # The logger object, used to report things in the terminal
        # self.logger = None  # stable_baselines3.common.logger
        # # Sometimes, for event callback, it is useful
        # # to have access to the parent object
        # self.parent = None  # type: Optional[BaseCallback]

    def _on_training_start(self) -> None:
        """
        This method is called before the first rollout starts.
        """
        pass

    def _on_rollout_start(self) -> None:
        """
        A rollout is the collection of environment interaction
        using the current policy.
        This event is triggered before collecting new samples.
        """
        pass

    def _on_rollout_end(self) -> bool:
        """
        This method will be called by the model after each call to `env.step()`.

        For child callback (of an `EventCallback`), this will be called
        when the event is triggered.

        :return: (bool) If the callback returns False, training is aborted early.
        """
        return True


    def _on_training_end(self) -> None:
        """
        This event is triggered before exiting the `learn()` method.
        """
        pass

    def _on_step(self) -> None:
        # x, y = ts2xy(load_results(self.log_dir), 'timesteps')
        # print(x,y)
        # print(vars(self.training_env.envs[0].env.env))

        print(f"{self.num_timesteps}", end="\r")
        metrics_t = self.training_env.envs[0].env.env.evaluate()

        metrics = {"price_cost": metrics_t[0], "emmision_cost": metrics_t[1]}

        action, _states = self.model.predict(self.training_env.envs[0].reset(), deterministic=True)
        
        wandb.log({"metric":sum(metrics_t), "Price cost":metrics_t[0], "Emmision cost":metrics_t[1]})
        
        # if self.num_timesteps %1000 == 0:
        #     self.training_env.envs[0].env = self.training_env.envs[0].env.reset()

        if self.num_timesteps%1000 == 0:
            print("Step: {}, Price cost: {}, Emmision cost:{}, metrics: {}".format(self.num_timesteps, metrics_t[0], metrics_t[1], sum(metrics_t)))
            print(action)
            self.model.save("ppo_citylearn")



# function to train the policy with PPO algorithm
def train_ppo():
    log_dir = "tmp/"
    os.makedirs(log_dir, exist_ok=True)
    # first we initialize the environment (petting zoo)
    env = CityLearnEnv(schema=Constants.schema_path)
    env = EnvCityGym(env)
    # env = Monitor(env, log_dir)

    env.reset()

    # Configure the algorithm

    # load model if exist
    # try:
    #     model = PPO.load("ppo_citylearn")
    # except:
    model = PPO('MlpPolicy', env, verbose=0, gamma=0.99, seed=123, n_steps=10)

    # Train the agent

    checkpoint_callback = CheckpointCallback(save_freq=1000, save_path='./logs/')
    # Separate evaluation env

    # eval_callback = EvalCallback(env, best_model_save_path='./logs/best_model',
    #                              log_path='./logs/results', eval_freq=100, verbose=2)
    # Create the callback list
    customcallback = CustomCallback(log_dir=log_dir,verbose=0)

    callback = CallbackList([checkpoint_callback, customcallback])

    model.learn(total_timesteps=10000000,  callback=callback)

    model.save("ppo_citylearn")

    return model


In [6]:
model = train_ppo()

  logger.warn(
We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=10 and n_envs=1)


Step: 1000, Price cost: 1.9031050573164758, Emmision cost:1.8568708671728067, metrics: 3.7599759244892823
[-0.8663336  -1.          0.36465788 -0.724324   -0.11854772]
Step: 2000, Price cost: 1.3061432786075906, Emmision cost:1.2904703660935049, metrics: 2.5966136447010957
[-1.         -1.         -1.         -0.09882007 -1.        ]
Step: 3000, Price cost: 1.0, Emmision cost:1.0, metrics: 2.0
[-1.        -1.        -1.        -0.6406799 -0.5325639]
Step: 4000, Price cost: 1.0, Emmision cost:1.0, metrics: 2.0
[-1. -1. -1. -1. -1.]
Step: 5000, Price cost: 1.0, Emmision cost:1.0, metrics: 2.0
[-1. -1. -1. -1. -1.]
Step: 6000, Price cost: 1.0, Emmision cost:1.0, metrics: 2.0
[-1.        -0.9028361 -1.        -1.        -1.       ]
Step: 7000, Price cost: 1.0, Emmision cost:1.0, metrics: 2.0
[-1.        -0.9602244 -1.        -1.        -1.       ]
Step: 8000, Price cost: 1.0, Emmision cost:1.0, metrics: 2.0
[-1. -1. -1. -1. -1.]
Step: 9000, Price cost: 1.0, Emmision cost:1.0, metrics: 2.0


KeyboardInterrupt: 

In [None]:
env = CityLearnEnv(schema=Constants.schema_path)
env = EnvCityGym(env)

env = Monitor(env)
obs = env.reset()

model = PPO.load("ppo_citylearn")

nb_iter = 8750

reward_tot = 0

for i in range(nb_iter):

    action = model.predict(obs)[0]
        
    obs, rewards, dones, info = env.step(action)
    reward_tot += rewards 

    if i % 1000 == 0:
        print("actions : ", action)
        print("rewards : ", rewards)

print(sum(env.env.evaluate())/2)
print(reward_tot)

  logger.warn(


actions :  [ 0.46788347 -1.         -1.         -0.20734441 -0.44856098]
rewards :  -2.8903913786673163
actions :  [ 0.12238503 -0.09601915  0.48492345 -0.5266901   0.70425487]
rewards :  -3.203796022303757


KeyboardInterrupt: 