In [1]:
import os

# import couple of libs some will be useful
import gym
import numpy as np
from collections import deque
import random
import re
import os
import sys
import time
import json
import itertools
from datasets import Dataset

# import stable_baselines3
from stable_baselines3 import PPO, A2C, DDPG, TD3
from stable_baselines3.common.utils import set_random_seed

from citylearn.citylearn import CityLearnEnv

import functools


  __DEFAULT = ''
  __STORAGE_SUFFIX = '_without_storage'
  __PARTIAL_LOAD_SUFFIX = '_and_partial_load'
  __PV_SUFFIX = '_and_pv'


In [2]:
schema = "citylearn_challenge_2022_phase_2"

In [3]:
class Constants:
    episodes = 3
    schema_path = '/content/neurips-2022-citylearn-challenge/data/citylearn_challenge_2022_phase_2/schema.json'


def action_space_to_dict(aspace):
    """ Only for box space """
    return { "high": aspace.high,
             "low": aspace.low,
             "shape": aspace.shape,
             "dtype": str(aspace.dtype)
    }

def env_reset(env):
    observations = env.reset()
    action_space = env.action_space
    observation_space = env.observation_space
    #building_info = env.get_building_information()
    #building_info = list(building_info.values())
    action_space_dicts = [action_space_to_dict(asp) for asp in action_space]
    observation_space_dicts = [action_space_to_dict(osp) for osp in observation_space]
    obs_dict = {"action_space": action_space_dicts,
                "observation_space": observation_space_dicts,
              #  "building_info": building_info,
                "observation": observations }
    return obs_dict

import gym

# here we init the citylearn env
env = CityLearnEnv(schema="citylearn_challenge_2023_phase_3_3")

#### IMPORTANT 
# here we choose the observation we want to take from the building env
# we divide observation that are specific to buildings (index_particular)
# and observation that are the same for all the buildings (index_commun)

index_commun = [0, 2, 19, 4, 8, 24]
index_particular = [20, 21, 22, 23]

normalization_value_commun = [12, 24, 2, 100, 100, 1]
normalization_value_particular = [5, 5, 5, 5]

len_tot_index = len(index_commun) + len(index_particular) * 5

## env wrapper for stable baselines
class EnvCityGym(gym.Env):
    """
    Env wrapper coming from the gym library.
    """
    def __init__(self, env):
        self.env = env

        # get the number of buildings
        self.num_buildings = len(env.action_space)

        # define action and observation space
        self.action_space = gym.spaces.Box(low=np.array([-1] * self.num_buildings), high=np.array([1] * self.num_buildings), dtype=np.float32)

        # define the observation space
        self.observation_space = gym.spaces.Box(low=np.array([0] * len_tot_index), high=np.array([1] * len_tot_index), dtype=np.float32)

        # TO THINK : normalize the observation space
        self.current_obs = None
    def reset(self):
        obs_dict = env_reset(self.env)
        obs = self.env.reset()

        observation = self.get_observation(obs)
        
        self.current_obs = observation
        self.interactions = []

        return observation

    def get_observation(self, obs):
        """
        We retrieve new observation from the building observation to get a proper array of observation
        Basicly the observation array will be something like obs[0][index_commun] + obs[i][index_particular] for i in range(5)

        The first element of the new observation will be "commun observation" among all building like month / hour / carbon intensity / outdoor_dry_bulb_temperature_predicted_6h ...
        The next element of the new observation will be the concatenation of certain observation specific to buildings non_shiftable_load / solar_generation / ...  
        """
        
        # we get the observation commun for each building (index_commun)
        observation_commun = [obs[0][i]/n for i, n in zip(index_commun, normalization_value_commun)]
        observation_particular = [[o[i]/n for i, n in zip(index_particular, normalization_value_particular)] for o in obs]

        observation_particular = list(itertools.chain(*observation_particular))
        # we concatenate the observation
        observation = observation_commun + observation_particular

        return observation

    def step(self, action):
        """
        we apply the same action for all the buildings
        """
        # reprocessing action
        action = [[act] for act in action]

        # we do a step in the environment
        obs, reward, done, info = self.env.step(action)
        
        observation = self.get_observation(obs)
        
        
        self.interactions.append({
            "observations": self.current_obs,
            "next_observations": self.get_observation(obs),  # Assuming next observation is same as current for simplicity
            "actions": action,
            "rewards": reward,
            "dones": done,
            "info": info
        })
        
        self.current_obs = observation
        
        

        return observation, sum(reward), done, info
        
    def render(self, mode='human'):
        return self.env.render(mode)

# function to train the policy with PPO algorithm
def test_ppo():

    # Modify the petting zoo environment to make a custom observation space (return an array of value for each agent)
    

    # first we initialize the environment (petting zoo)
    env = CityLearnEnv(schema=schema)
    env = EnvCityGym(env)
    
    # we load the model
    model = PPO.load("ppo_citylearn")

    # we reset the environment
    obs = env.reset()

    nb_iter = 8000

    # loop on the number of iteration
    for i in range(nb_iter):
        # we get the action for each agent
        actions = []
        for agent in env.possible_agents:
            action, _states = model.predict(obs[agent], deterministic=True)


            actions.append(action)

        actions = {agent: action for agent, action in zip(env.possible_agents, actions)}

        # we do a step in the environment
        obs, rewards, dones, info = env.step(actions)

        # sometimes check the actions and rewards
        if i % 100 == 0:
            print("actions : ", actions)
            print("rewards : ", rewards)

        


    final_result = sum(env.citylearnenv.evaluate())/2

    print("final result : ", final_result)
    # launch as main

    return final_result
    

# function to train the policy with PPO algorithm
def train_ppo():

    # first we initialize the environment (petting zoo)
    env = CityLearnEnv(schema=schema)
    env = EnvCityGym(env)

    env.reset()

    # Configure the algorithm

    # load model if exist
    try:
        model = PPO.load("ppo_citylearn")
    except:
        model = PPO('MlpPolicy', env, verbose=2, gamma=0.99)

    # Train the agent
    model.learn(total_timesteps=10000000)

    model.save("ppo_citylearn")

    return model

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])  # type: ignore[arg-type]


In [4]:
schema

'citylearn_challenge_2022_phase_2'

In [5]:
env = CityLearnEnv(schema=schema)
env = EnvCityGym(env)

  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")


In [6]:
env.action_space

Box(-1.0, 1.0, (5,), float32)

In [7]:
# function to train the policy with PPO algorithm
def test_ppo():

    # Modify the petting zoo environment to make a custom observation space (return an array of value for each agent)
    

    # first we initialize the environment (petting zoo)
    env = CityLearnEnv(schema=Constants.schema_path)
    env = EnvCityGym(env)
    
    # we load the model
    model = PPO.load("ppo_citylearn")

    # we reset the environment
    obs = env.reset()

    nb_iter = 8000

    # loop on the number of iteration
    for i in range(nb_iter):
        # we get the action for each agent
        actions = []
        for agent in env.possible_agents:
            action, _states = model.predict(obs[agent], deterministic=True)


            actions.append(action)

        actions = {agent: action for agent, action in zip(env.possible_agents, actions)}

        # we do a step in the environment
        obs, rewards, dones, info = env.step(actions)

        # sometimes check the actions and rewards
        if i % 100 == 0:
            print("actions : ", actions)
            print("rewards : ", rewards)

        


    final_result = sum(env.citylearnenv.evaluate())/2

    print("final result : ", final_result)
    # launch as main

    return final_result
    

# function to train the policy with PPO algorithm
def train_ppo():

    # first we initialize the environment (petting zoo)
    env = CityLearnEnv(schema=schema)
    env = EnvCityGym(env)

    env.reset()

    # Configure the algorithm

    # load model if exist
    try:
        model = PPO.load("ppo_citylearn")
    except:
        model = PPO('MlpPolicy', env, verbose=2, gamma=0.99)

    # Train the agent
    model.learn(total_timesteps=100000)

    model.save("ppo_citylearn")

    return model

### Saving Interactions

In [8]:
my_dataset = Dataset.from_dict({
    'observations': [],
    'next_observations': [],
    'actions': [],
    'rewards': [],
    'dones': []
})

In [9]:
env = CityLearnEnv(schema=schema)

In [10]:
env = CityLearnEnv(schema=schema)
env = EnvCityGym(env)

In [11]:
model = PPO("MlpPolicy", env, verbose=1)


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [12]:
model.policy

ActorCriticPolicy(
  (features_extractor): FlattenExtractor(
    (flatten): Flatten(start_dim=1, end_dim=-1)
  )
  (mlp_extractor): MlpExtractor(
    (shared_net): Sequential()
    (policy_net): Sequential(
      (0): Linear(in_features=26, out_features=64, bias=True)
      (1): Tanh()
      (2): Linear(in_features=64, out_features=64, bias=True)
      (3): Tanh()
    )
    (value_net): Sequential(
      (0): Linear(in_features=26, out_features=64, bias=True)
      (1): Tanh()
      (2): Linear(in_features=64, out_features=64, bias=True)
      (3): Tanh()
    )
  )
  (action_net): Linear(in_features=64, out_features=5, bias=True)
  (value_net): Linear(in_features=64, out_features=1, bias=True)
)

In [13]:
env.reset()

[0.5833333333333334,
 1.0,
 0.08536220341920853,
 0.18299999237060546,
 0.81,
 0.2199999988079071,
 0.2989033222198486,
 0.0,
 0.0,
 0.2989033222198486,
 0.1541433334350586,
 0.0,
 0.0,
 0.1541433334350586,
 1.950581918208627e-08,
 0.0,
 0.0,
 1.950581918208627e-08,
 0.12609000205993653,
 0.0,
 0.0,
 0.12609000205993653,
 0.10914000272750854,
 0.0,
 0.0,
 0.10914000272750854]

In [14]:
model.learn(total_timesteps=1000)

-----------------------------
| time/              |      |
|    fps             | 190  |
|    iterations      | 1    |
|    time_elapsed    | 10   |
|    total_timesteps | 2048 |
-----------------------------


<stable_baselines3.ppo.ppo.PPO at 0x7f4a3406f280>

In [15]:
dataset = env.interactions

In [16]:
test = Dataset.from_dict({k: [s[k] for s in dataset] for k in dataset[0].keys()})

In [17]:
len(test["dones"])

2048

### Other

In [None]:

model = train_ppo()

In [102]:

# simple run though the env with our PPO policy and we sometimes print our actions / reward to get a sense of what we are doing
env = CityLearnEnv(schema=schema)
env = EnvCityGym(env)

obs = env.reset()

model = PPO.load("ppo_citylearn")

nb_iter = 100000000

reward_tot = 0

for i in range(nb_iter):

    action = model.predict(obs)[0]
        
    obs, rewards, dones, info = env.step(action)
    
    if dones is not False:
        print(dones)
        print(i)
    reward_tot += rewards 

    if i % 10000 == 0:
        print("actions : ", action)
        print("rewards : ", rewards)

print(sum(env.env.evaluate())/2)
print(reward_tot)

actions :  [ 1.         -0.45806432  1.          0.6259297   0.8149345 ]
rewards :  -22.450594544410706
True
8758


IndexError: index 8760 is out of bounds for axis 0 with size 8760

In [None]:
action

df_evaluate =env.env.evaluate()

In [None]:
df_evaluate[df_evaluate.cost_function=="cost_total"]

In [None]:
reward_tot