In [40]:
import os
from typing import Generic, Optional, SupportsFloat, Tuple, TypeVar, Union
import torch
# import couple of libs some will be useful
import gym
import numpy as np
from collections import deque
import random
import re
import os
import sys
import time
import json
import itertools
from datasets import Dataset
from _code.const import PATH_MODEL_SB,PATH_DATA_INTERACTIONS
from citylearn.agents.rbc import BasicRBC as BRBC
# import stable_baselines3
from stable_baselines3 import PPO, A2C, DDPG, TD3,SAC
from stable_baselines3.common.utils import set_random_seed

from citylearn.citylearn import CityLearnEnv
from utils.rewards import CustomReward

ObsType = TypeVar("ObsType")
ActType = TypeVar("ActType")

import functools
from citylearn.wrappers import *


In [2]:
class NormalizedObservationWrapperCustom(ObservationWrapper):
    """Wrapper for observations min-max and periodic normalization.
    
    Temporal observations including `hour`, `day_type` and `month` are periodically normalized using sine/cosine 
    transformations and then all observations are min-max normalized between 0 and 1.

    Parameters
    ----------
    env: CityLearnEnv
        CityLearn environment.
    """

    def __init__(self, env: CityLearnEnv) -> None:
        super().__init__(env)
        self.env: CityLearnEnv
        self.dataset=[]

    @property
    def shared_observations_norm(self) -> List[str]:
        """Names of common observations across all buildings i.e. observations that have the same value irrespective of the building.
        
        Includes extra three observations added during cyclic transformation of :code:`hour`, :code:`day_type` and :code:`month`.
        """

        shared_observations = []
        periodic_observation_names = list(Building.get_periodic_observation_metadata().keys())

        for o in self.env.shared_observations:
            if o in periodic_observation_names:
                shared_observations += [f'{o}_cos', f'{o}_sin']
            
            else:
                shared_observations.append(o)

        return shared_observations
    
    


    

    def get_observation_norm(self, observations: List[List[float]]) -> List[List[float]]:
        """Returns normalized observations."""

        if self.env.central_agent:
            norm_observations = []
            shared_observations = []

            for i, b in enumerate(self.env.buildings):
                for k, v in b.observations(normalize=True, periodic_normalization=True).items():
                    if i==0 or k not in self.shared_observations_norm or k not in shared_observations:
                        norm_observations.append(v)

                    else:
                        pass

                    if k in self.shared_observations_norm and k not in shared_observations:
                        shared_observations.append(k)
                    
                    else:
                        pass
            
            norm_observations = [norm_observations]

        else:
            norm_observations = [list(b.observations(normalize=True, periodic_normalization=True).values()) for b in self.env.buildings]
        
        return norm_observations
    
    def reset(self, **kwargs) -> Union[ObsType, Tuple[ObsType, dict]]:
        """Resets the environment with kwargs."""
        obs = self.env.reset(**kwargs)
        #print(obs)
        norm_obs = self.get_observation_norm(obs)
        self.current_obs = norm_obs
        self.dataset = []
        return self.env.reset(**kwargs)
    
    def step(self, action: ActType) -> Tuple[ObsType, float, bool, dict]:
        """Steps through the environment with action."""
        obs, reward, done, info = self.env.step(action)
        #print(obs)
        norm_obs = self.get_observation_norm(obs)
        print(norm_obs)
        
        
        self.dataset.append({
            "observations": self.current_obs,
            "next_observations": norm_obs,  # Assuming next observation is same as current for simplicity
            "actions": action,
            "rewards": reward,
            "dones": done,
            "info": info
        })
        
        self.current_obs = obs
        
        return self.env.step(action)

In [None]:
from citylearn.agents.rbc import OptimizedRBC as Agent

schema =  "citylearn_challenge_2022_phase_2"
env = CityLearnEnv(schema)
env = NormalizedObservationWrapperCustom(env)
env.central_agent = True
model_rbc = Agent(env)

model_rbc.learn(episodes=1)

In [3]:
class StableBaselines3WrapperCustom(Wrapper):
    """Wrapper for :code:`stable-baselines3` algorithms.

    Wraps `env` in :py:class:`citylearn.wrappers.StableBaselines3ObservationWrapper`,
    :py:class:`citylearn.wrappers.StableBaselines3ActionWrapper`
    and :py:class:`citylearn.wrappers.StableBaselines3RewardWrapper`.
    
    Parameters
    ----------
    env: CityLearnEnv
        CityLearn environment.
    """

    def __init__(self, env: CityLearnEnv):
        env = StableBaselines3ActionWrapper(env)
        env = StableBaselines3RewardWrapper(env)
        env = StableBaselines3ObservationWrapper(env)
        super().__init__(env)
        self.env: CityLearnEnv
        self.dataset = []
        
    def reset(self, **kwargs) -> Union[ObsType, Tuple[ObsType, dict]]:
        """Resets the environment with kwargs."""
        obs = self.env.reset(**kwargs)
        self.current_obs = obs
        self.dataset = []
        return self.env.reset(**kwargs)
    
    def step(self, action: ActType) -> Tuple[ObsType, float, bool, dict]:
        """Steps through the environment with action."""
        obs, reward, done, info = self.env.step(action)
        
        self.dataset.append({
            "observations": self.current_obs,
            "next_observations": obs,  # Assuming next observation is same as current for simplicity
            "actions": action,
            "rewards": reward,
            "dones": done,
            "info": info
        })
        
        self.current_obs = obs
        
        return self.env.step(action)
        

In [4]:
schema =  "citylearn_challenge_2022_phase_2"

In [18]:
sac_env = CityLearnEnv(schema)
sac_env.central_agent = True

In [19]:
sac_env = NormalizedObservationWrapperCustom(sac_env)

In [20]:
sac_env = StableBaselines3WrapperCustom(sac_env)

In [None]:
policy_kwargs = dict(net_arch=dict(pi=128, qf=64) )

In [37]:
sac_model = PPO(policy='MlpPolicy', env=sac_env, seed=10)

In [38]:
# Get the policy network
policy = sac_model.policy

# Get the network layers
layers = policy.mlp_extractor.policy_net

In [41]:
# Function to get the activation function names
def get_activation_functions(layers):
    activation_functions = []
    for layer in layers:
        if isinstance(layer, torch.nn.ReLU):
            activation_functions.append("ReLU")
        elif isinstance(layer, torch.nn.Tanh):
            activation_functions.append("Tanh")
        elif isinstance(layer, torch.nn.Sigmoid):
            activation_functions.append("Sigmoid")
        # Add other activations as needed
        else:
            activation_functions.append(layer.__class__.__name__)
    return activation_functions

# Print the activation functions
activation_functions = get_activation_functions(layers)
print("Activation functions used in the policy network:")
for func in activation_functions:
    print(func)


Activation functions used in the policy network:
Linear
Tanh
Linear
Tanh


In [27]:
total_params = sum(p.numel() for p in sac_model.policy.parameters())
total_params


16523

In [32]:
sac_model.get_parameters()

{'policy': OrderedDict([('log_std',
               tensor([0., 0., 0., 0., 0.], device='cuda:0')),
              ('mlp_extractor.policy_net.0.weight',
               tensor([[-0.0114, -0.1654,  0.0684,  ...,  0.0675, -0.0822, -0.1463],
                       [-0.0396,  0.0658,  0.1144,  ...,  0.2981,  0.0552, -0.1014],
                       [ 0.1162, -0.0856,  0.0187,  ..., -0.1996, -0.0885, -0.2582],
                       ...,
                       [ 0.1543, -0.1522,  0.2204,  ...,  0.2164,  0.2216,  0.0267],
                       [-0.0016,  0.2436,  0.1074,  ...,  0.0855,  0.0170,  0.0462],
                       [-0.0993,  0.0709, -0.0148,  ..., -0.3014, -0.1463,  0.1301]],
                      device='cuda:0')),
              ('mlp_extractor.policy_net.0.bias',
               tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
                       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0

In [29]:
mean_params

{'mlp_extractor.policy_net.0.weight': tensor([[-0.0114, -0.1654,  0.0684,  ...,  0.0675, -0.0822, -0.1463],
         [-0.0396,  0.0658,  0.1144,  ...,  0.2981,  0.0552, -0.1014],
         [ 0.1162, -0.0856,  0.0187,  ..., -0.1996, -0.0885, -0.2582],
         ...,
         [ 0.1543, -0.1522,  0.2204,  ...,  0.2164,  0.2216,  0.0267],
         [-0.0016,  0.2436,  0.1074,  ...,  0.0855,  0.0170,  0.0462],
         [-0.0993,  0.0709, -0.0148,  ..., -0.3014, -0.1463,  0.1301]],
        device='cuda:0'),
 'mlp_extractor.policy_net.0.bias': tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        device='cuda:0'),
 'mlp_extractor.policy_net.2.weight': tensor([[-0.2430, -0.0455,  0.1514,  ...,  0.0670,  0.0848,  0.1740],
         [ 0.0629, -0.1825, -0.0005,  ..., 

In [None]:
sac_model.learn(total_timesteps=1)

In [None]:
sac_env.observation_names

In [None]:
sac_env.dataset

In [None]:
sac_env.reward_function

In [None]:
first_b = sac_env.buildings[0]

In [None]:
first_b.observations().items()

In [None]:
first_b.observations(normalize=True, periodic_normalization=True).items()

In [None]:
sac_env.dataset[0]