In [1]:
import os
from drl_algo.train_util import train_ddpg_mlp,train_td3_mlp
from drl_algo.config import actor_lr,critic_lr,gamma,tau,batch_size
from citylearn.citylearn import CityLearnEnv
import torch
import numpy as np
class Constants:
    episodes = 3
    schema_path = 'data/citylearn_challenge_2022_phase_1/schema.json'


class ARGs:
    reward_key = 0
    device = 'cuda'
    epochs = 1000
    actor_lr = 1e-5
    critic_lr = 1e-5
    gamma = 0.99
    batch_size = 64
    tau = 0.05
    state_dim = 28
    action_dim = 1
    critic_hidden_dim = 10
    actor_hidden_dim = 10
    extractor_hidden_dim = 10
    attn_hidden_dim = 16
    n_agents = 5
    n_heads = 2
    device = 'cuda'
    update_freq = 1
    random_steps = 5
    max_steps = 1000


args = ARGs()


def action_space_to_dict(aspace):
    """ Only for box space """
    return { "high": aspace.high,
             "low": aspace.low,
             "shape": aspace.shape,
             "dtype": str(aspace.dtype)
    }

def env_reset(env):
    observations = env.reset()
    action_space = env.action_space
    observation_space = env.observation_space
    building_info = env.get_building_information()
    building_info = list(building_info.values())
    action_space_dicts = [action_space_to_dict(asp) for asp in action_space]
    observation_space_dicts = [action_space_to_dict(osp) for osp in observation_space]
    obs_dict = {"action_space": action_space_dicts,
                "observation_space": observation_space_dicts,
                "building_info": building_info,
                "observation": observations }
    return obs_dict



os.mkdir("KEY"+str(args.reward_key))
env = CityLearnEnv(schema=Constants.schema_path)
os.rmdir("KEY"+str(args.reward_key))

env.seed(123456)

torch.manual_seed(123456)
np.random.seed(123456)


0


In [2]:
np.random.normal(scale=0.5)

0.23455614995359314

In [3]:
5000/1.570

3184.7133757961783

In [4]:
for i in range(10000):
    print (i, np.cos(i/3184.713375))


0 1.0
1 0.9999999507020004
2 0.9999998028080064
3 0.9999995563180326
4 0.9999992112321033
5 0.9999987675502525
6 0.999998225272524
7 0.9999975843989714
8 0.9999968449296575
9 0.9999960068646555
10 0.999995070204048
11 0.9999940349479274
12 0.9999929010963955
13 0.9999916686495645
14 0.9999903376075555
15 0.9999889079705
16 0.9999873797385389
17 0.9999857529118229
18 0.9999840274905124
19 0.9999822034747774
20 0.9999802808647978
21 0.9999782596607633
22 0.999976139862873
23 0.999973921471336
24 0.999971604486371
25 0.9999691889082064
26 0.9999666747370805
27 0.9999640619732411
28 0.9999613506169457
29 0.9999585406684618
30 0.9999556321280664
31 0.9999526249960462
32 0.9999495192726978
33 0.9999463149583273
34 0.9999430120532508
35 0.9999396105577937
36 0.9999361104722916
37 0.9999325117970895
38 0.9999288145325422
39 0.9999250186790143
40 0.9999211242368801
41 0.9999171312065234
42 0.999913039588338
43 0.9999088493827273
44 0.9999045605901045
45 0.9999001732108923
46 0.9998956872455235


In [6]:
# The training code for DDPG,TD3 and SAC
from importlib.resources import path
import os
import sys
path_dir = os.path.abspath(os.getcwd())
path_checkpoint = path_dir + "/checkpoint/"

from drl_algo.models import DDPG_MLP_ACTOR,DDPG_MLP_CRITIC,TD3_MLP_ACTOR,TD3_MLP_CRITIC
from drl_algo.memory import DDPG_Memory,TD3_Memory
import wandb
import torch
import random
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

env=env
state_dim=env.observation_space[0].shape[0]*5
action_dim=env.action_space[0].shape[0]*5
actor_lr=args.actor_lr
critic_lr=args.critic_lr
tau=args.tau
batch_size=batch_size
device=args.device
random_steps=args.random_steps
episodes=args.epochs
update_freq=args.update_freq
gamma=args.gamma
r=args.reward_key
max_steps = args.max_steps
sin_step_facotor = 100
gaussian_factor = 1

normalizing_factor = [12, 8, 24, 40, 40, 40, 40 , 100, 100, 100, 100, 100, 1200, 1200, 1200, 1200, 100, 100, 100, 100, 1, 1, 1, 1, 1, 1, 1, 1]
    # wandb.init(project="REWARD_SWEEP",name="reward_{}_ddpg_mlp_actor-lr:{}_critic-lr:{}_gamma:{}_tau:{}".format(r, actor_lr,critic_lr,gamma,tau),entity="cleancity_challenge_rl")
    #actor and actor target
actor = DDPG_MLP_ACTOR(state_dim,action_dim,hidden_dim=16).to(device=device)
actor_target = DDPG_MLP_ACTOR(state_dim,action_dim,hidden_dim=16).to(device=device)
actor_target.load_state_dict(actor.state_dict())
#critic and critic target
critic = DDPG_MLP_CRITIC(state_dim,action_dim,hidden_dim=16).to(device=device)
critic_target = DDPG_MLP_CRITIC(state_dim,action_dim,hidden_dim=16).to(device=device)
critic_target.load_state_dict(critic.state_dict())
#actor and critic optimizers
actor_optimizer = optim.Adam(actor.parameters(),lr=actor_lr)
critic_optimizer = torch.optim.Adam(critic.parameters(),lr=critic_lr)
#memory
memory = DDPG_Memory(capacity=10000)
# episodes
total_steps = 0
actor_loss = 0
critic_loss = 0
for i in range(episodes):
    state = np.asarray(env.reset())
    # state = env.reset()
    state = state/normalizing_factor
    state = state.tolist()
    score = 0
    done = False
    steps = 0
    building_1=[]
    building_2=[]
    building_3=[]
    building_4=[]
    building_5=[]
    while not done:
        if total_steps < random_steps:
            action = np.random.normal(scale=0.5,size=action_dim).reshape(-1, 1).tolist()
        else:
            #add gaussian noise 
            action = actor(torch.flatten(torch.FloatTensor(state).to(device=device)))
            action = (action.cpu().detach().numpy() + gaussian_factor*np.random.normal(scale=0.5*np.abs(gaussian_factor),size=action_dim)).clip(-1,1)
            action = [([i]) for i in action]
            # gaussian_factor = gaussian_factor/1.0001
            gaussian_factor = np.cos((total_steps*max_steps + steps)/3184.713375)
            # print(0.5*np.abs(gaussian_factor))

        next_state, reward, done, _ = env.step(action)
        next_state = np.asarray(next_state)
        next_state = next_state/normalizing_factor
        next_state = next_state.tolist()
        if steps == max_steps:
            done = True
        steps = steps + 1
        building_1.append(reward[0])
        building_2.append(reward[1])
        building_3.append(reward[2])
        building_4.append(reward[3])
        building_5.append(reward[4])
        score = score + reward.sum()
        # action = [i[0] for i in action]
        memory.push(state=torch.FloatTensor(state).flatten(),next_state=torch.FloatTensor(next_state).flatten(),action=torch.FloatTensor(action),reward=torch.FloatTensor(reward).sum(),done=torch.tensor(done))
        state = next_state
        if total_steps >= random_steps and total_steps%update_freq == 0:
                #learn
                samples = memory.sample(batch_size=batch_size)
                next_states = torch.stack(list(samples.next_state)).to(device=device)
                states = torch.stack(list(samples.state)).to(device=device)
                actions = torch.stack(list(samples.action)).to(device=device)
                dones = torch.stack(list(samples.done)).to(device=device)
                rewards = torch.stack(list(samples.reward)).to(device=device)
                # Target Q
                with torch.no_grad():
                    Q_ = critic_target(next_states,actor_target(next_states)).squeeze(dim=1)
                    Q_target = rewards + gamma * (~dones) * Q_
                #critic update
                Q_Value = critic(states,actions.view(-1, 5)).squeeze(dim=1)
                critic_loss = F.mse_loss(Q_target,Q_Value)
                critic_optimizer.zero_grad()
                critic_loss.backward() 
                critic_optimizer.step()
                # Freeze crtitic network
                for param in critic.parameters():
                    param.requires_grad = False

                #actor update
                actor_loss = -1 * critic(states,actor(states)).mean()
                actor_optimizer.zero_grad()
                actor_loss.backward()
                actor_optimizer.step()
                # Unfreeze critic networks
                for params in critic.parameters():
                    params.requires_grad = True
                # soft target update by polyak average
                for param_critic,target_param_critic,param_actor,target_param_actor in zip(critic.parameters(),critic_target.parameters(),actor.parameters(),actor_target.parameters()):
                    target_param_critic.data.copy_(tau*param_critic.data + (1-tau)*target_param_critic.data)
                    target_param_actor.data.copy_(tau*param_actor.data + (1-tau)*target_param_actor.data)
    total_steps = total_steps + 1   
    metrics_t = env.evaluate()


    # wandb.log({"score":score,"actor_loss":actor_loss,"critic_loss":critic_loss,"Building_Score_1":sum(building_1),"Building_Score_2":sum(building_2),"Building_Score_3":sum(building_3),"Building_Score_4":sum(building_4),"Building_Score_5":sum(building_5)})
    print("Episode:",i,"total_score:",score,"Building_Score_1:",sum(building_1),"Building_Score_2:",sum(building_2),"Building_Score_3:",sum(building_3),"Building_Score_4:",sum(building_4),"Building_Score_5:",sum(building_5), "Price cost:",metrics_t[0], "Emission cost", metrics_t[1], "metrics", sum(metrics_t), end="\n\n")
    print("ACTION: ",action, end = "\n")
    print("REWARD: ",reward, end = "\n\n")


    if total_steps == 45:
        break        

    # torch.save(actor.state_dict(),"{}ddpg-actor_mlp_actor-lr:{}_critic-lr:{}_gamma:{}_tau:{}.pth".format(path_checkpoint,actor_lr,critic_lr,gamma,tau))
    # torch.save(critic.state_dict(),"{}ddpg-critic_mlp_actor-lr:{}_critic-lr:{}_gamma:{}_tau:{}.pth".format(path_checkpoint,actor_lr,critic_lr,gamma,tau))

CARBON EMISSION:  [0.13390473828931687, 0.21563198897421995, 0.23082506914325335, 0.30333736386706595, 0.21084456874078272]
ELECTRICITY PRICE:  [0.18725666666666677, 0.30154666666666663, 0.32279315566473077, 0.4241966666666666, 0.29485178516882843]
ELECTRICITY CONSUMPTION:  [0.8511666666666671, 1.3706666666666665, 1.4672416166578672, 1.9281666666666664, 1.3402353871310384]
REWARD:  [-0.3211614  -0.51717866 -0.55361822 -0.72753403 -0.50569635]

CARBON EMISSION:  [0.1289478367390079, 0.17014594441508796, 0.14049805264539916, 0.6405463437993214, 0]
ELECTRICITY PRICE:  [0.1836120000000001, 0.2422750000000001, 0.20005863684662484, 0.9120897120261839, -0.04376343563974716]
ELECTRICITY CONSUMPTION:  [0.8346000000000005, 1.1012500000000005, 0.9093574402119311, 4.145862327391745, -0.1989247074533962]
REWARD:  [-0.31255984 -0.41242094 -0.34055669 -1.55263606 -0.        ]

CARBON EMISSION:  [0.2516650745869111, 0.13950242718549064, 0.2076413676398449, 0, 0.412426172321117]
ELECTRICITY PRICE:  [0.

KeyboardInterrupt: 

In [None]:
actor(torch.randn((256, 140)).to(device))

tensor([[ 0.3547,  0.1365, -0.1226,  0.0739,  0.0881],
        [ 0.3654,  0.3348, -0.1669,  0.1132,  0.1064],
        [ 0.3048,  0.3386, -0.0395,  0.0903, -0.0506],
        ...,
        [ 0.4383,  0.0727, -0.0946,  0.0779, -0.0516],
        [ 0.3561, -0.3251, -0.2678, -0.6958, -0.1624],
        [ 0.2397,  0.0332,  0.2602, -0.2271,  0.0541]], device='cuda:0',
       grad_fn=<TanhBackward0>)

In [None]:
critic(actor(states), states)

tensor([[-5.7838e-01],
        [-1.6562e+00],
        [-1.4610e+00],
        [-2.3004e-01],
        [-3.1469e-01],
        [-1.9992e+00],
        [-1.9276e+00],
        [-3.6295e+00],
        [-1.8378e+00],
        [-1.7750e-01],
        [-8.1182e-02],
        [-2.6008e+00],
        [-1.7685e+00],
        [-3.6332e-02],
        [-2.5150e-01],
        [-4.1824e-01],
        [-1.7750e-01],
        [-1.1035e+00],
        [-3.1511e-01],
        [-7.0510e-01],
        [-7.2645e-02],
        [-7.1250e-02],
        [-1.0355e-01],
        [-1.0245e-01],
        [-3.8513e+00],
        [-5.7142e-01],
        [-8.8775e-01],
        [-2.6782e+00],
        [-3.0650e+00],
        [-4.1923e+00],
        [-3.0644e-02],
        [-2.0881e+00],
        [-7.8209e-02],
        [-8.9750e-01],
        [-2.4162e-01],
        [-1.2281e+00],
        [-1.8686e-01],
        [-7.2288e-01],
        [-4.6156e+00],
        [-3.6574e-01],
        [-2.2437e+00],
        [-8.1182e-02],
        [-3.4424e+00],
        [-3

In [5]:
env.render()

IndexError: list index out of range

In [8]:
import matplotlib.pyplot as plt
plt.imshow(env.render())


  plt.show()


In [None]:
for i,k in actor_target.network_actor.named_children():
    if str(type(k))[8:-2] == 'torch.nn.modules.linear.Linear':
        print(i,k.weight)
    k

0 Parameter containing:
tensor([[ 1.1141e-01,  1.6567e-01,  1.0638e-01,  ...,  1.3002e-01,
          1.6876e-01,  1.5684e-01],
        [ 1.6846e-01,  1.1112e-01,  1.7215e-01,  ...,  1.3077e-01,
          4.4803e-02,  1.0333e-01],
        [ 6.7540e-02, -3.5617e-03, -5.6268e-02,  ..., -4.4462e-02,
         -3.3273e-02,  1.0692e-02],
        ...,
        [-5.6889e-04,  1.5795e-02, -6.1002e-02,  ...,  4.1127e-02,
         -5.7165e-02, -1.1441e-04],
        [ 5.8480e-02,  3.4635e-03, -5.3207e-03,  ..., -7.7913e-02,
         -3.2132e-02,  4.6599e-02],
        [ 5.1964e-02,  1.7286e-01,  1.0209e-01,  ...,  4.8694e-02,
          9.8905e-02,  5.1087e-02]], device='cuda:0', requires_grad=True)
2 Parameter containing:
tensor([[ 0.2547,  0.2979,  0.0259,  0.1394, -0.0826,  0.0121, -0.0624,  0.0026,
          0.1421,  0.3060,  0.1465,  0.2053, -0.0705,  0.2159,  0.0336,  0.3556],
        [-0.1794, -0.1490, -0.0847, -0.1780,  0.2353, -0.2486, -0.3101,  0.1693,
          0.2198, -0.0896,  0.0661, -0.

In [None]:
type(k)

torch.nn.modules.activation.Tanh

In [None]:
import pandas as pd
a = pd.read_csv("data/citylearn_challenge_2022_phase_1/weather.csv")
a = pd.read_csv("data/citylearn_challenge_2022_phase_1/pricing.csv")
a.max()

Electricity Pricing [$]                   0.54
6h Prediction Electricity Pricing [$]     0.54
12h Prediction Electricity Pricing [$]    0.54
24h Prediction Electricity Pricing [$]    0.54
dtype: float64

In [None]:
import json
a = json.load(open("data/citylearn_challenge_2022_phase_1/schema.json", 'r'))['observations']
n =0
for i in a:
    if a[i]["active"] == True:
        print(i)
        n+=1

n

month
day_type
hour
outdoor_dry_bulb_temperature
outdoor_dry_bulb_temperature_predicted_6h
outdoor_dry_bulb_temperature_predicted_12h
outdoor_dry_bulb_temperature_predicted_24h
outdoor_relative_humidity
outdoor_relative_humidity_predicted_6h
outdoor_relative_humidity_predicted_12h
outdoor_relative_humidity_predicted_24h
diffuse_solar_irradiance
diffuse_solar_irradiance_predicted_6h
diffuse_solar_irradiance_predicted_12h
diffuse_solar_irradiance_predicted_24h
direct_solar_irradiance
direct_solar_irradiance_predicted_6h
direct_solar_irradiance_predicted_12h
direct_solar_irradiance_predicted_24h
carbon_intensity
non_shiftable_load
solar_generation
electrical_storage_soc
net_electricity_consumption
electricity_pricing
electricity_pricing_predicted_6h
electricity_pricing_predicted_12h
electricity_pricing_predicted_24h


28

In [14]:
a = np.asarray([[8, 1, 1, 20.1, 19.4, 22.8, 19.4, 79.0, 79.0, 71.0, 87.0, 0.0, 201.0, 966.0, 0.0, 0.0, 444.0, 747.0, 0.0, 0.1573190581037597, 0.8511666666666671, 0.0, 0.7202769107260067, 5.851166666666667, 0.22, 0.22, 0.22, 0.22], [8, 1, 1, 20.1, 19.4, 22.8, 19.4, 79.0, 79.0, 71.0, 87.0, 0.0, 201.0, 966.0, 0.0, 0.0, 444.0, 747.0, 0.0, 0.1573190581037597, 1.3706666666666665, 0.0, 0.0, 1.3706666666666665, 0.22, 0.22, 0.22, 0.22], [8, 1, 1, 20.1, 19.4, 22.8, 19.4, 79.0, 79.0, 71.0, 87.0, 0.0, 201.0, 966.0, 0.0, 0.0, 444.0, 747.0, 0.0, 0.1573190581037597, 1.0185241699218762e-07, 0.0, 0.0, 1.0185241699218762e-07, 0.22, 0.22, 0.22, 0.22], [8, 1, 1, 20.1, 19.4, 22.8, 19.4, 79.0, 79.0, 71.0, 87.0, 0.0, 201.0, 966.0, 0.0, 0.0, 444.0, 747.0, 0.0, 0.1573190581037597, 1.9281666666666664, 0.0, 0.7202769107260067, 6.928166666666666, 0.22, 0.22, 0.22, 0.22], [8, 1, 1, 20.1, 19.4, 22.8, 19.4, 79.0, 79.0, 71.0, 87.0, 0.0, 201.0, 966.0, 0.0, 0.0, 444.0, 747.0, 0.0, 0.1573190581037597, 0.5158833333333334, 0.0, 0.0, 0.5158833333333334, 0.22, 0.22, 0.22, 0.22]])
b = [12, 8, 24, 40, 40, 40, 40 , 100, 100, 100, 100, 100, 1200, 1200, 1200, 1200, 100, 100, 100, 100, 1, 1, 1, 1, 1, 1, 1, 1]
print(len(b))
a/b

28


array([[6.66666667e-01, 1.25000000e-01, 4.16666667e-02, 5.02500000e-01,
        4.85000000e-01, 5.70000000e-01, 4.85000000e-01, 7.90000000e-01,
        7.90000000e-01, 7.10000000e-01, 8.70000000e-01, 0.00000000e+00,
        1.67500000e-01, 8.05000000e-01, 0.00000000e+00, 0.00000000e+00,
        4.44000000e+00, 7.47000000e+00, 0.00000000e+00, 1.57319058e-03,
        8.51166667e-01, 0.00000000e+00, 7.20276911e-01, 5.85116667e+00,
        2.20000000e-01, 2.20000000e-01, 2.20000000e-01, 2.20000000e-01],
       [6.66666667e-01, 1.25000000e-01, 4.16666667e-02, 5.02500000e-01,
        4.85000000e-01, 5.70000000e-01, 4.85000000e-01, 7.90000000e-01,
        7.90000000e-01, 7.10000000e-01, 8.70000000e-01, 0.00000000e+00,
        1.67500000e-01, 8.05000000e-01, 0.00000000e+00, 0.00000000e+00,
        4.44000000e+00, 7.47000000e+00, 0.00000000e+00, 1.57319058e-03,
        1.37066667e+00, 0.00000000e+00, 0.00000000e+00, 1.37066667e+00,
        2.20000000e-01, 2.20000000e-01, 2.20000000e-01, 2.20000

In [None]:
a[:,21]

array([0., 0., 0., 0., 0.])