Installing required packages and dependencies

In [None]:
%%capture
!pip install py-elvis
!pip install pyyaml==5.4
!pip install plotly==5.9.0
!pip install -U kaleido

!pip install stable-baselines3[extra]
!pip install stable-baselines
!pip install sb3-contrib
!pip install gym
!pip install -q wandb

In [None]:
#Cloning repository and changing directory
!git clone https://github.com/francescomaldonato/RL_VPP_Thesis.git
%cd RL_VPP_Thesis/
%ls

In [None]:
import yaml
import torch
from torch.utils.tensorboard import SummaryWriter
from gym import Env
from VPP_environment import VPPEnv, VPP_Scenario_config
from elvis.config import ScenarioConfig
import os
import wandb
from wandb.integration.sb3 import WandbCallback
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3 import PPO #The available algoritmhs in sb3-contrib for the custom environment with MultiInputPolicy
from sb3_contrib.common.maskable.utils import get_action_masks
import stable_baselines3 as sb3
from stable_baselines3.common.env_checker import check_env
import random

#Check if cuda device is available for training
print("Torch-Cuda available device:", torch.cuda.is_available())
print(sb3.get_system_info())
!wandb --version

Torch-Cuda available device: True
OS: Linux-5.10.133+-x86_64-with-Ubuntu-18.04-bionic #1 SMP Fri Aug 26 08:44:51 UTC 2022
Python: 3.7.14
Stable-Baselines3: 1.6.0
PyTorch: 1.12.1+cu113
GPU Enabled: True
Numpy: 1.21.6
Gym: 0.21.0

({'OS': 'Linux-5.10.133+-x86_64-with-Ubuntu-18.04-bionic #1 SMP Fri Aug 26 08:44:51 UTC 2022', 'Python': '3.7.14', 'Stable-Baselines3': '1.6.0', 'PyTorch': '1.12.1+cu113', 'GPU Enabled': 'True', 'Numpy': '1.21.6', 'Gym': '0.21.0'}, 'OS: Linux-5.10.133+-x86_64-with-Ubuntu-18.04-bionic #1 SMP Fri Aug 26 08:44:51 UTC 2022\nPython: 3.7.14\nStable-Baselines3: 1.6.0\nPyTorch: 1.12.1+cu113\nGPU Enabled: True\nNumpy: 1.21.6\nGym: 0.21.0\n')
wandb, version 0.13.3


In [None]:
# Ensure deterministic behavior
torch.backends.cudnn.deterministic = True
random.seed(0)
torch.manual_seed(0)
torch.cuda.manual_seed_all(0)

In [None]:
#Loading paths for input data
current_folder = ''
VPP_data_input_path = current_folder + 'data/data_testing/environment_table/' + 'Environment_data_2020.csv'
elvis_input_folder = current_folder + 'data/config_builder/'

case = 'wohnblock_household_simulation_adaptive.yaml'
with open(elvis_input_folder + case, 'r') as file:
    yaml_str = yaml.full_load(file)

elvis_config_file = ScenarioConfig.from_yaml(yaml_str)
VPP_config_file = VPP_Scenario_config(yaml_str)

print(elvis_config_file)
print(VPP_config_file)

Vehicle types: <generator object ScenarioConfig.__str__.<locals>.<genexpr> at 0x7f3fdb498b50>Mean parking time: 18
Std deviation of parking time: 4
Mean value of the SOC distribution: 0.5
Std deviation of the SOC distribution: 0.2
Max parking time: 24
Number of charging events per week: 35
Vehicles are disconnected only depending on their parking time
Queue length: 0
Opening hours: None
Scheduling policy: Uncontrolled

{'start_date': '2022-01-01T00:00:00', 'end_date': '2023-01-01T00:00:00', 'resolution': '0:15:00', 'num_households': 4, 'solar_power': 16, 'wind_power': 12, 'charging_stations_n': 6, 'EVs_n_max': 1827, 'EV_load_max': 66, 'houseRWload_max': 10, 'av_max_energy_price': 0.13}


In [None]:
#Environment initialization
env = VPPEnv(VPP_data_input_path, elvis_config_file, VPP_config_file)
env.plot_ELVIS_data()

Charging event: 1, Arrival time: 2022-01-01 04:30:00, Parking_time: 17.147998909376643, Leaving_time: 2022-01-01 21:38:52.796074, SOC: 0.5863418486035888, SOC target: 1.0, Connected car: Tesla, Model S 
 ... 
 Charging event: 1825, Arrival time: 2022-12-31 16:00:00, Parking_time: 15.201427380293573, Leaving_time: 2023-01-01 07:12:05.138569, SOC: 0.47611396479812984, SOC target: 1.0, Connected car: Tesla, Model S 

ELVIS simulation: Tot_energy_consumed=kWh  60750.3821053751 , Av.load=kW  6.9347772158757 , Std.load=kW  12.742218960579994 , Total_cost=€  2135.55656989922 , Av.EV_load=kW  9.356470156131401 , Charging_events=  1825


In [None]:
#Function to check custom environment and output additional warnings if needed
check_env(env)
env.plot_reward_functions()

- ELVIS.Simulation:
 Energy_consumed=kWh  61218.63 , Av.load=kW  6.99 , Std.load=kW  12.58 , Total_cost=€  2118.93 , Av.EV_load=kW  9.41 , Av.EV_en_left=kWh  100.0 , Charging_events=  1825
Simulating VPP....


In [None]:
PPO_path = "trained_models/PPO_models/PPO"

#In Colab, uncomment below:
%env "WANDB_DISABLE_CODE" True
%env "WANDB_NOTEBOOK_NAME" "Hyperparameters_sweep_notebooks/PPO_VPP_Hyperp_Sweep.ipynb"
os.environ['WANDB_NOTEBOOK_NAME'] = 'Hyperparameters_sweep_notebooks/PPO_VPP_Hyperp_Sweep.ipynb'
wandb.login(relogin=True)

#In local notebook, uncomment below:
#your_wandb_login_code = 0123456789abcdefghijklmnopqrstwxyzàèìòù0 #example length
#!wandb login {your_wandb_login_code}

In [None]:
sweep_config = {
    "name": "PPO-sweep",
    "project": "RL_VPP_Thesis",
    #"entity" : "user_avocado",

    "method" : "random",
    "metric": {"name": "cumulative_reward",
                #"target": 10000000000,
                "goal": "maximize"},
    "early_terminate":{    
        "type": "hyperband",
        "min_iter": 10,
        "eta": 3
        },

    "parameters": {
        "policy_type": {
            "value": "MultiInputPolicy"
            },
        "epochs": {
            "value": 1,
            },
        
        # "n_steps": {
        #     "values" : [2920, 5840, 11684]
        #     },
        "batch_size": {
            #"value" : 2920
            "values" : [2920, 8760, 11680, 17520]
            },
        "n_epochs": {
            "value" : 15,
            # "min": 10,
            # "max": 20,
            # "distribution": "int_uniform"
            },
        "total_timesteps": {
            "value": 500000,
            #"min": 500000,
            #"max": 1500000,
            #"distribution": "int_uniform"
            },
        "learning_rate": {
            "min": 0.0001,
            "max": 0.0014,
            "distribution": "uniform"
            },
        "lr_schedule": {
            "value": "constant",
            #"values": ["linear", "constant"]
            },
        "gamma": { #discount factor
            "max": 0.9999,
            "min": 0.89,
            "distribution": "log_uniform_values"
            },
        "gae_lambda": {
            "values": [0.8, 0.9, 0.92, 0.95, 0.98, 0.99, 1.0]
            },
        "clip_range":{
            "values": [0.1, 0.2, 0.3, 0.4]
            },
        "ent_coef": { #entropy coefficient while calculating the loss
            "min": 1e-10,
            "max": 0.1,
            "distribution": "log_uniform_values"
            },
        "vf_coef":{
            "min": 0.0,
            "max": 1.0,
            "distribution": "uniform"
            },
        "normalize_advantage":{
            #"values":  [True, False]
            "value": True
            },
        "max_grad_norm": {
            "values": [0.1, 0.3, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 5]
            },

        "ortho_init": {
            #"values": [True, False],
            "value": True,
            },
        "net_arch": {
            #"values": ["small-short", "small-long", "medium-short", "medium", "big-short", "big", "big-large"],
            "values": ["small-separate", "small-shared", "medium-short", "medium", "medium-separate", "big-shared", "big", "big-separate"]
            },
        "activation_fn":{
            "value": "Tanh",
            #"values": ["Tanh", "ReLU"],
            #"values": ["Tanh", "ReLU", "ELU"],
            #"values": ["Tanh", "ReLU", "ELU", "LeakyReLU"]
            },
        "optimizer_class": {
            "value": "RMSprop",
            #"values": ["RMSprop", "Adam", "SGD"]
            }
    }
}

policy_dict = {
    "net_arch": {
         "small-separate": [dict(pi=[64, 64, 64], vf=[64, 64, 64])],
        "small-shared": [128, dict(pi=[64, 64], vf=[64, 64])],
        
        "medium-short": [dict(pi=[128, 128], vf=[128, 128])],
        "medium": [64, dict(pi=[256, 256], vf=[256, 256])],
        "medium-separate": [dict(pi=[256, 256], vf=[256, 256])],

        "big-shared": [256, dict(pi=[64, 64], vf=[64, 64])],
        "big": [128, dict(pi=[512, 512], vf=[512, 512])],
        "big-separate": [dict(pi=[512, 512], vf=[512, 512])],
        },
        
    "activation_fn":{
        "Tanh": torch.nn.modules.activation.Tanh,
        "ReLU": torch.nn.modules.activation.ReLU,
        "ELU": torch.nn.modules.activation.ELU,
        "LeakyReLU": torch.nn.modules.activation.LeakyReLU
        },

    "optimizer_class": {
        "RMSprop": torch.optim.RMSprop,
        "Adam": torch.optim.Adam,
        "SGD": torch.optim.SGD
        }
}

#to enable adjustable learning rate
from typing import Callable, Union
def linear_schedule(initial_value: Union[float, str]) -> Callable[[float], float]:
    if isinstance(initial_value, str):
        initial_value = float(initial_value)
    def func(progress_remaining: float) -> float:
        return progress_remaining * initial_value
    return func

#Trained model testing function
def evaluate_model(env, model, wandb_run_dir, deterministic=True):
    obs = env.reset()
    done = False
    cumulative_reward = 0
    # cell and hidden state of the LSTM
    lstm_states = None
    num_envs = 1
    # Episode start signals are used to reset the lstm states
    episode_starts = [True]
    while not done:
        # Retrieve current action mask
        action_masks = get_action_masks(env)
        action, lstm_states = model.predict(obs, state=lstm_states, episode_start=episode_starts, deterministic = deterministic) #Now using our trained model with deterministic prediction [should improve performances]
        #env.lstm_state = lstm_states
        obs, reward, done, info = env.step(action)
        episode_starts = done
        cumulative_reward+=reward
    VPP_results_plot = env.plot_VPP_results()
    VPP_results_plot.show()
    VPP_results_plot.write_image(os.path.join(wandb_run_dir, "VPP_results_plot.png"))
    plot_VPP_energies = env.plot_VPP_energies()
    plot_VPP_energies.show()
    plot_VPP_energies.write_image(os.path.join(wandb_run_dir, "VPP_energies_plot.png"))
    plot_rewards_results = env.plot_rewards_results()
    plot_rewards_results.show()
    plot_rewards_results.write_image(os.path.join(wandb_run_dir, "VPP_rewards_plot.png"))
    plot_rewards_stats = env.plot_rewards_stats()
    plot_rewards_stats.show()
    plot_rewards_stats.write_image(os.path.join(wandb_run_dir, "VPP_rewards_stats_plot.png"))
    plot_EVs_kpi = env.plot_EVs_kpi()
    plot_EVs_kpi.show()
    plot_EVs_kpi.write_image(os.path.join(wandb_run_dir, "EVs_kpi_plot.png"))
    plot_actions_kpi = env.plot_actions_kpi()
    plot_actions_kpi.show()
    plot_actions_kpi.write_image(os.path.join(wandb_run_dir, "actions_kpi_plot.png"))
    plot_load_kpi = env.plot_load_kpi()
    plot_load_kpi.show()
    plot_load_kpi.write_image(os.path.join(wandb_run_dir, "load_kpi_plot.png"))
    plot_yearly_log_kpi = env.plot_yearly_load_log()
    plot_yearly_log_kpi.show()
    plot_yearly_log_kpi.write_image(os.path.join(wandb_run_dir, "yearly_log_load_plot.png"))
    plot_comparison = env.plot_VPP_Elvis_comparison()
    plot_comparison.show()
    plot_comparison.write_image(os.path.join(wandb_run_dir, "Elvis_VPP_comparison_plot.png"))
    VPP_table = env.save_VPP_table(save_path=os.path.join(wandb_run_dir,'VPP_table.csv'))
    #wandb.log({"VPP_table": VPP_table})

    wandb.save(f"VPP_results_plot.png")
    wandb.save(f"VPP_energies_plot.png")
    wandb.save(f"VPP_rewards_plot.png")
    wandb.save(f"VPP_rewards_stats_plot.png")
    wandb.save(f"EVs_kpi_plot.png")
    wandb.save(f"actions_kpi_plot.png")
    wandb.save(f"load_kpi_plot.png")
    wandb.save(f"yearly_log_load_plot.png")
    wandb.save(f"Elvis_VPP_comparison_plot.png")
    wandb.save(f"VPP_table.csv")
    # wandb.log({"VPP_results_plot": wandb.Image("VPP_results_plot.png")})
    return cumulative_reward, env.av_EV_energy_left, env.overconsumed_en, env.underconsumed_en, env.sim_overcost

In [None]:
#ENVIRONMENT WRAPPING
X_env = Monitor(env)
#Vectorized environment wrapper
X_env = DummyVecEnv([lambda: X_env])
#Sync custom tensorboard patch
tensorboard_log_path = "wandb/tensorboard_log/"
#wandb.tensorboard.patch(root_logdir = tensorboard_log_path, pytorch=True)

#Model training function for Hyperparameters Sweep
def train_func():
    with wandb.init(#job_type='PPO_trained_model',
                    reinit=True,
                    #settings=wandb.Settings(symlink=False),
                    sync_tensorboard=True,
                    monitor_gym=False, save_code=False) as run:
        writer = SummaryWriter(tensorboard_log_path)
        config = wandb.config
        policy_kwargs =  dict(
            ortho_init =config["ortho_init"],
            net_arch = policy_dict["net_arch"][config["net_arch"]],
            activation_fn = policy_dict["activation_fn"][config["activation_fn"]],
            optimizer_class = policy_dict["optimizer_class"][config["optimizer_class"]]
        )
        # if config["batch_size"] > config["n_steps"]:
        #     batch_size = config["n_steps"]
        # else: batch_size = config["batch_size"]
        if config["lr_schedule"] == "linear":
            learning_rate = linear_schedule(config["learning_rate"])
        elif config["lr_schedule"] == "constant": learning_rate = config["learning_rate"]

        #model definition
        model = PPO(config["policy_type"], X_env,
                    learning_rate = learning_rate,
                    n_steps = config["batch_size"],
                    batch_size = config["batch_size"],
                    n_epochs = config["n_epochs"],
                    gamma = config["gamma"],
                    gae_lambda = config["gae_lambda"],
                    clip_range = config["clip_range"],
                    ent_coef = config["ent_coef"],
                    vf_coef = config["vf_coef"],
                    normalize_advantage = config["normalize_advantage"],
                    max_grad_norm = config["max_grad_norm"],
                    #create_eval_env = False,
                    policy_kwargs = policy_kwargs,
                    verbose=1,
                    tensorboard_log= tensorboard_log_path
                    #tensorboard_log= wandb.run.dir
                    )
        
        #Training of each hyperprameter set    
        for epoch in range(config["epochs"]):
            model.learn(total_timesteps=config["total_timesteps"],
                tb_log_name=f'PPO_{run.id}',
                callback=WandbCallback(
                    gradient_save_freq=1000,
                    #model_save_path=f"trained_models/PPO_sweep_{run.id}",
                    verbose=1)
                )
            ##TESTING trained model
            #cumulative_reward, std_reward = evaluate_policy(model, X_env, n_eval_episodes=1, render=False)
            cumulative_reward, av_EV_energy_left, std_EV_energy_left, total_load, av_load, std_load, total_cost = evaluate_model(env, model, wandb.run.dir, deterministic=True)
            ##Logging results
            wandb.log({"cumulative_reward":cumulative_reward, "av_EV_energy_left":av_EV_energy_left, "std_EV_energy_left":std_EV_energy_left, "total_load":total_load, "av_load":av_load, "std_load":std_load, "total_cost":total_cost})
            #model.save(current_folder + PPO_path + f"_{run.id}")
            model.save(os.path.join(wandb.run.dir, f"model_MaskablePPO.{run.id}"))
            wandb.save(f"model_MaskablePPO.{run.id}")  
        # Sync wandb
        #wandb.save(glob.glob(f"runs/*.pt.trace.json")[0], base_path=f"runs")
        #wandb.save(f'wandb/tensorboard_log/MaskablePPO_{run.id}')
        run.finish()
            
        #print(f"END OF TRAINING #### Model reward: {cumulative_reward}; Training complete.")

In [None]:
count = 40 # number of Training runs to execute

sweep_id = wandb.sweep(sweep_config)
wandb.agent(sweep_id, train_func, count=count)

#!wandb sync log_dir
!wandb sync wandb/tensorboard_log/

In [None]:
env.close()
#run.finish()
wandb.finish()