In [1]:
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

import numpy as np
import matplotlib.pyplot as plt
import pickle
import torch

from Configs import getEnvConfig, visualizeEnvConfig
from Configs import getDatasetConfig, visualizeDatasetConfig
from Environment.EnvironmentSim import createEnv
from Helpers.EnvInterface import EnvInterface
from Helpers.Eval import eval

In [2]:
configIdx = 3
envParams = getEnvConfig(configIdx)
visualizeEnvConfig(envParams)
trafficDataParentPath = f'Datasets/TrafficDataset/TrafficData'
env = createEnv(envParams, trafficDataParentPath)
env.selectMode(mode="test", type="data")

datasetConfigIdx = 7
datasetParams = getDatasetConfig(datasetConfigIdx)
visualizeDatasetConfig(datasetParams)
with open(f'Datasets/OfflineDataset/subOptimalAgent_encConfig{datasetConfigIdx}_{envParams["sub_agents_idx"]}.pkl', 'rb') as f:
    dataset_expert = pickle.load(f)

Environment Configuration
Number of Users:        20
Window Length:          200
Dataflow:               thumb_bk
Sigmoid K List:         [0.1, 0.2, 0.3, 0.4, 0.5]
Sigmoid S List:         [10.0, 10.0, 10.0, 10.0, 10.0]
Resource Bar:           5
Bandwidth:              200
Sub Agents:             [[1, 1, 1, 1, 1]]
User Map:               [[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15], [16, 17, 18, 19]]
Dataset Configuration
Number of Users:        20
Window Length:          200
N_aggregation:          4
Dataflow:               thumb_bk
Random Seed:            999
Resource Bar:           5
Bandwidth:              200
Sigmoid K List:         [0.1, 0.2, 0.3, 0.4, 0.5]
Sigmoid S List:         [10.0, 10.0, 10.0, 10.0, 10.0, 10.0]
Sub Agents:             [[1, 1, 1, 1, 1]]
User Map:               [[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15], [16, 17, 18, 19]]


In [3]:
envInterface = EnvInterface(
        envParams, discrete_state=False
    )
from DiffusionQL.DQL_Q_esmb import DQL_Q_esmb as Agent
print(f"state_dim: {envInterface.state_dim}, action_dim: {envInterface.action_dim}")

best_reward = np.inf
best_model_idx = None
for model_idx in [0]:
    with open(f"Models/config_{configIdx}/hyperparams_{model_idx}.pkl", "rb") as f:
        hyperparams = pickle.load(f)
    agent = Agent(
        state_dim=envInterface.state_dim, 
        action_dim=envInterface.action_dim, 
        **hyperparams
    )
    agent.load_model(f"Models/config_{configIdx}", f'{model_idx}_best')
    env.reset()
    env.selectMode(mode="test", type="data")
    reward, metrics = eval(
        agent, env, envInterface, 
        LEN_eval=250, obvMode="predicted", sample_method="greedy", 
        N_action_candidates=50, eta=0.1, verbose=True) 
    print(f"reward_diffusionQ{model_idx}: {reward}")
    if reward < best_reward:
        best_reward = np.mean(reward)
        best_model_idx = model_idx

print(f"best_model_idx: {best_model_idx}")

state_dim: 20, action_dim: 20


  self.actor.load_state_dict(torch.load(f'{dir}/actor_{id}.pth'))
  self.critic.load_state_dict(torch.load(f'{dir}/critic_{id}.pth'))
                                                                     

reward_diffusionQ0: 0.46926666909253867
best_model_idx: 0




In [4]:
env.selectMode(mode="test", type="data")
rewards_expert = dataset_expert['rewardRecord']

agent.load_model(f"Models/config_{configIdx}", f'{best_model_idx}_best')
LEN_eval = 50
reward_expert_list = []
reward_policy_list = []
reward_policy_list_2 = []

from tqdm import tqdm
for _ in tqdm(range(20)):
    env.reset()
    env.selectMode(mode="test", type="data")
    reward_expert_sample = np.random.choice(rewards_expert, size=LEN_eval, replace=False)
    reward_policy, _ = eval(agent, env, envInterface, LEN_eval=LEN_eval, obvMode="predicted", 
                            sample_method="greedy", N_action_candidates=50, eta=0.01, verbose=True) 
    reward_policy_2, _ = eval(agent, env, envInterface, LEN_eval=LEN_eval, obvMode="predicted", 
                            sample_method="greedy", N_action_candidates=50, eta=1.0, verbose=True) 
    reward_expert_list.append(np.mean(reward_expert_sample))
    reward_policy_list.append(reward_policy)
    reward_policy_list_2.append(reward_policy_2)

reward_expert_list = np.array(reward_expert_list)
reward_policy_list = np.array(reward_policy_list)
reward_policy_list_2 = np.array(reward_policy_list_2)


100%|██████████| 20/20 [01:18<00:00,  3.91s/it]


In [5]:
import scipy.stats as stats

def mean_confidence_interval(data, confidence=0.95):
    data = np.array(data)
    n = data.size
    m = np.mean(data)
    se = stats.sem(data, axis=None)
    h = se * stats.t.ppf((1 + confidence) / 2., n-1)
    return m, h

mean_exp, bound_exp = mean_confidence_interval(reward_expert_list)
mean_pol, bound_pol = mean_confidence_interval(reward_policy_list)
mean_pol_2, bound_pol_2 = mean_confidence_interval(reward_policy_list_2)

print(f"Expert Reward: {mean_exp:.6f} ± {bound_exp:.6f}")
print(f"Policy Reward: {mean_pol:.6f} ± {bound_pol:.6f}")
print(f"Policy Reward: {mean_pol_2:.6f} ± {bound_pol_2:.6f}")

Expert Reward: 0.466450 ± 0.004510
Policy Reward: 0.469065 ± 0.003221
Policy Reward: 0.477703 ± 0.003002
