In [6]:
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

import sys
sys.path.insert(0, '../../../../')

import numpy as np
import matplotlib.pyplot as plt
import pickle

from src.difsched.config import getEnvConfig, visualizeEnvConfig, getDatasetConfig, visualizeDatasetConfig
from src.difsched.utils.DataSampler import ReplayBuffer, ReplayBufferHybrid
from src.difsched.utils.Visualization import MultiLivePlot
from src.difsched.env.EnvironmentSim import createEnv
from src.difsched.utils.EnvInterface import EnvInterface
from src.difsched.evaluation import eval
from src.difsched.agents.DiffusionQL.DQL_Q_esmb import DQL_Q_esmb as Agent

In [7]:
configIdx = 0
envParams = getEnvConfig(configIdx)
visualizeEnvConfig(envParams)
trafficDataParentPath =  f'../../../../data/raw/traffic'
env = createEnv(envParams, trafficDataParentPath)
env.selectMode(mode="train", type="data")

Environment Configuration
Number of Users:        8
Window Length:          200
Dataflow:               thumb_fr
Sigmoid K List:         [0.1, 0.2, 0.3, 0.4, 0.5]
Sigmoid S List:         [10.0, 10.0, 10.0, 10.0, 10.0]
Resource Bar:           5
Bandwidth:              100
Sub Agents:             [[0, 0]]
User Map:               [[0, 1, 2, 3], [4, 5, 6, 7]]


In [8]:
hyperparams = {
    'N_diffusion_steps':30,
    'schedule_type': "vp",
    'abs_action_max': 1.0,
    'gamma': 0.99,
    'lr': 5e-4,
    'decay_lr': True,
    'weight_decay': 0.0,
    'num_critics': 8,
    'lcb_coef': 0.15,
    'q_sample_eta': 1.0,
    'weight_entropy_loss': 0.01,
    'weight_q_loss': 1.0,
    'approximate_action': True,
    'ema_tau': 0.001,
    'ema_period': 20,
    'ema_begin_update': 1000,
    'layer_norm': True,
    'grad_clip': 3.0,
    'device': 'cuda',
}

In [9]:
iterations = 100
batch_size = 100
LEN_eval = 50
report_period = 50
warm_up_period = 30
max_sp_ratio, min_sp_ratio = 1.0, 0.2
max_weight_bc_loss, min_weight_bc_loss = 0.0, 0.0
rb_capacity = 30000

save_folder = f"../../../../data/results/dql/config_{configIdx}/without_bc"
if not os.path.exists(save_folder):
    os.makedirs(save_folder)

for N_exp in range(1):
    envInterface = EnvInterface(envParams, discrete_state=False)
    dataSamplerOn = ReplayBufferHybrid(capacity=rb_capacity, envInterface=envInterface, device=hyperparams['device'])

    print(f"state_dim: {envInterface.state_dim}, action_dim: {envInterface.action_dim}")
    agent = Agent(
        state_dim=envInterface.state_dim, 
        action_dim=envInterface.action_dim, 
        **hyperparams
    )
    metrics_train = {'Ld': [], 'Lq': [], 'Le': [], 'loss_Q': [], 'Reward': []}
    ploter = MultiLivePlot(nrows=1, ncols=5, titles=["Ld", "Lq", "Le", "loss_Q", "Reward"], display_window=25)
    best_reward = np.inf
    idx_episode = 1
    reward_last = np.inf
    while(True):
        metrics = agent.train(dataSamplerOn, iterations, batch_size, tqdm_pos=0)
        _, explore_data = eval(agent, env, envInterface, LEN_eval=LEN_eval, obvMode="predicted", sample_method="exploration", N_action_candidates=10, 
                            eta=np.random.uniform(1.0, 5.0), verbose=True)
        dataSamplerOn.addOnline(explore_data)
        reward, offpolicy_data = eval(agent, env, envInterface, LEN_eval=LEN_eval, obvMode="predicted", sample_method="greedy", N_action_candidates=50, eta=1.0, verbose=True)
        dataSamplerOn.addOnline(offpolicy_data)
        sample_ratio = np.max([min_sp_ratio, max_sp_ratio - ((max_sp_ratio-min_sp_ratio)/warm_up_period) * idx_episode])
        dataSamplerOn.set_sample_ratio(sample_ratio)
        weight_bc_loss = np.max([min_weight_bc_loss, max_weight_bc_loss - ((max_weight_bc_loss-min_weight_bc_loss)/warm_up_period) * idx_episode])
        agent.set_weight_bc_loss(weight_bc_loss)

        metrics_train['Ld'] += metrics['Ld']
        metrics_train['Lq'] += metrics['Lq']
        metrics_train['Le'] += metrics['Le']
        metrics_train['loss_Q'] += metrics['loss_Q']
        metrics_train['Reward'].append(reward)
        ploter.update(0, idx_episode, np.mean(metrics['Ld']))
        ploter.update(1, idx_episode, np.mean(metrics['Lq']))
        ploter.update(2, idx_episode, np.mean(metrics['Le']))
        ploter.update(3, idx_episode, np.mean(metrics['loss_Q']))
        ploter.update(4, idx_episode, reward)    
        if reward < best_reward:
            best_reward = reward
            agent.save_model(save_folder, f'{N_exp}_best')
            with open(f"{save_folder}/train_metrics_{N_exp}_best.pkl", "wb") as f:
                pickle.dump(metrics_train, f)

        if idx_episode >= 50 and np.mean(metrics_train['Reward'][-10:]) > np.mean(metrics_train['Reward'][-20:-10]) or idx_episode > 50 or np.abs(reward_last - reward) < 1e-7:
            agent.save_model(save_folder, f'{N_exp}_end')
            with open(f"{save_folder}/train_metrics_{N_exp}_end.pkl", "wb") as f:
                pickle.dump(metrics_train, f)
            break
        reward_last = reward

        if idx_episode % report_period == 0:
            print("=" * 20 + f"Iteration {idx_episode}" + "=" * 20)
            print(f"Ld: {np.mean(metrics['Ld'])}, " + 
                f"Lq: {np.mean(metrics['Lq'])}, " + 
                f"Le: {np.mean(metrics['Le'])}, " + 
                f"loss_Q: {np.mean(metrics['loss_Q'])}")
            print(f"Avg. Reward: {np.mean(metrics_train['Reward'][-int(report_period):])}, sample_ratio: {sample_ratio}, weight_bc_loss: {weight_bc_loss}")
            print("=" * 50) 
        idx_episode += 1

state_dim: 8, action_dim: 8


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
                                                                   

Ld: 0.5149996089935303, Lq: 0.40042383819818494, Le: -1.4799671816825866, loss_Q: 0.0255245965719223
Avg. Reward: 0.635945400171526, sample_ratio: 0.2, weight_bc_loss: 0.0


                                                                   

In [10]:
import matplotlib.pyplot as plt

agent_eval = Agent(
    state_dim=envInterface.state_dim, 
    action_dim=envInterface.action_dim, 
    **hyperparams
)
agent_eval.load_model("Models/", f'{N_exp}_best')

LEN_eval = 500

reward, info = eval(agent_eval, env, envInterface, LEN_eval=LEN_eval, obvMode="predicted", sample_method="greedy", N_action_candidates=50, eta=0.5, verbose=True) 
reward_off = np.array(dataset_off['rewards'])[np.random.randint(0, len(dataset_off['rewards']), LEN_eval)]
print(f"reward_off: {np.mean(reward_off)}, reward_policy: {np.mean(reward)}, ")

fig, axs = plt.subplots(1, 1, figsize=(5, 2))
plt.plot(reward_off, label="offline dataset")
plt.plot(info['rewards'], label="policy")
plt.legend()
plt.show()

  self.actor.load_state_dict(torch.load(f'{dir}/actor_{id}.pth'))


FileNotFoundError: [Errno 2] No such file or directory: 'Models//actor_0_best.pth'

In [None]:
action_off = dataset_off['actions']
action = info['actions']

obv_off = [M for [w, r, M, alpha] in action_off]
obv = [M for [w, r, M, alpha] in action]

fig, axs = plt.subplots(1, 1, figsize=(5, 2))
plt.plot(obv_off[:len(obv)], label="offline")
plt.show()
fig, axs = plt.subplots(1, 1, figsize=(5, 2))
plt.plot(obv, label="online")
plt.legend()
plt.show()

In [None]:
spend_off = np.sum([w*r for [w, r, M, alpha] in action_off], axis=1)
spend_on = np.sum([w*r for [w, r, M, alpha] in action], axis=1)
alpha_off = np.array([alpha for [w, r, M, alpha] in action_off])
alpha_on = np.array([alpha for [w, r, M, alpha] in action])

print(action[0])

fig, axs = plt.subplots(1, 1, figsize=(5, 2))
#plt.plot(spend_off - alpha_off*envParams['B'], label="offline")
plt.plot(spend_on - alpha_on*envParams['B'], label="online")
#plt.plot(alpha_off, label="offline")
#plt.plot(alpha_on, label="online")
plt.legend()
plt.show()

(array([0, 0, 0, 1, 0, 0, 0, 0]), array([100. ,  20. , 100. ,  87.2, 103.2,  96.8,   0.8, 180. ]), 9, 0.09999999999999998)
