In [None]:
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

import sys
sys.path.insert(0, '../../../../')

import numpy as np
import matplotlib.pyplot as plt
import pickle

from src.difsched.config import getEnvConfig, visualizeEnvConfig, getDatasetConfig, visualizeDatasetConfig
from src.difsched.utils.DataSampler import ReplayBuffer, ReplayBufferHybrid
from src.difsched.utils.Visualization import MultiLivePlot
from src.difsched.env.EnvironmentSim import createEnv
from src.difsched.utils.EnvInterface import EnvInterface
from src.difsched.evaluation import eval
from src.difsched.agents.DiffusionQL.DQL_Q_esmb import DQL_Q_esmb as Agent

In [None]:
envConfigIdx = 0
envParams = getEnvConfig(envConfigIdx)
visualizeEnvConfig(envParams)

datasetConfigIdx = 0
datasetParams = getDatasetConfig(datasetConfigIdx)
visualizeDatasetConfig(datasetParams)

trafficDataParentPath = f'../../../../data/raw/traffic'
env = createEnv(envParams, trafficDataParentPath)
env.selectMode(mode="train", type="data")

Environment Configuration
Number of Users:        8
Window Length:          200
Dataflow:               thumb_fr
Sigmoid K List:         [0.1, 0.2, 0.3, 0.4, 0.5]
Sigmoid S List:         [10.0, 10.0, 10.0, 10.0, 10.0]
Resource Bar:           5
Bandwidth:              100
Sub Agents:             [[0, 0]]
User Map:               [[0, 1, 2, 3], [4, 5, 6, 7]]
Dataset Configuration
Number of Users:        8
Window Length:          200
N_aggregation:          4
Dataflow:               thumb_fr
Random Seed:            999
Resource Bar:           5
Bandwidth:              100
Sigmoid K List:         [0.3]
Sigmoid S List:         [10.0]
Sub Agents:             [[0, 0]]
User Map:               [[0, 1, 2, 3], [4, 5, 6, 7]]


In [None]:
with open(f'../../../../data/processed/offline_dataset/subOptimalAgent_encConfig{datasetConfigIdx}_{envParams["sub_agents_idx"]}.pkl', 'rb') as f:
    dataset_expert = pickle.load(f)

dataset_off = {
    'observations': dataset_expert['uRecord'],     
    'actions': dataset_expert['actionsRecord'], 
    'rewards': dataset_expert['rewardRecord'], 
    'next_observations': dataset_expert['uNextRecord']
}
print(f"Avg. packet loss rate: {np.mean(dataset_expert['rewardRecord'])}")
print(f"length of dataset: {len(dataset_off['observations'])}")

Avg. packet loss rate: 0.33388894431115085
length of dataset: 10000


In [None]:
hyperparams = {
    'N_diffusion_steps':30,
    'schedule_type': "vp",
    'abs_action_max': 1.0,
    'gamma': 0.99,
    'lr': 5e-4,
    'decay_lr': True,
    'weight_decay': 0.0,
    'num_critics': 8,
    'lcb_coef': 0.15,
    'q_sample_eta': 1.0,
    'weight_entropy_loss': 0.01,
    'weight_q_loss': 1.0,
    'approximate_action': True,
    'ema_tau': 0.001,
    'ema_period': 20,
    'ema_begin_update': 1000,
    'layer_norm': True,
    'grad_clip': 3.0,
    'device': 'cuda',
}

In [None]:
iterations = 100
batch_size = 100
LEN_eval = 50
report_period = 10
warm_up_period = 100
max_sp_ratio, min_sp_ratio = 1.0, 0.5
max_weight_bc_loss, min_weight_bc_loss = 1.0, 1.0
rb_capacity = 30000

save_folder = f"../../../../data/results/dql/config_{envConfigIdx}"
if not os.path.exists(save_folder):
    os.makedirs(save_folder)

for N_exp in range(2):
    with open(f"{save_folder}/hyperparams_{N_exp}.pkl", "wb") as f:
        pickle.dump(hyperparams, f)
    envInterface = EnvInterface(
        envParams, discrete_state=False,
        n_bits_state=3, base_state=5,
        n_bits_action=2, base_action=200,
    )
    dataSamplerOff = ReplayBuffer(capacity=rb_capacity, envInterface=envInterface, device=hyperparams['device'])
    dataSamplerOn = ReplayBufferHybrid(capacity=rb_capacity, envInterface=envInterface, device=hyperparams['device'])
    dataSamplerOff.add(dataset_off)
    dataSamplerOn.addOffline(dataset_off)
    batch = dataSamplerOff.sample(len(dataSamplerOff))
    print(f"Expert's Reward: {np.mean(batch[2].cpu().detach().numpy())}")

    print(f"state_dim: {envInterface.state_dim}, action_dim: {envInterface.action_dim}")
    agent = Agent(
        state_dim=envInterface.state_dim, 
        action_dim=envInterface.action_dim, 
        **hyperparams
    )
    metrics_train = {'Ld': [], 'Lq': [], 'Le': [], 'loss_Q': [], 'Reward': []}
    ploter = MultiLivePlot(nrows=1, ncols=5, titles=["Ld", "Lq", "Le", "loss_Q", "Reward"], display_window=25)
    best_reward = np.inf
    idx_episode = 1
    while(True):
        metrics = agent.train_split(dataSamplerOff, dataSamplerOn, iterations, batch_size, tqdm_pos=0)
        _, explore_data = eval(agent, env, envInterface, LEN_eval=LEN_eval, obvMode="predicted", 
                               sample_method="exploration", N_action_candidates=10, 
                               eta=np.random.uniform(0.5, 3.0), verbose=True)
        dataSamplerOn.addOnline(explore_data)
        reward, offpolicy_data = eval(agent, env, envInterface, LEN_eval=LEN_eval, obvMode="predicted", 
                                      sample_method="greedy", N_action_candidates=50, eta=1.0, verbose=True)
        dataSamplerOn.addOnline(offpolicy_data)
        sample_ratio = np.max([min_sp_ratio, max_sp_ratio - ((max_sp_ratio-min_sp_ratio)/warm_up_period) * idx_episode])
        dataSamplerOn.set_sample_ratio(sample_ratio)
        weight_bc_loss = np.max([min_weight_bc_loss, max_weight_bc_loss - ((max_weight_bc_loss-min_weight_bc_loss)/warm_up_period) * idx_episode])
        agent.set_weight_bc_loss(weight_bc_loss)

        metrics_train['Ld'] += metrics['Ld']
        metrics_train['Lq'] += metrics['Lq']
        metrics_train['Le'] += metrics['Le']
        metrics_train['loss_Q'] += metrics['loss_Q']
        metrics_train['Reward'].append(reward)
        ploter.update(0, idx_episode, np.mean(metrics['Ld']))
        ploter.update(1, idx_episode, np.mean(metrics['Lq']))
        ploter.update(2, idx_episode, np.mean(metrics['Le']))
        ploter.update(3, idx_episode, np.mean(metrics['loss_Q']))
        ploter.update(4, idx_episode, reward)    
      
        if idx_episode > 10:
            window = 5
            smooth_reward = np.convolve(
                np.concatenate([np.zeros(window), np.array(metrics_train['Reward'])]), 
                np.ones(window)/window, mode='valid')[1:]
            # save model
            if smooth_reward[-1] < best_reward:
                best_reward = smooth_reward[-1]
                agent.save_model(save_folder, f'{N_exp}_best')
                print(f"save model {N_exp}_best, smoothed reward: {best_reward}")
                with open(f"{save_folder}/train_metrics_{N_exp}_best.pkl", "wb") as f:
                    pickle.dump(metrics_train, f)
            # stop training
            if np.abs(smooth_reward[-1] - smooth_reward[-2]) < 1e-6 or \
                idx_episode > 50:
                #smooth_reward[-1] > 5.0*smooth_reward[-window] or \
                agent.save_model(save_folder, f'{N_exp}_end')
                with open(f"{save_folder}/train_metrics_{N_exp}_end.pkl", "wb") as f:
                    pickle.dump(metrics_train, f)
                break

        if idx_episode % report_period == 0:
            print("=" * 20 + f"Iteration {idx_episode}" + "=" * 20)
            print(f"Ld: {np.mean(metrics['Ld'])}, " + 
                f"Lq: {np.mean(metrics['Lq'])}, " + 
                f"Le: {np.mean(metrics['Le'])}, " + 
                f"loss_Q: {np.mean(metrics['loss_Q'])}")
            print(f"Avg. Reward: {np.mean(metrics_train['Reward'][-int(report_period):])}, sample_ratio: {sample_ratio}, weight_bc_loss: {weight_bc_loss}")
            print("=" * 50) 
        idx_episode += 1

Expert's Reward: 0.6659311652183533
state_dim: 8, action_dim: 8


                                                                   

Ld: 0.14931574650108814, Lq: 0.6654142016172409, Le: -1.5740736174583434, loss_Q: 0.013272479884326458
Avg. Reward: 0.4777179394505329, sample_ratio: 0.95, weight_bc_loss: 1.0


                                                                   

save model 0_best, smoothed reward: 0.43502893197366965


                                                                   

save model 0_best, smoothed reward: 0.43322280857138695


                                                                   

save model 0_best, smoothed reward: 0.4312438449718464


                                                                   

save model 0_best, smoothed reward: 0.41595941435183703


                                                                   

save model 0_best, smoothed reward: 0.397285479412988


                                                                   

save model 0_best, smoothed reward: 0.3907813070310474
Ld: 0.14865337505936624, Lq: 0.6746866953372955, Le: -1.560660080909729, loss_Q: 0.013502478124573827
Avg. Reward: 0.41435471802200025, sample_ratio: 0.9, weight_bc_loss: 1.0


                                                                   

save model 0_best, smoothed reward: 0.3815947336699306


                                                                   

save model 0_best, smoothed reward: 0.36830823116879513


                                                                   

save model 0_best, smoothed reward: 0.3673121019755942


                                                                   

save model 0_best, smoothed reward: 0.36627724732503586


                                                                   

save model 0_best, smoothed reward: 0.3618868300853788


                                                                   

save model 0_best, smoothed reward: 0.3518281598524299


                                                                   

save model 0_best, smoothed reward: 0.350980663229202


                                                                   

Ld: 0.12880088306963444, Lq: 0.683540067076683, Le: -1.5811374354362489, loss_Q: 0.013530754074454308
Avg. Reward: 0.3600641174187641, sample_ratio: 0.85, weight_bc_loss: 1.0


                                                                   

save model 0_best, smoothed reward: 0.34981005486966443


                                                                   

save model 0_best, smoothed reward: 0.34957105146543044


                                                                   

Ld: 0.1394987626373768, Lq: 0.6890191715955735, Le: -1.5605802893638612, loss_Q: 0.013742734100669623
Avg. Reward: 0.3575353112358427, sample_ratio: 0.8, weight_bc_loss: 1.0


                                                                   

save model 0_best, smoothed reward: 0.34598628410732846


                                                                   

save model 0_best, smoothed reward: 0.34533213361565407


                                                                   

save model 0_best, smoothed reward: 0.33779366731512694


                                                                   

Ld: 0.12360217101871968, Lq: 0.6969855779409408, Le: -1.5810922491550445, loss_Q: 0.01319136838428676
Avg. Reward: 0.34802396231324384, sample_ratio: 0.75, weight_bc_loss: 1.0


                                                                   

Expert's Reward: 0.6680389046669006
state_dim: 8, action_dim: 8


                                                                   

KeyboardInterrupt: 

In [None]:
fig, axs = plt.subplots(1, 1, figsize=(5, 2))
plt.plot(metrics_train['Reward'])
plt.show()