In [None]:
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

import numpy as np
import matplotlib.pyplot as plt
import pickle
import torch

from Configs import getEnvConfig, visualizeEnvConfig, getDatasetConfig, visualizeDatasetConfig
from Helpers.DataSampler import ReplayBuffer, ReplayBufferHybrid
from Helpers.Visualization import MultiLivePlot
from Environment.EnvironmentSim import createEnv
from Helpers.EnvInterface import EnvInterface
from Helpers.Eval import eval

In [None]:
envConfigIdx = 0
envParams = getEnvConfig(envConfigIdx)
visualizeEnvConfig(envParams)

datasetConfigIdx = 0
datasetParams = getDatasetConfig(datasetConfigIdx)
visualizeDatasetConfig(datasetParams)

trafficDataParentPath = f'Datasets/TrafficDataset/TrafficData'
env = createEnv(envParams, trafficDataParentPath)
env.selectMode(mode="train", type="data")

Environment Configuration
Number of Users:        20
Window Length:          200
Dataflow:               thumb_bk
Sigmoid K List:         [0.1, 0.2, 0.3, 0.4, 0.5]
Sigmoid S List:         [10.0, 10.0, 10.0, 10.0, 10.0]
Resource Bar:           5
Bandwidth:              200
Sub Agents:             [[1, 1, 1, 1, 1]]
User Map:               [[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15], [16, 17, 18, 19]]
Dataset Configuration
Number of Users:        20
Window Length:          200
N_aggregation:          4
Dataflow:               thumb_bk
Random Seed:            999
Resource Bar:           5
Bandwidth:              200
Sigmoid K List:         [0.3]
Sigmoid S List:         [10.0]
Sub Agents:             [[1, 1, 1, 1, 1]]
User Map:               [[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15], [16, 17, 18, 19]]


In [None]:
with open(f'Datasets/OfflineDataset/subOptimalAgent_encConfig{datasetConfigIdx}_{envParams["sub_agents_idx"]}.pkl', 'rb') as f:
    dataset_expert = pickle.load(f)
with open(f'Datasets/OfflineDataset/random_policy_8_users.pkl', 'rb') as f:
    dataset_random = pickle.load(f)

dataset_off = {
    'observations': dataset_expert['uRecord'],     
    'actions': dataset_expert['actionsRecord'], 
    'rewards': dataset_expert['rewardRecord'], 
    'next_observations': dataset_expert['uNextRecord']
}
print(f"Avg. packet loss rate: {np.mean(dataset_expert['rewardRecord'])}")
print(f"length of dataset: {len(dataset_off['observations'])}")

Avg. packet loss rate: 0.4738505039223643
length of dataset: 10000


In [None]:
hyperparams = {
    'N_diffusion_steps':30,
    'schedule_type': "vp",
    'abs_action_max': 1.0,
    'gamma': 0.99,
    'lr': 5e-4,
    'decay_lr': True,
    'weight_decay': 0.0,
    'num_critics': 8,
    'lcb_coef': 0.15,
    'q_sample_eta': 1.0,
    'weight_entropy_loss': 0.01,
    'weight_q_loss': 1.0,
    'approximate_action': True,
    'ema_tau': 0.001,
    'ema_period': 20,
    'ema_begin_update': 1000,
    'layer_norm': True,
    'grad_clip': 3.0,
    'device': 'cuda',
}

In [None]:
iterations = 100
batch_size = 100
LEN_eval = 50
report_period = 10
warm_up_period = 100
max_sp_ratio, min_sp_ratio = 1.0, 0.5
max_weight_bc_loss, min_weight_bc_loss = 1.0, 1.0
rb_capacity = 30000

save_folder = f"Models/config_{envConfigIdx}"
if not os.path.exists(save_folder):
    os.makedirs(save_folder)

for N_exp in range(2):
    with open(f"Models/config_{envConfigIdx}/hyperparams_{N_exp}.pkl", "wb") as f:
        pickle.dump(hyperparams, f)
    envInterface = EnvInterface(
        envParams, discrete_state=False,
        n_bits_state=3, base_state=5,
        n_bits_action=2, base_action=200,
    )
    dataSamplerOff = ReplayBuffer(capacity=rb_capacity, envInterface=envInterface, device=hyperparams['device'])
    dataSamplerOn = ReplayBufferHybrid(capacity=rb_capacity, envInterface=envInterface, device=hyperparams['device'])
    dataSamplerOff.add(dataset_off)
    dataSamplerOn.addOffline(dataset_off)
    batch = dataSamplerOff.sample(len(dataSamplerOff))
    print(f"Expert's Reward: {np.mean(batch[2].cpu().detach().numpy())}")

    from DiffusionQL.DQL_Q_esmb import DQL_Q_esmb as Agent
    print(f"state_dim: {envInterface.state_dim}, action_dim: {envInterface.action_dim}")
    agent = Agent(
        state_dim=envInterface.state_dim, 
        action_dim=envInterface.action_dim, 
        **hyperparams
    )
    metrics_train = {'Ld': [], 'Lq': [], 'Le': [], 'loss_Q': [], 'Reward': []}
    ploter = MultiLivePlot(nrows=1, ncols=5, titles=["Ld", "Lq", "Le", "loss_Q", "Reward"], display_window=25)
    best_reward = np.inf
    idx_episode = 1
    while(True):
        metrics = agent.train_split(dataSamplerOff, dataSamplerOn, iterations, batch_size, tqdm_pos=0)
        _, explore_data = eval(agent, env, envInterface, LEN_eval=LEN_eval, obvMode="predicted", 
                               sample_method="exploration", N_action_candidates=10, 
                               eta=np.random.uniform(0.5, 3.0), verbose=True)
        dataSamplerOn.addOnline(explore_data)
        reward, offpolicy_data = eval(agent, env, envInterface, LEN_eval=LEN_eval, obvMode="predicted", 
                                      sample_method="greedy", N_action_candidates=50, eta=1.0, verbose=True)
        dataSamplerOn.addOnline(offpolicy_data)
        sample_ratio = np.max([min_sp_ratio, max_sp_ratio - ((max_sp_ratio-min_sp_ratio)/warm_up_period) * idx_episode])
        dataSamplerOn.set_sample_ratio(sample_ratio)
        weight_bc_loss = np.max([min_weight_bc_loss, max_weight_bc_loss - ((max_weight_bc_loss-min_weight_bc_loss)/warm_up_period) * idx_episode])
        agent.set_weight_bc_loss(weight_bc_loss)

        metrics_train['Ld'] += metrics['Ld']
        metrics_train['Lq'] += metrics['Lq']
        metrics_train['Le'] += metrics['Le']
        metrics_train['loss_Q'] += metrics['loss_Q']
        metrics_train['Reward'].append(reward)
        ploter.update(0, idx_episode, np.mean(metrics['Ld']))
        ploter.update(1, idx_episode, np.mean(metrics['Lq']))
        ploter.update(2, idx_episode, np.mean(metrics['Le']))
        ploter.update(3, idx_episode, np.mean(metrics['loss_Q']))
        ploter.update(4, idx_episode, reward)    
      
        if idx_episode > 10:
            window = 5
            smooth_reward = np.convolve(
                np.concatenate([np.zeros(window), np.array(metrics_train['Reward'])]), 
                np.ones(window)/window, mode='valid')[1:]
            # save model
            if smooth_reward[-1] < best_reward:
                best_reward = smooth_reward[-1]
                agent.save_model(save_folder, f'{N_exp}_best')
                print(f"save model {N_exp}_best, smoothed reward: {best_reward}")
                with open(save_folder + f"/train_metrics_{N_exp}_best.pkl", "wb") as f:
                    pickle.dump(metrics_train, f)
            # stop training
            if np.abs(smooth_reward[-1] - smooth_reward[-2]) < 1e-6 or \
                idx_episode > 50:
                #smooth_reward[-1] > 5.0*smooth_reward[-window] or \
                agent.save_model(save_folder, f'{N_exp}_end')
                with open(save_folder + f"/train_metrics_{N_exp}_end.pkl", "wb") as f:
                    pickle.dump(metrics_train, f)
                break

        if idx_episode % report_period == 0:
            print("=" * 20 + f"Iteration {idx_episode}" + "=" * 20)
            print(f"Ld: {np.mean(metrics['Ld'])}, " + 
                f"Lq: {np.mean(metrics['Lq'])}, " + 
                f"Le: {np.mean(metrics['Le'])}, " + 
                f"loss_Q: {np.mean(metrics['loss_Q'])}")
            print(f"Avg. Reward: {np.mean(metrics_train['Reward'][-int(report_period):])}, sample_ratio: {sample_ratio}, weight_bc_loss: {weight_bc_loss}")
            print("=" * 50) 
        idx_episode += 1

Expert's Reward: 0.5262174606323242
state_dim: 20, action_dim: 20


                                                                   

Ld: 0.12396391712129116, Lq: 0.526014432311058, Le: -1.5293437361717224, loss_Q: 0.006541097303852439
Avg. Reward: 0.6593772399266051, sample_ratio: 0.95, weight_bc_loss: 1.0


                                                                   

save model 0_best, smoothed reward: 0.628411368070088


                                                                   

save model 0_best, smoothed reward: 0.6192674180088923


                                                                   

save model 0_best, smoothed reward: 0.6055249510095051


                                                                   

save model 0_best, smoothed reward: 0.5973702406721723


                                                                   

save model 0_best, smoothed reward: 0.5831596307912654


                                                                   

save model 0_best, smoothed reward: 0.5658016138638471


                                                                   

save model 0_best, smoothed reward: 0.5512020907208797


                                                                   

save model 0_best, smoothed reward: 0.5436932253077297


                                                                   

save model 0_best, smoothed reward: 0.5270384094274878


                                                                   

save model 0_best, smoothed reward: 0.5209711251268689
Ld: 0.11964490413665771, Lq: 0.5363591969013214, Le: -1.528318407535553, loss_Q: 0.007012844723649323
Avg. Reward: 0.5520653779590672, sample_ratio: 0.9, weight_bc_loss: 1.0


                                                                   

save model 0_best, smoothed reward: 0.5164472935627933


                                                                   

save model 0_best, smoothed reward: 0.5090382101036461


                                                                   

save model 0_best, smoothed reward: 0.5034561687229804


                                                                   

save model 0_best, smoothed reward: 0.5013228182084882


                                                                   

save model 0_best, smoothed reward: 0.4970780470789471


                                                                   

save model 0_best, smoothed reward: 0.4908715358813738


                                                                   

save model 0_best, smoothed reward: 0.49065478378567756


                                                                   

save model 0_best, smoothed reward: 0.4870471809219912


                                                                   

save model 0_best, smoothed reward: 0.4844641398705061


                                                                   

save model 0_best, smoothed reward: 0.48191409421057146
Ld: 0.09851015970110893, Lq: 0.5432795852422714, Le: -1.5392620730400086, loss_Q: 0.006659077354706824
Avg. Reward: 0.4894960706447593, sample_ratio: 0.85, weight_bc_loss: 1.0


                                                                   

Ld: 0.10239594727754593, Lq: 0.5480386954545975, Le: -1.5258098673820495, loss_Q: 0.006758152116090059
Avg. Reward: 0.4890375066857301, sample_ratio: 0.8, weight_bc_loss: 1.0


                                                                   

save model 0_best, smoothed reward: 0.47790722317044615


                                                                   

Ld: 0.0861209562420845, Lq: 0.5477991890907288, Le: -1.5268995261192322, loss_Q: 0.006729420279152691
Avg. Reward: 0.4826101521465467, sample_ratio: 0.75, weight_bc_loss: 1.0


                                                                   

Expert's Reward: 0.5266821384429932
state_dim: 20, action_dim: 20


                                                                   

KeyboardInterrupt: 

In [None]:
fig, axs = plt.subplots(1, 1, figsize=(5, 2))
plt.plot(metrics_train['Reward'])
plt.show()