In [None]:
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

import numpy as np
import matplotlib.pyplot as plt
import pickle
import torch
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm


import sys
sys.path.append("../../../../"
)
from src.difsched.utils import ReplayBuffer
from src.difsched.agents.mdp import MdpKernel
#sys.path.append(os.path.abspath("../../Diffusion-Policies-for-Offline-RL"))
from src.difsched.agents.Dql_external.dql import Diffusion_QL

In [2]:
import numpy as np

def normalize_data(data, min_val, max_val):
    # Map data in [min_val, max_val] to [-1, 1]
    return 2 * (data - min_val) / (max_val - min_val) - 1

def denormalize_data(data, min_val, max_val):
    # Map data in [-1, 1] back to [min_val, max_val]
    return (data + 1) / 2 * (max_val - min_val) + min_val


def generate_transition_and_reward(N_s, N_a, seed=None):
    if seed is not None:
        np.random.seed(seed)
    
    # Random transition probabilities
    P = np.random.rand(N_s, N_a, N_s)
    # Normalize over next-states axis so that ∑_s' P(s'|s,a) = 1
    P /= P.sum(axis=2, keepdims=True)
    
    # Random rewards
    R = np.random.rand(N_s, N_a)
    P = np.transpose(P, (0, 2, 1))
    
    return P, R

def evaluate_diffusionQ(diffusionQ, R, P, N_s, N_a, N_iter=100, sample_method="greedy"):
    metrics  = {'s_record': [], 'a_record': [], 'r_record': [], 's_next_record': []}
    s_init = np.random.randint(0, N_s)
    s = s_init
    for _ in tqdm(range(N_iter), leave=False):
        s_DQ = torch.as_tensor([normalize_data(s, 0, N_s-1)], dtype=torch.float32).to(diffusionQ.device)
        a_DQ = diffusionQ.sample_action(s_DQ.unsqueeze(-1), sample_method=sample_method)
        a = denormalize_data(a_DQ, 0, N_a-1).astype(int)[0]
        r = R[s, a]
        s_next = generate_next_state(P[:, :, a], s)
        s = s_next
        metrics['s_record'].append([s])
        metrics['a_record'].append([a])
        metrics['r_record'].append([r])
        metrics['s_next_record'].append([s_next])

    return np.mean(metrics['r_record']), metrics

def evaluate_deterministic_policy(policyDeter, R, P, sRecord, N_s, N_iter=100):
    metrics  = {'s_record': [], 'a_record': [], 'r_record': [], 's_next_record': []}
    if P is None and sRecord is None:
        raise ValueError("Either P or sRecord must be provided")

    if P is None:
        s = sRecord
    else:
        s = np.random.randint(0, N_s) 

    for i in tqdm(range(N_iter), leave=False):
        if P is None:
            s = sRecord[i]
        else:
            s_next = generate_next_state(P[:, :, policyDeter[s]], s)
        a = policyDeter[s]
        r = R[s, a]
        metrics['r_record'].append([r])
        if P is not None:
            metrics['s_record'].append([s])
            metrics['a_record'].append([a])
            metrics['s_next_record'].append([s_next])
            s = s_next
    return np.mean(metrics['r_record']), metrics

def extract_deterministic_policy(diffusionQ, N_s, N_a, sample_method="greedy"   ):
    policy_deterministic = []
    for s in tqdm(range(N_s), leave=False):
        s_DQ = torch.as_tensor([normalize_data(s, 0, N_s-1)], dtype=torch.float32).to(diffusionQ.device)
        a_DQ = diffusionQ.sample_action(s_DQ.unsqueeze(-1), sample_method=sample_method)[0]
        a = denormalize_data(a_DQ, 0, N_a-1).astype(int)
        policy_deterministic.append(a)
    return policy_deterministic


In [3]:
# Example usage:
N_s = 4  # number of states
N_a = 4
  # number of actions
P, R = generate_transition_and_reward(N_s, N_a, seed=995)
print(R)
print(P.shape)
print(R.shape)

mdpKernel = MdpKernel()
mdpKernel.N_states = N_s
mdpKernel.N_actions = N_a
mdpKernel.transitionTable = P
mdpKernel.rewardTable = -R
V, policyDeter = mdpKernel.optimize_policy(mode="deterministic", gamma=0.99, theta=1e-10)
print(V)
print(policyDeter)


[[0.67918602 0.13145189 0.11357647 0.61079304]
 [0.04191794 0.81719606 0.98816978 0.28420654]
 [0.68759066 0.68304571 0.00542148 0.80897017]
 [0.57091093 0.76788478 0.28030813 0.37841846]]
(4, 4, 4)
(4, 4)


Value Iteration: 100%|██████████| 1000/1000 [00:00<00:00, 1908.33it/s, V=-83.453784]

[-83.35144674 -83.61016774 -83.46227102 -83.39125144]
[0 2 3 1]





In [4]:
diffusionQ = Diffusion_QL( 
    state_dim=1,
    action_dim=1,
    max_action=1,
    device='cuda',
    discount=0.99,
    tau=0.005)

N_iter = 10000 
r_mdp, metrics = evaluate_deterministic_policy(policyDeter, R, P, sRecord=None, N_s=N_s, N_iter=N_iter)
print(f"Avg Reward MDP: {r_mdp}")

                                                       

Avg Reward MDP: 0.8342602392324928




In [8]:

N_iter = 5000 
r_mdp, metrics = evaluate_deterministic_policy(policyDeter, R, P, sRecord=None, N_s=N_s, N_iter=N_iter)
print(f"Avg Reward MDP: {r_mdp}")

rb = ReplayBuffer(capacity=N_iter)
rb.addOffline(
    metrics['s_record'], metrics['a_record'], metrics['r_record'], metrics['s_next_record']
)

diffusionQ = Diffusion_QL( 
    state_dim=1,
    action_dim=1,
    max_action=1,
    eta=1.0,
    device='cuda',
    discount=0.99,
    n_timesteps=200,
    lr=1e-4,
    lr_decay=True,
    tau=0.005,
    update_ema_every=5,
    step_start_ema=100,
    max_q_backup=False)

epochs = 1000
batch_size = 100
sample_method = "greedy"

metric = {'L_actor': [], 'L_critic': []}
mean_metric = lambda x: sum(x) / len(x) if len(x) > 0 else float('nan')

with tqdm(range(1, epochs + 1), desc=f'Epoch', unit='epoch', leave=False) as epoch_bar:
    for ep in epoch_bar:
        samples = rb.sample(batch_size)
        batch = (
            torch.as_tensor(samples[0], dtype=torch.float32).to(diffusionQ.device), 
            torch.as_tensor(samples[1], dtype=torch.float32).to(diffusionQ.device), 
            torch.as_tensor(samples[2], dtype=torch.float32).to(diffusionQ.device), 
            torch.as_tensor(samples[3], dtype=torch.float32).to(diffusionQ.device)
        )
        L_actor, L_critic = diffusionQ.update(batch)
        metric['L_actor'].append(L_actor)
        metric['L_critic'].append(L_critic)
        epoch_bar.set_postfix({'L_actor': f'{L_actor:.6f}', 'L_critic': f'{L_critic:.6f}'})
        if ep % (epochs/10) == 0:        
            policy_DQ = extract_deterministic_policy(diffusionQ, N_s, N_a)
            r_DQ, _ = evaluate_diffusionQ(diffusionQ, R, P, N_iter=100, N_s=N_s, N_a=N_a, sample_method=sample_method)
            print(f"Epoch {ep}/{epochs}" + 
                f" L_actor: {mean_metric(metric['L_actor'][-int(epochs/10):]):.6f}, " + 
                f" L_critic: {mean_metric(metric['L_critic'][-int(epochs/10):]):.6f}, " + 
                f" r_DQ: {r_DQ}, {policyDeter} => {policy_DQ}")

#model_state = diffusionQ.state_dict()
#torch.save(model_state, "model_state.pth")

                                                      

Avg Reward MDP: 0.8344596318286731


Epoch:  10%|█         | 100/1000 [01:19<1:47:13,  7.15s/epoch, L_actor=1.799448, L_critic=5.805151]

Epoch 100/1000 L_actor: 0.929092,  L_critic: 6.119413,  r_DQ: 0.30662726255317807, [0 2 3 1] => [0, 0, 2, 2]


Epoch:  20%|██        | 200/1000 [02:49<1:52:33,  8.44s/epoch, L_actor=-0.714319, L_critic=4.983795]

Epoch 200/1000 L_actor: 0.513368,  L_critic: 5.900342,  r_DQ: 0.479830231363773, [0 2 3 1] => [0, 1, 2, 2]


Epoch:  30%|███       | 300/1000 [04:14<1:19:55,  6.85s/epoch, L_actor=-2.517940, L_critic=5.626985]

Epoch 300/1000 L_actor: -0.352703,  L_critic: 5.602450,  r_DQ: 0.5274882591128633, [0 2 3 1] => [0, 2, 2, 2]


Epoch:  40%|████      | 400/1000 [05:44<1:16:10,  7.62s/epoch, L_actor=-15.899167, L_critic=5.167257]

Epoch 400/1000 L_actor: -2.622108,  L_critic: 5.316918,  r_DQ: 0.45575822948236744, [0 2 3 1] => [2, 2, 2, 2]


                                                                                                     

KeyboardInterrupt: 