# Twin-Delayed DDPG

## Installing the packages

In [None]:
# Cloning Github Repository and navigate into project folder
!git clone https://github.com/francmeister/Masters-Research-Project.git

In [None]:
cd Masters-Research-Project/Multi-User-MEC-System/

In [None]:
!pip install -e Network_Env


In [None]:
cd Network_Env/

## Importing the libraries

In [None]:
import os
import time
import random
import numpy as np
import matplotlib.pyplot as plt
#from NetworkEnv_ import NetworkEnv_
import Network_Env
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import pygame
from gym import wrappers
from torch.autograd import Variable
from collections import deque
from google.colab import files
from numpy import interp


## Step 1: We initialize the Experience Replay memory

In [None]:
class ReplayBuffer(object):

  def __init__(self, max_size=1e6):
    self.storage = []
    self.max_size = max_size
    self.ptr = 0

  def add(self, transition):
    if len(self.storage) == self.max_size:
      self.storage[int(self.ptr)] = transition
      self.ptr = (self.ptr + 1) % self.max_size
    else:
      self.storage.append(transition)

  def sample(self, batch_size):
    ind = np.random.randint(0, len(self.storage), size=batch_size)
    batch_states, batch_next_states, batch_actions, batch_rewards, batch_dones = [], [], [], [], []
    for i in ind: 
      state, next_state, action, reward, done = self.storage[i]
      batch_states.append(np.array(state, copy=False))
      batch_next_states.append(np.array(next_state, copy=False))
      batch_actions.append(np.array(action, copy=False))
      batch_rewards.append(np.array(reward, copy=False))
      batch_dones.append(np.array(done, copy=False))
    return np.array(batch_states), np.array(batch_next_states), np.array(batch_actions), np.array(batch_rewards).reshape(-1, 1), np.array(batch_dones).reshape(-1, 1)

## Step 2: We build one neural network for the Actor model and one neural network for the Actor target

In [None]:
class Actor(nn.Module):
  
  def __init__(self, state_dim, action_dim, max_action):
    super(Actor, self).__init__()
    self.layer_1 = nn.Linear(state_dim, 400)
    #self.layer_1 = nn.Linear(state_dim, 1000)
    self.layer_2 = nn.Linear(400, 300)
    #self.layer_2 = nn.Linear(1000, 800)
    self.layer_3 = nn.Linear(300, action_dim)
    #self.layer_3 = nn.Linear(800, action_dim)
    self.max_action = max_action

  def forward(self, x):
    x = F.relu(self.layer_1(x))
    x = F.relu(self.layer_2(x))
    x = self.max_action * torch.sigmoid(self.layer_3(x))
    #x = self.max_action * torch.tanh(self.layer_3(x))
    return x

## Step 3: We build two neural networks for the two Critic models and two neural networks for the two Critic targets

In [None]:
class Critic(nn.Module):
  
  def __init__(self, state_dim, action_dim):
    super(Critic, self).__init__()
    # Defining the first Critic neural network
    self.layer_1 = nn.Linear(state_dim + action_dim, 400)
    #self.layer_1 = nn.Linear(state_dim + action_dim, 1000)
    self.layer_2 = nn.Linear(400, 300)
    #self.layer_2 = nn.Linear(1000, 800)
    self.layer_3 = nn.Linear(300, 1)
    #self.layer_3 = nn.Linear(800, 1)
    # Defining the second Critic neural network
    self.layer_4 = nn.Linear(state_dim + action_dim, 400)
    #self.layer_4 = nn.Linear(state_dim + action_dim, 1000)
    self.layer_5 = nn.Linear(400, 300)
    #self.layer_5 = nn.Linear(1000, 800)
    self.layer_6 = nn.Linear(300, 1)
    #self.layer_6 = nn.Linear(800, 1)

  def forward(self, x, u):
    xu = torch.cat([x, u], 1)
    # Forward-Propagation on the first Critic Neural Network
    x1 = F.relu(self.layer_1(xu))
    x1 = F.relu(self.layer_2(x1))
    x1 = self.layer_3(x1)
    # Forward-Propagation on the second Critic Neural Network
    x2 = F.relu(self.layer_4(xu))
    x2 = F.relu(self.layer_5(x2))
    x2 = self.layer_6(x2)
    return x1, x2

  def Q1(self, x, u):
    xu = torch.cat([x, u], 1)
    x1 = F.relu(self.layer_1(xu))
    x1 = F.relu(self.layer_2(x1))
    x1 = self.layer_3(x1)
    return x1

## Steps 4 to 15: Training Process

In [None]:
# Selecting the device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Building the whole Training Process into a class

class TD3(object):
  
  def __init__(self, state_dim, action_dim, max_action):
    self.actor = Actor(state_dim, action_dim, max_action).to(device)
    self.actor_target = Actor(state_dim, action_dim, max_action).to(device)
    self.actor_target.load_state_dict(self.actor.state_dict())
    self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),lr=0.0000001)
    #self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),lr=0.00000001)
    
    self.critic = Critic(state_dim, action_dim).to(device)
    self.critic_target = Critic(state_dim, action_dim).to(device)
    self.critic_target.load_state_dict(self.critic.state_dict())
    self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),lr=0.0001)
    #self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),lr=0.00001)
    self.max_action = max_action

  def select_action(self, state):
    state = torch.Tensor(state).to(device)
    #return self.actor(state).cpu().data.numpy().flatten()
    return self.actor(state).cpu().data.numpy()

  def train(self, replay_buffer, iterations, batch_size=100, discount=0.99, tau=0.005, policy_noise=0.3, noise_clip=0.5, policy_freq=2):
    
    for it in range(iterations):
      
      # Step 4: We sample a batch of transitions (s, s’, a, r) from the memory
      batch_states, batch_next_states, batch_actions, batch_rewards, batch_dones = replay_buffer.sample(batch_size)
    
      #batch_states = np.reshape(batch_states,(batch_states.shape[0]*batch_states.shape[1],batch_states.shape[2]))
      #batch_next_states = np.reshape(batch_next_states,(batch_next_states.shape[0]*batch_next_states.shape[1],batch_next_states.shape[2]))
      #batch_actions = np.reshape(batch_actions,(batch_actions.shape[0]*batch_actions.shape[1],batch_actions.shape[2]))

      state = torch.Tensor(batch_states).to(device)
      next_state = torch.Tensor(batch_next_states).to(device)
      action = torch.Tensor(batch_actions).to(device)
      reward = torch.Tensor(batch_rewards).to(device)
      done = torch.Tensor(batch_dones).to(device)
      
      # Step 5: From the next state s’, the Actor target plays the next action a’
      next_action = self.actor_target(next_state)
      
      # Step 6: We add Gaussian noise to this next action a’ and we clamp it in a range of values supported by the environment
      noise = torch.Tensor(batch_actions).data.normal_(0, policy_noise).to(device)
      noise = noise.clamp(-noise_clip, noise_clip)
      next_action = (next_action + noise).clamp(-self.max_action, self.max_action)

      
      # Step 7: The two Critic targets take each the couple (s’, a’) as input and return two Q-values Qt1(s’,a’) and Qt2(s’,a’) as outputs
      target_Q1, target_Q2 = self.critic_target(next_state, torch.Tensor(next_action).to(device))
      
      # Step 8: We keep the minimum of these two Q-values: min(Qt1, Qt2)
      target_Q = torch.min(target_Q1, target_Q2)
      # Step 9: We get the final target of the two Critic models, which is: Qt = r + γ * min(Qt1, Qt2), where γ is the discount factor
      target_Q = reward + ((1 - done) * discount * target_Q).detach()
      
      # Step 10: The two Critic models take each the couple (s, a) as input and return two Q-values Q1(s,a) and Q2(s,a) as outputs
      current_Q1, current_Q2 = self.critic(state, action)
      
      # Step 11: We compute the loss coming from the two Critic models: Critic Loss = MSE_Loss(Q1(s,a), Qt) + MSE_Loss(Q2(s,a), Qt)
      critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)
      
      # Step 12: We backpropagate this Critic loss and update the parameters of the two Critic models with a SGD optimizer
      self.critic_optimizer.zero_grad()
      critic_loss.backward()
      self.critic_optimizer.step()
      
      # Step 13: Once every two iterations, we update our Actor model by performing gradient ascent on the output of the first Critic model
      if it % policy_freq == 0:
        actor_loss = -self.critic.Q1(state, self.actor(state)).mean()
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()
        
        # Step 14: Still once every two iterations, we update the weights of the Actor target by polyak averaging
        for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
          target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)
        
        # Step 15: Still once every two iterations, we update the weights of the Critic target by polyak averaging
        for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
          target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)
  
  # Making a save method to save a trained model
  def save(self, filename, directory):
    torch.save(self.actor.state_dict(), '%s/%s_actor.pth' % (directory, filename))
    torch.save(self.critic.state_dict(), '%s/%s_critic.pth' % (directory, filename))
  
  # Making a load method to load a pre-trained model
  def load(self, filename, directory):
    self.actor.load_state_dict(torch.load('%s/%s_actor.pth' % (directory, filename)))
    self.critic.load_state_dict(torch.load('%s/%s_critic.pth' % (directory, filename)))

# We build respective classes for DDPG implementation

## One DDPG Actor 

In [None]:
class Actor_DDPG(nn.Module):
	def __init__(self, state_dim, action_dim, max_action):
		super(Actor_DDPG, self).__init__()

		self.layer_1 = nn.Linear(state_dim, 400)
		self.layer_2 = nn.Linear(400, 300)
		self.layer_3 = nn.Linear(300, action_dim)
		
		self.max_action = max_action

	
	def forward(self, state):
		x = F.relu(self.layer_1(state))
		x = F.relu(self.layer_2(x))
		return self.max_action * torch.sigmoid(self.layer_3(x))

# One DDPG Critic

In [None]:
class Critic_DDPG(nn.Module):
	def __init__(self, state_dim, action_dim):
		super(Critic_DDPG, self).__init__()

		#self.layer_1 = nn.Linear(state_dim + action_dim, 400)
		#self.layer_2 = nn.Linear(400, 300)
		self.layer_1 = nn.Linear(state_dim, 400)
		self.layer_2 = nn.Linear(400 + action_dim, 300)
		self.layer_3 = nn.Linear(300, 1)


	def forward(self, state, action):
		#q = F.relu(self.layer_1(torch.cat([state, action], 1)))
		#q = F.relu(self.layer_2(q))

		q = F.relu(self.layer_1(state))
		q = F.relu(self.layer_2(torch.cat([q, action], 1)))
		return self.layer_3(q)


## DDPG Class

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class DDPG(object):
	def __init__(self, state_dim, action_dim, max_action):
		self.actor = Actor_DDPG(state_dim, action_dim, max_action).to(device)
		self.actor_target = Actor_DDPG(state_dim, action_dim, max_action).to(device)
		self.actor_target.load_state_dict(self.actor.state_dict())
		self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=0.000001)

		self.critic = Critic_DDPG(state_dim, action_dim).to(device)
		self.critic_target = Critic_DDPG(state_dim, action_dim).to(device)
		self.critic_target.load_state_dict(self.critic.state_dict())
		self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=0.0001)

	def select_action(self, state):
		#state = torch.FloatTensor(state.reshape(1, -1)).to(device)
		state = torch.Tensor(state).to(device)
		return self.actor(state).cpu().data.numpy()
		#return self.actor(state).cpu().data.numpy().flatten()

	def train(self, replay_buffer, iterations, batch_size=100, discount=0.99, tau=0.005, policy_noise=0.2, noise_clip=0.5, policy_freq=2):
		# Sample replay buffer 
		batch_states, batch_next_states, batch_actions,batch_rewards, batch_dones = replay_buffer.sample(batch_size)
		
		state = torch.Tensor(batch_states).to(device)
		next_state = torch.Tensor(batch_next_states).to(device)
		action = torch.Tensor(batch_actions).to(device)
		reward = torch.Tensor(batch_rewards).to(device)
		done = torch.Tensor(batch_dones).to(device)
      

		# Compute the target Q value
		next_action = self.actor_target(next_state)	
		target_Q = self.critic_target(next_state, torch.Tensor(next_action).to(device))
		#target_Q1, target_Q2 = self.critic_target(next_state, torch.Tensor(next_action).to(device))
		target_Q = reward + ((1 - done) *discount * target_Q).detach()

		# Get current Q estimate
		current_Q = self.critic(state, action)

		# Compute critic loss
		critic_loss = F.mse_loss(current_Q, target_Q)

		# Optimize the critic
		self.critic_optimizer.zero_grad()
		critic_loss.backward()
		self.critic_optimizer.step()

		# Compute actor loss
		actor_loss = -self.critic(state, self.actor(state)).mean()
		
		# Optimize the actor 
		self.actor_optimizer.zero_grad()
		actor_loss.backward()
		self.actor_optimizer.step()

		# Update the frozen target models
		for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
			target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)

		for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
			target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)


	def save(self, filename, directory):
		torch.save(self.actor.state_dict(), '%s/%s_actor.pth' % (directory, filename))
		torch.save(self.critic.state_dict(), '%s/%s_critic.pth' % (directory, filename))


	def load(self, filename, directory):
		self.actor.load_state_dict(torch.load('%s/%s_actor.pth' % (directory, filename)))
		self.critic.load_state_dict(torch.load('%s/%s_critic.pth' % (directory, filename)))
  
  

## We make a function that evaluates the policy by calculating its average reward over 10 episodes

In [None]:
def evaluate_policy(policy, eval_episodes=10):
  avg_reward = 0.
  for _ in range(eval_episodes):
    obs = env.reset()
    done = False
    while not done:
      action = policy.select_action(obs)
      action = env.reshape_action_space_from_model_to_dict(action)
      obs, reward, done, _ = env.step(action)
      avg_reward += reward#interp(sum(reward),[720000000,863000000],[0,1000])
  avg_reward /= eval_episodes
  print ("---------------------------------------")
  print ("Average Reward over the Evaluation Step: %f" % (avg_reward))
  print ("---------------------------------------")
  return avg_reward

## We set the parameters

In [None]:
env_name = "NetworkEnv-v0" # Name of a environment (set it to any Continous environment you want)
policy_type = 'DDPG'
seed = 0 # Random seed number
start_timesteps = 10000 # Number of iterations/timesteps before which the model randomly chooses an action, and after which it starts to use the policy network
eval_freq = 5000 # How often the evaluation step is performed (after how many timesteps)
max_timesteps = 400000 # Total number of iterations/timesteps
save_models = True # Boolean checker whether or not to save the pre-trained model
expl_noise = 0.3 # Exploration noise - STD value of exploration Gaussian noise
batch_size = 100 # Size of the batch
discount = 0.99 # Discount factor gamma, used in the calculation of the total discounted reward
tau = 0.005 # Target network update rate
policy_noise = 0.4 # STD of Gaussian noise added to the actions for the exploration purposes
noise_clip = 0.5 # Maximum value of the Gaussian noise added to the actions (policy)
policy_freq = 2 # Number of iterations to wait before the policy network (Actor model) is updated

## We create a file name for the two saved models: the Actor and Critic models

In [None]:
file_name = "%s_%s_%s" % ("TD3", env_name, str(seed))
file_name_1 = "timestep_rewards_energy_throughput"
file_name_2 = "offloading_actions"
file_name_3 = "power_actions"
file_name_4 = "subcarrier_actions"
file_name_5 = "allocated_RBs"
file_name_6 = "fairnes_index"

file_name_7 = "energy_efficiency_rewards"
file_name_8 = "battery_energy_rewards"
file_name_9 = "throughput_rewards"
file_name_10 = "delay_rewards"
file_name_11 = "sum_allocations_per_RB_matrix"
file_name_12 = "RB_allocation_matrix"
file_name_13 = "energy_rewards"
file_name_14 = "delays"
file_name_15 = "tasks_dropped"
file_name_16 = "outage_probabilties"
file_name_17 = "resource_allocation_constraint_violation_count"
file_name_18 = "urllc_reliability_reward"

file_name_19 = "individual_energy_rewards"
file_name_20 = "individual_channel_rate_rewards"
file_name_21 = "individual_channel_battery_energy_rewards"
file_name_22 = "individual_delay_rewards"
file_name_23 = "individual_queue_delays"
file_name_24 = "individual_tasks_dropped"
file_name_25 = "individual_energy_efficiency"
file_name_26 = "individual_total_reward"
file_name_27 = "total reward"
file_name_28 = "overall_users_reward"
file_name_29 = "q_action"
file_name_30 = "RB_bandwidth"
file_name_31 = "rate_variance"
file_name_32 = "inf_total_reward"
file_name_33 = "inf_energy"
file_name_34 = "inf_throughput"
file_name_35 = "inf_fairness_index"
file_name_36 = "inf_task_delay"
file_name_37 = "urllc_avg_rate"
file_name_38 = "individual_channel_rates"
file_name_39 = "individual_local_queue_delays"
file_name_40 = "individual_offload_queue_delays"
file_name_41 = "individual_local_queue_lengths"
file_name_42 = "individual_offload_queue_lengths"
file_name_43 = "users_lc_service_rates"
file_name_44 = "resource_block_action_matrix"
file_name_45 = "individual_expected_rate_over_prev_T_slot"
file_name_46 = "individual_average_task_size_offload_queue"

file_name_47 = "individual_battery_energy_levels"
file_name_48 = "individual_energy_harvested"
file_name_49 = "throughput_log_reward"
file_name_50 = "individual_local_energy_consumed"
file_name_51 = "individual_offloading_energy"
file_name_52 = "individual_small_scale_gains"
file_name_53 = "individual_large_scale_gains"
file_name_54 = "individual_average_offloading_rates"
file_name_55 = "individual_local_queue_length_num_tasks"
file_name_56 = "individual_offload_queue_length_num_tasks"
file_name_57 = "individual_offload_stability_constraint_reward"
file_name_58 = "total_offload_traffic_reward"
file_name_59 = "individual_offload_traffic_numerator"
file_name_60 = "individual_local_queueing_violation_prob_reward"
file_name_61 = "individual_offload_ratio_reward"
file_name_62 = "total_local_queueing_violation_prob_reward"
file_name_63 = "total_offload_ratio_reward"
file_name_64 = "urllc_total_rate"
file_name_65 = "F_L_inverse"
file_name_66 = "urllc_total_rate_per_second"
file_name_67 = "urllc_total_rate_per_slot"
file_name_68 = "individual_urllc_channel_rate_per_slot_with_penalty"
file_name_69 = "individual_urllc_channel_rate_per_second_penalties"
file_name_70 = "individual_urllc_channel_rate_per_second_without_penalty"
file_name_71 = "individual_urllc_channel_rate_per_second_with_penalty"
file_name_72 = "individual_embb_puncturing_users_sum_data_rates"
file_name_73 = "individual_embb_num_puncturing_users"







print ("---------------------------------------")
print ("Settings: %s" % (file_name))
print ("---------------------------------------")

## We create a folder inside which will be saved the trained models

In [None]:
if not os.path.exists("./results"):
  os.makedirs("./results")
if not os.path.exists("./inference_results"):
  os.makedirs("./inference_results")
if save_models and not os.path.exists("./pytorch_models"):
  os.makedirs("./pytorch_models")

## We create the PyBullet environment

In [None]:
env = gym.make(env_name)
#env = NetworkEnv_()

## We set seeds and we get the necessary information on the states and actions in the chosen environment

In [None]:
#env.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)
state_dim = env.observation_space_dim
action_dim = env.action_space_dim
max_action = float(env.box_action_space.high[0][1]) # to change this soon
print('action sapce dim: ', action_dim)


## We create the policy network (the Actor model)

In [None]:
if policy_type == 'TD3':
    policy = TD3(state_dim, action_dim, max_action)
elif policy_type == 'DDPG':
    policy = DDPG(state_dim, action_dim, max_action)

## We create the Experience Replay memory

In [None]:
replay_buffer = ReplayBuffer()

## We define a list where all the evaluation results over 10 episodes are stored

In [None]:
evaluations = [evaluate_policy(policy)]

## We create a new folder directory in which the final results (videos of the agent) will be populated

In [None]:
def mkdir(base, name):
    path = os.path.join(base, name)
    if not os.path.exists(path):
        os.makedirs(path)
    return path
work_dir = mkdir('exp', 'brs')
monitor_dir = mkdir(work_dir, 'monitor')
max_episode_steps = env.STEP_LIMIT
save_env_vid = False
if save_env_vid:
  env = wrappers.Monitor(env, monitor_dir, force = True)
  env.reset()

## We initialize the variables

In [1]:
total_timesteps = 0
timesteps_since_eval = 0
episode_num = 0
done = True
t0 = time.time()
timestep_rewards = []
timestep_rewards_energy_throughput_delays = []
offload_actions = []
power_actions = []
subcarrier_actions = []
allocated_RBs = []
fairness_index = []
energy_efficiency_rewards = []
battery_energy_rewards = []
energy_rewards = []
throughput_rewards = []
delay_rewards = []
sum_allocations_per_RB_matrix = []
change_action = 0
RB_allocation_matrix = []
delays = []
urllc_reliability_reward = []
tasks_dropped = []
resource_allocation_matrix = []
resource_allocation_constraint_violation_count = []
outage_probabilties = []
urllc_reliability_reward_normalized = []
individual_energy_rewards = []
individual_channel_rate_rewards = []
individual_channel_battery_energy_rewards = []
individual_delay_rewards = []
individual_queue_delays = []
individual_tasks_dropped = []
individual_energy_efficiency = []
individual_total_reward = []
total_reward = []
overall_users_reward = []
q_actions = []
RB_bandwidths = []
rate_variances = []
inf_task_delays = []
urllc_avg_rate = []
individual_channel_rates = []

individual_local_queue_delays = []
individual_offload_queue_delays = []
individual_local_queue_lengths = []
individual_offload_queue_lengths = []

users_lc_service_rates = []
resource_block_action_matrix = []

individual_expected_rate_over_prev_T_slot = []
individual_average_task_size_offload_queue = []

individual_battery_energy_levels = []
individual_energy_harvested = []
throughput_log_reward = []

individual_local_energy_consumed = []
individual_offloading_energy = []

individual_small_scale_gains = []
individual_large_scale_gains = []
individual_average_offloading_rates = []


individual_local_queue_length_num_tasks = []
individual_offload_queue_length_num_tasks = []
individual_offload_stability_constraint_reward = []
total_offload_traffic_reward = []
individual_offload_traffic_numerator = []
individual_local_queueing_violation_prob_reward = []
individual_offload_ratio_reward = []
total_local_queueing_violation_prob_reward = []
total_offload_ratio_reward = []
urllc_total_rate = []
F_L_inverse = []


urllc_total_rate_per_second = []
urllc_total_rate_per_slot = []
individual_urllc_channel_rate_per_slot_with_penalty = []
individual_urllc_channel_rate_per_second_penalties = []
individual_urllc_channel_rate_per_second_without_penalty = []
individual_urllc_channel_rate_per_second_with_penalty = []
individual_embb_puncturing_users_sum_data_rates = []
individual_embb_num_puncturing_users = []

NameError: name 'time' is not defined

## Training

## Before training, generate random observation samples to get their limits

In [None]:
timesteps = np.arange(0,env.STEP_LIMIT,1)
episodes = 1
obs = env.reset()
number_of_users = env.number_of_users
number_of_RBs = env.num_allocate_RB_upper_bound
small_scale_channel_gains = []
large_scale_channel_gains = []
battery_energy_levels = []
local_queue_lengths = []
offloading_queue_lengths = []
#observation = np.column_stack((observation_channel_gains,observation_battery_energies,observation_offloading_queue_lengths,observation_local_queue_lengths,num_urllc_arriving_packets)) #observation_channel_gains.
for episode in episodes:
    for timestep in timesteps:
        #print('----------------------------------------------------------------------------------------------------------------------------------------------------')
        action = env.action_space.sample()
        action = env.enforce_constraint(action)
        #print(action)
        action2, action = env.reshape_action_space_dict(action)
        observation,reward,dones,info = env.step_(action)
        small_scale_channel_gains.append(observation[0:number_of_users*number_of_RBs])
        large_scale_channel_gains.append(observation[number_of_users*number_of_RBs:(number_of_users*number_of_RBs)*2])
        battery_energy_levels.append(observation[(number_of_users*number_of_RBs)*2:(number_of_users*number_of_RBs)*2+number_of_users])
        offloading_queue_lengths.append(observation[(number_of_users*number_of_RBs)*2+number_of_users:(number_of_users*number_of_RBs)*2+number_of_users*2])
        local_queue_lengths.append(observation[(number_of_users*number_of_RBs)*2+number_of_users*2:(number_of_users*number_of_RBs)*2+number_of_users*4])
        print('observation:')
        print(observation)
        print('----------------------------------------------------------')
        print('small_scale_channel_gains:')
        print(small_scale_channel_gains)
        print('----------------------------------------------------------')
        print('large_scale_channel_gains:')
        print(large_scale_channel_gains)
        print('----------------------------------------------------------')
        print('offloading_queue_lengths:')
        print(offloading_queue_lengths)
        print('----------------------------------------------------------')
        print('local_queue_lengths')
        print(local_queue_lengths)

print('small_scale_channel_gains:')
print(small_scale_channel_gains)
print('----------------------------------------------------------')
print('large_scale_channel_gains:')
print(large_scale_channel_gains)
print('----------------------------------------------------------')
print('offloading_queue_lengths:')
print(offloading_queue_lengths)
print('----------------------------------------------------------')
print('local_queue_lengths')
print(local_queue_lengths)

In [1]:
%%time
# We start the main loop over 500,000 timesteps
while total_timesteps < max_timesteps:
  
  # If the episode is done
  if done:

    # If we are not at the very beginning, we start the training process of the model
    if total_timesteps != 0:
      print("Total Timesteps: {} Episode Num: {} Reward: {}".format(total_timesteps, episode_num, episode_reward))
      timestep_rewards.append([total_timesteps, episode_reward])
      timestep_rewards_energy_throughput_delays.append([total_timesteps,episode_reward,env.total_energy,env.total_rate,env.SBS1.total_delay])
      offload_actions.append(env.offload_decisions)
      power_actions.append(env.powers)
      subcarrier_actions.append(env.subcarriers)
      allocated_RBs.append(env.Communication_Channel_1.allocated_RBs)
      fairness_index.append(env.SBS1.fairness_index)
      outage_probabilties.append(env.SBS1.outage_probability)

      energy_efficiency_rewards.append(env.SBS1.energy_efficiency_rewards)
      battery_energy_rewards.append(env.SBS1.battery_energy_rewards)
      throughput_rewards.append(env.SBS1.throughput_rewards)
      delay_rewards.append(env.SBS1.delay_rewards)
      sum_allocations_per_RB_matrix.append(env.sum_allocations_per_RB_matrix)
      RB_allocation_matrix.append(env.RB_allocation_matrix)
      energy_rewards.append(env.SBS1.energy_rewards)
      delays.append(env.SBS1.delays)
      tasks_dropped.append(env.SBS1.tasks_dropped)
      resource_allocation_matrix.append(env.resource_block_allocation_matrix)
      resource_allocation_constraint_violation_count.append(env.resource_allocation_constraint_violation)
      urllc_reliability_reward.append(env.SBS1.urllc_reliability_reward)

      individual_energy_rewards.append(env.SBS1.individual_energy_rewards)
      individual_channel_rate_rewards.append(env.SBS1.individual_channel_rate_rewards)
      individual_channel_rates.append(env.SBS1.individual_channel_rates)
      individual_channel_battery_energy_rewards.append(env.SBS1.individual_channel_rate_rewards)
      individual_delay_rewards.append(env.SBS1.individual_delay_rewards)
      individual_queue_delays.append(env.SBS1.individual_queue_delays)
      individual_tasks_dropped.append(env.SBS1.individual_tasks_dropped)
      individual_energy_efficiency.append(env.SBS1.individual_energy_efficiency)
      individual_total_reward.append(env.SBS1.individual_total_reward)
      total_reward.append(env.SBS1.total_reward)
      overall_users_reward.append(env.SBS1.overall_users_reward)
      q_actions.append(env.SBS1.q_action)
      RB_bandwidths.append(env.RB_bandwidth)
      rate_variances.append(env.SBS1.users_rate_variance_sum)
      urllc_avg_rate.append(env.SBS1.average_rate_prev_slots)
      individual_local_queue_delays.append(env.SBS1.individual_local_queue_delays)
      individual_offload_queue_delays.append(env.SBS1.individual_offload_queue_delays)
      individual_local_queue_lengths.append(env.SBS1.individual_local_queue_lengths)
      individual_offload_queue_lengths.append(env.SBS1.individual_offload_queue_lengths)
      users_lc_service_rates.clear()
      users_lc_service_rates.append(env.SBS1.users_lc_service_rates)
      resource_block_action_matrix.append(env.resource_block_action_matrix)

      individual_expected_rate_over_prev_T_slot.append(env.SBS1.individual_expected_rate_over_prev_T_slot)
      individual_average_task_size_offload_queue.append(env.SBS1.individual_average_task_size_offload_queue)

      individual_battery_energy_levels.append(env.SBS1.individual_battery_energy_levels)
      individual_energy_harvested.append(env.SBS1.individual_energy_harvested)
      throughput_log_reward.append(env.SBS1.throughput_log_reward)
      individual_local_energy_consumed.append(env.SBS1.individual_local_energy_consumed)
      individual_offloading_energy.append(env.SBS1.individual_offloading_energy)
      individual_small_scale_gains.append(env.SBS1.individual_small_scale_gains)
      individual_large_scale_gains.append(env.SBS1.individual_large_scale_gains)
      individual_average_offloading_rates.append(env.SBS1.individual_average_offloading_rates)
      individual_local_queue_length_num_tasks.append(env.SBS1.individual_local_queue_length_num_tasks)
      individual_offload_queue_length_num_tasks.append(env.SBS1.individual_offload_queue_length_num_tasks)
      individual_offload_stability_constraint_reward.append(env.SBS1.individual_offload_stability_constraint_reward)
      total_offload_traffic_reward.append(env.SBS1.total_offload_traffic_reward)
      individual_offload_traffic_numerator.append(env.SBS1.individual_offload_traffic_numerator)
      individual_local_queueing_violation_prob_reward.append(env.SBS1.individual_local_queueing_violation_prob_reward)
      individual_offload_ratio_reward.append(env.SBS1.individual_offload_ratio_reward)
      total_local_queueing_violation_prob_reward.append(env.SBS1.total_local_queueing_violation_prob_reward)
      total_offload_ratio_reward.append(env.SBS1.total_offload_ratio_reward)
      urllc_total_rate.append(env.SBS1.urllc_total_rate)
      F_L_inverse.append(env.SBS1.F_L_inverse)

      urllc_total_rate_per_second.append(env.SBS1.urllc_total_rate_per_second)
      urllc_total_rate_per_slot.append(env.SBS1.urllc_total_rate_per_slot)
      individual_urllc_channel_rate_per_slot_with_penalty.append(env.SBS1.individual_urllc_channel_rate_per_slot_with_penalty)
      individual_urllc_channel_rate_per_second_penalties.append(env.SBS1.individual_urllc_channel_rate_per_second_penalties)
      individual_urllc_channel_rate_per_second_without_penalty.append(env.SBS1.individual_urllc_channel_rate_per_second_without_penalty)
      individual_urllc_channel_rate_per_second_with_penalty.append(env.SBS1.individual_urllc_channel_rate_per_second_with_penalty)
      individual_embb_puncturing_users_sum_data_rates.append(env.SBS1.individual_embb_puncturing_users_sum_data_rates)
      individual_embb_num_puncturing_users.append(env.SBS1.individual_embb_num_puncturing_users)

      policy.train(replay_buffer, episode_timesteps, batch_size, discount, tau, policy_noise, noise_clip, policy_freq)

    # We evaluate the episode and we save the policy
    if timesteps_since_eval >= eval_freq:
      timesteps_since_eval %= eval_freq
      evaluations.append(evaluate_policy(policy))
      policy.save(file_name, directory="./pytorch_models")
      np.save("./results/%s" % (file_name), evaluations)
      np.save("./results/%s" % (file_name_1), timestep_rewards_energy_throughput_delays)
      np.save("./results/%s" % (file_name_2), offload_actions)
      np.save("./results/%s" % (file_name_3), power_actions)
      np.save("./results/%s" % (file_name_4), subcarrier_actions)
      np.save("./results/%s" % (file_name_5), allocated_RBs)
      np.save("./results/%s" % (file_name_6), fairness_index)
      np.save("./results/%s" % (file_name_7), energy_efficiency_rewards)
      np.save("./results/%s" % (file_name_8), battery_energy_rewards)
      np.save("./results/%s" % (file_name_9), throughput_rewards)
      np.save("./results/%s" % (file_name_10), delay_rewards)
      np.save("./results/%s" % (file_name_11), sum_allocations_per_RB_matrix)
      np.save("./results/%s" % (file_name_12), RB_allocation_matrix)
      np.save("./results/%s" % (file_name_13), energy_rewards)
      np.save("./results/%s" % (file_name_14), delays)
      np.save("./results/%s" % (file_name_15), tasks_dropped)
      np.save("./results/%s" % (file_name_16), outage_probabilties)
      np.save("./results/%s" % (file_name_17), resource_allocation_constraint_violation_count)
      np.save("./results/%s" % (file_name_18), urllc_reliability_reward)

      np.save("./results/%s" % (file_name_19), individual_energy_rewards)
      np.save("./results/%s" % (file_name_20), individual_channel_rate_rewards)
      np.save("./results/%s" % (file_name_21), individual_channel_battery_energy_rewards)
      np.save("./results/%s" % (file_name_22), individual_delay_rewards)
      np.save("./results/%s" % (file_name_23), individual_queue_delays)
      np.save("./results/%s" % (file_name_24), individual_tasks_dropped)
      np.save("./results/%s" % (file_name_25), individual_energy_efficiency)
      np.save("./results/%s" % (file_name_26), individual_total_reward)
      np.save("./results/%s" % (file_name_27), total_reward)
      np.save("./results/%s" % (file_name_28), overall_users_reward)
      np.save("./results/%s" % (file_name_29), q_actions)
      np.save("./results/%s" % (file_name_30), RB_bandwidths)
      np.save("./results/%s" % (file_name_37), urllc_avg_rate)
      np.save("./results/%s" % (file_name_38), individual_channel_rates)

      np.save("./results/%s" % (file_name_39), individual_local_queue_delays)
      np.save("./results/%s" % (file_name_40), individual_offload_queue_delays)
      np.save("./results/%s" % (file_name_41), individual_local_queue_lengths)
      np.save("./results/%s" % (file_name_42), individual_offload_queue_lengths)
      np.save("./results/%s" % (file_name_43), users_lc_service_rates)
      np.save("./results/%s" % (file_name_45), individual_expected_rate_over_prev_T_slot)
      np.save("./results/%s" % (file_name_46), individual_average_task_size_offload_queue)
      np.save("./results/%s" % (file_name_47), individual_battery_energy_levels)
      np.save("./results/%s" % (file_name_48), individual_energy_harvested)
      np.save("./results/%s" % (file_name_49), throughput_log_reward)
      np.save("./results/%s" % (file_name_50), individual_local_energy_consumed)
      np.save("./results/%s" % (file_name_51), individual_offloading_energy)
      np.save("./results/%s" % (file_name_52), individual_small_scale_gains)
      np.save("./results/%s" % (file_name_53), individual_large_scale_gains)
      np.save("./results/%s" % (file_name_54), individual_average_offloading_rates)
      np.save("./results/%s" % (file_name_55), individual_local_queue_length_num_tasks)
      np.save("./results/%s" % (file_name_56), individual_offload_queue_length_num_tasks)
      np.save("./results/%s" % (file_name_57), individual_offload_stability_constraint_reward)
      np.save("./results/%s" % (file_name_58), total_offload_traffic_reward)
      np.save("./results/%s" % (file_name_59), individual_offload_traffic_numerator)
      np.save("./results/%s" % (file_name_60), individual_local_queueing_violation_prob_reward)
      np.save("./results/%s" % (file_name_61), individual_offload_ratio_reward)
      np.save("./results/%s" % (file_name_62), total_local_queueing_violation_prob_reward)
      np.save("./results/%s" % (file_name_63), total_offload_ratio_reward)
      np.save("./results/%s" % (file_name_64), urllc_total_rate)
      np.save("./results/%s" % (file_name_65), F_L_inverse)

      np.save("./results/%s" % (file_name_66), urllc_total_rate_per_second)
      np.save("./results/%s" % (file_name_67), urllc_total_rate_per_slot)
      np.save("./results/%s" % (file_name_68), individual_urllc_channel_rate_per_slot_with_penalty)
      np.save("./results/%s" % (file_name_69), individual_urllc_channel_rate_per_second_penalties)
      np.save("./results/%s" % (file_name_70), individual_urllc_channel_rate_per_second_without_penalty)
      np.save("./results/%s" % (file_name_71), individual_urllc_channel_rate_per_second_with_penalty)

      np.save("./results/%s" % (file_name_72), individual_embb_puncturing_users_sum_data_rates)
      np.save("./results/%s" % (file_name_73), individual_embb_num_puncturing_users)     


      #np.save("./results/%s" % (file_name_44), resource_block_action_matrix)
      
      
      

    # When the training step is done, we reset the state of the environment
    obs = env.reset()
    
    # Set the Done to False
    done = False
    
    # Set rewards and episode timesteps to zero
    episode_reward = 0
    episode_timesteps = 0
    episode_num += 1
  
  # Before 10000 timesteps, we play random actions
  if total_timesteps < start_timesteps:
    action = env.action_space.sample()
    action = env.enforce_constraint(action)
    action2, action = env.reshape_action_space_dict(action)
    
    new_obs, reward, done, _ = env.step(action)

  else: # After 10000 timesteps, we switch to the model
    action = policy.select_action(np.array(obs))
    # If the explore_noise parameter is not 0, we add noise to the action and we clip it
    if expl_noise != 0:
      action = (action + np.random.normal(0, expl_noise, size=env.action_space.shape)).clip(env.action_space_low, env.action_space_high)

    #print(action)
    action = env.reshape_action_space_from_model_to_dict(action)
    reformed_action = env.apply_resource_allocation_constraint(action)

    new_obs, reward, done, _ = env.step(reformed_action)
    #action = env.enforce_constraint(action)
    #print(action)
    
  
  #print("Action in training")
  #print(action)
  #print(' ')
  # The agent performs the action in the environment, then reaches the next state and receives the reward
  #new_obs, reward, done, _ = env.step(action)
  #done = dones[len(dones) - 1]
  # We check if the episode is done
  done_bool = 0 if episode_timesteps + 1 == env.STEP_LIMIT else float(done)
  
  # We increase the total reward
  episode_reward += reward
  #print('episode_reward: ', episode_reward)
  #print('episode reward')
  #print(episode_reward)
  #episode_reward = interp(episode_reward,[720000000,863000000],[0,1000])
  # We store the new transition into the Experience Replay memory (ReplayBuffer)
  action = env.reshape_action_space_for_model(action)
  replay_buffer.add((obs, new_obs, action, reward, done_bool))

  # We update the state, the episode timestep, the total timesteps, and the timesteps since the evaluation of the policy
  obs = new_obs
  episode_timesteps += 1
  total_timesteps += 1
  timesteps_since_eval += 1

# We add the last policy evaluation to our list of evaluations and we save our model
evaluations.append(evaluate_policy(policy))
if save_models: policy.save("%s" % (file_name), directory="./pytorch_models")
np.save("./results/%s" % (file_name), evaluations)
np.save("./results/%s" % (file_name_1), timestep_rewards_energy_throughput_delays)
np.save("./results/%s" % (file_name_2), offload_actions)
np.save("./results/%s" % (file_name_3), power_actions)
np.save("./results/%s" % (file_name_4), subcarrier_actions)
np.save("./results/%s" % (file_name_5), allocated_RBs)
np.save("./results/%s" % (file_name_6), fairness_index)
np.save("./results/%s" % (file_name_7), energy_efficiency_rewards)
np.save("./results/%s" % (file_name_8), battery_energy_rewards)
np.save("./results/%s" % (file_name_9), throughput_rewards)
np.save("./results/%s" % (file_name_10), delay_rewards)
np.save("./results/%s" % (file_name_11), sum_allocations_per_RB_matrix)
np.save("./results/%s" % (file_name_12), RB_allocation_matrix)
np.save("./results/%s" % (file_name_13), energy_rewards)
np.save("./results/%s" % (file_name_14), delays)
np.save("./results/%s" % (file_name_15), tasks_dropped)
np.save("./results/%s" % (file_name_16), outage_probabilties)
np.save("./results/%s" % (file_name_17), resource_allocation_constraint_violation_count)
np.save("./results/%s" % (file_name_18), urllc_reliability_reward)

np.save("./results/%s" % (file_name_19), individual_energy_rewards)
np.save("./results/%s" % (file_name_20), individual_channel_rate_rewards)
np.save("./results/%s" % (file_name_21), individual_channel_battery_energy_rewards)
np.save("./results/%s" % (file_name_22), individual_delay_rewards)
np.save("./results/%s" % (file_name_23), individual_queue_delays)
np.save("./results/%s" % (file_name_24), individual_tasks_dropped)
np.save("./results/%s" % (file_name_25), individual_energy_efficiency)
np.save("./results/%s" % (file_name_26), individual_total_reward)
np.save("./results/%s" % (file_name_27), total_reward)
np.save("./results/%s" % (file_name_28), overall_users_reward)
np.save("./results/%s" % (file_name_37), urllc_avg_rate)
np.save("./results/%s" % (file_name_38), individual_channel_rates)
np.save("./results/%s" % (file_name_39), individual_local_queue_delays)
np.save("./results/%s" % (file_name_40), individual_offload_queue_delays)
np.save("./results/%s" % (file_name_41), individual_local_queue_lengths)
np.save("./results/%s" % (file_name_42), individual_offload_queue_lengths)
np.save("./results/%s" % (file_name_43), users_lc_service_rates)
np.save("./results/%s" % (file_name_45), individual_expected_rate_over_prev_T_slot)
np.save("./results/%s" % (file_name_46), individual_average_task_size_offload_queue)
np.save("./results/%s" % (file_name_47), individual_battery_energy_levels)
np.save("./results/%s" % (file_name_48), individual_energy_harvested)
np.save("./results/%s" % (file_name_49), throughput_log_reward)
np.save("./results/%s" % (file_name_50), individual_local_energy_consumed)
np.save("./results/%s" % (file_name_51), individual_offloading_energy)
np.save("./results/%s" % (file_name_52), individual_small_scale_gains)
np.save("./results/%s" % (file_name_53), individual_large_scale_gains)
np.save("./results/%s" % (file_name_54), individual_average_offloading_rates)
np.save("./results/%s" % (file_name_55), individual_local_queue_length_num_tasks)
np.save("./results/%s" % (file_name_56), individual_offload_queue_length_num_tasks)
np.save("./results/%s" % (file_name_57), individual_offload_stability_constraint_reward)
np.save("./results/%s" % (file_name_58), total_offload_traffic_reward)
np.save("./results/%s" % (file_name_59), individual_offload_traffic_numerator)
np.save("./results/%s" % (file_name_60), individual_local_queueing_violation_prob_reward)
np.save("./results/%s" % (file_name_61), individual_offload_ratio_reward)
np.save("./results/%s" % (file_name_62), total_local_queueing_violation_prob_reward)
np.save("./results/%s" % (file_name_63), total_offload_ratio_reward)
np.save("./results/%s" % (file_name_64), urllc_total_rate)
np.save("./results/%s" % (file_name_65), F_L_inverse)
np.save("./results/%s" % (file_name_66), urllc_total_rate_per_second)
np.save("./results/%s" % (file_name_67), urllc_total_rate_per_slot)
np.save("./results/%s" % (file_name_68), individual_urllc_channel_rate_per_slot_with_penalty)
np.save("./results/%s" % (file_name_69), individual_urllc_channel_rate_per_second_penalties)
np.save("./results/%s" % (file_name_70), individual_urllc_channel_rate_per_second_without_penalty)
np.save("./results/%s" % (file_name_71), individual_urllc_channel_rate_per_second_with_penalty)
np.save("./results/%s" % (file_name_72), individual_embb_puncturing_users_sum_data_rates)
np.save("./results/%s" % (file_name_73), individual_embb_num_puncturing_users) 
#np.save("./results/%s" % (file_name_44), resource_block_action_matrix)

IndentationError: unexpected indent (3466533948.py, line 192)

## Inference

In [None]:
inf_total_reward = []
inf_energy = []
inf_throughput = []
inf_fairness_index = []
inf_num_RBs_allocated = []
inf_outage_probability = []
inf_individual_channel_rates = []

In [None]:
import math

In [None]:
class Actor(nn.Module):
  
  def __init__(self, state_dim, action_dim, max_action):
    super(Actor, self).__init__()
    self.layer_1 = nn.Linear(state_dim, 400)
    self.layer_2 = nn.Linear(400, 300)
    self.layer_3 = nn.Linear(300, action_dim)
    self.max_action = max_action

  def forward(self, x):
    x = F.relu(self.layer_1(x))
    x = F.relu(self.layer_2(x))
    x = self.max_action * torch.sigmoid(self.layer_3(x))
    #x = self.max_action * torch.tanh(self.layer_3(x))
    return x

class Critic(nn.Module):
  
  def __init__(self, state_dim, action_dim):
    super(Critic, self).__init__()
    # Defining the first Critic neural network
    self.layer_1 = nn.Linear(state_dim + action_dim, 400)
    self.layer_2 = nn.Linear(400, 300)
    self.layer_3 = nn.Linear(300, 1)
    # Defining the second Critic neural network
    self.layer_4 = nn.Linear(state_dim + action_dim, 400)
    self.layer_5 = nn.Linear(400, 300)
    self.layer_6 = nn.Linear(300, 1)

  def forward(self, x, u):
    xu = torch.cat([x, u], 1)
    # Forward-Propagation on the first Critic Neural Network
    x1 = F.relu(self.layer_1(xu))
    x1 = F.relu(self.layer_2(x1))
    x1 = self.layer_3(x1)
    # Forward-Propagation on the second Critic Neural Network
    x2 = F.relu(self.layer_4(xu))
    x2 = F.relu(self.layer_5(x2))
    x2 = self.layer_6(x2)
    return x1, x2

  def Q1(self, x, u):
    xu = torch.cat([x, u], 1)
    x1 = F.relu(self.layer_1(xu))
    x1 = F.relu(self.layer_2(x1))
    x1 = self.layer_3(x1)
    return x1

# Selecting the device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Building the whole Training Process into a class

class TD3(object):
  
  def __init__(self, state_dim, action_dim, max_action):
    self.actor = Actor(state_dim, action_dim, max_action).to(device)
    self.actor_target = Actor(state_dim, action_dim, max_action).to(device)
    self.actor_target.load_state_dict(self.actor.state_dict())
    self.actor_optimizer = torch.optim.Adam(self.actor.parameters())
    self.critic = Critic(state_dim, action_dim).to(device)
    self.critic_target = Critic(state_dim, action_dim).to(device)
    self.critic_target.load_state_dict(self.critic.state_dict())
    self.critic_optimizer = torch.optim.Adam(self.critic.parameters())
    self.max_action = max_action

  def select_action(self, state):
    state = torch.Tensor(state.reshape(1, -1)).to(device)
    return self.actor(state).cpu().data.numpy().flatten()

  # Selecting the device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Building the whole Training Process into a class

class TD3(object):
  
  def __init__(self, state_dim, action_dim, max_action):
    self.actor = Actor(state_dim, action_dim, max_action).to(device)
    self.actor_target = Actor(state_dim, action_dim, max_action).to(device)
    self.actor_target.load_state_dict(self.actor.state_dict())
    self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),lr=0.0000001)
    #self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),lr=0.00000001)
    
    self.critic = Critic(state_dim, action_dim).to(device)
    self.critic_target = Critic(state_dim, action_dim).to(device)
    self.critic_target.load_state_dict(self.critic.state_dict())
    self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),lr=0.0001)
    #self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),lr=0.00001)
    self.max_action = max_action

  def select_action(self, state):
    state = torch.Tensor(state).to(device)
    #return self.actor(state).cpu().data.numpy().flatten()
    return self.actor(state).cpu().data.numpy()

  def train(self, replay_buffer, iterations, batch_size=100, discount=0.99, tau=0.005, policy_noise=0.2, noise_clip=0.5, policy_freq=2):
    
    for it in range(iterations):
      
      # Step 4: We sample a batch of transitions (s, s’, a, r) from the memory
      batch_states, batch_next_states, batch_actions, batch_rewards, batch_dones = replay_buffer.sample(batch_size)
    
      #batch_states = np.reshape(batch_states,(batch_states.shape[0]*batch_states.shape[1],batch_states.shape[2]))
      #batch_next_states = np.reshape(batch_next_states,(batch_next_states.shape[0]*batch_next_states.shape[1],batch_next_states.shape[2]))
      #batch_actions = np.reshape(batch_actions,(batch_actions.shape[0]*batch_actions.shape[1],batch_actions.shape[2]))

      state = torch.Tensor(batch_states).to(device)
      next_state = torch.Tensor(batch_next_states).to(device)
      action = torch.Tensor(batch_actions).to(device)
      reward = torch.Tensor(batch_rewards).to(device)
      done = torch.Tensor(batch_dones).to(device)
      
      # Step 5: From the next state s’, the Actor target plays the next action a’
      next_action = self.actor_target(next_state)
      
      # Step 6: We add Gaussian noise to this next action a’ and we clamp it in a range of values supported by the environment
      noise = torch.Tensor(batch_actions).data.normal_(0, policy_noise).to(device)
      noise = noise.clamp(-noise_clip, noise_clip)
      next_action = (next_action + noise).clamp(-self.max_action, self.max_action)

      
      # Step 7: The two Critic targets take each the couple (s’, a’) as input and return two Q-values Qt1(s’,a’) and Qt2(s’,a’) as outputs
      target_Q1, target_Q2 = self.critic_target(next_state, torch.Tensor(next_action).to(device))
      
      # Step 8: We keep the minimum of these two Q-values: min(Qt1, Qt2)
      target_Q = torch.min(target_Q1, target_Q2)
      # Step 9: We get the final target of the two Critic models, which is: Qt = r + γ * min(Qt1, Qt2), where γ is the discount factor
      target_Q = reward + ((1 - done) * discount * target_Q).detach()
      
      # Step 10: The two Critic models take each the couple (s, a) as input and return two Q-values Q1(s,a) and Q2(s,a) as outputs
      current_Q1, current_Q2 = self.critic(state, action)
      
      # Step 11: We compute the loss coming from the two Critic models: Critic Loss = MSE_Loss(Q1(s,a), Qt) + MSE_Loss(Q2(s,a), Qt)
      critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)
      
      # Step 12: We backpropagate this Critic loss and update the parameters of the two Critic models with a SGD optimizer
      self.critic_optimizer.zero_grad()
      critic_loss.backward()
      self.critic_optimizer.step()
      
      # Step 13: Once every two iterations, we update our Actor model by performing gradient ascent on the output of the first Critic model
      if it % policy_freq == 0:
        actor_loss = -self.critic.Q1(state, self.actor(state)).mean()
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()
        
        # Step 14: Still once every two iterations, we update the weights of the Actor target by polyak averaging
        for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
          target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)
        
        # Step 15: Still once every two iterations, we update the weights of the Critic target by polyak averaging
        for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
          target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)
  
  # Making a save method to save a trained model
  def save(self, filename, directory):
    torch.save(self.actor.state_dict(), '%s/%s_actor.pth' % (directory, filename))
    torch.save(self.critic.state_dict(), '%s/%s_critic.pth' % (directory, filename))
  
  # Making a load method to load a pre-trained model
  def load(self, filename, directory):
    self.actor.load_state_dict(torch.load('%s/%s_actor.pth' % (directory, filename)))
    self.critic.load_state_dict(torch.load('%s/%s_critic.pth' % (directory, filename)))

def evaluate_policy(policy, eval_episodes=10):
  inf_outage_probability=[]
  avg_reward = 0.
  for _ in range(eval_episodes):
    obs = env.reset()
    done = False
    while not done:
      action = policy.select_action(obs)
      action = env.reshape_action_space_from_model_to_dict(action)
      reformed_action = env.apply_resource_allocation_constraint(action)
      obs, reward, done, _ = env.step(reformed_action)
      inf_energy.append(env.total_energy)
      inf_throughput.append(env.total_rate)
      inf_total_reward.append(reward)
      inf_fairness_index.append(env.SBS1.fairness_index)
      inf_task_delays.append(env.SBS1.delays)
      inf_num_RBs_allocated.append(env.num_RBs_allocated)
      inf_outage_probability.append(env.SBS1.outage_probability)
      inf_individual_channel_rates.append(env.SBS1.individual_channel_rates)
      avg_reward += reward
  avg_reward /= eval_episodes

  av_reward = sum(inf_total_reward)/len(inf_total_reward)
  av_energy = sum(inf_energy)/len(inf_energy)
  av_throughput = sum(inf_throughput)/len(inf_throughput)
  av_fairness_index = sum(inf_fairness_index)/len(inf_fairness_index)
  av_task_delay = sum(inf_task_delays)/len(inf_task_delays)
  av_num_RBs_allocated = sum(inf_num_RBs_allocated)/len(inf_num_RBs_allocated)
  inf_outage_probability = [0 if math.isnan(x) else x for x in inf_outage_probability]
  av_outage_probability = sum(inf_outage_probability)/len(inf_outage_probability)
  av_individual_channel_rates = np.array(inf_individual_channel_rates)
  av_individual_channel_rates = np.mean(av_individual_channel_rates, axis=0)

  
  np.save("./inference_results/%s" % (file_name_32), av_reward)
  np.save("./inference_results/%s" % (file_name_33), av_energy)
  np.save("./inference_results/%s" % (file_name_34), av_throughput)
  np.save("./inference_results/%s" % (file_name_35), av_fairness_index)
  np.save("./inference_results/%s" % (file_name_36), av_task_delay)
  np.save("./inference_results/%s" % (file_name_38), av_individual_channel_rates)
  print ("---------------------------------------")
  print ("Average Reward over the Evaluation Step: %f" % (avg_reward))
  print ("---------------------------------------")
  print('')
  print ("---------------------------------------")
  print ("Average Energy over the Evaluation Step: %f" % (av_energy))
  print ("---------------------------------------")
  print('')
  print ("---------------------------------------")
  print ("Average Throughput over the Evaluation Step: %f" % (av_throughput))
  print ("---------------------------------------")
  print('')
  print ("---------------------------------------")
  print ("Average Fairness Index over the Evaluation Step: %f" % (av_fairness_index))
  print ("---------------------------------------")
  print('')
  print ("---------------------------------------")
  print ("Average task delay over the Evaluation Step: %f" % (av_task_delay))
  print ("---------------------------------------")
  print ("---------------------------------------")
  print ("Average Number of allocated RBs over the Evaluation Step: %f" % (av_num_RBs_allocated))
  print ("---------------------------------------")
  print ("---------------------------------------")
  print ("Average Outage Probability over the Evaluation Step: %f" % (av_outage_probability))
  print ("---------------------------------------")
  print ("---------------------------------------")
  print ("Average Individual Channel Rates over the Evaluation Step: ",av_individual_channel_rates)
  print ("---------------------------------------")
  return avg_reward

env_name = "NetworkEnv-v0"
seed = 0

file_name = "%s_%s_%s" % ("TD3", env_name, str(seed))
print ("---------------------------------------")
print ("Settings: %s" % (file_name))
print ("---------------------------------------")

eval_episodes = 10
env = gym.make(env_name)
#env = NetworkEnv_()
#max_episode_steps = env._max_episode_steps
#if save_env_vid:
#  env = wrappers.Monitor(env, monitor_dir, force = True)
#  env.reset()
#env.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)
state_dim = env.observation_space_dim
action_dim = env.action_space_dim
max_action = float(env.box_action_space.high[0][1]) # to change this soon
policy = TD3(state_dim, action_dim, max_action)
policy.load(file_name, 'pytorch_models')
_ = evaluate_policy(policy, eval_episodes=eval_episodes)

In [None]:
class Actor_DDPG(nn.Module):
	def __init__(self, state_dim, action_dim, max_action):
		super(Actor_DDPG, self).__init__()

		self.layer_1 = nn.Linear(state_dim, 400)
		self.layer_2 = nn.Linear(400, 300)
		self.layer_3 = nn.Linear(300, action_dim)
		
		self.max_action = max_action

	
	def forward(self, state):
		x = F.relu(self.layer_1(state))
		x = F.relu(self.layer_2(x))
		return self.max_action * torch.sigmoid(self.layer_3(x))

class Critic_DDPG(nn.Module):
	def __init__(self, state_dim, action_dim):
		super(Critic_DDPG, self).__init__()

		#self.layer_1 = nn.Linear(state_dim + action_dim, 400)
		#self.layer_2 = nn.Linear(400, 300)
		self.layer_1 = nn.Linear(state_dim, 400)
		self.layer_2 = nn.Linear(400 + action_dim, 300)
		self.layer_3 = nn.Linear(300, 1)


	def forward(self, state, action):
		#q = F.relu(self.layer_1(torch.cat([state, action], 1)))
		#q = F.relu(self.layer_2(q))

		q = F.relu(self.layer_1(state))
		q = F.relu(self.layer_2(torch.cat([q, action], 1)))
		return self.layer_3(q)

# Selecting the device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Building the whole Training Process into a class

class TD3(object):
  
  def __init__(self, state_dim, action_dim, max_action):
    self.actor = Actor(state_dim, action_dim, max_action).to(device)
    self.actor_target = Actor(state_dim, action_dim, max_action).to(device)
    self.actor_target.load_state_dict(self.actor.state_dict())
    self.actor_optimizer = torch.optim.Adam(self.actor.parameters())
    self.critic = Critic(state_dim, action_dim).to(device)
    self.critic_target = Critic(state_dim, action_dim).to(device)
    self.critic_target.load_state_dict(self.critic.state_dict())
    self.critic_optimizer = torch.optim.Adam(self.critic.parameters())
    self.max_action = max_action

  def select_action(self, state):
    state = torch.Tensor(state.reshape(1, -1)).to(device)
    return self.actor(state).cpu().data.numpy().flatten()

  # Selecting the device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Building the whole Training Process into a class

class DDPG(object):
	def __init__(self, state_dim, action_dim, max_action):
		self.actor = Actor_DDPG(state_dim, action_dim, max_action).to(device)
		self.actor_target = Actor_DDPG(state_dim, action_dim, max_action).to(device)
		self.actor_target.load_state_dict(self.actor.state_dict())
		self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=0.000001)

		self.critic = Critic_DDPG(state_dim, action_dim).to(device)
		self.critic_target = Critic_DDPG(state_dim, action_dim).to(device)
		self.critic_target.load_state_dict(self.critic.state_dict())
		self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=0.0001)

	def select_action(self, state):
		#state = torch.FloatTensor(state.reshape(1, -1)).to(device)
		state = torch.Tensor(state).to(device)
		return self.actor(state).cpu().data.numpy()
		#return self.actor(state).cpu().data.numpy().flatten()

	def train(self, replay_buffer, iterations, batch_size=100, discount=0.99, tau=0.005, policy_noise=0.2, noise_clip=0.5, policy_freq=2):
		# Sample replay buffer 
		batch_states, batch_next_states, batch_actions,batch_rewards, batch_dones = replay_buffer.sample(batch_size)
		
		state = torch.Tensor(batch_states).to(device)
		next_state = torch.Tensor(batch_next_states).to(device)
		action = torch.Tensor(batch_actions).to(device)
		reward = torch.Tensor(batch_rewards).to(device)
		done = torch.Tensor(batch_dones).to(device)
      

		# Compute the target Q value
		next_action = self.actor_target(next_state)	
		target_Q = self.critic_target(next_state, torch.Tensor(next_action).to(device))
		#target_Q1, target_Q2 = self.critic_target(next_state, torch.Tensor(next_action).to(device))
		target_Q = reward + ((1 - done) *discount * target_Q).detach()

		# Get current Q estimate
		current_Q = self.critic(state, action)

		# Compute critic loss
		critic_loss = F.mse_loss(current_Q, target_Q)

		# Optimize the critic
		self.critic_optimizer.zero_grad()
		critic_loss.backward()
		self.critic_optimizer.step()

		# Compute actor loss
		actor_loss = -self.critic(state, self.actor(state)).mean()
		
		# Optimize the actor 
		self.actor_optimizer.zero_grad()
		actor_loss.backward()
		self.actor_optimizer.step()

		# Update the frozen target models
		for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
			target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)

		for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
			target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)


	def save(self, filename, directory):
		torch.save(self.actor.state_dict(), '%s/%s_actor.pth' % (directory, filename))
		torch.save(self.critic.state_dict(), '%s/%s_critic.pth' % (directory, filename))


	def load(self, filename, directory):
		self.actor.load_state_dict(torch.load('%s/%s_actor.pth' % (directory, filename)))
		self.critic.load_state_dict(torch.load('%s/%s_critic.pth' % (directory, filename)))

def evaluate_policy(policy, eval_episodes=10):
  inf_outage_probability=[]
  avg_reward = 0.
  for _ in range(eval_episodes):
    obs = env.reset()
    done = False
    while not done:
      action = policy.select_action(obs)
      action = env.reshape_action_space_from_model_to_dict(action)
      reformed_action = env.apply_resource_allocation_constraint(action)
      obs, reward, done, _ = env.step(reformed_action)
      inf_energy.append(env.total_energy)
      inf_throughput.append(env.total_rate)
      inf_total_reward.append(reward)
      inf_fairness_index.append(env.SBS1.fairness_index)
      inf_task_delays.append(env.SBS1.delays)
      inf_num_RBs_allocated.append(env.num_RBs_allocated)
      inf_outage_probability.append(env.SBS1.outage_probability)
      inf_individual_channel_rates.append(env.SBS1.individual_channel_rates)
      avg_reward += reward
  avg_reward /= eval_episodes

  av_reward = sum(inf_total_reward)/len(inf_total_reward)
  av_energy = sum(inf_energy)/len(inf_energy)
  av_throughput = sum(inf_throughput)/len(inf_throughput)
  av_fairness_index = sum(inf_fairness_index)/len(inf_fairness_index)
  av_task_delay = sum(inf_task_delays)/len(inf_task_delays)
  av_num_RBs_allocated = sum(inf_num_RBs_allocated)/len(inf_num_RBs_allocated)
  inf_outage_probability = [0 if math.isnan(x) else x for x in inf_outage_probability]
  av_outage_probability = sum(inf_outage_probability)/len(inf_outage_probability)
  av_individual_channel_rates = np.array(inf_individual_channel_rates)
  av_individual_channel_rates = np.mean(av_individual_channel_rates, axis=0)

  
  np.save("./inference_results/%s" % (file_name_32), av_reward)
  np.save("./inference_results/%s" % (file_name_33), av_energy)
  np.save("./inference_results/%s" % (file_name_34), av_throughput)
  np.save("./inference_results/%s" % (file_name_35), av_fairness_index)
  np.save("./inference_results/%s" % (file_name_36), av_task_delay)
  np.save("./inference_results/%s" % (file_name_38), av_individual_channel_rates)
  print ("---------------------------------------")
  print ("Average Reward over the Evaluation Step: %f" % (avg_reward))
  print ("---------------------------------------")
  print('')
  print ("---------------------------------------")
  print ("Average Energy over the Evaluation Step: %f" % (av_energy))
  print ("---------------------------------------")
  print('')
  print ("---------------------------------------")
  print ("Average Throughput over the Evaluation Step: %f" % (av_throughput))
  print ("---------------------------------------")
  print('')
  print ("---------------------------------------")
  print ("Average Fairness Index over the Evaluation Step: %f" % (av_fairness_index))
  print ("---------------------------------------")
  print('')
  print ("---------------------------------------")
  print ("Average task delay over the Evaluation Step: %f" % (av_task_delay))
  print ("---------------------------------------")
  print ("---------------------------------------")
  print ("Average Number of allocated RBs over the Evaluation Step: %f" % (av_num_RBs_allocated))
  print ("---------------------------------------")
  print ("---------------------------------------")
  print ("Average Outage Probability over the Evaluation Step: %f" % (av_outage_probability))
  print ("---------------------------------------")
  print ("---------------------------------------")
  print ("Average Individual Channel Rates over the Evaluation Step: ",av_individual_channel_rates)
  print ("---------------------------------------")
  return avg_reward

env_name = "NetworkEnv-v0"
seed = 0

file_name = "%s_%s_%s" % ("TD3", env_name, str(seed))
print ("---------------------------------------")
print ("Settings: %s" % (file_name))
print ("---------------------------------------")

eval_episodes = 10
env = gym.make(env_name)
#env = NetworkEnv_()
#max_episode_steps = env._max_episode_steps
#if save_env_vid:
#  env = wrappers.Monitor(env, monitor_dir, force = True)
#  env.reset()
#env.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)
state_dim = env.observation_space_dim
action_dim = env.action_space_dim
max_action = float(env.box_action_space.high[0][1]) # to change this soon
policy = DDPG(state_dim, action_dim, max_action)
policy.load(file_name, 'pytorch_models')
_ = evaluate_policy(policy, eval_episodes=eval_episodes)