# Twin-Delayed DDPG

## Installing the packages

In [None]:
# Cloning Github Repository and navigate into project folder
!git clone https://github.com/francmeister/Masters-Research-Project.git

In [None]:
cd Masters-Research-Project/ICARTI-Project/

In [None]:
!pip install -e Network_Env

In [None]:
cd Network_Env/

In [None]:
import os
import time
import random
import numpy as np
import matplotlib.pyplot as plt
import Network_Env
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import pygame
from gym import wrappers
from torch.autograd import Variable
from collections import deque
from google.colab import files


In [None]:
class Memory(object):

  def __init__(self, max_size=1e6):
    self.storage = []
    self.max_size = max_size
    self.ptr = 0

  def add(self, transition):
    if len(self.storage) == self.max_size:
      self.storage[int(self.ptr)] = transition
      self.ptr = (self.ptr + 1) % self.max_size
    else:
      self.storage.append(transition)

  def sample(self):
    batch_size = len(self.storage)
    ind = np.random.randint(0, len(self.storage), size=batch_size)
    batch_states, batch_actions = [], []
    for i in ind: 
      state, action = self.storage[i]
      batch_states.append(np.array(state, copy=False))
      batch_actions.append(np.array(action, copy=False))
    return np.array(batch_states), np.array(batch_actions)

In [None]:
class ObservationSpaceMemory(object):

  def __init__(self, max_size=1e6):
    self.storage = []
    self.max_size = max_size
    self.ptr = 0

  def add(self, transition):
    if len(self.storage) == self.max_size:
      self.storage[int(self.ptr)] = transition
      self.ptr = (self.ptr + 1) % self.max_size
    else:
      self.storage.append(transition)

  def sample(self, batch_size):
    ind = np.random.randint(0, len(self.storage), size=batch_size)
    batch_states = []
    for i in ind: 
      state, action = self.storage[i]
      batch_states.append(np.array(state, copy=False))
    return np.array(batch_states)

In [None]:
class Actor(nn.Module):
  
  def __init__(self, state_dim, action_dim, max_action):
    super(Actor, self).__init__()
    self.layer_1 = nn.Linear(state_dim, 400)
    self.layer_2 = nn.Linear(400, 300)
    self.layer_3 = nn.Linear(300, action_dim)
    self.max_action = max_action

  def forward(self, x):
    x = F.relu(self.layer_1(x))
    x = F.relu(self.layer_2(x))
    x = self.max_action * torch.sigmoid(self.layer_3(x))
    #x = self.max_action * torch.tanh(self.layer_3(x))
    return x

In [None]:
# Selecting the device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Building the whole Training Process into a class

class TD3(object):
  
  def __init__(self, state_dim, action_dim, max_action):
    self.actor = Actor(state_dim, action_dim, max_action).to(device)
    self.actor_optimizer = torch.optim.Adam(self.actor.parameters())
    self.max_action = max_action

  def select_action(self, state):
    state = torch.Tensor(state).to(device)
    #return self.actor(state).cpu().data.numpy().flatten()
    return self.actor(state).cpu().data.numpy()

  def train(self, memory, train_iterations):
    
    for it in range(train_iterations):
      
      # Step 4: We sample a batch of transitions (s, sâ€™, a, r) from the memory
      batch_states, batch_actions = memory.sample()
     # print("batch_states")
     # print(batch_states.shape)
     # print(batch_states)
     # print("batch_actions")
      #print(batch_actions.shape)
      #print(batch_actions)
      #batch_states = np.reshape(batch_states,(batch_states.shape[0]*batch_states.shape[1],batch_states.shape[2]))
      #batch_actions = np.reshape(batch_actions,(batch_actions.shape[0]*batch_actions.shape[1],batch_actions.shape[2]))

      state = torch.Tensor(batch_states).to(device)
      action = torch.Tensor(batch_actions).to(device)

      prediction = self.actor(state)

      loss = F.mse_loss(prediction, action) 
      self.actor_optimizer.zero_grad()
      loss.backward()
      self.actor_optimizer.step()


In [None]:
env_name = "NetworkEnv-v0" # Name of a environment (set it to any Continous environment you want)
seed = 0 # Random seed number
start_timesteps = 1e4 # Number of iterations/timesteps before which the model randomly chooses an action, and after which it starts to use the policy network
eval_freq = 5e3 # How often the evaluation step is performed (after how many timesteps)
max_timesteps = 7e5 # Total number of iterations/timesteps
save_models = True # Boolean checker whether or not to save the pre-trained model
expl_noise = 0.1 # Exploration noise - STD value of exploration Gaussian noise
batch_size = 10 # Size of the batch
discount = 0.99 # Discount factor gamma, used in the calculation of the total discounted reward
tau = 0.005 # Target network update rate
policy_noise = 0.2 # STD of Gaussian noise added to the actions for the exploration purposes
noise_clip = 0.5 # Maximum value of the Gaussian noise added to the actions (policy)
policy_freq = 2 # Number of iterations to wait before the policy network (Actor model) is updated

In [None]:
env = gym.make(env_name)

In [None]:
#env.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)
state_dim = env.observation_space.shape[1]
action_dim = env.action_space.shape[1]
max_action = float(env.action_space.high[0][1]) # to change this soon

In [None]:
policy = TD3(state_dim, action_dim, max_action)

In [None]:
memory = Memory()
#obsSpaceMemory = ObservationSpaceMemory()

In [None]:
Fmax = -10000000000
epochs = 25
sampling_frequency = 100
train_iterations = 10
obsSpaceMemory = []

In [None]:
for i in range(1,epochs):
    print("Epoch: ", i)
    obsSpaceMemory = []
    for i in range(1,sampling_frequency):
        obsSample = env.observation_space.sample()
        obsSpaceMemory.append(obsSample)

    obsSpaceMemory = np.array(obsSpaceMemory)
    obs = env.reset()
    for obsSample in obsSpaceMemory:
        #print(obsSample)
        obsAction = policy.select_action(obsSample)
        obs, rewards, done, _ = env.step(obsAction)

        index = 0
        for reward in rewards:
            if reward > Fmax:
                memory.add((obsSample[index],obsAction[index]))
                Fmax = reward
                print("Fmax")
                print(Fmax)
            index+=1

    if len(memory.storage) > 0:
        policy.train(memory,train_iterations)