# Twin Delayed Deep Deterministic Policy Gradient (TD3)

# Imports

In [3]:
import numpy as np
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
#from tensorboardX import SummaryWriter

import gym
#import roboschool
import sys

# Networks

In [4]:
def hidden_init(layer):
    fan_in = layer.weight.data.size()[0]
    lim = 1. / np.sqrt(fan_in)
    return (-lim, lim)

In [5]:
class Actor(nn.Module):
    """Initialize parameters and build model.
        Args:
            state_size (int): Dimension of each state
            action_size (int): Dimension of each action
            max_action (float): highest action to take
            seed (int): Random seed
            h1_units (int): Number of nodes in first hidden layer
            h2_units (int): Number of nodes in second hidden layer
            
        Return:
            action output of network with tanh activation
    """
    
    def __init__(self, state_dim, action_dim, max_action):
        super(Actor, self).__init__()

        self.l1 = nn.Linear(state_dim, 400)
        self.l2 = nn.Linear(400, 300)
        self.l3 = nn.Linear(300, action_dim)

        self.max_action = max_action


    def forward(self, x):
        x = F.relu(self.l1(x))
        x = F.relu(self.l2(x))
        x = self.max_action * torch.tanh(self.l3(x)) 
        return x



In [6]:
class Critic(nn.Module):
    """Initialize parameters and build model.
        Args:
            state_size (int): Dimension of each state
            action_size (int): Dimension of each action
            max_action (float): highest action to take
            seed (int): Random seed
            h1_units (int): Number of nodes in first hidden layer
            h2_units (int): Number of nodes in second hidden layer
            
        Return:
            value output of network 
    """
    
    def __init__(self, state_dim, action_dim):
        super(Critic, self).__init__()

        # Q1 architecture
        self.l1 = nn.Linear(state_dim + action_dim, 400)
        self.l2 = nn.Linear(400, 300)
        self.l3 = nn.Linear(300, 1)

        # Q2 architecture
        self.l4 = nn.Linear(state_dim + action_dim, 400)
        self.l5 = nn.Linear(400, 300)
        self.l6 = nn.Linear(300, 1)


    def forward(self, x, u):
        xu = torch.cat([x, u], 1)

        x1 = F.relu(self.l1(xu))
        x1 = F.relu(self.l2(x1))
        x1 = self.l3(x1)

        x2 = F.relu(self.l4(xu))
        x2 = F.relu(self.l5(x2))
        x2 = self.l6(x2)
        return x1, x2


    def Q1(self, x, u):
        xu = torch.cat([x, u], 1)

        x1 = F.relu(self.l1(xu))
        x1 = F.relu(self.l2(x1))
        x1 = self.l3(x1)
        return x1

# Memory

In [7]:
# Code based on: 
# https://github.com/openai/baselines/blob/master/baselines/deepq/replay_buffer.py

# Expects tuples of (state, next_state, action, reward, done)
class ReplayBuffer(object):
    """Buffer to store tuples of experience replay"""
    
    def __init__(self, max_size=1000000):
        """
        Args:
            max_size (int): total amount of tuples to store
        """
        
        self.storage = []
        self.max_size = max_size
        self.ptr = 0

    def add(self, data):
        """Add experience tuples to buffer
        
        Args:
            data (tuple): experience replay tuple
        """
        
        if len(self.storage) == self.max_size:
            self.storage[int(self.ptr)] = data
            self.ptr = (self.ptr + 1) % self.max_size
        else:
            self.storage.append(data)

    def sample(self, batch_size):
        """Samples a random amount of experiences from buffer of batch size
        
        Args:
            batch_size (int): size of sample
        """
        
        ind = np.random.randint(0, len(self.storage), size=batch_size)
        states, actions, next_states, rewards, dones = [], [], [], [], []

        for i in ind: 
            s, a, s_, r, d = self.storage[i]
            states.append(np.array(s, copy=False))
            actions.append(np.array(a, copy=False))
            next_states.append(np.array(s_, copy=False))
            rewards.append(np.array(r, copy=False))
            dones.append(np.array(d, copy=False))

        return np.array(states), np.array(actions), np.array(next_states), np.array(rewards).reshape(-1, 1), np.array(dones).reshape(-1, 1)

# Agent

In [8]:
class TD3(object):
    """Agent class that handles the training of the networks and provides outputs as actions
    
        Args:
            state_dim (int): state size
            action_dim (int): action size
            max_action (float): highest action to take
            device (device): cuda or cpu to process tensors
            env (env): gym environment to use
    
    """
    
    def __init__(self, state_dim, action_dim, max_action, env):
        self.actor = Actor(state_dim, action_dim, max_action).to(device)
        self.actor_target = Actor(state_dim, action_dim, max_action).to(device)
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=1e-3)

        self.critic = Critic(state_dim, action_dim).to(device)
        self.critic_target = Critic(state_dim, action_dim).to(device)
        self.critic_target.load_state_dict(self.critic.state_dict())
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=1e-3)

        self.max_action = max_action
        self.env = env


        
    def select_action(self, state, noise=0.1):
        """Select an appropriate action from the agent policy
        
            Args:
                state (array): current state of environment
                noise (float): how much noise to add to acitons
                
            Returns:
                action (float): action clipped within action range
        
        """
        
        state = torch.FloatTensor(state.reshape(1, -1)).to(device)
        
        action = self.actor(state).cpu().data.numpy().flatten()
        if noise != 0: 
            action = (action + np.random.normal(0, noise, size=self.env.action_space.shape[0]))
            
        return action.clip(self.env.action_space.low, self.env.action_space.high)

    
    def train(self, replay_buffer, iterations, batch_size=100, discount=0.99, tau=0.005, policy_noise=0.2, noise_clip=0.5, policy_freq=2):
        """Train and update actor and critic networks
        
            Args:
                replay_buffer (ReplayBuffer): buffer for experience replay
                iterations (int): how many times to run training
                batch_size(int): batch size to sample from replay buffer
                discount (float): discount factor
                tau (float): soft update for main networks to target networks
                
            Return:
                actor_loss (float): loss from actor network
                critic_loss (float): loss from critic network
        
        """
        
        for it in range(iterations):

            # Sample replay buffer 
            x, y, u, r, d = replay_buffer.sample(batch_size)
            state = torch.FloatTensor(x).to(device)
            action = torch.FloatTensor(u).to(device)
            next_state = torch.FloatTensor(y).to(device)
            done = torch.FloatTensor(1 - d).to(device)
            reward = torch.FloatTensor(r).to(device)

            # Select action according to policy and add clipped noise 
            noise = torch.FloatTensor(u).data.normal_(0, policy_noise).to(device)
            noise = noise.clamp(-noise_clip, noise_clip)
            next_action = (self.actor_target(next_state) + noise).clamp(-self.max_action, self.max_action)

            # Compute the target Q value
            target_Q1, target_Q2 = self.critic_target(next_state, next_action)
            target_Q = torch.min(target_Q1, target_Q2)
            target_Q = reward + (done * discount * target_Q).detach()

            # Get current Q estimates
            current_Q1, current_Q2 = self.critic(state, action)

            # Compute critic loss
            critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q) 

            # Optimize the critic
            self.critic_optimizer.zero_grad()
            critic_loss.backward()
            self.critic_optimizer.step()

            # Delayed policy updates
            if it % policy_freq == 0:

                # Compute actor loss
                actor_loss = -self.critic.Q1(state, self.actor(state)).mean()

                # Optimize the actor 
                self.actor_optimizer.zero_grad()
                actor_loss.backward()
                self.actor_optimizer.step()

                # Update the frozen target models
                for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
                    target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)

                for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
                    target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)


    def save(self, filename, directory):
        torch.save(self.actor.state_dict(), '%s/%s_actor.pth' % (directory, filename))
        torch.save(self.critic.state_dict(), '%s/%s_critic.pth' % (directory, filename))


    def load(self, filename="best_avg", directory="./saves"):
        self.actor.load_state_dict(torch.load('%s/%s_actor.pth' % (directory, filename)))
        self.critic.load_state_dict(torch.load('%s/%s_critic.pth' % (directory, filename)))

# Runner

In [9]:
class Runner():
    """Carries out the environment steps and adds experiences to memory"""
    
    def __init__(self, env, agent, replay_buffer):
        
        self.env = env
        self.agent = agent
        self.replay_buffer = replay_buffer
        self.obs = env.reset()
        self.done = False
        
    def next_step(self, episode_timesteps, noise=0.1):
        
        action = self.agent.select_action(np.array(self.obs), noise=0.1)
        
        # Perform action
        new_obs, reward, done, _ = self.env.step(action) 
        done_bool = 0 if episode_timesteps + 1 == 200 else float(done)
    
        # Store data in replay buffer
        replay_buffer.add((self.obs, new_obs, action, reward, done_bool))
        
        self.obs = new_obs
        
        if done:
            self.obs = self.env.reset()
            done = False
            
            return reward, True
        
        return reward, done

# Evaluate

In [10]:
def evaluate_policy(policy, env, eval_episodes=100,render=False):
    """run several episodes using the best agent policy
        
        Args:
            policy (agent): agent to evaluate
            env (env): gym environment
            eval_episodes (int): how many test episodes to run
            render (bool): show training
        
        Returns:
            avg_reward (float): average reward over the number of evaluations
    
    """
    
    avg_reward = 0.
    for i in range(eval_episodes):
        obs = env.reset()
        done = False
        while not done:
            if render:
                env.render()
            action = policy.select_action(np.array(obs), noise=0)
            obs, reward, done, _ = env.step(action)
            avg_reward += reward

    avg_reward /= eval_episodes

    print("\n---------------------------------------")
    print("Evaluation over {:d} episodes: {:f}" .format(eval_episodes, avg_reward))
    print("---------------------------------------")
    return avg_reward

# Observation

In [11]:
def observe(env,replay_buffer, observation_steps):
    """run episodes while taking random actions and filling replay_buffer
    
        Args:
            env (env): gym environment
            replay_buffer(ReplayBuffer): buffer to store experience replay
            observation_steps (int): how many steps to observe for
    
    """
    
    time_steps = 0
    obs = env.reset()
    done = False

    while time_steps < observation_steps:
        action = env.action_space.sample()
        new_obs, reward, done, _ = env.step(action)

        replay_buffer.add((obs, new_obs, action, reward, done))

        obs = new_obs
        time_steps += 1

        if done:
            obs = env.reset()
            done = False

        print("\rPopulating Buffer {}/{}.".format(time_steps, observation_steps), end="")
        sys.stdout.flush()

# Train

In [13]:
def train(agent, test_env):
    """Train the agent for exploration steps
    
        Args:
            agent (Agent): agent to use
            env (environment): gym environment
            writer (SummaryWriter): tensorboard writer
            exploration (int): how many training steps to run
    
    """

    total_timesteps = 0
    timesteps_since_eval = 0
    episode_num = 0
    episode_reward = 0
    episode_timesteps = 0
    done = False 
    obs = env.reset()
    evaluations = []
    rewards = []
    best_avg = -2000
    
    writer = SummaryWriter(comment="-TD3_Baseline_HalfCheetah")
    
    while total_timesteps < EXPLORATION:
    
        if done: 

            if total_timesteps != 0: 
                rewards.append(episode_reward)
                avg_reward = np.mean(rewards[-100:])
                
                writer.add_scalar("avg_reward", avg_reward, total_timesteps)
                writer.add_scalar("reward_step", reward, total_timesteps)
                writer.add_scalar("episode_reward", episode_reward, total_timesteps)
                
                if best_avg < avg_reward:
                    best_avg = avg_reward
                    print("saving best model....\n")
                    agent.save("best_avg","saves")

                print("\rTotal T: {:d} Episode Num: {:d} Reward: {:f} Avg Reward: {:f}".format(
                    total_timesteps, episode_num, episode_reward, avg_reward), end="")
                sys.stdout.flush()


                if avg_reward >= REWARD_THRESH:
                    break

                agent.train(replay_buffer, episode_timesteps, BATCH_SIZE, GAMMA, TAU, NOISE, NOISE_CLIP, POLICY_FREQUENCY)

                # Evaluate episode
#                 if timesteps_since_eval >= EVAL_FREQUENCY:
#                     timesteps_since_eval %= EVAL_FREQUENCY
#                     eval_reward = evaluate_policy(agent, test_env)
#                     evaluations.append(avg_reward)
#                     writer.add_scalar("eval_reward", eval_reward, total_timesteps)

#                     if best_avg < eval_reward:
#                         best_avg = eval_reward
#                         print("saving best model....\n")
#                         agent.save("best_avg","saves")

                episode_reward = 0
                episode_timesteps = 0
                episode_num += 1 

        reward, done = runner.next_step(episode_timesteps)
        episode_reward += reward

        episode_timesteps += 1
        total_timesteps += 1
        timesteps_since_eval += 1

# Config

In [12]:
ENV = "Pendulum-v0"
SEED = 0
OBSERVATION = 10000
EXPLORATION = 5000000
BATCH_SIZE = 100
GAMMA = 0.99
TAU = 0.005
NOISE = 0.2
NOISE_CLIP = 0.5
EXPLORE_NOISE = 0.1
POLICY_FREQUENCY = 2
EVAL_FREQUENCY = 5000
REWARD_THRESH = 8000

# Main

In [14]:
env = gym.make(ENV)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Set seeds
env.seed(SEED)
torch.manual_seed(SEED)
np.random.seed(SEED)

state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0] 
max_action = float(env.action_space.high[0])

policy = TD3(state_dim, action_dim, max_action, env)

replay_buffer = ReplayBuffer()

runner = Runner(env, policy, replay_buffer)

total_timesteps = 0
timesteps_since_eval = 0
episode_num = 0
done = True

In [15]:
# Populate replay buffer
observe(env, replay_buffer, OBSERVATION)

Populating Buffer 10000/10000.

In [None]:
# Train agent
train(policy, env)

saving best model....

Total T: 20 Episode Num: 0 Reward: 16.743121 Avg Reward: 16.743121saving best model....

Total T: 1514 Episode Num: 58 Reward: 21.769903 Avg Reward: 20.342496saving best model....

Total T: 1536 Episode Num: 59 Reward: 22.781277 Avg Reward: 20.383142saving best model....

Total T: 1582 Episode Num: 61 Reward: 17.289137 Avg Reward: 20.385368saving best model....

Total T: 1630 Episode Num: 63 Reward: 19.862503 Avg Reward: 20.485710saving best model....

Total T: 1650 Episode Num: 64 Reward: 21.866313 Avg Reward: 20.506950saving best model....

Total T: 1675 Episode Num: 65 Reward: 26.819443 Avg Reward: 20.602593saving best model....

Total T: 1700 Episode Num: 66 Reward: 23.843442 Avg Reward: 20.650964saving best model....

Total T: 1726 Episode Num: 67 Reward: 21.144815 Avg Reward: 20.658227saving best model....

Total T: 1750 Episode Num: 68 Reward: 24.633037 Avg Reward: 20.715833saving best model....

Total T: 1773 Episode Num: 69 Reward: 21.303502 Avg Reward: 

Total T: 39806 Episode Num: 578 Reward: 208.145489 Avg Reward: 105.773231saving best model....

Total T: 40153 Episode Num: 579 Reward: 188.060161 Avg Reward: 106.239159saving best model....

Total T: 41153 Episode Num: 580 Reward: 555.741063 Avg Reward: 110.771692saving best model....

Total T: 42153 Episode Num: 581 Reward: 569.504707 Avg Reward: 115.972529saving best model....

Total T: 46021 Episode Num: 592 Reward: 198.298114 Avg Reward: 116.316214saving best model....

Total T: 46418 Episode Num: 593 Reward: 210.076767 Avg Reward: 117.462085saving best model....

Total T: 46844 Episode Num: 594 Reward: 246.176531 Avg Reward: 119.406595saving best model....

Total T: 47203 Episode Num: 595 Reward: 209.325910 Avg Reward: 120.992479saving best model....

Total T: 47523 Episode Num: 596 Reward: 185.038532 Avg Reward: 122.203928saving best model....

Total T: 47685 Episode Num: 597 Reward: 84.376518 Avg Reward: 122.501402saving best model....

Total T: 48022 Episode Num: 598 Reward: 1

Total T: 138656 Episode Num: 836 Reward: 607.065039 Avg Reward: 439.941842saving best model....

Total T: 139656 Episode Num: 837 Reward: 595.689970 Avg Reward: 445.555158saving best model....

Total T: 140656 Episode Num: 838 Reward: 732.384281 Avg Reward: 452.465407saving best model....

Total T: 141531 Episode Num: 839 Reward: 647.656644 Avg Reward: 459.453589saving best model....

Total T: 142531 Episode Num: 840 Reward: 756.008799 Avg Reward: 465.000707saving best model....

Total T: 143531 Episode Num: 841 Reward: 695.013929 Avg Reward: 471.457913saving best model....

Total T: 144531 Episode Num: 842 Reward: 768.923176 Avg Reward: 478.003507saving best model....

Total T: 145531 Episode Num: 843 Reward: 742.911920 Avg Reward: 483.338175saving best model....

Total T: 146531 Episode Num: 844 Reward: 762.849011 Avg Reward: 488.611679saving best model....

Total T: 147531 Episode Num: 845 Reward: 759.828828 Avg Reward: 492.855337saving best model....

Total T: 148531 Episode Num: 8

Total T: 623005 Episode Num: 1369 Reward: 1187.826921 Avg Reward: 969.779982saving best model....

Total T: 624005 Episode Num: 1370 Reward: 1163.409350 Avg Reward: 973.381144saving best model....

Total T: 625005 Episode Num: 1371 Reward: 918.428779 Avg Reward: 975.207919saving best model....

Total T: 626005 Episode Num: 1372 Reward: 1244.181364 Avg Reward: 980.535153saving best model....

Total T: 627005 Episode Num: 1373 Reward: 1105.513651 Avg Reward: 985.010907saving best model....

Total T: 628005 Episode Num: 1374 Reward: 1131.012411 Avg Reward: 988.240308saving best model....

Total T: 728909 Episode Num: 1484 Reward: 1428.454555 Avg Reward: 981.490534saving best model....

Total T: 729909 Episode Num: 1485 Reward: 1434.856087 Avg Reward: 993.059802saving best model....

Total T: 730909 Episode Num: 1486 Reward: 1137.569908 Avg Reward: 999.046120saving best model....

Total T: 731909 Episode Num: 1487 Reward: 1443.342884 Avg Reward: 1009.637547saving best model....

Total T: 7

Total T: 1282653 Episode Num: 2151 Reward: 2278.623270 Avg Reward: 1955.972979saving best model....

Total T: 1283653 Episode Num: 2152 Reward: 2296.058155 Avg Reward: 1958.867957saving best model....

Total T: 1285653 Episode Num: 2154 Reward: 2047.697984 Avg Reward: 1959.030327saving best model....

Total T: 1286653 Episode Num: 2155 Reward: 2173.697953 Avg Reward: 1962.093588saving best model....

Total T: 1287653 Episode Num: 2156 Reward: 2275.344447 Avg Reward: 1965.456756saving best model....

Total T: 1288653 Episode Num: 2157 Reward: 2041.508042 Avg Reward: 1976.215511saving best model....

Total T: 1289653 Episode Num: 2158 Reward: 2285.170873 Avg Reward: 1992.755884saving best model....

Total T: 1290653 Episode Num: 2159 Reward: 2247.382220 Avg Reward: 1992.975531saving best model....

Total T: 1302653 Episode Num: 2171 Reward: 2123.387622 Avg Reward: 2009.603248saving best model....

Total T: 1303653 Episode Num: 2172 Reward: 2033.092230 Avg Reward: 2010.851613saving best m

Total T: 2697225 Episode Num: 3580 Reward: 2511.032046 Avg Reward: 2352.228172saving best model....

Total T: 2809146 Episode Num: 3693 Reward: 2350.420103 Avg Reward: 2352.268838saving best model....

Total T: 2810146 Episode Num: 3694 Reward: 2332.138665 Avg Reward: 2353.183528saving best model....

Total T: 2811146 Episode Num: 3695 Reward: 2569.125164 Avg Reward: 2378.745147saving best model....

Total T: 2812146 Episode Num: 3696 Reward: 2508.268060 Avg Reward: 2379.971951saving best model....

Total T: 2813146 Episode Num: 3697 Reward: 2337.819026 Avg Reward: 2380.401157saving best model....

Total T: 2814146 Episode Num: 3698 Reward: 2342.367980 Avg Reward: 2383.982854saving best model....

Total T: 2815146 Episode Num: 3699 Reward: 2383.442357 Avg Reward: 2384.515742saving best model....

Total T: 2816146 Episode Num: 3700 Reward: 2445.194353 Avg Reward: 2386.066220saving best model....

Total T: 2817146 Episode Num: 3701 Reward: 2515.765400 Avg Reward: 2386.134727saving best m

In [None]:
policy.load()

for i in range(100):
    evaluate_policy(policy, env, render=True)

In [None]:
env.close()