# Twin Delayed Deep Deterministic Policy Gradient (TD3)

# Imports

In [1]:
import numpy as np
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
from tensorboardX import SummaryWriter

import gym
import roboschool
import sys

# Networks

In [2]:
def hidden_init(layer):
    fan_in = layer.weight.data.size()[0]
    lim = 1. / np.sqrt(fan_in)
    return (-lim, lim)

In [3]:
class Actor(nn.Module):
    """Initialize parameters and build model.
        Args:
            state_size (int): Dimension of each state
            action_size (int): Dimension of each action
            max_action (float): highest action to take
            seed (int): Random seed
            h1_units (int): Number of nodes in first hidden layer
            h2_units (int): Number of nodes in second hidden layer
            
        Return:
            action output of network with tanh activation
    """
    
    def __init__(self, state_dim, action_dim, max_action):
        super(Actor, self).__init__()

        self.l1 = nn.Linear(state_dim, 400)
        self.l2 = nn.Linear(400, 300)
        self.l3 = nn.Linear(300, action_dim)

        self.max_action = max_action


    def forward(self, x):
        x = F.relu(self.l1(x))
        x = F.relu(self.l2(x))
        x = self.max_action * torch.tanh(self.l3(x)) 
        return x



In [4]:
class Critic(nn.Module):
    """Initialize parameters and build model.
        Args:
            state_size (int): Dimension of each state
            action_size (int): Dimension of each action
            max_action (float): highest action to take
            seed (int): Random seed
            h1_units (int): Number of nodes in first hidden layer
            h2_units (int): Number of nodes in second hidden layer
            
        Return:
            value output of network 
    """
    
    def __init__(self, state_dim, action_dim):
        super(Critic, self).__init__()

        # Q1 architecture
        self.l1 = nn.Linear(state_dim + action_dim, 400)
        self.l2 = nn.Linear(400, 300)
        self.l3 = nn.Linear(300, 1)

        # Q2 architecture
        self.l4 = nn.Linear(state_dim + action_dim, 400)
        self.l5 = nn.Linear(400, 300)
        self.l6 = nn.Linear(300, 1)


    def forward(self, x, u):
        xu = torch.cat([x, u], 1)

        x1 = F.relu(self.l1(xu))
        x1 = F.relu(self.l2(x1))
        x1 = self.l3(x1)

        x2 = F.relu(self.l4(xu))
        x2 = F.relu(self.l5(x2))
        x2 = self.l6(x2)
        return x1, x2


    def Q1(self, x, u):
        xu = torch.cat([x, u], 1)

        x1 = F.relu(self.l1(xu))
        x1 = F.relu(self.l2(x1))
        x1 = self.l3(x1)
        return x1

# Memory

In [5]:
# Code based on: 
# https://github.com/openai/baselines/blob/master/baselines/deepq/replay_buffer.py

# Expects tuples of (state, next_state, action, reward, done)
class ReplayBuffer(object):
    """Buffer to store tuples of experience replay"""
    
    def __init__(self, max_size=1000000):
        """
        Args:
            max_size (int): total amount of tuples to store
        """
        
        self.storage = []
        self.max_size = max_size
        self.ptr = 0

    def add(self, data):
        """Add experience tuples to buffer
        
        Args:
            data (tuple): experience replay tuple
        """
        
        if len(self.storage) == self.max_size:
            self.storage[int(self.ptr)] = data
            self.ptr = (self.ptr + 1) % self.max_size
        else:
            self.storage.append(data)

    def sample(self, batch_size):
        """Samples a random amount of experiences from buffer of batch size
        
        Args:
            batch_size (int): size of sample
        """
        
        ind = np.random.randint(0, len(self.storage), size=batch_size)
        states, actions, next_states, rewards, dones = [], [], [], [], []

        for i in ind: 
            s, a, s_, r, d = self.storage[i]
            states.append(np.array(s, copy=False))
            actions.append(np.array(a, copy=False))
            next_states.append(np.array(s_, copy=False))
            rewards.append(np.array(r, copy=False))
            dones.append(np.array(d, copy=False))

        return np.array(states), np.array(actions), np.array(next_states), np.array(rewards).reshape(-1, 1), np.array(dones).reshape(-1, 1)

# Agent

In [6]:
class TD3(object):
    """Agent class that handles the training of the networks and provides outputs as actions
    
        Args:
            state_dim (int): state size
            action_dim (int): action size
            max_action (float): highest action to take
            device (device): cuda or cpu to process tensors
            env (env): gym environment to use
    
    """
    
    def __init__(self, state_dim, action_dim, max_action, env):
        self.actor = Actor(state_dim, action_dim, max_action).to(device)
        self.actor_target = Actor(state_dim, action_dim, max_action).to(device)
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=1e-3)

        self.critic = Critic(state_dim, action_dim).to(device)
        self.critic_target = Critic(state_dim, action_dim).to(device)
        self.critic_target.load_state_dict(self.critic.state_dict())
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=1e-3)

        self.max_action = max_action
        self.env = env


        
    def select_action(self, state, noise=0.1):
        """Select an appropriate action from the agent policy
        
            Args:
                state (array): current state of environment
                noise (float): how much noise to add to acitons
                
            Returns:
                action (float): action clipped within action range
        
        """
        
        state = torch.FloatTensor(state.reshape(1, -1)).to(device)
        print(state)
        
        action = self.actor(state).cpu().data.numpy().flatten()
        if noise != 0: 
            action = (action + np.random.normal(0, noise, size=self.env.action_space.shape[0]))
            
        return action.clip(self.env.action_space.low, self.env.action_space.high)

    
    def train(self, replay_buffer, iterations, batch_size=100, discount=0.99, tau=0.005, policy_noise=0.2, noise_clip=0.5, policy_freq=2):
        """Train and update actor and critic networks
        
            Args:
                replay_buffer (ReplayBuffer): buffer for experience replay
                iterations (int): how many times to run training
                batch_size(int): batch size to sample from replay buffer
                discount (float): discount factor
                tau (float): soft update for main networks to target networks
                
            Return:
                actor_loss (float): loss from actor network
                critic_loss (float): loss from critic network
        
        """
        
        for it in range(iterations):

            # Sample replay buffer 
            x, y, u, r, d = replay_buffer.sample(batch_size)
            state = torch.FloatTensor(x).to(device)
            action = torch.FloatTensor(u).to(device)
            next_state = torch.FloatTensor(y).to(device)
            done = torch.FloatTensor(1 - d).to(device)
            reward = torch.FloatTensor(r).to(device)

            # Select action according to policy and add clipped noise 
            noise = torch.FloatTensor(u).data.normal_(0, policy_noise).to(device)
            noise = noise.clamp(-noise_clip, noise_clip)
            next_action = (self.actor_target(next_state) + noise).clamp(-self.max_action, self.max_action)

            # Compute the target Q value
            target_Q1, target_Q2 = self.critic_target(next_state, next_action)
            target_Q = torch.min(target_Q1, target_Q2)
            target_Q = reward + (done * discount * target_Q).detach()

            # Get current Q estimates
            current_Q1, current_Q2 = self.critic(state, action)

            # Compute critic loss
            critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q) 

            # Optimize the critic
            self.critic_optimizer.zero_grad()
            critic_loss.backward()
            self.critic_optimizer.step()

            # Delayed policy updates
            if it % policy_freq == 0:

                # Compute actor loss
                actor_loss = -self.critic.Q1(state, self.actor(state)).mean()

                # Optimize the actor 
                self.actor_optimizer.zero_grad()
                actor_loss.backward()
                self.actor_optimizer.step()

                # Update the frozen target models
                for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
                    target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)

                for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
                    target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)


    def save(self, filename, directory):
        torch.save(self.actor.state_dict(), '%s/%s_actor.pth' % (directory, filename))
        torch.save(self.critic.state_dict(), '%s/%s_critic.pth' % (directory, filename))


    def load(self, filename="best_avg", directory="./saves"):
        self.actor.load_state_dict(torch.load('%s/%s_actor.pth' % (directory, filename)))
        self.critic.load_state_dict(torch.load('%s/%s_critic.pth' % (directory, filename)))

# Runner

In [7]:
class Runner():
    """Carries out the environment steps and adds experiences to memory"""
    
    def __init__(self, env, agent, replay_buffer):
        
        self.env = env
        self.agent = agent
        self.replay_buffer = replay_buffer
        self.obs = env.reset()
        self.done = False
        
    def next_step(self, episode_timesteps, noise=0.1):
        
        action = self.agent.select_action(np.array(self.obs), noise=0.1)
        
        # Perform action
        new_obs, reward, done, _ = self.env.step(action) 
        done_bool = 0 if episode_timesteps + 1 == 200 else float(done)
    
        # Store data in replay buffer
        replay_buffer.add((self.obs, new_obs, action, reward, done_bool))
        
        self.obs = new_obs
        
        if done:
            self.obs = self.env.reset()
            done = False
            
            return reward, True
        
        return reward, done

# Evaluate

In [8]:
def evaluate_policy(policy, env, eval_episodes=100,render=False):
    """run several episodes using the best agent policy
        
        Args:
            policy (agent): agent to evaluate
            env (env): gym environment
            eval_episodes (int): how many test episodes to run
            render (bool): show training
        
        Returns:
            avg_reward (float): average reward over the number of evaluations
    
    """
    
    avg_reward = 0.
    for i in range(eval_episodes):
        obs = env.reset()
        done = False
        while not done:
            if render:
                env.render()
            action = policy.select_action(np.array(obs), noise=0)
            obs, reward, done, _ = env.step(action)
            avg_reward += reward

    avg_reward /= eval_episodes

    print("\n---------------------------------------")
    print("Evaluation over {:d} episodes: {:f}" .format(eval_episodes, avg_reward))
    print("---------------------------------------")
    return avg_reward

# Observation

In [9]:
def observe(env,replay_buffer, observation_steps):
    """run episodes while taking random actions and filling replay_buffer
    
        Args:
            env (env): gym environment
            replay_buffer(ReplayBuffer): buffer to store experience replay
            observation_steps (int): how many steps to observe for
    
    """
    
    time_steps = 0
    obs = env.reset()
    done = False

    while time_steps < observation_steps:
        action = env.action_space.sample()
        new_obs, reward, done, _ = env.step(action)

        replay_buffer.add((obs, new_obs, action, reward, done))

        obs = new_obs
        time_steps += 1

        if done:
            obs = env.reset()
            done = False

        print("\rPopulating Buffer {}/{}.".format(time_steps, observation_steps), end="")
        sys.stdout.flush()

# Train

In [10]:
def train(agent, test_env):
    """Train the agent for exploration steps
    
        Args:
            agent (Agent): agent to use
            env (environment): gym environment
            writer (SummaryWriter): tensorboard writer
            exploration (int): how many training steps to run
    
    """

    total_timesteps = 0
    timesteps_since_eval = 0
    episode_num = 0
    episode_reward = 0
    episode_timesteps = 0
    done = False 
    obs = env.reset()
    evaluations = []
    rewards = []
    best_avg = -2000
    
    writer = SummaryWriter(comment="-TD3_Baseline_HalfCheetah")
    
    while total_timesteps < EXPLORATION:
    
        if done: 

            if total_timesteps != 0: 
                rewards.append(episode_reward)
                avg_reward = np.mean(rewards[-100:])
                
                writer.add_scalar("avg_reward", avg_reward, total_timesteps)
                writer.add_scalar("reward_step", reward, total_timesteps)
                writer.add_scalar("episode_reward", episode_reward, total_timesteps)
                
                if best_avg < avg_reward:
                    best_avg = avg_reward
                    print("saving best model....\n")
                    agent.save("best_avg","saves")

                print("\rTotal T: {:d} Episode Num: {:d} Reward: {:f} Avg Reward: {:f}".format(
                    total_timesteps, episode_num, episode_reward, avg_reward), end="")
                sys.stdout.flush()


                if avg_reward >= REWARD_THRESH:
                    break

                agent.train(replay_buffer, episode_timesteps, BATCH_SIZE, GAMMA, TAU, NOISE, NOISE_CLIP, POLICY_FREQUENCY)

                # Evaluate episode
#                 if timesteps_since_eval >= EVAL_FREQUENCY:
#                     timesteps_since_eval %= EVAL_FREQUENCY
#                     eval_reward = evaluate_policy(agent, test_env)
#                     evaluations.append(avg_reward)
#                     writer.add_scalar("eval_reward", eval_reward, total_timesteps)

#                     if best_avg < eval_reward:
#                         best_avg = eval_reward
#                         print("saving best model....\n")
#                         agent.save("best_avg","saves")

                episode_reward = 0
                episode_timesteps = 0
                episode_num += 1 

        reward, done = runner.next_step(episode_timesteps)
        episode_reward += reward

        episode_timesteps += 1
        total_timesteps += 1
        timesteps_since_eval += 1

# Config

In [11]:
ENV = "RoboschoolHalfCheetah-v1"#"Pendulum-v0"
SEED = 0
OBSERVATION = 10000
EXPLORATION = 5000000
BATCH_SIZE = 100
GAMMA = 0.99
TAU = 0.005
NOISE = 0.2
NOISE_CLIP = 0.5
EXPLORE_NOISE = 0.1
POLICY_FREQUENCY = 2
EVAL_FREQUENCY = 5000
REWARD_THRESH = 8000

# Main

In [12]:
env = gym.make(ENV)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Set seeds
env.seed(SEED)
torch.manual_seed(SEED)
np.random.seed(SEED)

state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0] 
max_action = float(env.action_space.high[0])

policy = TD3(state_dim, action_dim, max_action, env)

replay_buffer = ReplayBuffer()

runner = Runner(env, policy, replay_buffer)

total_timesteps = 0
timesteps_since_eval = 0
episode_num = 0
done = True

In [None]:
# Populate replay buffer
observe(env, replay_buffer, OBSERVATION)

Populating Buffer 10000/10000.

In [13]:
# Train agent
train(policy, env)

tensor([[ 0.0000,  0.0000,  1.0000,  0.0000,  0.0000,  0.0000,  0.0000, -0.0000,
         -0.4511,  0.0000,  0.1186,  0.0000, -0.2801,  0.0000,  0.2678,  0.0000,
         -0.0257,  0.0000,  1.2390,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000]])
tensor([[-5.7925e-04,  0.0000e+00,  1.0000e+00,  4.1982e-01,  0.0000e+00,
         -2.8052e-02,  0.0000e+00, -4.4921e-04, -3.4898e-01,  1.2153e-01,
          3.0647e-02, -9.6289e-02, -4.5571e-01,  1.0223e-01,  3.1420e-01,
          5.7355e-02, -3.1098e-02,  6.0102e-02,  1.2698e+00, -1.7329e-01,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00]])
tensor([[-2.9562e-03,  0.0000e+00,  1.0000e+00,  7.0635e-01,  0.0000e+00,
         -6.7841e-02,  0.0000e+00, -9.9834e-04, -3.1236e-01,  2.0574e-01,
         -3.4452e-03, -2.0218e-01, -4.2228e-01,  1.3136e-01,  3.2625e-01,
          1.0007e-01, -2.7499e-02,  4.2870e-03,  1.2545e+00, -1.0438e-01,
          0.0000e+00,  0.0000e+00,  0.

Total T: 38 Episode Num: 1 Reward: -8.080337 Avg Reward: 5.697363tensor([[ 0.0000,  0.0000,  1.0000,  0.0000,  0.0000,  0.0000,  0.0000, -0.0000,
         -0.4545,  0.0000,  0.1076,  0.0000, -0.2298,  0.0000,  0.2201,  0.0000,
          0.0308,  0.0000,  1.2422,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000]])
tensor([[ 0.0047,  0.0000,  1.0000,  1.3613,  0.0000,  0.2320,  0.0000, -0.0043,
         -0.3963,  0.4422,  0.0441, -0.4849, -0.3750, -0.8304,  0.1613, -0.6801,
          0.1846,  1.7678,  1.1378, -1.4460,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000]])
tensor([[ 0.0215,  0.0000,  1.0000,  2.5051,  0.0000,  0.4357,  0.0000, -0.0193,
         -0.2511,  0.8185, -0.1264, -1.0202, -0.7389, -1.5754, -0.0476, -2.1045,
          0.7049,  5.0000,  0.8079, -3.8109,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000]])
tensor([[ 3.6047e-02,  0.0000e+00,  1.0000e+00,  3.8663e+00,  0.0000e+00,
          2.5442e-01,  0.0000e+00, -1.1797e-0

Total T: 76 Episode Num: 3 Reward: -16.665749 Avg Reward: -6.177595tensor([[ 0.0000,  0.0000,  1.0000,  0.0000,  0.0000,  0.0000,  0.0000, -0.0000,
         -0.3848,  0.0000, -0.0422,  0.0000, -0.3190,  0.0000,  0.3746,  0.0000,
          0.0277,  0.0000,  1.2002,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000]])
tensor([[ 0.0027,  0.0000,  1.0000,  1.6518,  0.0000,  0.1306,  0.0000, -0.0020,
         -0.3196,  0.4956, -0.1082, -0.5055, -0.4763, -0.8997,  0.3580, -0.1936,
          0.1095,  0.9265,  1.1142, -1.1737,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000]])
tensor([[ 0.0139,  0.0000,  1.0000,  3.4349,  0.0000,  0.3518,  0.0000, -0.0111,
         -0.1430,  1.0444, -0.3008, -1.1783, -0.8605, -1.6491,  0.2694, -0.9387,
          0.4114,  2.9447,  0.8539, -2.8981,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000]])
tensor([[ 0.0328,  0.0000,  1.0000,  4.8301,  0.0000,  0.3664,  0.0000, -0.0213,
          0.1390,  1.4853, -0.7084, 

Total T: 114 Episode Num: 5 Reward: -5.834756 Avg Reward: -5.997606tensor([[ 0.0000,  0.0000,  1.0000,  0.0000,  0.0000,  0.0000,  0.0000, -0.0000,
         -0.4645,  0.0000, -0.0566,  0.0000, -0.1660,  0.0000,  0.2624,  0.0000,
          0.1164,  0.0000,  1.2667,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000]])
tensor([[-0.0027,  0.0000,  1.0000,  0.9946,  0.0000, -0.1397, -0.0000,  0.0073,
         -0.4295,  0.2650, -0.1058, -0.3771, -0.1936, -0.1554,  0.3226,  0.6619,
          0.0029, -1.2405,  1.2667,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000]])
tensor([[-0.0134,  0.0000,  1.0000,  2.0248,  0.0000, -0.3576, -0.0000,  0.0355,
         -0.3368,  0.5424, -0.2436, -0.8327, -0.2674, -0.3186,  0.4709,  1.2413,
         -0.2652, -2.1870,  1.2667,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000]])
tensor([[-0.0339,  0.0000,  1.0000,  3.2170,  0.0000, -0.7208, -0.0000,  0.0884,
         -0.1778,  0.8855, -0.5137, 

Total T: 222 Episode Num: 6 Reward: -25.516873 Avg Reward: -8.786072tensor([[ 0.0000,  0.0000,  1.0000,  0.0000,  0.0000,  0.0000,  0.0000, -0.0000,
         -0.3262,  0.0000,  0.1060,  0.0000, -0.4698,  0.0000,  0.2385,  0.0000,
          0.1118,  0.0000,  1.1882,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000]])
tensor([[-0.0056,  0.0000,  1.0000, -1.2361,  0.0000, -0.2710, -0.0000,  0.0033,
         -0.3790, -0.4037,  0.1630,  0.4329, -0.3209,  0.8580,  0.2915,  0.5791,
         -0.0080, -1.3096,  1.1882,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000]])
tensor([[-0.0271,  0.0000,  1.0000, -2.5189,  0.0000, -0.5496, -0.0000,  0.0150,
         -0.5183, -0.8231,  0.3186,  0.9197,  0.0327,  1.5242,  0.4163,  1.0226,
         -0.2854, -2.2540,  1.1882,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000]])
tensor([[-0.0659,  0.0000,  1.0000, -4.1206,  0.0000, -0.7789, -0.0000,  0.0335,
         -0.7580, -1.3433,  0.5705,

Total T: 261 Episode Num: 8 Reward: -12.115935 Avg Reward: -9.552315tensor([[ 0.0000,  0.0000,  1.0000,  0.0000,  0.0000,  0.0000,  0.0000, -0.0000,
         -0.2433,  0.0000, -0.0955,  0.0000, -0.4559,  0.0000,  0.3244,  0.0000,
          0.0562,  0.0000,  1.1544,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000]])
tensor([[-0.0040,  0.0000,  1.0000, -1.2760,  0.0000, -0.1972,  0.0000, -0.0014,
         -0.2957, -0.3984, -0.0209,  0.5640, -0.3175,  0.7983,  0.3457,  0.2371,
          0.0132, -0.4759,  1.1544,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000]])
tensor([[-0.0178,  0.0000,  1.0000, -2.6087,  0.0000, -0.3656,  0.0000, -0.0084,
         -0.4327, -0.8056,  0.1655,  1.0705,  0.0574,  1.6862,  0.3829,  0.2735,
         -0.0579, -0.5073,  1.1544,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000]])
tensor([[-0.0395,  0.0000,  1.0000, -4.1276,  0.0000, -0.5798,  0.0000, -0.0271,
         -0.6614, -1.2682,  0.4501,

tensor([[-2.9088e-01,  0.0000e+00,  1.0000e+00,  1.0722e-02,  0.0000e+00,
         -3.7756e-03,  0.0000e+00, -1.8572e-01, -1.0003e+00,  4.8112e-03,
          1.2893e+00,  7.6565e-03,  1.0033e+00, -9.1458e-03,  1.0002e+00,
          8.3763e-04, -2.1978e-01,  6.2292e-03,  1.0003e+00, -1.1265e-02,
          1.0000e+00,  0.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
          0.0000e+00]])
tensor([[-2.9105e-01,  0.0000e+00,  1.0000e+00,  1.9142e-02,  0.0000e+00,
          3.6918e-03,  0.0000e+00, -1.8620e-01, -1.0005e+00,  6.8011e-03,
          1.2907e+00,  1.2202e-03,  1.0028e+00, -1.0141e-02,  1.0002e+00,
          2.6036e-04, -2.1951e-01,  3.7565e-03,  1.0002e+00, -9.1702e-03,
          1.0000e+00,  0.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
          0.0000e+00]])
tensor([[-2.9121e-01,  0.0000e+00,  1.0000e+00, -4.0650e-02,  0.0000e+00,
         -6.7347e-03,  0.0000e+00, -1.8708e-01, -1.0006e+00, -1.3975e-02,
          1.2924e+00,  5.6894e-03,  1.0035e+00,  1.1313e-02,  1.

tensor([[-3.1339e-01,  0.0000e+00,  1.0000e+00, -7.1182e-02,  0.0000e+00,
         -1.7000e-02,  0.0000e+00, -2.6387e-01, -1.0010e+00, -2.3882e-02,
          1.4947e+00,  7.2232e-03,  1.0033e+00,  1.4256e-02,  1.0002e+00,
         -1.2810e-04, -1.7305e-01, -1.0390e-02,  1.0000e+00,  9.9141e-03,
          1.0000e+00,  0.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
          0.0000e+00]])
tensor([[-3.1348e-01,  0.0000e+00,  1.0000e+00, -6.3400e-02,  0.0000e+00,
         -1.5885e-02,  0.0000e+00, -2.6434e-01, -1.0009e+00, -2.1163e-02,
          1.4959e+00,  6.2903e-03,  1.0033e+00,  1.2641e-02,  1.0002e+00,
          9.8345e-04, -1.7286e-01, -1.6304e-03,  1.0002e+00, -1.1346e-02,
          1.0000e+00,  0.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
          0.0000e+00]])
tensor([[-0.3137,  0.0000,  1.0000, -0.0816,  0.0000, -0.0203,  0.0000, -0.2647,
         -1.0012, -0.0265,  1.4972,  0.0070,  1.0032,  0.0124,  1.0003,  0.0015,
         -0.1731, -0.0046,  1.0003, -0.0137,  1.00

tensor([[-3.4645e-01,  0.0000e+00,  1.0000e+00,  2.5580e-02,  0.0000e+00,
          1.6931e-03,  0.0000e+00, -3.3181e-01, -9.9977e-01,  1.1476e-02,
          1.7055e+00,  9.3587e-03,  1.0026e+00,  1.1732e-03,  1.0003e+00,
          1.0711e-03, -1.9722e-01,  5.8323e-03,  1.0003e+00, -1.4518e-02,
          1.0000e+00,  0.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
          0.0000e+00]])
tensor([[-3.4673e-01,  0.0000e+00,  1.0000e+00,  2.6940e-02,  0.0000e+00,
          2.9801e-03,  0.0000e+00, -3.3237e-01, -9.9946e-01,  1.0305e-02,
          1.7071e+00,  9.8853e-03,  1.0028e+00,  1.4592e-03,  1.0002e+00,
         -8.6293e-05, -1.9705e-01,  1.1940e-02,  1.0001e+00, -1.2590e-02,
          1.0000e+00,  0.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
          0.0000e+00]])
tensor([[-0.3470,  0.0000,  1.0000, -0.1134,  0.0000, -0.0418,  0.0000, -0.3326,
         -1.0020, -0.0377,  1.7082,  0.0063,  1.0033,  0.0084,  1.0004,  0.0040,
         -0.1990, -0.0128,  1.0005, -0.0143,  1.00

tensor([[-3.9062e-01,  0.0000e+00,  1.0000e+00,  2.6788e-02,  0.0000e+00,
          3.8547e-03,  0.0000e+00, -3.8353e-01, -9.9938e-01,  1.3230e-02,
          1.9121e+00,  9.0521e-03,  1.0032e+00,  3.2745e-03,  1.0003e+00,
         -9.9836e-04, -2.8192e-01, -1.8616e-02,  1.0010e+00,  3.8381e-02,
          1.0000e+00,  0.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
          0.0000e+00]])
tensor([[-3.9089e-01,  0.0000e+00,  1.0000e+00,  2.6910e-02,  0.0000e+00,
          1.0699e-02,  0.0000e+00, -3.8373e-01, -1.0006e+00,  1.1892e-02,
          1.9132e+00,  2.5195e-03,  1.0031e+00, -8.2483e-03,  1.0003e+00,
         -9.2749e-04, -2.8191e-01, -1.3491e-03,  9.9988e-01, -5.2174e-03,
          1.0000e+00,  0.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
          0.0000e+00]])
tensor([[-3.9115e-01,  0.0000e+00,  1.0000e+00,  2.9372e-02,  0.0000e+00,
          6.1150e-03,  0.0000e+00, -3.8388e-01, -1.0002e+00,  1.3202e-02,
          1.9143e+00,  6.2370e-03,  1.0030e+00, -8.2636e-04,  1.

KeyboardInterrupt: 

In [None]:
policy.load()

for i in range(100):
    evaluate_policy(policy, env, render=True)

In [None]:
env.close()