In [1]:
import numpy as np
import torch
import gym
import os
from agent import TD3
from util import ReplayBuffer

In [2]:
policy_name = "TD3"
env_name = "BipedalWalker-v2"
seed = 0
filename = f"{policy_name}-{env_name}-{str(seed)}"
start_timesteps = 1e4
eval_freq = 5e3
max_timesteps = 1e6
save_models = True
expl_noise = 0.1
batch_size = 100
discount = 0.99
tau = 0.005
policy_noise = 0.2
noise_clip = 0.5
policy_freq = 2

# print(filename)

if not os.path.exists("./results"):
    os.makedirs("./results")
if save_models and not os.path.exists("./pytorch_models"):
    os.makedirs("./pytorch_models")

TD3-BipedalWalker-v2-0


In [3]:
def evaluate_policy(policy, eval_episodes=10):
    avg_reward = 0.
    for _ in range(eval_episodes):
        obs = env.reset()
        done = False
        while not done:
            action = policy.select_action(np.array(obs))
            obs, reward, done, _ = env.step(action)
            avg_reward += reward
            
    avg_reward /= eval_episodes
    print(f"Evaluation over {eval_episodes} episodes: {avg_reward}")
    return avg_reward

In [4]:
env = gym.make(env_name)
env.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)

state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
max_action = float(env.action_space.high[0])

policy = TD3(state_dim, action_dim, max_action)

replay_buffer = ReplayBuffer()

evaluations = [evaluate_policy(policy)]

total_timesteps = 0
timesteps_since_eval = 0
episode_num = 0
done = True

while total_timesteps < max_timesteps:
    if done:
        if total_timesteps !=0:
            print(f"Total T: {total_timesteps} Episode Num: {episode_num} Episode T: {episode_timesteps} Reward: {episode_reward}")
            policy.train(replay_buffer, episode_timesteps, batch_size, discount, tau, policy_noise, noise_clip, policy_freq)
        if timesteps_since_eval > eval_freq:
            timesteps_since_eval %= eval_freq
            evaluations.append(evaluate_policy(policy))
            
            if save_models: policy.save(filename, directory="./pytorch_models")
            np.save(f"./results/{filename}",evaluations)
            
        obs = env.reset()
        done = False
        episode_reward = 0
        episode_timesteps = 0
        episode_num +=1
        
        
    if total_timesteps < start_timesteps:
        action = env.action_space.sample()
    else:
        action = policy.select_action(np.array(obs))
        if expl_noise != 0:
            action = (action + np.random.normal(0, expl_noise, size=env.action_space.shape[0])).clip(env.action_space.low, env.action_space.high)
    
    new_obs, reward, done, _ = env.step(action)
    done_bool = 0 if episode_timesteps + 1 == env._max_episode_steps else float(done)
    episode_reward += reward
    
    replay_buffer.add((obs, new_obs, action, reward, done_bool))
    obs = new_obs
    
    episode_timesteps += 1
    total_timesteps += 1
    timesteps_since_eval += 1

evaluations.append(evaluate_policy(policy))
if save_models: policy.save(f"{filename}",directory="./pytorch_models")
np.save(f"./results/{filename}",evaluations)

Evaluation over 10 episodes: -118.46731245457529
Total T: 64 Episode Num: 1 Episode T: 64 Reward: -100.18085529850983
Total T: 1664 Episode Num: 2 Episode T: 1600 Reward: -82.02193539054794
Total T: 3264 Episode Num: 3 Episode T: 1600 Reward: -79.50127513945797
Total T: 4864 Episode Num: 4 Episode T: 1600 Reward: -82.10784610391238
Total T: 4985 Episode Num: 5 Episode T: 121 Reward: -101.48584581220507
Total T: 6585 Episode Num: 6 Episode T: 1600 Reward: -90.7804433609247
Evaluation over 10 episodes: -23.646632089354284
Total T: 8185 Episode Num: 7 Episode T: 1600 Reward: -83.27493059260918
Total T: 8279 Episode Num: 8 Episode T: 94 Reward: -103.32958148858836
Total T: 9879 Episode Num: 9 Episode T: 1600 Reward: -75.885076106441
Total T: 11479 Episode Num: 10 Episode T: 1600 Reward: -23.55360106786585
Evaluation over 10 episodes: -132.94366374796013
Total T: 11608 Episode Num: 11 Episode T: 129 Reward: -130.14037812755734
Total T: 11652 Episode Num: 12 Episode T: 44 Reward: -112.088960

Total T: 69995 Episode Num: 104 Episode T: 116 Reward: -96.79344124117063
Total T: 71595 Episode Num: 105 Episode T: 1600 Reward: -74.668877926624
Evaluation over 10 episodes: -113.58407384996278
Total T: 73195 Episode Num: 106 Episode T: 1600 Reward: -38.769573342459566
Total T: 74795 Episode Num: 107 Episode T: 1600 Reward: -58.901384211567034
Total T: 76395 Episode Num: 108 Episode T: 1600 Reward: -98.04567151799925
Evaluation over 10 episodes: -116.07138444236588
Total T: 76479 Episode Num: 109 Episode T: 84 Reward: -118.79033690415356
Total T: 78079 Episode Num: 110 Episode T: 1600 Reward: -53.47971898815172
Total T: 78616 Episode Num: 111 Episode T: 537 Reward: -165.03562682510363
Total T: 80216 Episode Num: 112 Episode T: 1600 Reward: -55.89486768524502
Evaluation over 10 episodes: -79.96136488918138
Total T: 81816 Episode Num: 113 Episode T: 1600 Reward: -57.87778329592001
Total T: 83416 Episode Num: 114 Episode T: 1600 Reward: -47.88437741760918
Total T: 85016 Episode Num: 115

KeyboardInterrupt: 