In [1]:
import numpy as np
import torch
import gym
import pybullet_envs
import os

import utils
import TD3
import OurDDPG
import DDPG

In [2]:
from gym.envs.registration import registry, make, spec


def register(id, *args, **kvargs):
  if id in registry.env_specs:
    return
  else:
    return gym.envs.registration.register(id, *args, **kvargs)

In [3]:
register(id='MyAntBulletEnv-v0',
         entry_point='override_ant:MyAntBulletEnv',
         max_episode_steps=1000,
         reward_threshold=2500.0)

In [4]:
# Runs policy for X episodes and returns average reward
# A fixed seed is used for the eval environment
def eval_policy(policy, env_name, seed, eval_episodes=10):
    eval_env = gym.make(env_name)
    eval_env.seed(seed + 100)

    avg_reward = 0.
    for _ in range(eval_episodes):
        state, done = eval_env.reset(), False
        while not done:
            action = policy.select_action(np.array(state))
            state, reward, done, _ = eval_env.step(action)
            avg_reward += reward

    avg_reward /= eval_episodes

    print("---------------------------------------")
    print(f"Evaluation over {eval_episodes} episodes: {avg_reward:.3f}")
    print("---------------------------------------")
    return avg_reward


In [5]:
def main():
    args = {
            "policy" : "TD3",                  # Policy name (TD3, DDPG or OurDDPG)
            "env" : "MyAntBulletEnv-v0",         # OpenAI gym environment name
            "seed" : 0,                        # Sets Gym, PyTorch and Numpy seeds
            "start_timesteps" : 25e3,          # Time steps initial random policy is used
            "eval_freq" : 5e3,                 # How often (time steps) we evaluate
            "max_timesteps" : 2e6,             # Max time steps to run environment
            "expl_noise" : 0.1,                # Std of Gaussian exploration noise
            "batch_size" : 256,                # Batch size for both actor and critic
            "discount" : 0.99,                 # Discount factor
            "tau" : 0.005,                     # Target network update rate
            "policy_noise" : 0.2,              # Noise added to target policy during critic update
            "noise_clip" : 0.5,                # Range to clip target policy noise
            "policy_freq" : 2,                 # Frequency of delayed policy updates
            "save_model" : "store_true",       # Save model and optimizer parameters
            "load_model" : "",                 # Model load file name, "" doesn't load, "default" uses file_name
           }

    file_name = f"{args['policy']}_{args['env']}_{args['seed']}"
    print("---------------------------------------")
    print(f"Policy: {args['policy']}, Env: {args['env']}, Seed: {args['seed']}")
    print("---------------------------------------")

    if not os.path.exists("./results"):
        os.makedirs("./results")

    if args['save_model'] and not os.path.exists("./models"):
        os.makedirs("./models")

    env = gym.make(args['env'])

    # Set seeds
    env.seed(args['seed'])
    env.action_space.seed(args['seed'])
    torch.manual_seed(args['seed'])
    np.random.seed(args['seed'])

    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0] 
    max_action = float(env.action_space.high[0])

    kwargs = {
        "state_dim": state_dim,
        "action_dim": action_dim,
        "max_action": max_action,
        "discount": args['discount'],
        "tau": args['tau'],
    }

    # Initialize policy
    if args['policy'] == "TD3":
        # Target policy smoothing is scaled wrt the action scale
        kwargs["policy_noise"] = args['policy_noise'] * max_action
        kwargs["noise_clip"] = args['noise_clip'] * max_action
        kwargs["policy_freq"] = args['policy_freq']
        policy = TD3.TD3(**kwargs)
    elif args['policy'] == "OurDDPG":
        policy = OurDDPG.DDPG(**kwargs)
    elif args['policy'] == "DDPG":
        policy = DDPG.DDPG(**kwargs)

    if args['load_model'] != "":
        policy_file = file_name if args['load_model'] == "default" else args['load_model']
        policy.load(f"./models/{policy_file}")

    replay_buffer = utils.ReplayBuffer(state_dim, action_dim)

    # Evaluate untrained policy
    evaluations = [eval_policy(policy, args['env'], args['seed'])]

    state, done = env.reset(), False
    episode_reward = 0
    episode_timesteps = 0
    episode_num = 0

    for t in range(int(args['max_timesteps'])):

        episode_timesteps += 1

        # Select action randomly or according to policy
        if t < args['start_timesteps']:
            action = env.action_space.sample()
        else:
            action = (
                policy.select_action(np.array(state))
                + np.random.normal(0, max_action * args['expl_noise'], size=action_dim)
            ).clip(-max_action, max_action)

        # Perform action
        next_state, reward, done, _ = env.step(action) 
        done_bool = float(done) if episode_timesteps < env._max_episode_steps else 0

        # Store data in replay buffer
        replay_buffer.add(state, action, next_state, reward, done_bool)

        state = next_state
        episode_reward += reward

        # Train agent after collecting sufficient data
        if t >= args['start_timesteps']:
            policy.train(replay_buffer, args['batch_size'])

        if done: 
            # +1 to account for 0 indexing. +0 on ep_timesteps since it will increment +1 even if done=True
            print(f"Total T: {t+1} Episode Num: {episode_num+1} Episode T: {episode_timesteps} Reward: {episode_reward:.3f}")
            # Reset environment
            state, done = env.reset(), False
            episode_reward = 0
            episode_timesteps = 0
            episode_num += 1 

        # Evaluate episode
        if (t + 1) % args['eval_freq'] == 0:
            evaluations.append(eval_policy(policy, args['env'], args['seed']))
            np.save(f"./results/{file_name}", evaluations)
            if args['save_model']: policy.save(f"./models/{file_name}")


In [6]:
main()

---------------------------------------
Policy: TD3, Env: MyAntBulletEnv-v0, Seed: 0
---------------------------------------


pybullet build time: Jul 29 2021 19:39:46


---------------------------------------
Evaluation over 10 episodes: 9.411
---------------------------------------
Total T: 1000 Episode Num: 1 Episode T: 1000 Reward: 521.987
Total T: 2000 Episode Num: 2 Episode T: 1000 Reward: 490.852
Total T: 3000 Episode Num: 3 Episode T: 1000 Reward: 512.882
Total T: 4000 Episode Num: 4 Episode T: 1000 Reward: 487.381
Total T: 4163 Episode Num: 5 Episode T: 163 Reward: 76.051
---------------------------------------
Evaluation over 10 episodes: 9.411
---------------------------------------
Total T: 5163 Episode Num: 6 Episode T: 1000 Reward: 471.709
Total T: 6163 Episode Num: 7 Episode T: 1000 Reward: 475.883
Total T: 7163 Episode Num: 8 Episode T: 1000 Reward: 473.663
Total T: 8163 Episode Num: 9 Episode T: 1000 Reward: 426.268
Total T: 9163 Episode Num: 10 Episode T: 1000 Reward: 500.778
---------------------------------------
Evaluation over 10 episodes: 9.411
---------------------------------------
Total T: 10163 Episode Num: 11 Episode T: 1000