In [1]:
ENV_NAME = 'Pendulum-v1'
alias = 'TD_INVASE'
RED_ACTION_DIM = 100
import gym
print('\n now evaluating: \n       ', ENV_NAME)


import matplotlib.pyplot as plt
import numpy as np
import torch
import argparse
import os
import torch.nn.functional as F
import utils
import TD3_INVASE_TD

def eval_policy(policy, eval_episodes=10):
    eval_env = gym.make(ENV_NAME)
    avg_reward = 0.
    for _ in range(eval_episodes):
        state, done = eval_env.reset(), False
        while not done:
            action = policy.select_action(np.array(state))
            state, reward, done, _ = eval_env.step(action[:-RED_ACTION_DIM])
            avg_reward += reward

    avg_reward /= eval_episodes

    print("---------------------------------------")
    print(f"Evaluation over {eval_episodes} episodes: {avg_reward:.3f}")
    print("---------------------------------------")
    return avg_reward

env = gym.make(ENV_NAME)
torch.manual_seed(0)
np.random.seed(0)

#spec = env.action_space
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0] + RED_ACTION_DIM
max_action = env.action_space.high[0]

args_policy_noise = 0.2
args_noise_clip = 0.5
args_policy_freq = 2
args_max_timesteps = 20000
args_expl_noise = 0.1
args_batch_size = 256
args_eval_freq = 1000
args_start_timesteps = 10000

kwargs = {
    "state_dim": state_dim,
    "action_dim": action_dim,
    "max_action": max_action,
    "discount": 0.99,
    "tau": 0.005
}

for repeat in range(5):
    kwargs["policy_noise"] = args_policy_noise * max_action
    kwargs["noise_clip"] = args_noise_clip * max_action
    kwargs["policy_freq"] = args_policy_freq
    policy = TD3_INVASE_TD.TD3(**kwargs)
    replay_buffer = utils.ReplayBuffer(state_dim, action_dim)

    # Evaluate untrained policy
    evaluations = [eval_policy(policy)]
    
    state, done = env.reset(), False
    episode_reward = 0
    episode_timesteps = 0
    episode_num = 0
    counter = 0
    msk_list = []        
    temp_curve = [eval_policy(policy)]
    temp_val = []
    for t in range(int(args_max_timesteps)):
        episode_timesteps += 1
        counter += 1
        # Select action randomly or according to policy
        if t < args_start_timesteps:
            action = np.random.uniform(-max_action, max_action, action_dim)
        else:
            if np.random.uniform(0,1) < 0.0:
                action = np.random.uniform(-max_action, max_action, action_dim)
            else:
                action = (
                    policy.select_action(np.array(state))
                    + np.random.normal(0, max_action * args_expl_noise, size=action_dim)
                ).clip(-max_action, max_action)

        # Perform action
        next_state, reward, done, _ = env.step(action[:-RED_ACTION_DIM])
        

        done_bool = float(done) if episode_timesteps < env._max_episode_steps else 0

        replay_buffer.add(state, action, next_state, reward, done_bool)

        state = next_state
        episode_reward += reward

        if t >= args_start_timesteps:
            '''TD3'''
            Lmd = t/args_max_timesteps * 0.1
            Thr = 0.5*(1 - t/args_max_timesteps)
            policy.train(replay_buffer, args_batch_size, Lmd, Thr)
                    
                    
        # Train agent after collecting sufficient data
        if done:
            print(f"Total T: {t+1} Episode Num: {episode_num+1} Episode T: {episode_timesteps} Reward: {episode_reward:.3f}")
            msk_list = []
            state, done = env.reset(), False
            episode_reward = 0
            episode_timesteps = 0
            episode_num += 1 

        # Evaluate episode
        if (t + 1) % args_eval_freq == 0:
            evaluations.append(eval_policy(policy))
            print('recent Evaluation:',evaluations[-1])
            np.save('results/evaluations_alias{}_ENV{}_Repeat{}'.format(alias,ENV_NAME,repeat),evaluations)
print('JUPYTER CELL ENDING')


 now evaluating: 
        Pendulum-v1


  logger.warn(


---------------------------------------
Evaluation over 10 episodes: -1428.961
---------------------------------------
---------------------------------------
Evaluation over 10 episodes: -1428.961
---------------------------------------
Total T: 200 Episode Num: 1 Episode T: 200 Reward: -1087.151
Total T: 400 Episode Num: 2 Episode T: 200 Reward: -879.068
Total T: 600 Episode Num: 3 Episode T: 200 Reward: -1252.688
Total T: 800 Episode Num: 4 Episode T: 200 Reward: -1454.375
Total T: 1000 Episode Num: 5 Episode T: 200 Reward: -1168.417
---------------------------------------
Evaluation over 10 episodes: -1428.961
---------------------------------------
recent Evaluation: -1428.9613687610868
Total T: 1200 Episode Num: 6 Episode T: 200 Reward: -1426.576
Total T: 1400 Episode Num: 7 Episode T: 200 Reward: -1168.346
Total T: 1600 Episode Num: 8 Episode T: 200 Reward: -1713.521
Total T: 1800 Episode Num: 9 Episode T: 200 Reward: -1343.202
Total T: 2000 Episode Num: 10 Episode T: 200 Reward

In [3]:
test_epochs = 1
test_steps = 1000

# Original pendulum
env.action_space.low = -2.0
env.action_space.high = 2.0

print(env.action_space.low , env.action_space.high)

test_reward_list_O = []

# Only test
for i_ in range(test_epochs): # Run test by visualizing TODO: use reward instead of visualization
    with torch.no_grad():
        observation_ = env.reset()
        done_ = False
        j_ = 0
        while (not done_) and j_ < test_steps:
            env.render()
            # time.sleep(1e-3)
            action = (policy.select_action(np.array(observation_))+ np.random.normal(0, max_action * args_expl_noise, size=action_dim)).clip(-max_action, max_action)
            o_, test_reward_, done, _ = env.step(action[:-RED_ACTION_DIM])
            observation_ = o_
            j_ += 1
            test_reward_list_O.append(test_reward_)
            #print(test_reward_)
            print(j_)
plt.plot(test_reward_list_O)
env.close()

-2.0 2.0


error: display Surface quit