In [1]:
import gymnasium as gym


env = gym.make("CartPole-v1")

In [2]:
env.action_space
env.observation_space

Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)

In [8]:
%load_ext autoreload
%autoreload 2
import gym
import numpy as np
from datetime import datetime
from ppo import PPO
import os

env_name = "CartPole-v1"
has_continuous_action_space = False

max_ep_len = 400
max_training_timesteps = 100000

print_freq = max_ep_len * 2
log_freq = 10
save_model_freq = int(1e5)

action_std = None
K_epochs = 200
eps_clip = 0.1
gamma = 0.99

lr_actor = 0.0001  
lr_critic = 0.0001 

env = gym.make(env_name)

state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n if not has_continuous_action_space else env.action_space.shape[0]

ppo_agent = PPO(state_dim, action_dim, lr_actor, lr_critic, gamma, K_epochs, eps_clip, has_continuous_action_space, action_std)

log_dir = "PPO_logs"
os.makedirs(log_dir, exist_ok=True)
log_f_name = log_dir + '/' + env_name + "_PPO_log_" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + ".csv"

print(f"Training environment: {env_name}")

log_f = open(log_f_name, "w+")
log_f.write('episode,timestep,reward\n')

start_time = datetime.now().replace(microsecond=0)
print("Started training at (GMT):", start_time)

time_step = 0
i_episode = 0


# printing and logging variables
print_running_reward = 0
print_running_episodes = 0

while time_step <= max_training_timesteps:
    state = env.reset()[0]
    current_ep_reward = 0

    for t in range(1, max_ep_len + 1):
        action = ppo_agent.select_action(state)
        state, reward, done, truncated, info = env.step(action)

        ppo_agent.buffer.rewards.append(reward)
        ppo_agent.buffer.is_terminals.append(done)

        time_step += 1
        current_ep_reward += reward

        if time_step % max_ep_len * 3 == 0:
            ppo_agent.update()

        if time_step % log_freq == 0 and print_running_episodes > 0:
            log_f.write(f'{i_episode},{time_step},{current_ep_reward}\n')
            # print average reward till last episode
            print_avg_reward = print_running_reward / print_running_episodes
            print_avg_reward = round(print_avg_reward, 2)

            print("Episode : {} \t\t Timestep : {} \t\t Average Reward : {}".format(i_episode, time_step, print_avg_reward))

            print_running_reward = 0
            print_running_episodes = 0


        if time_step % save_model_freq == 0:
            ppo_agent.save(f"PPO_{env_name}.pth")

        if done:
            break

    print_running_reward += current_ep_reward
    print_running_episodes += 1

    i_episode += 1

log_f.close()
end_time = datetime.now().replace(microsecond=0)
print("Finished training at (GMT):", end_time)
print("Total training time:", end_time - start_time)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Training environment: CartPole-v1
Started training at (GMT): 2024-06-17 12:10:07
Episode : 1 		 Timestep : 20 		 Average Reward : 16.0
Episode : 2 		 Timestep : 50 		 Average Reward : 27.0
Episode : 3 		 Timestep : 70 		 Average Reward : 24.0
Episode : 4 		 Timestep : 90 		 Average Reward : 20.0
Episode : 5 		 Timestep : 110 		 Average Reward : 14.0
Episode : 6 		 Timestep : 120 		 Average Reward : 15.0
Episode : 7 		 Timestep : 140 		 Average Reward : 16.0
Episode : 8 		 Timestep : 150 		 Average Reward : 14.0
Episode : 9 		 Timestep : 180 		 Average Reward : 29.0
Episode : 10 		 Timestep : 230 		 Average Reward : 48.0
Episode : 11 		 Timestep : 240 		 Average Reward : 15.0
Episode : 12 		 Timestep : 270 		 Average Reward : 30.0
Episode : 13 		 Timestep : 300 		 Average Reward : 23.0
Episode : 14 		 Timestep : 340 		 Average Reward : 40.0
Episode : 15 		 Timestep : 350 		 Average Reward : 16.0
Epis

KeyboardInterrupt: 

In [2]:
import gym
env = gym.make("LunarLanderContinuous-v2")
float(env.action_space.high[0]), env.observation_space.shape[0], env.action_space.shape[0], env._max_episode_steps

(1.0, 8, 2, 1000)

In [3]:
import torch
import random
def reproducibility(seed: int):
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    random.seed(seed)
    np.random.seed(seed)
    if torch.cuda.is_available():
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.cuda.manual_seed(seed)
reproducibility(3)

NameError: name 'os' is not defined

In [26]:
%load_ext autoreload
%autoreload 2
import gym
import numpy as np
from datetime import datetime
from ppo import PPO
from ddpg import DDPG
from td import TD3
import os

env_name = "LunarLanderContinuous-v2"
max_ep_len = 1000
max_training_timesteps = 5e6

print_freq = max_ep_len * 2
log_freq = 10
save_model_freq = int(1e5)

action_std = None
K_epochs = 200
eps_clip = 0.1
gamma = 0.99

lr_actor = 0.0001  
lr_critic = 0.0001 

env = gym.make(env_name)

env.observation_space.shape[0]
env.action_space.shape[0]
float(env.action_space.high[0]) 

state_dim = 8
action_dim = 2

ddpg_agent = TD3(state_dim, action_dim, max_action=1)

log_dir = "PPO_logs"
os.makedirs(log_dir, exist_ok=True)
log_f_name = log_dir + '/' + env_name + "_PPO_log_" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + ".csv"

print(f"Training environment: {env_name}")

log_f = open(log_f_name, "w+")
log_f.write('episode,timestep,reward\n')

start_time = datetime.now().replace(microsecond=0)
print("Started training at (GMT):", start_time)

time_step = 0
i_episode = 0


# printing and logging variables
print_running_reward = 0
print_running_episodes = 0

def evaluate_policy(env, agent, turns = 3):
    total_scores = 0
    for j in range(turns):
        s, info = env.reset()
        done = False
        while not done:
            # Take deterministic actions at test time
            a = agent.select_action(s, deterministic=True)
            s_next, r, dw, tr, info = env.step(a)
            done = (dw or tr)

            total_scores += r
            s = s_next
    return int(total_scores/turns)

while time_step <= max_training_timesteps:
    state = env.reset()[0]
    current_ep_reward = 0
    done = False

    while not done:
        action = ddpg_agent.select_action(state, False)

        state_, reward, done, truncated, info = env.step(action)
        done = (done or truncated)

        reward = Reward_adapter(reward, 1)

        ddpg_agent.buffer.store(state, action, reward, state_, done)

        time_step += 1
        current_ep_reward += reward

        if time_step > 2000 and time_step % 50 == 0:
            print("Updating the Model")
            ddpg_agent.update()

        if time_step % log_freq == 0 and print_running_episodes > 0:
            log_f.write(f'{i_episode},{time_step},{current_ep_reward}\n')
            # print average reward till last episode
            print_avg_reward = print_running_reward / print_running_episodes
            print_avg_reward = round(print_avg_reward, 2)

            ddpg_agent.explore_noise *= 0.9998
            ep_r = evaluate_policy(env, ddpg_agent, turns=3)

            print("Episode : {} \t\t Timestep : {} \t\t Reward: {} \t\t Average Reward : {}".format(i_episode, time_step, ep_r, print_avg_reward))

            print_running_reward = 0
            print_running_episodes = 0

        state = state_

    print_running_reward += current_ep_reward
    print_running_episodes += 1

    i_episode += 1

log_f.close()
end_time = datetime.now().replace(microsecond=0)
print("Finished training at (GMT):", end_time)
print("Total training time:", end_time - start_time)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Device set to : NVIDIA TITAN RTX
Device set to : NVIDIA TITAN RTX
Training environment: LunarLanderContinuous-v2
Started training at (GMT): 2024-07-14 18:07:19
Episode : 1 		 Timestep : 90 		 Reward: -140 		 Average Reward : -132.58


  if not isinstance(terminated, (bool, np.bool8)):


Episode : 2 		 Timestep : 100 		 Reward: -131 		 Average Reward : -9.31
Episode : 3 		 Timestep : 110 		 Reward: -139 		 Average Reward : -17.3
Episode : 4 		 Timestep : 120 		 Reward: -119 		 Average Reward : -19.83
Episode : 5 		 Timestep : 130 		 Reward: -88 		 Average Reward : -15.13
Episode : 6 		 Timestep : 140 		 Reward: -178 		 Average Reward : -18.51
Episode : 7 		 Timestep : 150 		 Reward: -155 		 Average Reward : -20.11
Episode : 8 		 Timestep : 160 		 Reward: -138 		 Average Reward : -16.98
Episode : 9 		 Timestep : 170 		 Reward: -144 		 Average Reward : -11.7
Episode : 10 		 Timestep : 180 		 Reward: -127 		 Average Reward : -12.46
Episode : 11 		 Timestep : 190 		 Reward: -179 		 Average Reward : -12.11
Episode : 12 		 Timestep : 200 		 Reward: -139 		 Average Reward : -17.04
Episode : 13 		 Timestep : 210 		 Reward: -162 		 Average Reward : -16.04
Episode : 14 		 Timestep : 220 		 Reward: -122 		 Average Reward : -17.43
Episode : 15 		 Timestep : 230 		 Reward: -89 		 A

KeyboardInterrupt: 

In [23]:
#Just ignore this function~
def str2bool(v):
    '''transfer str to bool for argparse'''
    if isinstance(v, bool):
        return v
    if v.lower() in ('yes', 'True','true','TRUE', 't', 'y', '1'):
        return True
    elif v.lower() in ('no', 'False','false','FALSE', 'f', 'n', '0'):
        return False
    else:
        raise argparse.ArgumentTypeError('Boolean value expected.')

#reward engineering for better training
def Reward_adapter(r, EnvIdex):
    # For Pendulum-v0
    if EnvIdex == 0:
        r = (r + 8) / 8

    # For LunarLander
    elif EnvIdex == 1:
        if r <= -100: r = -10

    # For BipedalWalker
    elif EnvIdex == 4 or EnvIdex == 5:
        if r <= -100: r = -1
    return r

def evaluate_policy(env, agent, turns = 3):
    total_scores = 0
    for j in range(turns):
        s, info = env.reset()
        done = False
        while not done:
            # Take deterministic actions at test time
            a = agent.select_action(s, deterministic=True)
            s_next, r, dw, tr, info = env.step(a)
            done = (dw or tr)

            total_scores += r
            s = s_next
    return int(total_scores/turns)


In [25]:
%load_ext autoreload
%autoreload 2
import gymnasium as gym
from datetime import datetime
import numpy as np
import os, shutil
import argparse
import torch
from td import TD3


'''Hyperparameter Setting'''
parser = argparse.ArgumentParser()
parser.add_argument('--dvc', type=str, default='cuda', help='running device: cuda or cpu')
parser.add_argument('--EnvIdex', type=int, default=1, help='PV1, Lch_Cv2, Humanv4, HCv4, BWv3, BWHv3')
parser.add_argument('--write', type=str2bool, default=False, help='Use SummaryWriter to record the training')
parser.add_argument('--render', type=str2bool, default=False, help='Render or Not')
parser.add_argument('--Loadmodel', type=str2bool, default=False, help='Load pretrained model or Not')
parser.add_argument('--ModelIdex', type=int, default=30, help='which model to load')

parser.add_argument('--seed', type=int, default=0, help='random seed')
parser.add_argument('--update_every', type=int, default=50, help='training frequency')
parser.add_argument('--Max_train_steps', type=int, default=int(5e6), help='Max training steps')
parser.add_argument('--save_interval', type=int, default=int(1e5), help='Model saving interval, in steps.')
parser.add_argument('--eval_interval', type=int, default=int(2e3), help='Model evaluating interval, in steps.')

parser.add_argument('--delay_freq', type=int, default=1, help='Delayed frequency for Actor and Target Net')
parser.add_argument('--gamma', type=float, default=0.99, help='Discounted Factor')
parser.add_argument('--net_width', type=int, default=256, help='Hidden net width, s_dim-400-300-a_dim')
parser.add_argument('--a_lr', type=float, default=1e-4, help='Learning rate of actor')
parser.add_argument('--c_lr', type=float, default=1e-4, help='Learning rate of critic')
parser.add_argument('--batch_size', type=int, default=256, help='batch_size of training')
parser.add_argument('--explore_noise', type=float, default=0.15, help='exploring noise when interacting')
parser.add_argument('--explore_noise_decay', type=float, default=0.998, help='Decay rate of explore noise')
opt = parser.parse_args('')
opt.dvc = torch.device(opt.dvc) # from str to torch.device
print(opt)


def main():
    EnvName = ['Pendulum-v1','LunarLanderContinuous-v2','Humanoid-v4','HalfCheetah-v4','BipedalWalker-v3','BipedalWalkerHardcore-v3']
    BrifEnvName = ['PV1', 'LLdV2', 'Humanv4', 'HCv4','BWv3', 'BWHv3']

    # Build Env
    env = gym.make(EnvName[opt.EnvIdex], render_mode = "human" if opt.render else None)
    eval_env = gym.make(EnvName[opt.EnvIdex])
    opt.state_dim = env.observation_space.shape[0]
    opt.action_dim = env.action_space.shape[0]
    opt.max_action = float(env.action_space.high[0])   #remark: action space【-max,max】
    opt.max_e_steps = env._max_episode_steps
    print(f'Env:{EnvName[opt.EnvIdex]}  state_dim:{opt.state_dim}  action_dim:{opt.action_dim}  '
          f'max_a:{opt.max_action}  min_a:{env.action_space.low[0]}  max_e_steps:{opt.max_e_steps}')

    # Seed Everything
    env_seed = opt.seed
    np.random.seed(opt.seed)
    torch.manual_seed(opt.seed)
    torch.cuda.manual_seed(opt.seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    print("Random Seed: {}".format(opt.seed))

    # Build SummaryWriter to record training curves
    if opt.write:
        from torch.utils.tensorboard import SummaryWriter
        timenow = str(datetime.now())[0:-10]
        timenow = ' ' + timenow[0:13] + '_' + timenow[-2::]
        writepath = 'runs/{}'.format(BrifEnvName[opt.EnvIdex]) + timenow
        if os.path.exists(writepath): shutil.rmtree(writepath)
        writer = SummaryWriter(log_dir=writepath)


    # Build DRL model
    if not os.path.exists('model'): os.mkdir('model')
    agent = TD3(state_dim=opt.state_dim, action_dim=opt.action_dim, max_action=1) # var: transfer argparse to dictionary
    if opt.Loadmodel: agent.load(BrifEnvName[opt.EnvIdex], opt.ModelIdex)

    if opt.render:
        while True:
            score = evaluate_policy(env, agent, turns=1)
            print('EnvName:', BrifEnvName[opt.EnvIdex], 'score:', score)
    else:
        total_steps = 0
        while total_steps < opt.Max_train_steps:
            s, info = env.reset(seed=env_seed)  # Do not use opt.seed directly, or it can overfit to opt.seed
            env_seed += 1
            done = False

            '''Interact & trian'''
            while not done:
                if total_steps < (10*opt.max_e_steps): a = env.action_space.sample() # warm up
                else: a = agent.select_action(s, deterministic=False)
                s_next, r, dw, tr, info = env.step(a) # dw: dead&win; tr: truncated
                # r = Reward_adapter(r, opt.EnvIdex)
                done = (dw or tr)

                agent.buffer.store(s, a, r, s_next, dw)
                s = s_next
                total_steps += 1

                '''train if its time'''
                # train 50 times every 50 steps rather than 1 training per step. Better!
                if (total_steps >= 2*opt.max_e_steps) and (total_steps % opt.update_every == 0):
                    agent.update()

                '''record & log'''
                if total_steps % opt.eval_interval == 0:
                    agent.explore_noise *= opt.explore_noise_decay
                    ep_r = evaluate_policy(eval_env, agent, turns=3)
                    if opt.write: writer.add_scalar('ep_r', ep_r, global_step=total_steps)
                    print(f'EnvName:{BrifEnvName[opt.EnvIdex]}, Steps: {int(total_steps/1000)}k, Episode Reward:{ep_r}')

                '''save model'''
                if total_steps % opt.save_interval == 0:
                    agent.save(BrifEnvName[opt.EnvIdex], int(total_steps/1000))
        env.close()
        eval_env.close()


if __name__ == '__main__':
    main()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Namespace(dvc=device(type='cuda'), EnvIdex=1, write=False, render=False, Loadmodel=False, ModelIdex=30, seed=0, update_every=50, Max_train_steps=5000000, save_interval=100000, eval_interval=2000, delay_freq=1, gamma=0.99, net_width=256, a_lr=0.0001, c_lr=0.0001, batch_size=256, explore_noise=0.15, explore_noise_decay=0.998)
Env:LunarLanderContinuous-v2  state_dim:8  action_dim:2  max_a:1.0  min_a:-1.0  max_e_steps:1000
Random Seed: 0
EnvName:LLdV2, Steps: 2k, Episode Reward:-1166
EnvName:LLdV2, Steps: 4k, Episode Reward:-452
EnvName:LLdV2, Steps: 6k, Episode Reward:-208
EnvName:LLdV2, Steps: 8k, Episode Reward:-92
EnvName:LLdV2, Steps: 10k, Episode Reward:-128
EnvName:LLdV2, Steps: 12k, Episode Reward:-72
EnvName:LLdV2, Steps: 14k, Episode Reward:19
EnvName:LLdV2, Steps: 16k, Episode Reward:-85
EnvName:LLdV2, Steps: 18k, Episode Reward:-17
EnvName:LLdV2, Steps: 20k, Episode Reward:-32
EnvName:LLdV2,

KeyboardInterrupt: 

In [30]:
import gym
import numpy as np
from datetime import datetime
from ppo import PPO
from ddpg import DDPG
from td import TD3
import os


env_name = "LunarLanderContinuous-v2"

env = gym.make(env_name)
eval_env = gym.make(env_name)

state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
max_action = float(env.action_space.high[0])   #remark: action space【-max,max】
max_e_steps = env._max_episode_steps

reproducibility(3)
agent = TD3(state_dim=opt.state_dim, action_dim=opt.action_dim, max_action=1) 

total_steps = 0
env_seed = 3


while total_steps < 100000:

    state, info = env.reset(seed=3)
    env_seed += 1
    done = False

    while not done:
        if total_steps < 10*max_e_steps: 
            action = env.action_space.sample()
        else:
            action = agent.select_action(state, deterministic=False)
        
        state_, reward, done, tr, info = env.step(action)
        reward = Reward_adapter(reward, 1)
        done = (done or tr)

        agent.buffer.store(state, action, reward, state_, done)
        state = state_
        total_steps +=1

        if (total_steps >= 2*max_e_steps) and (total_steps % 50 == 0):
            agent.update()

        if total_steps % 1000 == 0:
            agent.explore_noise *= 0.988

            total_scores = 0
            for j in range(5):
                s, info = eval_env.reset()
                done = False
                while not done:
                    # Take deterministic actions at test time
                    a = agent.select_action(s, deterministic=True)
                    s_next, r, dw, tr, info = eval_env.step(a)
                    done = (dw or tr)

                    total_scores += r
                    s = s_next
            eval_s = total_scores / 5
            print(f'Steps: {int(total_steps)}, Episode Reward:{eval_s}')

    env.close()
    eval_env.close()


  if not isinstance(terminated, (bool, np.bool8)):


Steps: 1000, Episode Reward:-133.70651539273302
Steps: 2000, Episode Reward:-386.2711131134291
Steps: 3000, Episode Reward:-1079.5909419748411
Steps: 4000, Episode Reward:-531.8640370024275
Steps: 5000, Episode Reward:-357.3779769169705
Steps: 6000, Episode Reward:-393.44752598834987
Steps: 7000, Episode Reward:-298.6332106530454
Steps: 8000, Episode Reward:-390.49976887155395
Steps: 9000, Episode Reward:-494.52058582956926
Steps: 10000, Episode Reward:-71.08657570679081
Steps: 11000, Episode Reward:72.63717296339482
Steps: 12000, Episode Reward:-103.05285536330696
Steps: 13000, Episode Reward:-69.8877710872083
Steps: 14000, Episode Reward:-64.58550788672065
Steps: 15000, Episode Reward:-24.684381887021196
Steps: 16000, Episode Reward:-45.573699303399465
Steps: 17000, Episode Reward:7.338005654791584
Steps: 18000, Episode Reward:77.81901165201396
Steps: 19000, Episode Reward:156.68480132240438
Steps: 20000, Episode Reward:189.0323423219489
Steps: 21000, Episode Reward:176.3679329075756