In [1]:
import gymnasium as gym


env = gym.make("CartPole-v1")

In [2]:
env.action_space
env.observation_space

Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)

In [8]:
%load_ext autoreload
%autoreload 2
import gym
import numpy as np
from datetime import datetime
from ppo import PPO
import os

env_name = "CartPole-v1"
has_continuous_action_space = False

max_ep_len = 400
max_training_timesteps = 100000

print_freq = max_ep_len * 2
log_freq = 10
save_model_freq = int(1e5)

action_std = None
K_epochs = 200
eps_clip = 0.1
gamma = 0.99

lr_actor = 0.0001  
lr_critic = 0.0001 

env = gym.make(env_name)

state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n if not has_continuous_action_space else env.action_space.shape[0]

ppo_agent = PPO(state_dim, action_dim, lr_actor, lr_critic, gamma, K_epochs, eps_clip, has_continuous_action_space, action_std)

log_dir = "PPO_logs"
os.makedirs(log_dir, exist_ok=True)
log_f_name = log_dir + '/' + env_name + "_PPO_log_" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + ".csv"

print(f"Training environment: {env_name}")

log_f = open(log_f_name, "w+")
log_f.write('episode,timestep,reward\n')

start_time = datetime.now().replace(microsecond=0)
print("Started training at (GMT):", start_time)

time_step = 0
i_episode = 0


# printing and logging variables
print_running_reward = 0
print_running_episodes = 0

while time_step <= max_training_timesteps:
    state = env.reset()[0]
    current_ep_reward = 0

    for t in range(1, max_ep_len + 1):
        action = ppo_agent.select_action(state)
        state, reward, done, truncated, info = env.step(action)

        ppo_agent.buffer.rewards.append(reward)
        ppo_agent.buffer.is_terminals.append(done)

        time_step += 1
        current_ep_reward += reward

        if time_step % max_ep_len * 3 == 0:
            ppo_agent.update()

        if time_step % log_freq == 0 and print_running_episodes > 0:
            log_f.write(f'{i_episode},{time_step},{current_ep_reward}\n')
            # print average reward till last episode
            print_avg_reward = print_running_reward / print_running_episodes
            print_avg_reward = round(print_avg_reward, 2)

            print("Episode : {} \t\t Timestep : {} \t\t Average Reward : {}".format(i_episode, time_step, print_avg_reward))

            print_running_reward = 0
            print_running_episodes = 0


        if time_step % save_model_freq == 0:
            ppo_agent.save(f"PPO_{env_name}.pth")

        if done:
            break

    print_running_reward += current_ep_reward
    print_running_episodes += 1

    i_episode += 1

log_f.close()
end_time = datetime.now().replace(microsecond=0)
print("Finished training at (GMT):", end_time)
print("Total training time:", end_time - start_time)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Training environment: CartPole-v1
Started training at (GMT): 2024-06-17 12:10:07
Episode : 1 		 Timestep : 20 		 Average Reward : 16.0
Episode : 2 		 Timestep : 50 		 Average Reward : 27.0
Episode : 3 		 Timestep : 70 		 Average Reward : 24.0
Episode : 4 		 Timestep : 90 		 Average Reward : 20.0
Episode : 5 		 Timestep : 110 		 Average Reward : 14.0
Episode : 6 		 Timestep : 120 		 Average Reward : 15.0
Episode : 7 		 Timestep : 140 		 Average Reward : 16.0
Episode : 8 		 Timestep : 150 		 Average Reward : 14.0
Episode : 9 		 Timestep : 180 		 Average Reward : 29.0
Episode : 10 		 Timestep : 230 		 Average Reward : 48.0
Episode : 11 		 Timestep : 240 		 Average Reward : 15.0
Episode : 12 		 Timestep : 270 		 Average Reward : 30.0
Episode : 13 		 Timestep : 300 		 Average Reward : 23.0
Episode : 14 		 Timestep : 340 		 Average Reward : 40.0
Episode : 15 		 Timestep : 350 		 Average Reward : 16.0
Epis

KeyboardInterrupt: 