## 1. Installing all the dependencies

In [1]:
# !pip install 'stable-baselines3[extra]' 

In [2]:
!python3 --version

Python 3.11.5


In [1]:
import os
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

## 2. Load Environment 

In [3]:
env = gym.make("CartPole-v1", render_mode="human")

Test out the environment and actually loading the environment

In [4]:
episodes = 5
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0

    while not done:
        env.render()
        action = env.action_space.sample()
        n_state, reward, done, truncated, info = env.step(action)
        score += reward
        # print('Episode:{}, Score :{} \n ---> n_state:{} \n ---> done:{}, truncated:{}, info:{}'.format(episode, score, n_state, done, truncated, info))
    print("Episodes:", episode, " Score:", score)
env.close()


Episodes: 1  Score: 51.0
Episodes: 2  Score: 14.0
Episodes: 3  Score: 10.0
Episodes: 4  Score: 14.0
Episodes: 5  Score: 42.0


2.1 Understanding the Environment

ACTION_SPACE: of pole
0: Push cart to the left
1: Push cart to the right

In [5]:
print( "action_space: ", env.action_space)
print( "action_space.sample(): ", env.action_space.sample())

action_space:  Discrete(2)
action_space.sample():  0


OBSERVATION_SPACE : CartPole 
Box(4,)
0: Cart_position
1: Cart_Velocity
2: Pole_angle
3: Pole_Angular_Velocity

In [6]:
print( "obs_space:", env.observation_space)
print( "obs_space.sample():", env.observation_space.sample())

obs_space: Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)
obs_space.sample(): [ 4.4520354e+00  2.0612396e+37 -2.4021810e-01 -2.4867722e+37]


## 3. Train the RL Model

In [3]:
# Make the directories first training/logs
log_path = os.path.join('Training', 'Logs')

In [9]:
log_path

'Training/Logs'

Setting up the RL_Algorithm

In [31]:
env = gym.make("CartPole-v1", render_mode="human")
env = DummyVecEnv([lambda:env])

#choosed apprioprate model 
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)


Using cpu device


In [None]:
PPO??

Executing the Agent Learning, Model Training or  Training the Agent

In [11]:
## executing the learning or model training  or training the agent
model.learn(total_timesteps=20000)

Logging to Training/Logs/PPO_12
-----------------------------
| time/              |      |
|    fps             | 4912 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 2048 |
-----------------------------
----------------------------------------
| time/                   |            |
|    fps                  | 3344       |
|    iterations           | 2          |
|    time_elapsed         | 1          |
|    total_timesteps      | 4096       |
| train/                  |            |
|    approx_kl            | 0.00998391 |
|    clip_fraction        | 0.117      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.685     |
|    explained_variance   | -0.00613   |
|    learning_rate        | 0.0003     |
|    loss                 | 4.81       |
|    n_updates            | 10         |
|    policy_gradient_loss | -0.0173    |
|    value_loss           | 42.6       |
----------------------------------------
--------------------

<stable_baselines3.ppo.ppo.PPO at 0x17a0a95d0>

## 4. Save and Reload the Model

In [12]:
## saving the model 
PPO_Path = os.path.join('Training', 'Saved Models', 'PPO-CartPole-model-v0')
model.save(PPO_Path) 

In [13]:
# trying delete then reloading the saved model
del model

In [29]:
# reloading
model = PPO.load(PPO_Path, env=env)

In [None]:
# checking if loaded or not
# model??

In [23]:
# reusing the trained agent or model 
model.learn(total_timesteps= 20000)

Logging to Training/Logs/PPO_15
-----------------------------
| time/              |      |
|    fps             | 45   |
|    iterations      | 1    |
|    time_elapsed    | 45   |
|    total_timesteps | 2048 |
-----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 46           |
|    iterations           | 2            |
|    time_elapsed         | 88           |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0038913414 |
|    clip_fraction        | 0.029        |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.561       |
|    explained_variance   | 0.729        |
|    learning_rate        | 0.0003       |
|    loss                 | 0.503        |
|    n_updates            | 110          |
|    policy_gradient_loss | -0.00286     |
|    value_loss           | 26           |
---------------------------

<stable_baselines3.ppo.ppo.PPO at 0x292923b90>

## 5. Evaluation

In [24]:
evaluate_policy(model, env, n_eval_episodes=10, render=True)

(500.0, 0.0)

In [32]:
# Enjoy trained agent
vec_env = model.get_env()
obs = vec_env.reset()
for i in range(1000):
    action, _states = model.predict(obs, deterministic=True)
    obs, rewards, dones, info = vec_env.step(action)
    vec_env.render("human")
print("done")

done


## 6. Test Model

In [33]:
#testing the model

episodes = 5
for episode in range(1, episodes+1):
    obs = env.reset()
    done = False
    score = 0 
    while not done:
        env.render()
        action, _state = model.predict(obs) # using saved model 
        obs, reward, done, info = env.step(action)
        score += reward
        print( "--> Episode:{} => ".format(episode), action, obs, reward, done )
    print("Episdoe:{}, Score:{}".format(episode, score))

env.close()   
        

--> Episode:1 =>  [0] [[-0.02599332 -0.18074583 -0.02502888  0.32777467]] [1.] [False]
--> Episode:1 =>  [0] [[-0.02960824 -0.37550268 -0.01847339  0.61246073]] [1.] [False]
--> Episode:1 =>  [1] [[-0.03711829 -0.18012749 -0.00622417  0.3140171 ]] [1.] [False]
--> Episode:1 =>  [1] [[-4.0720843e-02  1.5082573e-02  5.6170313e-05  1.9377774e-02]] [1.] [False]
--> Episode:1 =>  [0] [[-0.04041919 -0.18004018  0.00044373  0.31207842]] [1.] [False]
--> Episode:1 =>  [0] [[-0.04402    -0.37516844  0.00668529  0.60490125]] [1.] [False]
--> Episode:1 =>  [1] [[-0.05152337 -0.18014063  0.01878332  0.3143315 ]] [1.] [False]
--> Episode:1 =>  [1] [[-0.05512618  0.01470879  0.02506995  0.02763092]] [1.] [False]
--> Episode:1 =>  [1] [[-0.054832    0.20946242  0.02562257 -0.25703794]] [1.] [False]
--> Episode:1 =>  [1] [[-0.05064275  0.40420935  0.02048181 -0.54153025]] [1.] [False]
--> Episode:1 =>  [0] [[-0.04255857  0.2088056   0.0096512  -0.24246487]] [1.] [False]
--> Episode:1 =>  [1] [[-0.0383

## 7. Viewing the logs in Tensorboard

In [4]:
training_log_path = os.path.join(log_path, 'PPO_14')

In [5]:
training_log_path

'Training/Logs/PPO_14'

In [6]:
os.listdir()

['.DS_Store',
 'Training',
 'CartPole-v1 based on PPO.ipynb',
 '.ipynb_checkpoints']

In [None]:
#viewing in the tensor_board 
!tensorboard --logdir={training_log_path}

Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
TensorBoard 2.14.1 at http://localhost:6006/ (Press CTRL+C to quit)


In [None]:
# !pip install tensorflow

## 8. Adding to callback to the training stages : BenchMarking wrt Reward_threshold

In [4]:
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold
save_path = os.path.join('Training', 'Saved Models')
save_path

'Training/Saved Models'

In [5]:
stop_callback = StopTrainingOnRewardThreshold(reward_threshold=250, verbose=1) # setting stop benchmark with reward:250
eval_callback = EvalCallback(env,
                             callback_on_new_best = stop_callback,
                             eval_freq=10000,
                             best_model_save_path=save_path,
                             verbose=1)


NameError: name 'env' is not defined

In [None]:
model = PPO("MlpPolicy", env, verbose=1, tensorboard_log=log_path)

In [None]:
# continuing the agent training from the previous state: eval_callback
model.learn(total_timesteps=20000, callback=eval_callback)


## 9. Changing the Policies
it means changing the neural network for PPO model 

In [6]:
net_arch = [dict(pi=[128, 128, 128, 128], vf=[128, 128, 128, 128])]

In [7]:
model = PPO("MlpPolicy", env, verbose=1, tensorboard_log=log_path, policy_kwargs={'net_arch':net_arch})

NameError: name 'env' is not defined

In [8]:
model??

Object `model` not found.


In [None]:
# retraining againg with changed policy
model.learn(total_timesteps=20000, callback=eval_callback)

## 10. Using Alternative Algorithm: for this DQN from PPO
 if used model doesn't provides optimal metrics and performance evaluation then change Algorthim

In [9]:
# import alternative model 
from stable_baselines3 import DQN

In [10]:
3model = DQN('MlpPolicy', env, verbose=1, tensorboard_log=log_path) # intansitating the alternative model 

NameError: name 'env' is not defined

In [None]:
# training new model 
model.learn(total_timesteps=20000)

In [None]:
# save the alternative model 
model.save()