## 1. Installing all the dependencies

In [1]:
# !pip install 'stable-baselines3[extra]' 

In [2]:
!python3 --version

Python 3.11.5


In [3]:
import os
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

## 2. Load Environment 

In [4]:
env = gym.make("CartPole-v1", render_mode="human")

Test out the environment and actually loading the environment

In [5]:
episodes = 5
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0

    while not done:
        env.render()
        action = env.action_space.sample()
        n_state, reward, done, truncated, info = env.step(action)
        score += reward
        # print('Episode:{}, Score :{} \n ---> n_state:{} \n ---> done:{}, truncated:{}, info:{}'.format(episode, score, n_state, done, truncated, info))
    print("Episodes:", episode, " Score:", score)
env.close()


Episodes: 1  Score: 53.0
Episodes: 2  Score: 30.0
Episodes: 3  Score: 18.0
Episodes: 4  Score: 11.0
Episodes: 5  Score: 40.0


2.1 Understanding the Environment

ACTION_SPACE: of pole
0: Push cart to the left
1: Push cart to the right

In [6]:
print( "action_space: ", env.action_space)
print( "action_space.sample(): ", env.action_space.sample())

action_space:  Discrete(2)
action_space.sample():  0


OBSERVATION_SPACE : CartPole 
Box(4,)
0: Cart_position
1: Cart_Velocity
2: Pole_angle
3: Pole_Angular_Velocity

In [7]:
print( "obs_space:", env.observation_space)
print( "obs_space.sample():", env.observation_space.sample())

obs_space: Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)
obs_space.sample(): [ 3.5990555e+00 -2.2408152e+38  4.1125914e-01 -1.9779791e+38]


## 3. Train the RL Model

In [8]:
# Make the directories first training/logs
log_path = os.path.join('Training', 'Logs')

In [9]:
log_path

'Training/Logs'

Setting up the RL_Algorithm

In [10]:
env = gym.make("CartPole-v1", render_mode="human")
env = DummyVecEnv([lambda:env])

#choosed apprioprate model 
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)


Using cpu device


In [11]:
PPO??

[0;31mInit signature:[0m
[0mPPO[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mpolicy[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mType[0m[0;34m[[0m[0mstable_baselines3[0m[0;34m.[0m[0mcommon[0m[0;34m.[0m[0mpolicies[0m[0;34m.[0m[0mActorCriticPolicy[0m[0;34m][0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0menv[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mgymnasium[0m[0;34m.[0m[0mcore[0m[0;34m.[0m[0mEnv[0m[0;34m,[0m [0mstable_baselines3[0m[0;34m.[0m[0mcommon[0m[0;34m.[0m[0mvec_env[0m[0;34m.[0m[0mbase_vec_env[0m[0;34m.[0m[0mVecEnv[0m[0;34m,[0m [0mstr[0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mlearning_rate[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mfloat[0m[0;34m,[0m [0mCallable[0m[0;34m[[0m[0;34m[[0m[0mfloat[0m[0;34m][0m[0;34m,[0m [0mfloat[0m[0;34m][0m[0;34m][0m [0;34m=[0m [0;36m0.0003[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mn_steps[0m[0;34m:[0m [0mint[

Executing the Agent Learning, Model Training or  Training the Agent

In [12]:
## executing the learning or model training  or training the agent
model.learn(total_timesteps=20000)

Logging to Training/Logs/PPO_16
-----------------------------
| time/              |      |
|    fps             | 45   |
|    iterations      | 1    |
|    time_elapsed    | 45   |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 45          |
|    iterations           | 2           |
|    time_elapsed         | 89          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.009017691 |
|    clip_fraction        | 0.108       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.686      |
|    explained_variance   | -0.0143     |
|    learning_rate        | 0.0003      |
|    loss                 | 7.72        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0166     |
|    value_loss           | 51          |
-----------------------------------------
--

<stable_baselines3.ppo.ppo.PPO at 0x15cb67390>

## 4. Save and Reload the Model

In [13]:
## saving the model 
PPO_Path = os.path.join('Training', 'Saved Models', 'PPO-CartPole-model-v0')
model.save(PPO_Path) 

In [14]:
# trying delete then reloading the saved model
del model

In [15]:
# reloading
model = PPO.load(PPO_Path, env=env)

In [16]:
# checking if loaded or not
# model??

In [17]:
# reusing the trained agent or model 
model.learn(total_timesteps= 20000)

Logging to Training/Logs/PPO_17
-----------------------------
| time/              |      |
|    fps             | 47   |
|    iterations      | 1    |
|    time_elapsed    | 43   |
|    total_timesteps | 2048 |
-----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 47           |
|    iterations           | 2            |
|    time_elapsed         | 86           |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0031677184 |
|    clip_fraction        | 0.0373       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.56        |
|    explained_variance   | 0.716        |
|    learning_rate        | 0.0003       |
|    loss                 | 1.08         |
|    n_updates            | 110          |
|    policy_gradient_loss | -0.00303     |
|    value_loss           | 16.8         |
---------------------------

<stable_baselines3.ppo.ppo.PPO at 0x16d5374d0>

## 5. Evaluation

In [18]:
evaluate_policy(model, env, n_eval_episodes=10, render=True)



(500.0, 0.0)

In [19]:
# Enjoy trained agent
vec_env = model.get_env()
obs = vec_env.reset()
for i in range(1000):
    action, _states = model.predict(obs, deterministic=True)
    obs, rewards, dones, info = vec_env.step(action)
    vec_env.render("human")
print("done")

done


## 6. Test Model

In [20]:
#testing the model

episodes = 5
for episode in range(1, episodes+1):
    obs = env.reset()
    done = False
    score = 0 
    while not done:
        env.render()
        action, _state = model.predict(obs) # using saved model 
        obs, reward, done, info = env.step(action)
        score += reward
        print( "--> Episode:{} => ".format(episode), action, obs, reward, done )
    print("Episdoe:{}, Score:{}".format(episode, score))

env.close()   
        

--> Episode:1 =>  [1] [[ 0.00027563  0.21398045  0.01653245 -0.23789153]] [1.] [False]
--> Episode:1 =>  [0] [[0.00455524 0.01862626 0.01177462 0.05996003]] [1.] [False]
--> Episode:1 =>  [1] [[ 0.00492777  0.21357742  0.01297382 -0.22898479]] [1.] [False]
--> Episode:1 =>  [1] [[ 0.00919932  0.4085116   0.00839412 -0.51754725]] [1.] [False]
--> Episode:1 =>  [1] [[ 0.01736955  0.6035144  -0.00195682 -0.80757326]] [1.] [False]
--> Episode:1 =>  [0] [[ 0.02943983  0.40841928 -0.01810828 -0.5155065 ]] [1.] [False]
--> Episode:1 =>  [0] [[ 0.03760822  0.21355695 -0.02841841 -0.22858445]] [1.] [False]
--> Episode:1 =>  [1] [[ 0.04187936  0.40907323 -0.0329901  -0.5300944 ]] [1.] [False]
--> Episode:1 =>  [0] [[ 0.05006082  0.21443053 -0.04359199 -0.24798648]] [1.] [False]
--> Episode:1 =>  [0] [[ 0.05434943  0.01995737 -0.04855172  0.03063415]] [1.] [False]
--> Episode:1 =>  [1] [[ 0.05474858  0.21574074 -0.04793904 -0.27696344]] [1.] [False]
--> Episode:1 =>  [0] [[ 0.0590634   0.02133433

## 7. Viewing the logs in Tensorboard

In [21]:
training_log_path = os.path.join(log_path, 'PPO_14')

In [22]:
training_log_path

'Training/Logs/PPO_14'

In [23]:
os.listdir()

['.DS_Store',
 'Training',
 'CartPole-v1 based on PPO.ipynb',
 '.ipynb_checkpoints',
 'Read must.txt']

In [None]:
#viewing in the tensor_board 
!tensorboard --logdir={training_log_path}

TensorFlow installation not found - running with reduced feature set.
Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
TensorBoard 2.15.0 at http://localhost:6006/ (Press CTRL+C to quit)


In [None]:
# !pip install tensorflow

## 8. Adding to callback to the training stages : BenchMarking wrt Reward_threshold

In [None]:
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold
save_path = os.path.join('Training', 'Saved Models')
save_path

In [None]:
stop_callback = StopTrainingOnRewardThreshold(reward_threshold=250, verbose=1) # setting stop benchmark with reward:250
eval_callback = EvalCallback(env,
                             callback_on_new_best = stop_callback,
                             eval_freq=10000,
                             best_model_save_path=save_path,
                             verbose=1)


In [None]:
model = PPO("MlpPolicy", env, verbose=1, tensorboard_log=log_path)

In [None]:
# continuing the agent training from the previous state: eval_callback
model.learn(total_timesteps=20000, callback=eval_callback)


## 9. Changing the Policies
it means changing the neural network for PPO model 

In [None]:
net_arch = [dict(pi=[128, 128, 128, 128], vf=[128, 128, 128, 128])]

In [None]:
model = PPO("MlpPolicy", env, verbose=1, tensorboard_log=log_path, policy_kwargs={'net_arch':net_arch})

In [None]:
model??

In [None]:
# retraining againg with changed policy
model.learn(total_timesteps=20000, callback=eval_callback)

## 10. Using Alternative Algorithm: for this DQN from PPO
 if used model doesn't provides optimal metrics and performance evaluation then change Algorthim

In [None]:
# import alternative model 
from stable_baselines3 import DQN

In [None]:
3model = DQN('MlpPolicy', env, verbose=1, tensorboard_log=log_path) # intansitating the alternative model 

In [None]:
# training new model 
model.learn(total_timesteps=20000)

In [None]:
# save the alternative model 
model.save()