# TFM | Reinforcement Learning | Daniel Zorrilla | Cartpole

## Installing additional dependencies

###### Installing stable baselines and pyglet library for developing games and other visually-rich applications

In [1]:
!pip install stable-baselines3[extra]



In [2]:
!pip install pyglet



# PPO Algorithm

## 1. Importing Dependencies

In [3]:
import os # provides a way of using OS dependent functionality. (files)
import gym # Open AI gym
from stable_baselines3 import PPO #PPO RL Algorithm
from stable_baselines3.common.vec_env import DummyVecEnv # Creates a simple vectorized wrapper for multiple environments
from stable_baselines3.common.evaluation import evaluate_policy # Test how well a model is performing

## 2. Environments

In [4]:
environment_name = 'CartPole-v0' # Naming the Cartpole-v0 environment
env = gym.make(environment_name) # Creating the environment

In [5]:
episodes = 5  # Number of episodes
for episode in range (1, episodes+1): # Resetting environment  
    state = env.reset() 
    done = False
    score = 0
    
    while not done: # While episode active
        env.render() # Visualizing environment
        action = env.action_space.sample() # Creating sample actions
        n_state, reward, done, info = env.step(action) # Defining step action
        score += reward # Getting score
    print('Episode:{} Score:{}'.format(episode,score)) # Printing episode and score

Episode:1 Score:9.0
Episode:2 Score:9.0
Episode:3 Score:19.0
Episode:4 Score:16.0
Episode:5 Score:14.0


In [6]:
env.close() # Closing the render

In [7]:
env.action_space # Understanding the action space of the agent

Discrete(2)

In [8]:
env.action_space.sample() # Action random sample

1

In [9]:
env.observation_space # Understanding the observation space of this environment

Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)

In [10]:
env.observation_space.sample()

array([-4.2134857e-01, -9.8147361e+37,  2.6317251e-01, -1.5495867e+37],
      dtype=float32)

## 3. Train and create RL Model

In [11]:
log_path = os.path.join('Training', 'Cartpole') #where it is saved the tensorboard log

In [12]:
log_path

'Training\\Cartpole'

#### Install Pytorch *conda install pytorch torchvision torchaudio cudatoolkit=11.3 -c pytorch*

In [13]:
env = gym.make(environment_name) # Create environment
env = DummyVecEnv([lambda: env]) # Wrapped environment using DummyVecEnv
model = PPO('MlpPolicy', env, verbose = 1, tensorboard_log=log_path) # Creating PPO Algorithm with MultiLayerPerceptron Policy

Using cpu device


In [15]:
model.learn(total_timesteps=100000) # Train model 100.000 steps. 

Logging to Training\Cartpole\PPO_1
-----------------------------
| time/              |      |
|    fps             | 920  |
|    iterations      | 1    |
|    time_elapsed    | 2    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 593         |
|    iterations           | 2           |
|    time_elapsed         | 6           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.002222451 |
|    clip_fraction        | 0.0167      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.359      |
|    explained_variance   | 0.588       |
|    learning_rate        | 0.0003      |
|    loss                 | -0.00205    |
|    n_updates            | 500         |
|    policy_gradient_loss | -0.00121    |
|    value_loss           | 5.11e-06    |
-----------------------------------------

<stable_baselines3.ppo.ppo.PPO at 0x19eb1552d30>

## 4. Save and Reload Model

In [16]:
PPO_Path = os.path.join('Training', 'Saved Models Cartpole', 'PPO_Model_Cartpole') # Locate path

In [17]:
model.save(PPO_Path) #save model in PPO_Path

In [18]:
del model #Delete model to simulate reloading in production

In [19]:
model = PPO.load(PPO_Path, env = env) # Loading again the model saved in PPO_Path

In [20]:
PPO_Path

'Training\\Saved Models Cartpole\\PPO_Model_Cartpole'

## 5. Evaluation

In [21]:
evaluate_policy(model, env, n_eval_episodes=5, render=True) # Evaluating model with 10 episodes



(200.0, 0.0)

In [22]:
env.close()

## 6. Testing Model

In [23]:
episodes = 5
for episode in range (1, episodes+1): 
    obs = env.reset()  # Resetting episodes
    done = False
    score = 0
    
    while not done:
        env.render() # Visualize model
        action, _ = model.predict(obs) # Using trained model to predict actions
        obs, reward, done, info = env.step(action) # Defining step action
        score += reward
    print('Episode:{} Score:{}'.format(episode,score))

Episode:1 Score:[200.]
Episode:2 Score:[200.]
Episode:3 Score:[200.]
Episode:4 Score:[200.]
Episode:5 Score:[200.]


In [24]:
env.close()

## 7. Viewing Logs in Tensorboard

In [25]:
training_log_path = os.path.join(log_path, 'PPO_1') # Locating PPO_1 path

In [26]:
training_log_path

'Training\\Cartpole\\PPO_1'

In [None]:
!tensorboard --logdir={training_log_path}

#### Execute in command line the tensorboard visualization http://localhost:6006 stop the cell to continue

## 8. Adding a callback to the training stage

In [27]:
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold

In [28]:
save_path = os.path.join('Training','Saved Models Cartpole') #Where the best model is going to be saved

In [29]:
stop_callback = StopTrainingOnRewardThreshold(reward_threshold=200, verbose=1) #Stop our training when we achieved a 200 rwd
eval_callback = EvalCallback(env,  #callback that is triggered after each training run
                            callback_on_new_best=stop_callback, #callback to run in the new best model
                            eval_freq=10000, #Evaluation Frequency to 10.000 time steps
                            best_model_save_path=save_path, # Save the model everytime there is a new best model
                            verbose=1)

In [30]:
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

Using cpu device


In [31]:
model.learn(total_timesteps=20000, callback=eval_callback) # Training model with callback argument

Logging to Training\Cartpole\PPO_2
-----------------------------
| time/              |      |
|    fps             | 915  |
|    iterations      | 1    |
|    time_elapsed    | 2    |
|    total_timesteps | 2048 |
-----------------------------
----------------------------------------
| time/                   |            |
|    fps                  | 593        |
|    iterations           | 2          |
|    time_elapsed         | 6          |
|    total_timesteps      | 4096       |
| train/                  |            |
|    approx_kl            | 0.00835478 |
|    clip_fraction        | 0.101      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.686     |
|    explained_variance   | 0.0032     |
|    learning_rate        | 0.0003     |
|    loss                 | 8.9        |
|    n_updates            | 10         |
|    policy_gradient_loss | -0.016     |
|    value_loss           | 58.9       |
----------------------------------------
-----------------

<stable_baselines3.ppo.ppo.PPO at 0x19ec0b1fa00>

## 9. Changing Policies

In [40]:
net_arch = [dict(pi=[128,128,128,128], vf=[128,128,128,128])] #dictionary neural network for our custom actor=PI and valueFunctn
                                                              #128 un/eachLayer (4Lyrs)

In [41]:
#associating this new_Arch to the model
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path, policy_kwargs={'net_arch':net_arch})

Using cpu device


In [42]:
model.learn(total_timesteps=20000, callback=eval_callback)

Logging to Training\Cartpole\PPO_4
-----------------------------
| time/              |      |
|    fps             | 655  |
|    iterations      | 1    |
|    time_elapsed    | 3    |
|    total_timesteps | 2048 |
-----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 378          |
|    iterations           | 2            |
|    time_elapsed         | 10           |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0147063695 |
|    clip_fraction        | 0.208        |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.682       |
|    explained_variance   | 0.012        |
|    learning_rate        | 0.0003       |
|    loss                 | 4.14         |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.026       |
|    value_loss           | 23.2         |
------------------------

<stable_baselines3.ppo.ppo.PPO at 0x19ec0b3cca0>

In [35]:
# Testing the model with new architecture
episodes = 5
for episode in range (1, episodes+1):
    obs = env.reset()
    done = False
    score = 0
    
    while not done:
        env.render()
        action, _ = model.predict(obs) #using trained model to predict actions
        obs, reward, done, info = env.step(action)
        score += reward
    print('Episode:{} Score:{}'.format(episode,score))

Episode:1 Score:[200.]
Episode:2 Score:[200.]
Episode:3 Score:[200.]
Episode:4 Score:[200.]
Episode:5 Score:[200.]


In [36]:
env.close()

## 10. Using DQN Algorithm

In [44]:
from stable_baselines3 import DQN # DQN RL Algorithm

In [45]:
model = DQN('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

Using cpu device


In [46]:
model.learn(total_timesteps=20000, callback=eval_callback) #50.000

Logging to Training\Cartpole\DQN_2
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.979    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 4042     |
|    time_elapsed     | 0        |
|    total_timesteps  | 113      |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.962    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 4064     |
|    time_elapsed     | 0        |
|    total_timesteps  | 202      |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.942    |
| time/               |          |
|    episodes         | 12       |
|    fps              | 4675     |
|    time_elapsed     | 0        |
|    total_timesteps  | 307      |
----------------------------------
--------------------

<stable_baselines3.dqn.dqn.DQN at 0x19ec9e739d0>

In [47]:
DQN_Path = os.path.join('Training','Saved Models Cartpole', 'DQN_Model_Cartpole')
model.save(DQN_Path)

In [48]:
DQN_Path

'Training\\Saved Models Cartpole\\DQN_Model_Cartpole'

In [49]:
evaluate_policy(model, env, n_eval_episodes=5, render=True) # Evaluating model with 10 episodes

(44.8, 46.41292923313503)

In [50]:
env.close()

In [51]:
model

<stable_baselines3.dqn.dqn.DQN at 0x19ec9e739d0>

In [52]:
# Testing DQN Trained algorithm
episodes = 5
for episode in range (1, episodes+1): 
    obs = env.reset()  # Resetting episodes
    done = False
    score = 0
    
    while not done:
        env.render() # Visualize model
        action, _ = model.predict(obs) # Using trained model to predict actions
        obs, reward, done, info = env.step(action) # Defining step action
        score += reward
    print('Episode:{} Score:{}'.format(episode,score))

Episode:1 Score:[10.]
Episode:2 Score:[62.]
Episode:3 Score:[14.]
Episode:4 Score:[94.]
Episode:5 Score:[10.]


In [53]:
env.close()

## 11. Using A2C Algorithm

In [54]:
from stable_baselines3 import A2C # A2C RL Algorithm

In [56]:
model = A2C('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

Using cpu device


In [59]:
model.learn(total_timesteps=20000, callback=eval_callback) #First 20.000 then 200.000

Logging to Training\Cartpole\A2C_3
------------------------------------
| time/                 |          |
|    fps                | 435      |
|    iterations         | 100      |
|    time_elapsed       | 1        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -0.516   |
|    explained_variance | -0.0634  |
|    learning_rate      | 0.0007   |
|    n_updates          | 20099    |
|    policy_loss        | 0.000105 |
|    value_loss         | 6.17e-08 |
------------------------------------
------------------------------------
| time/                 |          |
|    fps                | 430      |
|    iterations         | 200      |
|    time_elapsed       | 2        |
|    total_timesteps    | 1000     |
| train/                |          |
|    entropy_loss       | -0.463   |
|    explained_variance | 0.000849 |
|    learning_rate      | 0.0007   |
|    n_updates          | 20199    |
|    policy_loss        | 0.00011  |
|  

<stable_baselines3.a2c.a2c.A2C at 0x19ec9e7e940>

In [60]:
A2C_Path = os.path.join('Training','Saved Models Cartpole', 'A2C_Model_Cartpole')
model.save(A2C_Path)

In [61]:
A2C_Path

'Training\\Saved Models Cartpole\\A2C_Model_Cartpole'

In [62]:
evaluate_policy(model, env, n_eval_episodes=5, render=True) # Evaluating model with 10 episodes

(200.0, 0.0)

In [63]:
env.close()

In [64]:
model

<stable_baselines3.a2c.a2c.A2C at 0x19ec9e7e940>

In [65]:
# Testing DQN Trained algorithm
episodes = 5
for episode in range (1, episodes+1): 
    obs = env.reset()  # Resetting episodes
    done = False
    score = 0
    
    while not done:
        env.render() # Visualize model
        action, _ = model.predict(obs) # Using trained model to predict actions
        obs, reward, done, info = env.step(action) # Defining step action
        score += reward
    print('Episode:{} Score:{}'.format(episode,score))

Episode:1 Score:[200.]
Episode:2 Score:[200.]
Episode:3 Score:[200.]
Episode:4 Score:[200.]
Episode:5 Score:[200.]


In [66]:
env.close()