# TFM | Reinforcement Learning | Daniel Zorrilla | Acrobot

## Installing additional dependencies

###### Installing stable baselines and pyglet library for developing games and other visually-rich applications

In [1]:
!pip install stable-baselines3[extra]



In [2]:
!pip install pyglet



# PPO Algorithm

## 1. Importing Dependencies

In [1]:
import os # provides a way of using OS dependent functionality. (files)
import gym # Open AI gym
from stable_baselines3 import PPO #PPO RL Algorithm
from stable_baselines3.common.vec_env import DummyVecEnv # Creates a simple vectorized wrapper for multiple environments
from stable_baselines3.common.evaluation import evaluate_policy # Test how well a model is performing

## 2. Environments

In [2]:
environment_name = 'Acrobot-v1' # Naming the Acrobot environment
env = gym.make(environment_name) # Creating the environment

In [3]:
episodes = 5  # Number of episodes
for episode in range (1, episodes+1): # Resetting environment  
    state = env.reset() 
    done = False
    score = 0
    
    while not done: # While episode active
        env.render() # Visualizing environment
        action = env.action_space.sample() # Creating sample actions
        n_state, reward, done, info = env.step(action) # Defining step action
        score += reward # Getting score
    print('Episode:{} Score:{}'.format(episode,score)) # Printing episode and score

Episode:1 Score:-500.0
Episode:2 Score:-481.0
Episode:3 Score:-500.0
Episode:4 Score:-500.0
Episode:5 Score:-500.0


In [4]:
env.close() # Closing the render

In [5]:
env.action_space # Understanding the action space of the agent

Discrete(3)

In [6]:
env.action_space.sample() # Action random sample

0

In [7]:
env.observation_space # Understanding the observation space of this environment

Box([ -1.        -1.        -1.        -1.       -12.566371 -28.274334], [ 1.        1.        1.        1.       12.566371 28.274334], (6,), float32)

In [8]:
env.observation_space.sample()

array([  0.73386836,  -0.4865981 ,   0.3885829 ,   0.76086515,
        -9.824807  , -13.917628  ], dtype=float32)

## 3. Train and create RL Model

In [9]:
log_path = os.path.join('Training', 'Acrobot') #where it is saved the tensorboard log

In [10]:
log_path

'Training\\Acrobot'

#### Install Pytorch *conda install pytorch torchvision torchaudio cudatoolkit=11.3 -c pytorch*

In [11]:
env = gym.make(environment_name) # Create environment
env = DummyVecEnv([lambda: env]) # Wrapped environment using DummyVecEnv
model = PPO('MlpPolicy', env, verbose = 1, tensorboard_log=log_path) # Creating PPO Algorithm with MultiLayerPerceptron Policy

Using cpu device


In [12]:
model.learn(total_timesteps=20000) # Train model 100.000 steps. 

Logging to Training\Acrobot\PPO_1
-----------------------------
| time/              |      |
|    fps             | 695  |
|    iterations      | 1    |
|    time_elapsed    | 2    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 493         |
|    iterations           | 2           |
|    time_elapsed         | 8           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008623523 |
|    clip_fraction        | 0.0198      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.09       |
|    explained_variance   | -0.0207     |
|    learning_rate        | 0.0003      |
|    loss                 | 18.8        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0045     |
|    value_loss           | 141         |
-----------------------------------------


<stable_baselines3.ppo.ppo.PPO at 0x1fa03cea310>

## 4. Save and Reload Model

In [13]:
PPO_Path = os.path.join('Training', 'Saved Models Acrobot', 'PPO_Model_Acrobot') # Locate path

In [14]:
model.save(PPO_Path) #save model in PPO_Path

In [15]:
del model #Delete model to simulate reloading in production

In [16]:
model = PPO.load(PPO_Path, env = env) # Loading again the model saved in PPO_Path

In [17]:
PPO_Path

'Training\\Saved Models Acrobot\\PPO_Model_Acrobot'

## 5. Evaluation

In [18]:
evaluate_policy(model, env, n_eval_episodes=5, render=True) # Evaluating model with 10 episodes



(-115.4, 48.565831610299846)

In [19]:
env.close()

## 6. Testing Model

In [20]:
episodes = 5
for episode in range (1, episodes+1): 
    obs = env.reset()  # Resetting episodes
    done = False
    score = 0
    
    while not done:
        env.render() # Visualize model
        action, _ = model.predict(obs) # Using trained model to predict actions
        obs, reward, done, info = env.step(action) # Defining step action
        score += reward
    print('Episode:{} Score:{}'.format(episode,score))

Episode:1 Score:[-150.]
Episode:2 Score:[-131.]
Episode:3 Score:[-117.]
Episode:4 Score:[-176.]
Episode:5 Score:[-78.]


In [21]:
env.close()

## 7. Viewing Logs in Tensorboard

In [22]:
training_log_path = os.path.join(log_path, 'PPO_1') # Locating PPO_1 path

In [23]:
training_log_path

'Training\\Acrobot\\PPO_1'

In [None]:
!tensorboard --logdir={training_log_path}

#### Execute in command line the tensorboard visualization http://localhost:6006 stop the cell to continue

## 8. Adding a callback to the training stage

In [24]:
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold

In [25]:
save_path = os.path.join('Training','Saved Models Acrobot') #Where the best model is going to be saved

In [26]:
stop_callback = StopTrainingOnRewardThreshold(reward_threshold=200, verbose=1) #Stop our training when we achieved a 200 rwd
eval_callback = EvalCallback(env,  #callback that is triggered after each training run
                            callback_on_new_best=stop_callback, #callback to run in the new best model
                            eval_freq=10000, #Evaluation Frequency to 10.000 time steps
                            best_model_save_path=save_path, # Save the model everytime there is a new best model
                            verbose=1)

In [27]:
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

Using cpu device


In [28]:
model.learn(total_timesteps=20000, callback=eval_callback) # Training model with callback argument

Logging to Training\Acrobot\PPO_2
-----------------------------
| time/              |      |
|    fps             | 735  |
|    iterations      | 1    |
|    time_elapsed    | 2    |
|    total_timesteps | 2048 |
-----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 504          |
|    iterations           | 2            |
|    time_elapsed         | 8            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0072079394 |
|    clip_fraction        | 0.0326       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.09        |
|    explained_variance   | -0.0122      |
|    learning_rate        | 0.0003       |
|    loss                 | 15.9         |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.00282     |
|    value_loss           | 131          |
-------------------------

<stable_baselines3.ppo.ppo.PPO at 0x1fa114b2c70>

## 9. Changing Policies

In [29]:
net_arch = [dict(pi=[128,128,128,128], vf=[128,128,128,128])] #dictionary neural network for our custom actor=PI and valueFunctn
                                                              #128 un/eachLayer (4Lyrs)

In [30]:
#associating this new_Arch to the model
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path, policy_kwargs={'net_arch':net_arch})

Using cpu device


In [31]:
model.learn(total_timesteps=20000, callback=eval_callback)

Logging to Training\Acrobot\PPO_3
-----------------------------
| time/              |      |
|    fps             | 523  |
|    iterations      | 1    |
|    time_elapsed    | 3    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 329         |
|    iterations           | 2           |
|    time_elapsed         | 12          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.012025047 |
|    clip_fraction        | 0.103       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.09       |
|    explained_variance   | 0.0575      |
|    learning_rate        | 0.0003      |
|    loss                 | 0.69        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.00592    |
|    value_loss           | 40.3        |
-----------------------------------------


<stable_baselines3.ppo.ppo.PPO at 0x1fa11537b80>

In [32]:
# Testing the model with new architecture
episodes = 5
for episode in range (1, episodes+1):
    obs = env.reset()
    done = False
    score = 0
    
    while not done:
        env.render()
        action, _ = model.predict(obs) #using trained model to predict actions
        obs, reward, done, info = env.step(action)
        score += reward
    print('Episode:{} Score:{}'.format(episode,score))

Episode:1 Score:[-87.]
Episode:2 Score:[-90.]
Episode:3 Score:[-78.]
Episode:4 Score:[-85.]
Episode:5 Score:[-96.]


In [33]:
env.close()

## 10. Using DQN Algorithm

In [34]:
from stable_baselines3 import DQN # DQN RL Algorithm

In [35]:
model = DQN('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

Using cpu device


In [38]:
model.learn(total_timesteps=20000, callback=eval_callback) #50.000

Logging to Training\Acrobot\DQN_3
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.62     |
| time/               |          |
|    episodes         | 4        |
|    fps              | 2250     |
|    time_elapsed     | 0        |
|    total_timesteps  | 2000     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.24     |
| time/               |          |
|    episodes         | 8        |
|    fps              | 2274     |
|    time_elapsed     | 1        |
|    total_timesteps  | 4000     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 12       |
|    fps              | 2270     |
|    time_elapsed     | 2        |
|    total_timesteps  | 6000     |
----------------------------------
Eval num_timesteps=76

<stable_baselines3.dqn.dqn.DQN at 0x1fa1153f760>

In [39]:
DQN_Path = os.path.join('Training','Saved Models Acrobot', 'DQN_Model_Acrobot')
model.save(DQN_Path)

In [40]:
DQN_Path

'Training\\Saved Models Acrobot\\DQN_Model_Acrobot'

In [41]:
evaluate_policy(model, env, n_eval_episodes=5, render=True) # Evaluating model with 10 episodes

(-500.0, 0.0)

In [42]:
env.close()

In [43]:
model

<stable_baselines3.dqn.dqn.DQN at 0x1fa1153f760>

In [44]:
# Testing DQN Trained algorithm
episodes = 5
for episode in range (1, episodes+1): 
    obs = env.reset()  # Resetting episodes
    done = False
    score = 0
    
    while not done:
        env.render() # Visualize model
        action, _ = model.predict(obs) # Using trained model to predict actions
        obs, reward, done, info = env.step(action) # Defining step action
        score += reward
    print('Episode:{} Score:{}'.format(episode,score))

Episode:1 Score:[-500.]
Episode:2 Score:[-500.]
Episode:3 Score:[-500.]
Episode:4 Score:[-500.]
Episode:5 Score:[-500.]


In [45]:
env.close()

## 11. Using A2C Algorithm

In [46]:
from stable_baselines3 import A2C # A2C RL Algorithm

In [47]:
model = A2C('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

Using cpu device


In [50]:
model.learn(total_timesteps=20000, callback=eval_callback) #First 20.000 then 200.000

Logging to Training\Acrobot\A2C_3
-------------------------------------
| time/                 |           |
|    fps                | 374       |
|    iterations         | 100       |
|    time_elapsed       | 1         |
|    total_timesteps    | 500       |
| train/                |           |
|    entropy_loss       | -0.0074   |
|    explained_variance | -0.0329   |
|    learning_rate      | 0.0007    |
|    n_updates          | 20099     |
|    policy_loss        | -7.09e-07 |
|    value_loss         | 6.78e-07  |
-------------------------------------
-------------------------------------
| time/                 |           |
|    fps                | 374       |
|    iterations         | 200       |
|    time_elapsed       | 2         |
|    total_timesteps    | 1000      |
| train/                |           |
|    entropy_loss       | -0.0073   |
|    explained_variance | -0.0936   |
|    learning_rate      | 0.0007    |
|    n_updates          | 20199     |
|    policy_loss

<stable_baselines3.a2c.a2c.A2C at 0x1fa1155f160>

In [51]:
A2C_Path = os.path.join('Training','Saved Models Acrobot', 'A2C_Model_Acrobot')
model.save(A2C_Path)

In [52]:
A2C_Path

'Training\\Saved Models Acrobot\\A2C_Model_Acrobot'

In [53]:
evaluate_policy(model, env, n_eval_episodes=5, render=True) # Evaluating model with 10 episodes

(-500.0, 0.0)

In [54]:
env.close()

In [55]:
model

<stable_baselines3.a2c.a2c.A2C at 0x1fa1155f160>

In [56]:
# Testing DQN Trained algorithm
episodes = 5
for episode in range (1, episodes+1): 
    obs = env.reset()  # Resetting episodes
    done = False
    score = 0
    
    while not done:
        env.render() # Visualize model
        action, _ = model.predict(obs) # Using trained model to predict actions
        obs, reward, done, info = env.step(action) # Defining step action
        score += reward
    print('Episode:{} Score:{}'.format(episode,score))

Episode:1 Score:[-500.]
Episode:2 Score:[-500.]
Episode:3 Score:[-500.]
Episode:4 Score:[-500.]
Episode:5 Score:[-500.]


In [66]:
env.close()