# TFM | Reinforcement Learning | Daniel Zorrilla | Acrobot

## Installing additional dependencies

###### Installing stable baselines and pyglet library for developing games and other visually-rich applications

In [None]:
!pip install stable-baselines3[extra]

In [None]:
!pip install pyglet

# PPO Algorithm

## 1. Importing Dependencies

In [1]:
import os # provides a way of using OS dependent functionality. (files)
import gym # Open AI gym
from stable_baselines3 import PPO #PPO RL Algorithm
from stable_baselines3.common.vec_env import DummyVecEnv # Creates a simple vectorized wrapper for multiple environments
from stable_baselines3.common.evaluation import evaluate_policy # Test how well a model is performing

## 2. Environments

In [2]:
environment_name = 'Acrobot-v1' # Naming the Acrobot environment
env = gym.make(environment_name) # Creating the environment

In [3]:
episodes = 5  # Number of episodes
for episode in range (1, episodes+1): # Resetting environment  
    state = env.reset() 
    done = False
    score = 0
    
    while not done: # While episode active
        env.render() # Visualizing environment
        action = env.action_space.sample() # Creating sample actions
        n_state, reward, done, info = env.step(action) # Defining step action
        score += reward # Getting score
    print('Episode:{} Score:{}'.format(episode,score)) # Printing episode and score

Episode:1 Score:-500.0
Episode:2 Score:-500.0
Episode:3 Score:-500.0
Episode:4 Score:-500.0
Episode:5 Score:-500.0


In [4]:
env.close() # Closing the render

In [5]:
env.action_space # Understanding the action space of the agent

Discrete(3)

In [6]:
env.action_space.sample() # Action random sample

1

In [7]:
env.observation_space # Understanding the observation space of this environment

Box([ -1.        -1.        -1.        -1.       -12.566371 -28.274334], [ 1.        1.        1.        1.       12.566371 28.274334], (6,), float32)

In [8]:
env.observation_space.sample()

array([ 0.1180349,  0.7808099,  0.6646522,  0.7259256,  0.8066243,
       19.631912 ], dtype=float32)

## 3. Train and create RL Model

In [9]:
log_path = os.path.join('Training', 'Acrobot') #where it is saved the tensorboard log

In [10]:
log_path

'Training\\Acrobot'

#### Install Pytorch *conda install pytorch torchvision torchaudio cpuonly -c pytorch

In [11]:
env = gym.make(environment_name) # Create environment
env = DummyVecEnv([lambda: env]) # Wrapped environment using DummyVecEnv
model = PPO('MlpPolicy', env, verbose = 1, tensorboard_log=log_path) # Creating PPO Algorithm with MultiLayerPerceptron Policy

Using cpu device


In [12]:
model.learn(total_timesteps=40000) # Train model 40.000 steps. 

Logging to Training\Acrobot\PPO_1
-----------------------------
| time/              |      |
|    fps             | 523  |
|    iterations      | 1    |
|    time_elapsed    | 3    |
|    total_timesteps | 2048 |
-----------------------------
----------------------------------------
| time/                   |            |
|    fps                  | 692        |
|    iterations           | 2          |
|    time_elapsed         | 5          |
|    total_timesteps      | 4096       |
| train/                  |            |
|    approx_kl            | 0.00898079 |
|    clip_fraction        | 0.0606     |
|    clip_range           | 0.2        |
|    entropy_loss         | -1.09      |
|    explained_variance   | 0.0111     |
|    learning_rate        | 0.0003     |
|    loss                 | 21         |
|    n_updates            | 10         |
|    policy_gradient_loss | -0.0082    |
|    value_loss           | 147        |
----------------------------------------
------------------

<stable_baselines3.ppo.ppo.PPO at 0x1445b0a44f0>

## 4. Save and Reload Model

In [13]:
PPO_Path = os.path.join('Training', 'Saved Models Acrobot', 'PPO_Model_Acrobot') # Locate path

In [14]:
model.save(PPO_Path) #save model in PPO_Path

In [15]:
del model #Delete model to simulate reloading in production

In [16]:
model = PPO.load(PPO_Path, env = env) # Loading again the model saved in PPO_Path

In [17]:
PPO_Path

'Training\\Saved Models Acrobot\\PPO_Model_Acrobot'

## 5. Evaluation

In [18]:
evaluate_policy(model, env, n_eval_episodes=5, render=True) # Evaluating model with 10 episodes



(-102.0, 40.52653451752321)

In [19]:
env.close()

## 6. Testing Model

In [20]:
episodes = 5
for episode in range (1, episodes+1): 
    obs = env.reset()  # Resetting episodes
    done = False
    score = 0
    
    while not done:
        env.render() # Visualize model
        action, _ = model.predict(obs) # Using trained model to predict actions
        obs, reward, done, info = env.step(action) # Defining step action
        score += reward
    print('Episode:{} Score:{}'.format(episode,score))

Episode:1 Score:[-94.]
Episode:2 Score:[-95.]
Episode:3 Score:[-102.]
Episode:4 Score:[-84.]
Episode:5 Score:[-103.]


In [21]:
env.close()

## 7. Viewing Logs in Tensorboard Dev

!tensorboard dev upload --logdir {Path_To_Log} --name "Experiment"

#### Execute in command line the tensorboard visualization http://localhost:6006 stop the cell to continue

## 8. Adding a callback to the training stage

In [22]:
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold

In [23]:
save_path = os.path.join('Training','Saved Models Acrobot') #Where the best model is going to be saved

In [24]:
stop_callback = StopTrainingOnRewardThreshold(reward_threshold=200, verbose=1) #Stop our training when we achieved a 200 rwd
eval_callback = EvalCallback(env,  #callback that is triggered after each training run
                            callback_on_new_best=stop_callback, #callback to run in the new best model
                            eval_freq=10000, #Evaluation Frequency to 10.000 time steps
                            best_model_save_path=save_path, # Save the model everytime there is a new best model
                            verbose=1)

In [25]:
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

Using cpu device


In [26]:
model.learn(total_timesteps=40000, callback=eval_callback) # Training model with callback argument

Logging to Training\Acrobot\PPO_2
-----------------------------
| time/              |      |
|    fps             | 1821 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 1229        |
|    iterations           | 2           |
|    time_elapsed         | 3           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.004805004 |
|    clip_fraction        | 0.00708     |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.1        |
|    explained_variance   | -0.18       |
|    learning_rate        | 0.0003      |
|    loss                 | 17.6        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.00185    |
|    value_loss           | 143         |
-----------------------------------------


<stable_baselines3.ppo.ppo.PPO at 0x14471ecb2b0>

## 9. Changing Policies

In [27]:
net_arch = [dict(pi=[128,128,128,128], vf=[128,128,128,128])] #dictionary neural network for our custom actor=PI and valueFunctn
                                                              #128 un/eachLayer (4Lyrs)

In [28]:
#associating this new_Arch to the model
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path, policy_kwargs={'net_arch':net_arch})

Using cpu device


In [29]:
model.learn(total_timesteps=40000, callback=eval_callback)

Logging to Training\Acrobot\PPO_3
-----------------------------
| time/              |      |
|    fps             | 1383 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 829         |
|    iterations           | 2           |
|    time_elapsed         | 4           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.009211478 |
|    clip_fraction        | 0.0667      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.09       |
|    explained_variance   | -0.00776    |
|    learning_rate        | 0.0003      |
|    loss                 | 7.87        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.00387    |
|    value_loss           | 34          |
-----------------------------------------


<stable_baselines3.ppo.ppo.PPO at 0x14471ee9370>

In [30]:
# Testing the model with new architecture
episodes = 5
for episode in range (1, episodes+1):
    obs = env.reset()
    done = False
    score = 0
    
    while not done:
        env.render()
        action, _ = model.predict(obs) #using trained model to predict actions
        obs, reward, done, info = env.step(action)
        score += reward
    print('Episode:{} Score:{}'.format(episode,score))

Episode:1 Score:[-90.]
Episode:2 Score:[-89.]
Episode:3 Score:[-97.]
Episode:4 Score:[-82.]
Episode:5 Score:[-85.]


In [31]:
env.close()

## 10. Using DQN Algorithm

In [32]:
from stable_baselines3 import DQN # DQN RL Algorithm

In [33]:
model = DQN('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

Using cpu device


In [36]:
model.learn(total_timesteps=40000, callback=eval_callback) #40.000

Logging to Training\Acrobot\DQN_3
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.525    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 5257     |
|    time_elapsed     | 0        |
|    total_timesteps  | 2000     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 8        |
|    fps              | 5310     |
|    time_elapsed     | 0        |
|    total_timesteps  | 4000     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 12       |
|    fps              | 5278     |
|    time_elapsed     | 1        |
|    total_timesteps  | 6000     |
----------------------------------
---------------------

<stable_baselines3.dqn.dqn.DQN at 0x14471ee98b0>

In [37]:
DQN_Path = os.path.join('Training','Saved Models Acrobot', 'DQN_Model_Acrobot')
model.save(DQN_Path)

In [38]:
DQN_Path

'Training\\Saved Models Acrobot\\DQN_Model_Acrobot'

In [39]:
evaluate_policy(model, env, n_eval_episodes=5, render=True) # Evaluating model with 10 episodes

(-500.0, 0.0)

In [40]:
env.close()

In [41]:
model

<stable_baselines3.dqn.dqn.DQN at 0x14471ee98b0>

In [42]:
# Testing DQN Trained algorithm
episodes = 5
for episode in range (1, episodes+1): 
    obs = env.reset()  # Resetting episodes
    done = False
    score = 0
    
    while not done:
        env.render() # Visualize model
        action, _ = model.predict(obs) # Using trained model to predict actions
        obs, reward, done, info = env.step(action) # Defining step action
        score += reward
    print('Episode:{} Score:{}'.format(episode,score))

Episode:1 Score:[-500.]
Episode:2 Score:[-500.]
Episode:3 Score:[-500.]
Episode:4 Score:[-500.]
Episode:5 Score:[-500.]


In [43]:
env.close()

## 11. Using A2C Algorithm

In [44]:
from stable_baselines3 import A2C # A2C RL Algorithm

In [45]:
model = A2C('MlpPolicy', env, verbose=1, tensorboard_log=log_path, policy_kwargs={'net_arch':net_arch})

Using cpu device


In [48]:
model.learn(total_timesteps=40000, callback=eval_callback) #40.000

Logging to Training\Acrobot\A2C_3
-------------------------------------
| time/                 |           |
|    fps                | 603       |
|    iterations         | 100       |
|    time_elapsed       | 0         |
|    total_timesteps    | 500       |
| train/                |           |
|    entropy_loss       | -0.000128 |
|    explained_variance | -0.0171   |
|    learning_rate      | 0.0007    |
|    n_updates          | 16099     |
|    policy_loss        | -1.34e-08 |
|    value_loss         | 2.25e-06  |
-------------------------------------
-------------------------------------
| time/                 |           |
|    fps                | 601       |
|    iterations         | 200       |
|    time_elapsed       | 1         |
|    total_timesteps    | 1000      |
| train/                |           |
|    entropy_loss       | -0.000151 |
|    explained_variance | -0.0432   |
|    learning_rate      | 0.0007    |
|    n_updates          | 16199     |
|    policy_loss

<stable_baselines3.a2c.a2c.A2C at 0x14474542910>

In [49]:
A2C_Path = os.path.join('Training','Saved Models Acrobot', 'A2C_Model_Acrobot')
model.save(A2C_Path)

In [50]:
A2C_Path

'Training\\Saved Models Acrobot\\A2C_Model_Acrobot'

In [51]:
evaluate_policy(model, env, n_eval_episodes=5, render=True) # Evaluating model with 10 episodes

(-500.0, 0.0)

In [52]:
env.close()

In [53]:
model

<stable_baselines3.a2c.a2c.A2C at 0x14474542910>

In [54]:
# Testing DQN Trained algorithm
episodes = 5
for episode in range (1, episodes+1): 
    obs = env.reset()  # Resetting episodes
    done = False
    score = 0
    
    while not done:
        env.render() # Visualize model
        action, _ = model.predict(obs) # Using trained model to predict actions
        obs, reward, done, info = env.step(action) # Defining step action
        score += reward
    print('Episode:{} Score:{}'.format(episode,score))

Episode:1 Score:[-500.]
Episode:2 Score:[-500.]
Episode:3 Score:[-500.]
Episode:4 Score:[-500.]
Episode:5 Score:[-500.]


In [55]:
env.close()