# TFM | Reinforcement Learning | Daniel Zorrilla | MountainCar

## Installing additional dependencies

###### Installing stable baselines and pyglet library for developing games and other visually-rich applications

In [None]:
!pip install stable-baselines3[extra]

In [None]:
!pip install pyglet

# PPO Algorithm

## 1. Importing Dependencies

In [1]:
import os # provides a way of using OS dependent functionality. (files)
import gym # Open AI gym
from stable_baselines3 import PPO #PPO RL Algorithm
from stable_baselines3.common.vec_env import DummyVecEnv # Creates a simple vectorized wrapper for multiple environments
from stable_baselines3.common.evaluation import evaluate_policy # Test how well a model is performing

## 2. Environments

In [2]:
environment_name = 'MountainCar-v0' # Naming the MountainCar environment
env = gym.make(environment_name) # Creating the environment

In [3]:
episodes = 5  # Number of episodes
for episode in range (1, episodes+1): # Resetting environment  
    state = env.reset() 
    done = False
    score = 0
    
    while not done: # While episode active
        env.render() # Visualizing environment
        action = env.action_space.sample() # Creating sample actions
        n_state, reward, done, info = env.step(action) # Defining step action
        score += reward # Getting score
    print('Episode:{} Score:{}'.format(episode,score)) # Printing episode and score

Episode:1 Score:-200.0
Episode:2 Score:-200.0
Episode:3 Score:-200.0
Episode:4 Score:-200.0
Episode:5 Score:-200.0


In [4]:
env.close() # Closing the render

In [5]:
env.action_space # Understanding the action space of the agent

Discrete(3)

In [6]:
env.action_space.sample() # Action random sample

0

In [7]:
env.observation_space # Understanding the observation space of this environment

Box([-1.2  -0.07], [0.6  0.07], (2,), float32)

In [8]:
env.observation_space.sample()

array([-0.45368654,  0.04150966], dtype=float32)

## 3. Train and create RL Model

In [9]:
log_path = os.path.join('Training', 'MountainCar') #where it is saved the tensorboard log

In [10]:
log_path

'Training\\MountainCar'

#### Install Pytorch *conda install pytorch torchvision torchaudio cudatoolkit=11.3 -c pytorch*

In [11]:
env = gym.make(environment_name) # Create environment
env = DummyVecEnv([lambda: env]) # Wrapped environment using DummyVecEnv
model = PPO('MlpPolicy', env, verbose = 1, tensorboard_log=log_path) # Creating PPO Algorithm with MultiLayerPerceptron Policy

Using cpu device


In [12]:
model.learn(total_timesteps=20000) # Train model 100.000 steps. 

Logging to Training\MountainCar\PPO_1
-----------------------------
| time/              |      |
|    fps             | 870  |
|    iterations      | 1    |
|    time_elapsed    | 2    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 567         |
|    iterations           | 2           |
|    time_elapsed         | 7           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008367155 |
|    clip_fraction        | 0.00122     |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.1        |
|    explained_variance   | 0.000112    |
|    learning_rate        | 0.0003      |
|    loss                 | 12.5        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0017     |
|    value_loss           | 130         |
--------------------------------------

<stable_baselines3.ppo.ppo.PPO at 0x2230c01edf0>

## 4. Save and Reload Model

In [13]:
PPO_Path = os.path.join('Training', 'Saved Models MountainCar', 'PPO_Model_MountainCar') # Locate path

In [14]:
model.save(PPO_Path) #save model in PPO_Path

In [15]:
del model #Delete model to simulate reloading in production

In [16]:
model = PPO.load(PPO_Path, env = env) # Loading again the model saved in PPO_Path

In [17]:
PPO_Path

'Training\\Saved Models MountainCar\\PPO_Model_MountainCar'

## 5. Evaluation

In [18]:
evaluate_policy(model, env, n_eval_episodes=5, render=True) # Evaluating model with 10 episodes



(-200.0, 0.0)

In [19]:
env.close()

## 6. Testing Model

In [59]:
episodes = 5
for episode in range (1, episodes+1): 
    obs = env.reset()  # Resetting episodes
    done = False
    score = 0
    
    while not done:
        env.render() # Visualize model
        action, _states = model.predict(obs) # Using trained model to predict actions
        obs, reward, done, info = env.step(action) # Defining step action
        score += reward
    print('Episode:{} Score:{}'.format(episode,score))

Episode:1 Score:[-200.]
Episode:2 Score:[-200.]
Episode:3 Score:[-200.]
Episode:4 Score:[-200.]
Episode:5 Score:[-200.]


In [60]:
env.close()

## 7. Viewing Logs in Tensorboard

In [22]:
training_log_path = os.path.join(log_path, 'PPO_1') # Locating PPO_1 path

In [23]:
training_log_path

'Training\\MountainCar\\PPO_1'

In [24]:
!tensorboard --logdir={training_log_path}

^C


#### Execute in command line the tensorboard visualization http://localhost:6006 stop the cell to continue

## 8. Adding a callback to the training stage

In [25]:
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold

In [26]:
save_path = os.path.join('Training','Saved Models MountainCar') #Where the best model is going to be saved

In [27]:
stop_callback = StopTrainingOnRewardThreshold(reward_threshold=200, verbose=1) #Stop our training when we achieved a 200 rwd
eval_callback = EvalCallback(env,  #callback that is triggered after each training run
                            callback_on_new_best=stop_callback, #callback to run in the new best model
                            eval_freq=10000, #Evaluation Frequency to 10.000 time steps
                            best_model_save_path=save_path, # Save the model everytime there is a new best model
                            verbose=1)

In [28]:
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

Using cpu device


In [29]:
model.learn(total_timesteps=20000, callback=eval_callback) # Training model with callback argument

Logging to Training\MountainCar\PPO_2
-----------------------------
| time/              |      |
|    fps             | 874  |
|    iterations      | 1    |
|    time_elapsed    | 2    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 570         |
|    iterations           | 2           |
|    time_elapsed         | 7           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.009148173 |
|    clip_fraction        | 0.00425     |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.1        |
|    explained_variance   | -0.000311   |
|    learning_rate        | 0.0003      |
|    loss                 | 13.7        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.00217    |
|    value_loss           | 132         |
--------------------------------------

<stable_baselines3.ppo.ppo.PPO at 0x2231b5a1ca0>

## 9. Changing Policies

In [30]:
net_arch = [dict(pi=[128,128,128,128], vf=[128,128,128,128])] #dictionary neural network for our custom actor=PI and valueFunctn
                                                              #128 un/eachLayer (4Lyrs)

In [61]:
#associating this new_Arch to the model
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path, policy_kwargs={'net_arch':net_arch})

Using cpu device


In [32]:
model.learn(total_timesteps=20000, callback=eval_callback)

Logging to Training\MountainCar\PPO_3
-----------------------------
| time/              |      |
|    fps             | 613  |
|    iterations      | 1    |
|    time_elapsed    | 3    |
|    total_timesteps | 2048 |
-----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 364          |
|    iterations           | 2            |
|    time_elapsed         | 11           |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0084432345 |
|    clip_fraction        | 0.0496       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.09        |
|    explained_variance   | -0.000114    |
|    learning_rate        | 0.0003       |
|    loss                 | 4.83         |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.00384     |
|    value_loss           | 35.4         |
---------------------

<stable_baselines3.ppo.ppo.PPO at 0x2231512fac0>

In [62]:
# Testing the model with new architecture
episodes = 5
for episode in range (1, episodes+1):
    obs = env.reset()
    done = False
    score = 0
    
    while not done:
        env.render()
        action, _states = model.predict(obs) #using trained model to predict actions
        obs, reward, done, info = env.step(action)
        score += reward
    print('Episode:{} Score:{}'.format(episode,score))

Episode:1 Score:[-200.]
Episode:2 Score:[-200.]
Episode:3 Score:[-200.]
Episode:4 Score:[-200.]
Episode:5 Score:[-200.]


In [63]:
env.close()

## 10. Using DQN Algorithm

In [35]:
from stable_baselines3 import DQN # DQN RL Algorithm

In [36]:
model = DQN('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

Using cpu device


In [39]:
model.learn(total_timesteps=20000, callback=eval_callback) #50.000

Logging to Training\MountainCar\DQN_3
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.848    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 2914     |
|    time_elapsed     | 0        |
|    total_timesteps  | 800      |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.696    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 3305     |
|    time_elapsed     | 0        |
|    total_timesteps  | 1600     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.544    |
| time/               |          |
|    episodes         | 12       |
|    fps              | 3534     |
|    time_elapsed     | 0        |
|    total_timesteps  | 2400     |
----------------------------------
-----------------

<stable_baselines3.dqn.dqn.DQN at 0x2231b637970>

In [40]:
DQN_Path = os.path.join('Training','Saved Models MountainCar', 'DQN_Model_MountainCar')
model.save(DQN_Path)

In [41]:
DQN_Path

'Training\\Saved Models MountainCar\\DQN_Model_MountainCar'

In [42]:
evaluate_policy(model, env, n_eval_episodes=5, render=True) # Evaluating model with 10 episodes

(-200.0, 0.0)

In [43]:
env.close()

In [44]:
model

<stable_baselines3.dqn.dqn.DQN at 0x2231b637970>

In [45]:
# Testing DQN Trained algorithm
episodes = 5
for episode in range (1, episodes+1): 
    obs = env.reset()  # Resetting episodes
    done = False
    score = 0
    
    while not done:
        env.render() # Visualize model
        action, _ = model.predict(obs) # Using trained model to predict actions
        obs, reward, done, info = env.step(action) # Defining step action
        score += reward
    print('Episode:{} Score:{}'.format(episode,score))

Episode:1 Score:[-200.]
Episode:2 Score:[-200.]
Episode:3 Score:[-200.]
Episode:4 Score:[-200.]
Episode:5 Score:[-200.]


In [46]:
env.close()

## 11. Using A2C Algorithm

In [47]:
from stable_baselines3 import A2C # A2C RL Algorithm

In [48]:
model = A2C('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

Using cpu device


In [51]:
model.learn(total_timesteps=20000, callback=eval_callback) #First 20.000 then 200.000

Logging to Training\MountainCar\A2C_3
------------------------------------
| time/                 |          |
|    fps                | 421      |
|    iterations         | 100      |
|    time_elapsed       | 1        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -0.018   |
|    explained_variance | -0.191   |
|    learning_rate      | 0.0007   |
|    n_updates          | 20099    |
|    policy_loss        | 1.62e-06 |
|    value_loss         | 5.92e-07 |
------------------------------------
------------------------------------
| time/                 |          |
|    fps                | 396      |
|    iterations         | 200      |
|    time_elapsed       | 2        |
|    total_timesteps    | 1000     |
| train/                |          |
|    entropy_loss       | -0.0243  |
|    explained_variance | 0.025    |
|    learning_rate      | 0.0007   |
|    n_updates          | 20199    |
|    policy_loss        | 2.38e-06 |


<stable_baselines3.a2c.a2c.A2C at 0x2231b669a00>

In [52]:
A2C_Path = os.path.join('Training','Saved Models MountainCar', 'A2C_Model_MountainCar')
model.save(A2C_Path)

In [53]:
A2C_Path

'Training\\Saved Models MountainCar\\A2C_Model_MountainCar'

In [54]:
evaluate_policy(model, env, n_eval_episodes=5, render=True) # Evaluating model with 10 episodes

(-200.0, 0.0)

In [55]:
env.close()

In [56]:
model

<stable_baselines3.a2c.a2c.A2C at 0x2231b669a00>

In [57]:
# Testing A2C Trained algorithm
episodes = 5
for episode in range (1, episodes+1): 
    obs = env.reset()  # Resetting episodes
    done = False
    score = 0
    
    while not done:
        env.render() # Visualize model
        action, _ = model.predict(obs) # Using trained model to predict actions
        obs, reward, done, info = env.step(action) # Defining step action
        score += reward
    print('Episode:{} Score:{}'.format(episode,score))

Episode:1 Score:[-200.]
Episode:2 Score:[-200.]
Episode:3 Score:[-200.]
Episode:4 Score:[-200.]
Episode:5 Score:[-200.]


In [58]:
env.close()