# TFM | Reinforcement Learning | Daniel Zorrilla | Cartpole

## Installing additional dependencies

###### Installing stable baselines and pyglet library for developing games and other visually-rich applications

In [1]:
!pip install stable-baselines3[extra]



In [2]:
!pip install pyglet



# PPO Algorithm

## 1. Importing Dependencies

In [3]:
import os # provides a way of using OS dependent functionality. (files)
import gym # Open AI gym
from stable_baselines3 import PPO #PPO RL Algorithm
from stable_baselines3.common.vec_env import DummyVecEnv # Creates a simple vectorized wrapper for multiple environments
from stable_baselines3.common.evaluation import evaluate_policy # Test how well a model is performing

## 2. Environments

In [4]:
environment_name = 'CartPole-v0' # Naming the Cartpole-v0 environment
env = gym.make(environment_name) # Creating the environment

In [5]:
episodes = 5  # Number of episodes
for episode in range (1, episodes+1): # Resetting environment  
    state = env.reset() 
    done = False
    score = 0
    
    while not done: # While episode active
        env.render() # Visualizing environment
        action = env.action_space.sample() # Creating sample actions
        n_state, reward, done, info = env.step(action) # Defining step action
        score += reward # Getting score
    print('Episode:{} Score:{}'.format(episode,score)) # Printing episode and score

Episode:1 Score:33.0
Episode:2 Score:19.0
Episode:3 Score:24.0
Episode:4 Score:12.0
Episode:5 Score:31.0


In [6]:
env.close() # Closing the render

In [7]:
env.action_space # Understanding the action space of the agent

Discrete(2)

In [8]:
env.action_space.sample() # Action random sample

1

In [9]:
env.observation_space # Understanding the observation space of this environment

Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)

In [10]:
env.observation_space.sample()

array([-3.0860278e-01,  5.0972073e+37, -2.8087461e-01,  1.0164241e+38],
      dtype=float32)

## 3. Train and create RL Model

In [11]:
log_path = os.path.join('Training', 'Cartpole') #where it is saved the tensorboard log

In [12]:
log_path

'Training\\Cartpole'

#### Install Pytorch *conda install pytorch torchvision torchaudio cpuonly -c pytorch

In [13]:
env = gym.make(environment_name) # Create environment
env = DummyVecEnv([lambda: env]) # Wrapped environment using DummyVecEnv
model = PPO('MlpPolicy', env, verbose = 1, tensorboard_log=log_path) # Creating PPO Algorithm with MultiLayerPerceptron Policy

Using cpu device


In [14]:
model.learn(total_timesteps=40000) # Train model 40.000 steps. 

Logging to Training\Cartpole\PPO_1
-----------------------------
| time/              |      |
|    fps             | 404  |
|    iterations      | 1    |
|    time_elapsed    | 5    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 587         |
|    iterations           | 2           |
|    time_elapsed         | 6           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008680257 |
|    clip_fraction        | 0.0933      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.686      |
|    explained_variance   | -0.004      |
|    learning_rate        | 0.0003      |
|    loss                 | 6.86        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0131     |
|    value_loss           | 55.2        |
-----------------------------------------

<stable_baselines3.ppo.ppo.PPO at 0x1752b698610>

## 4. Save and Reload Model

In [15]:
PPO_Path = os.path.join('Training', 'Saved Models Cartpole', 'PPO_Model_Cartpole') # Locate path

In [16]:
model.save(PPO_Path) #save model in PPO_Path

In [17]:
del model #Delete model to simulate reloading in production

In [18]:
model = PPO.load(PPO_Path, env = env) # Loading again the model saved in PPO_Path

In [19]:
PPO_Path

'Training\\Saved Models Cartpole\\PPO_Model_Cartpole'

## 5. Evaluation

In [20]:
evaluate_policy(model, env, n_eval_episodes=5, render=True) # Evaluating model with 10 episodes



(200.0, 0.0)

In [21]:
env.close()

## 6. Testing Model

In [22]:
episodes = 5
for episode in range (1, episodes+1): 
    obs = env.reset()  # Resetting episodes
    done = False
    score = 0
    
    while not done:
        env.render() # Visualize model
        action, _ = model.predict(obs) # Using trained model to predict actions
        obs, reward, done, info = env.step(action) # Defining step action
        score += reward
    print('Episode:{} Score:{}'.format(episode,score))

Episode:1 Score:[200.]
Episode:2 Score:[200.]
Episode:3 Score:[200.]
Episode:4 Score:[200.]
Episode:5 Score:[200.]


In [23]:
env.close()

## 7. Viewing Logs in Tensorboard Dev

!tensorboard dev upload --logdir {Path_To_Log} --name "Experiment"

## 8. Adding a callback to the training stage

In [24]:
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold

In [25]:
save_path = os.path.join('Training','Saved Models Cartpole') #Where the best model is going to be saved

In [26]:
stop_callback = StopTrainingOnRewardThreshold(reward_threshold=200, verbose=1) #Stop our training when we achieved a 200 rwd
eval_callback = EvalCallback(env,  #callback that is triggered after each training run
                            callback_on_new_best=stop_callback, #callback to run in the new best model
                            eval_freq=10000, #Evaluation Frequency to 10.000 time steps
                            best_model_save_path=save_path, # Save the model everytime there is a new best model
                            verbose=1)

In [27]:
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

Using cpu device


In [28]:
model.learn(total_timesteps=40000, callback=eval_callback) # Training model with callback argument

Logging to Training\Cartpole\PPO_2
-----------------------------
| time/              |      |
|    fps             | 2386 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 2048 |
-----------------------------
----------------------------------------
| time/                   |            |
|    fps                  | 1531       |
|    iterations           | 2          |
|    time_elapsed         | 2          |
|    total_timesteps      | 4096       |
| train/                  |            |
|    approx_kl            | 0.00818003 |
|    clip_fraction        | 0.0813     |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.687     |
|    explained_variance   | -0.00773   |
|    learning_rate        | 0.0003     |
|    loss                 | 5.87       |
|    n_updates            | 10         |
|    policy_gradient_loss | -0.011     |
|    value_loss           | 47         |
----------------------------------------
-----------------

<stable_baselines3.ppo.ppo.PPO at 0x17539a71880>

## 9. Changing Policies

In [29]:
net_arch = [dict(pi=[128,128,128,128], vf=[128,128,128,128])] #dictionary neural network for our custom actor=PI and valueFunctn
                                                              #128 un/eachLayer (4Lyrs)

In [30]:
#associating this new_Arch to the model
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path, policy_kwargs={'net_arch':net_arch})

Using cpu device


In [31]:
model.learn(total_timesteps=40000, callback=eval_callback)

Logging to Training\Cartpole\PPO_3
-----------------------------
| time/              |      |
|    fps             | 1744 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 1000        |
|    iterations           | 2           |
|    time_elapsed         | 4           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.013630532 |
|    clip_fraction        | 0.206       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.682      |
|    explained_variance   | -0.00535    |
|    learning_rate        | 0.0003      |
|    loss                 | 2.92        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0248     |
|    value_loss           | 19.8        |
-----------------------------------------

<stable_baselines3.ppo.ppo.PPO at 0x17539a1c490>

In [32]:
# Testing the model with new architecture
episodes = 5
for episode in range (1, episodes+1):
    obs = env.reset()
    done = False
    score = 0
    
    while not done:
        env.render()
        action, _ = model.predict(obs) #using trained model to predict actions
        obs, reward, done, info = env.step(action)
        score += reward
    print('Episode:{} Score:{}'.format(episode,score))

Episode:1 Score:[200.]
Episode:2 Score:[200.]
Episode:3 Score:[200.]
Episode:4 Score:[200.]
Episode:5 Score:[200.]


In [33]:
env.close()

## 10. Using DQN Algorithm

In [34]:
from stable_baselines3 import DQN # DQN RL Algorithm

In [36]:
model = DQN('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

Using cpu device


In [39]:
model.learn(total_timesteps=40000, callback=eval_callback) #40.000

Logging to Training\Cartpole\DQN_3
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.985    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 1823     |
|    time_elapsed     | 0        |
|    total_timesteps  | 65       |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.961    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 3463     |
|    time_elapsed     | 0        |
|    total_timesteps  | 165      |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.931    |
| time/               |          |
|    episodes         | 12       |
|    fps              | 4720     |
|    time_elapsed     | 0        |
|    total_timesteps  | 291      |
----------------------------------
--------------------

<stable_baselines3.dqn.dqn.DQN at 0x1753bfa1790>

In [40]:
DQN_Path = os.path.join('Training','Saved Models Cartpole', 'DQN_Model_Cartpole')
model.save(DQN_Path)

In [41]:
DQN_Path

'Training\\Saved Models Cartpole\\DQN_Model_Cartpole'

In [42]:
evaluate_policy(model, env, n_eval_episodes=5, render=True) # Evaluating model with 10 episodes

(9.4, 0.48989794855663565)

In [43]:
env.close()

In [44]:
model

<stable_baselines3.dqn.dqn.DQN at 0x1753bfa1790>

In [45]:
# Testing DQN Trained algorithm
episodes = 5
for episode in range (1, episodes+1): 
    obs = env.reset()  # Resetting episodes
    done = False
    score = 0
    
    while not done:
        env.render() # Visualize model
        action, _ = model.predict(obs) # Using trained model to predict actions
        obs, reward, done, info = env.step(action) # Defining step action
        score += reward
    print('Episode:{} Score:{}'.format(episode,score))

Episode:1 Score:[9.]
Episode:2 Score:[10.]
Episode:3 Score:[11.]
Episode:4 Score:[10.]
Episode:5 Score:[9.]


In [46]:
env.close()

## 11. Using A2C Algorithm

In [48]:
from stable_baselines3 import A2C # A2C RL Algorithm

In [50]:
model = A2C('MlpPolicy', env, verbose=1, tensorboard_log=log_path, policy_kwargs={'net_arch':net_arch})

Using cpu device


In [53]:
model.learn(total_timesteps=40000, callback=eval_callback) # 40.000

Logging to Training\Cartpole\A2C_3
------------------------------------
| time/                 |          |
|    fps                | 642      |
|    iterations         | 100      |
|    time_elapsed       | 0        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -0.0155  |
|    explained_variance | -0.286   |
|    learning_rate      | 0.0007   |
|    n_updates          | 16099    |
|    policy_loss        | 4.93e-06 |
|    value_loss         | 2.74e-06 |
------------------------------------
-------------------------------------
| time/                 |           |
|    fps                | 650       |
|    iterations         | 200       |
|    time_elapsed       | 1         |
|    total_timesteps    | 1000      |
| train/                |           |
|    entropy_loss       | -0.00409  |
|    explained_variance | -0.238    |
|    learning_rate      | 0.0007    |
|    n_updates          | 16199     |
|    policy_loss        | -1.

<stable_baselines3.a2c.a2c.A2C at 0x175450c9ac0>

In [54]:
A2C_Path = os.path.join('Training','Saved Models Cartpole', 'A2C_Model_Cartpole')
model.save(A2C_Path)

In [55]:
A2C_Path

'Training\\Saved Models Cartpole\\A2C_Model_Cartpole'

In [56]:
evaluate_policy(model, env, n_eval_episodes=5, render=True) # Evaluating model with 10 episodes

(200.0, 0.0)

In [57]:
env.close()

In [58]:
model

<stable_baselines3.a2c.a2c.A2C at 0x175450c9ac0>

In [59]:
# Testing DQN Trained algorithm
episodes = 5
for episode in range (1, episodes+1): 
    obs = env.reset()  # Resetting episodes
    done = False
    score = 0
    
    while not done:
        env.render() # Visualize model
        action, _ = model.predict(obs) # Using trained model to predict actions
        obs, reward, done, info = env.step(action) # Defining step action
        score += reward
    print('Episode:{} Score:{}'.format(episode,score))

Episode:1 Score:[200.]
Episode:2 Score:[200.]
Episode:3 Score:[200.]
Episode:4 Score:[200.]
Episode:5 Score:[200.]


In [60]:
env.close()