# RL using TF, Keras, OpenAI gym

# Enviroment

In [1]:
import gym
import random
import numpy as np
import os

In [2]:
env = gym.make('MountainCar-v0')
states = env.observation_space.shape[0]  #obv format
actions = env.action_space.n  #possible actions

In [3]:
episodes = 5
for episode in range(1, episodes+1):
    obv = env.reset()
    done = False
    score = 0
    
    while not done: #if we win done == true
        env.render()
        action = random.choice([0,1]) #random element from list [0,1] or just use sample
        obv, reward, done, info = env.step(action)
        score += reward
    print('Episode:{} Score:{}'.format(episode,score))
env.close()

Episode:1 Score:-200.0
Episode:2 Score:-200.0
Episode:3 Score:-200.0
Episode:4 Score:-200.0
Episode:5 Score:-200.0
Episode:6 Score:-200.0
Episode:7 Score:-200.0
Episode:8 Score:-200.0
Episode:9 Score:-200.0
Episode:10 Score:-200.0


# Deep model

In [4]:
#from tensorflow.keras.models import Sequential
#from tensorflow.keras.layers import Dense, Flatten
#from tensorflow.keras.optimizers import Adam

In [5]:
#model 4*24*24*2
#def build_model(states, actions):
#    model = Sequential()
#    model.add(Flatten(input_shape=(1,states)))
#    model.add(Dense(24, activation='relu'))
#    model.add(Dense(24, activation='relu'))
#    model.add(Dense(actions, activation='linear'))
#    return model

In [6]:
#model = build_model(states, actions)
#model.summary() to view the architecture

# Build Agent

In [5]:
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

In [6]:
log_path = os.path.join('Training', 'logs')

In [7]:
log_path

'Training\\logs'

In [8]:
env = DummyVecEnv([lambda:env]) #vetorized enviroment
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path) # 3 possibile policies as baselines3
#We can open the logs for better visualize the evaluation 

Using cuda device


In [10]:
model.learn(total_timesteps=60000)#training model, complexity = more steps

Logging to Training\logs\PPO_8
-----------------------------
| time/              |      |
|    fps             | 743  |
|    iterations      | 1    |
|    time_elapsed    | 2    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 598         |
|    iterations           | 2           |
|    time_elapsed         | 6           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.011240096 |
|    clip_fraction        | 0.0156      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.1        |
|    explained_variance   | -0.0104     |
|    learning_rate        | 0.0003      |
|    loss                 | 17.7        |
|    n_updates            | 20          |
|    policy_gradient_loss | -0.00377    |
|    value_loss           | 89.7        |
-----------------------------------------
---

-----------------------------------------
| time/                   |             |
|    fps                  | 494         |
|    iterations           | 13          |
|    time_elapsed         | 53          |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.008102974 |
|    clip_fraction        | 0.00386     |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.08       |
|    explained_variance   | 1.49e-05    |
|    learning_rate        | 0.0003      |
|    loss                 | 59.9        |
|    n_updates            | 130         |
|    policy_gradient_loss | -0.000773   |
|    value_loss           | 159         |
-----------------------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 493         |
|    iterations           | 14          |
|    time_elapsed         | 58          |
|    total_timesteps      | 28672 

-----------------------------------------
| time/                   |             |
|    fps                  | 491         |
|    iterations           | 24          |
|    time_elapsed         | 100         |
|    total_timesteps      | 49152       |
| train/                  |             |
|    approx_kl            | 0.002651828 |
|    clip_fraction        | 0.000146    |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.936      |
|    explained_variance   | 0.0195      |
|    learning_rate        | 0.0003      |
|    loss                 | 54.3        |
|    n_updates            | 240         |
|    policy_gradient_loss | -0.000227   |
|    value_loss           | 172         |
-----------------------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 492         |
|    iterations           | 25          |
|    time_elapsed         | 103         |
|    total_timesteps      | 51200 

<stable_baselines3.ppo.ppo.PPO at 0x1cf125a6e48>

# Saving model (optional)

In [4]:
PPO_path = os.path.join('Training/','saved models','cartPoleModel')
PPO_path

'Training/saved models\\cartPoleModel'

In [11]:
#model.save(PPO_path)    #del model to delete the model

In [30]:
#model = PPO.load(PPO_path, env=env)

# Evaluate the Policy

In [9]:
evaluate_policy(model, env, n_eval_episodes=10, render=True )



(200.0, 0.0)

In [10]:
env.close()

# Test the model

In [12]:
#similar to the one wrote before
episodes = 5
for episode in range(1, episodes+1):
    obs = env.reset()
    done = False
    score = 0
    
    while not done: #if we win done == true
        env.render()
        action, _ = model.predict(obs) #using model prediction
        obs, reward, done, info = env.step(action)
        score += reward
    print('Episode:{} Score:{}'.format(episode, score))
env.close()

Episode:1 Score:[-200.]
Episode:2 Score:[-200.]
Episode:3 Score:[-200.]
Episode:4 Score:[-200.]
Episode:5 Score:[-200.]
