# RL using TF, Keras, OpenAI gym

# Enviroment

In [1]:
import gym
import random
import numpy as np
import os

In [2]:
env = gym.make('CartPole-v0')
states = env.observation_space.shape[0]  #obv format
actions = env.action_space.n  #possible actions

In [3]:
episodes = 10
for episode in range(1, episodes+1):
    obv = env.reset()
    done = False
    score = 0
    
    while not done: #if we win done == true
        env.render()
        action = random.choice([0,1]) #random element from list [0,1] or just use sample
        obv, reward, done, info = env.step(action)
        score += reward
    print('Episode:{} Score:{}'.format(episode,score))
env.close()

Episode:1 Score:27.0
Episode:2 Score:29.0
Episode:3 Score:11.0
Episode:4 Score:15.0
Episode:5 Score:44.0
Episode:6 Score:14.0
Episode:7 Score:27.0
Episode:8 Score:22.0
Episode:9 Score:30.0
Episode:10 Score:25.0


# Deep model

In [4]:
#from tensorflow.keras.models import Sequential
#from tensorflow.keras.layers import Dense, Flatten
#from tensorflow.keras.optimizers import Adam

In [5]:
#model 4*24*24*2
#def build_model(states, actions):
#    model = Sequential()
#    model.add(Flatten(input_shape=(1,states)))
#    model.add(Dense(24, activation='relu'))
#    model.add(Dense(24, activation='relu'))
#    model.add(Dense(actions, activation='linear'))
#    return model

In [6]:
#model = build_model(states, actions)
#model.summary() to view the architecture

# Build Agent

In [4]:
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

In [5]:
log_path = os.path.join('Training', 'logs')

In [6]:
log_path

'Training\\logs'

In [7]:
env = DummyVecEnv([lambda:env]) #vetorized enviroment
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path) # 3 possibile policies as baselines3
#We can open the logs for better visualize the evaluation 

Using cuda device


In [13]:
model.learn(total_timesteps=20000)#training model, complexity = more steps

Logging to Training\logs\PPO_6
-----------------------------
| time/              |      |
|    fps             | 907  |
|    iterations      | 1    |
|    time_elapsed    | 2    |
|    total_timesteps | 2048 |
-----------------------------
----------------------------------------
| time/                   |            |
|    fps                  | 610        |
|    iterations           | 2          |
|    time_elapsed         | 6          |
|    total_timesteps      | 4096       |
| train/                  |            |
|    approx_kl            | 0.00462535 |
|    clip_fraction        | 0.0477     |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.565     |
|    explained_variance   | 0.501      |
|    learning_rate        | 0.0003     |
|    loss                 | 12.2       |
|    n_updates            | 110        |
|    policy_gradient_loss | -0.00338   |
|    value_loss           | 88         |
----------------------------------------
---------------------

<stable_baselines3.ppo.ppo.PPO at 0x296cf4d5240>

# Saving model (optional)

In [10]:
PPO_path = os.path.join('Training/','saved models','cartPoleModel')
PPO_path

'Training/saved models\\cartPoleModel'

In [11]:
#model.save(PPO_path)    #del model to delete the model

In [30]:
#model = PPO.load(PPO_path, env=env)

# Evaluate the Policy

In [9]:
evaluate_policy(model, env, n_eval_episodes=10, render=True )



(200.0, 0.0)

In [10]:
env.close()

# Test the model

In [14]:
#similar to the one wrote before
episodes = 10
for episode in range(1, episodes+1):
    obs = env.reset()
    done = False
    score = 0
    
    while not done: #if we win done == true
        env.render()
        action, _ = model.predict(obs) #using model prediction
        obs, reward, done, info = env.step(action)
        score += reward
    print('Episode:{} Score:{}'.format(episode, score))
env.close()

Episode:1 Score:[200.]
Episode:2 Score:[200.]
Episode:3 Score:[200.]
Episode:4 Score:[200.]
Episode:5 Score:[200.]
Episode:6 Score:[200.]
Episode:7 Score:[200.]
Episode:8 Score:[200.]
Episode:9 Score:[200.]
Episode:10 Score:[200.]
