# Imports

In [1]:
import gym
from gym.wrappers import Monitor
from stable_baselines3 import A2C
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_atari_env
import os

# Load Environment

In [2]:
# This is needed to load and render the Atari environments
!python -m atari_py.import_roms .\Roms\ROMS

In [3]:
env_name = 'Breakout-v0'
env = gym.make(env_name)

We'll check the action space to determine what algorithm to use. We'll also check the observation space as it might be a series of images allowing us to use a CNN incorporated into the policy.

In [4]:
env.action_space

Discrete(4)

In [5]:
env.observation_space

Box([[[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 ...

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]], [[[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 ...

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
 

We'll quickly render the environment with the agent taking random steps just to see what we're dealing with.

In [6]:
episodes = 5
for episode in range(1, episodes + 1):
    # resets state to its initial values
    obs = env.reset()
    done = False
    score = 0
    
    while not done:
        # renders the actual environment
        env.render()
        # generates random action
        action = env.action_space.sample()
        # env.step outputs 4 values that corresponds to our variables below
        obs, reward, done, info = env.step(action)
        score += reward
    print('Episode:{} Score:{}'.format(episode, score))
env.close()



Episode:1 Score:1.0
Episode:2 Score:0.0
Episode:3 Score:1.0
Episode:4 Score:1.0
Episode:5 Score:1.0


# Modeling

Now we'll make the training environment. It will be vectorized so that we can train 4 environments simultaneously, speeding up training time

In [15]:
env = make_atari_env('Breakout-v0', n_envs = 4, seed = 0)
env = VecFrameStack(env, n_stack = 4)

## A2C, 100k timesteps

We'll use an A2C with a CNN policy because the action space is discrete and a CNN because the observation space is a series of images.

In [16]:
log_path = os.path.join('Training', 'Logs')
model = A2C('CnnPolicy', env, verbose = 0, tensorboard_log=log_path)

In [17]:
%%time
model.learn(total_timesteps=100_000)

Wall time: 22min 36s


<stable_baselines3.a2c.a2c.A2C at 0x273fcfa6460>

### Evaluation

Out trained model is definitely better than taking random actions. So we'll increase the training time with the assumption that it will improve performance.

In [18]:
env = make_atari_env('Breakout-v0', n_envs = 1, seed = 0)
env = VecFrameStack(env, n_stack = 4)

evaluate_policy(model, env, n_eval_episodes=10, render = True)

(5.9, 2.118962010041709)

In [19]:
# closes rendering of environment
env.close()

In [24]:
# saving the model for future use
# path = os.path.join('Training', 'Saved Models', 'a2c_breakout')
# model.save(path)

## Model 2: A2C, 500k timesteps

In [20]:
env = make_atari_env('Breakout-v0', n_envs = 4, seed = 0)
env = VecFrameStack(env, n_stack = 4)

log_path = os.path.join('Training', 'Logs')
model2 = A2C('CnnPolicy', env, verbose = 0, tensorboard_log=log_path)

In [21]:
%%time
model2.learn(total_timesteps=500_000)

Wall time: 1h 49min 7s


<stable_baselines3.a2c.a2c.A2C at 0x273fcfc1a90>

### Evaluation

Further evaluation has shown our assumption to be correct: More training results in a higher score.

In [22]:
env = make_atari_env('Breakout-v0', n_envs = 1, seed = 0)
env = VecFrameStack(env, n_stack = 4)

evaluate_policy(model2, env, n_eval_episodes=10, render = True)

(9.0, 1.7888543819998317)

In [23]:
# closes rendering of environment
env.close()

In [25]:
# save model
# path2 = os.path.join('Training', 'Saved Models', 'a2c_breakout500k')
# model2.save(path2)

## 1mil timesteps

In [26]:
env = make_atari_env('Breakout-v0', n_envs = 4, seed = 0)
env = VecFrameStack(env, n_stack = 4)

In [27]:
%%time
model2.learn(total_timesteps=500_000)

Wall time: 1h 45min


<stable_baselines3.a2c.a2c.A2C at 0x273fcfc1a90>

### Evaluation

Our best model for Breakout! Had we more time and computation, we could increase our score.

In [28]:
env = make_atari_env('Breakout-v0', n_envs = 1, seed = 0)
env = VecFrameStack(env, n_stack = 4)

evaluate_policy(model2, env, n_eval_episodes=10, render = True)

(16.9, 6.862215385719105)

In [29]:
env.close()

In [30]:
# saving the model for future use
# path3 = os.path.join('Training', 'Saved Models', 'a2c_breakout1mil')
# model2.save(path3)