In [1]:
import gym
from gym.wrappers import Monitor
from stable_baselines3 import A2C
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_atari_env
import os

In [2]:
!python -m atari_py.import_roms .\Roms\ROMS

In [3]:
env_name = 'Breakout-v0'
env = gym.make(env_name)

In [4]:
env.action_space

Discrete(4)

In [5]:
env.observation_space

Box([[[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 ...

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]], [[[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 ...

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
 

In [6]:
episodes = 5
for episode in range(1, episodes + 1):
    # resets state to its initial values
    obs = env.reset()
    done = False
    score = 0
    
    while not done:
        # renders the actual environment
        env.render()
        # generates random action
        action = env.action_space.sample()
        # env.step outputs 4 values that corresponds to our variables below
        obs, reward, done, info = env.step(action)
        score += reward
    print('Episode:{} Score:{}'.format(episode, score))
env.close()



Episode:1 Score:1.0
Episode:2 Score:0.0
Episode:3 Score:1.0
Episode:4 Score:1.0
Episode:5 Score:1.0


In [15]:
env = make_atari_env('Breakout-v0', n_envs = 4, seed = 0)
env = VecFrameStack(env, n_stack = 4)

In [16]:
log_path = os.path.join('Training', 'Logs')
model = A2C('CnnPolicy', env, verbose = 0, tensorboard_log=log_path)

In [17]:
%%time
model.learn(total_timesteps=100_000)

Wall time: 22min 36s


<stable_baselines3.a2c.a2c.A2C at 0x273fcfa6460>

In [18]:
env = make_atari_env('Breakout-v0', n_envs = 1, seed = 0)
env = VecFrameStack(env, n_stack = 4)

evaluate_policy(model, env, n_eval_episodes=10, render = True)

(5.9, 2.118962010041709)

In [19]:
env.close()

In [24]:
path = os.path.join('Training', 'Saved Models', 'a2c_breakout')
model.save(path)

## Model 2: A2C, 500k timesteps

In [20]:
env = make_atari_env('Breakout-v0', n_envs = 4, seed = 0)
env = VecFrameStack(env, n_stack = 4)

log_path = os.path.join('Training', 'Logs')
model2 = A2C('CnnPolicy', env, verbose = 0, tensorboard_log=log_path)

In [21]:
%%time
model2.learn(total_timesteps=500_000)

Wall time: 1h 49min 7s


<stable_baselines3.a2c.a2c.A2C at 0x273fcfc1a90>

In [22]:
env = make_atari_env('Breakout-v0', n_envs = 1, seed = 0)
env = VecFrameStack(env, n_stack = 4)

evaluate_policy(model2, env, n_eval_episodes=10, render = True)

(9.0, 1.7888543819998317)

In [23]:
env.close()

In [25]:
path2 = os.path.join('Training', 'Saved Models', 'a2c_breakout500k')
model2.save(path2)

In [26]:
env = make_atari_env('Breakout-v0', n_envs = 4, seed = 0)
env = VecFrameStack(env, n_stack = 4)

In [27]:
%%time
model2.learn(total_timesteps=500_000)

Wall time: 1h 45min


<stable_baselines3.a2c.a2c.A2C at 0x273fcfc1a90>

## 1mil timesteps

In [28]:
env = make_atari_env('Breakout-v0', n_envs = 1, seed = 0)
env = VecFrameStack(env, n_stack = 4)

evaluate_policy(model2, env, n_eval_episodes=10, render = True)

(16.9, 6.862215385719105)

In [29]:
env.close()

In [30]:
path3 = os.path.join('Training', 'Saved Models', 'a2c_breakout1mil')
model2.save(path3)

In [None]:
env = make_atari_env('Breakout-v0', n_envs = 4, seed = 0)
env = VecFrameStack(env, n_stack = 4)

In [None]:
%%time
model2.learn(total_timesteps=500_000)