## Breakout Atari 2600

In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
#import matplotlib as plt
import matplotlib.pyplot as plt

#!conda install -c conda-forge gym
import gym

In [2]:
tf.random.set_seed(42)
np.random.seed(42)

#!pip install tf-agents

# Reinforcement Learning Breakout Atari 2600

## 1 - Preparar el Ambiente

Vamos a crear una clase AtariPreprocessingWithAutoFire ya que el juego ocupa disparar en el primer frame para iniciar el juego, asi que apenas se haga reset, este inicial el juego con una accion Fire.

El ambiente a usar es **"BreakoutNoFrameskip-v4"** ya que este no usa max polling ni frame skipping.

In [3]:
from tf_agents.environments import suite_atari
from tf_agents.environments.atari_preprocessing import AtariPreprocessing
from tf_agents.environments.atari_wrappers import FrameStack4 # Stack 4 frames previos

max_episode_steps = 27000 # <=> 108k ALE frames ya que 1 step = 4 frames
environment_name = "BreakoutNoFrameskip-v4"

class AtariPreprocessingWithAutoFire(AtariPreprocessing):
    def reset(self, **kwargs):
        obs = super().reset(**kwargs)
        super().step(1) # Start
        return obs
    def step(self, action):
        lives_before_action = self.ale.lives()
        obs, rewards, done, info = super().step(action)
        if self.ale.lives() < lives_before_action and not done:
            super().step(1) # Auto Start si hay una muerte. (se pifea la bolita)
        return obs, rewards, done, info

# vamos a crear el ambiente con 2 wrappers, uno custom, y otro llamado FrameStack4   
env = suite_atari.load(
    environment_name,
    max_episode_steps=max_episode_steps,
    gym_env_wrappers=[AtariPreprocessingWithAutoFire, FrameStack4])

In [4]:
# vamos a convertir el ambiente actual en un ambiente de Tensorflow (como un grafo de tensorflow).

from tf_agents.environments.tf_py_environment import TFPyEnvironment

tf_env = TFPyEnvironment(env)

### DQN

Vamos a crear una clase pequeña para normalizar las observaciones. Las imágenes se almacenan usando bytes de 0 a 255 para usar menos RAM, pero queremos pasar flotantes de 0.0 a 1.0 a la red neuronal:

In [5]:
from tf_agents.networks.q_network import QNetwork

# normaliza 0.0 a 1.0
preprocessing_layer = keras.layers.Lambda(
                          lambda obs: tf.cast(obs, np.float32) / 255.)

# capas connvoluciuonales con los filtros a utilizar
conv_layer_params=[(32, (8, 8), 4), (64, (4, 4), 2), (64, (3, 3), 1)]

# capa Dense(512) despues salen 4 valores de output (Q-Values)
fc_layer_params=[512]

# Deep Q-Network (Q-Learning)
# esta red devuelve 1 Q-Value por cada accion. (recordemos son 4)
q_net = QNetwork(
    tf_env.observation_spec(),
    tf_env.action_spec(), # 4 acciones
    preprocessing_layers=preprocessing_layer,
    conv_layer_params=conv_layer_params,
    fc_layer_params=fc_layer_params)

### Definicion del DqnAgent

In [6]:
from tf_agents.agents.dqn.dqn_agent import DqnAgent

train_step = tf.Variable(0)
update_period = 4 # un training step recolecta 4 pasos
optimizer = keras.optimizers.RMSprop(lr=2.5e-4, rho=0.95, momentum=0.0,
                                     epsilon=0.00001, centered=True)

# funcion que calcula e y e-greedy
epsilon_fn = keras.optimizers.schedules.PolynomialDecay(
    initial_learning_rate=1.0, # initial ε
    decay_steps=250000 // update_period, # <=> 1,000,000 ALE frames
    end_learning_rate=0.01) # final ε

# el agente con la DQN creada y todos los parametros necesarios
agent = DqnAgent(tf_env.time_step_spec(),
                 tf_env.action_spec(),
                 q_network=q_net,
                 optimizer=optimizer,
                 target_update_period=2000, # <=> 32,000 ALE frames
                 td_errors_loss_fn=keras.losses.Huber(reduction="none"),
                 gamma=0.99, # factor de descuento
                 train_step_counter=train_step,
                 epsilon_greedy=lambda: epsilon_fn(train_step))

# se inicializa el agente
agent.initialize()

### Creamos el Replay Buffer

In [7]:
from tf_agents.replay_buffers import tf_uniform_replay_buffer

replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    data_spec=agent.collect_data_spec,
    batch_size=tf_env.batch_size,
    max_length=100000) # reducir si da Out-of-Memory Error (OOM)

### Creamos el Observador

In [8]:
replay_buffer_observer = replay_buffer.add_batch

### Funciones Utilitarias

In [9]:
class ShowProgress:
    def __init__(self, total):
        self.counter = 0
        self.total = total
    def __call__(self, trajectory):
        if not trajectory.is_boundary():
            self.counter += 1
        if self.counter % 100 == 0:
            print("\r{}/{}".format(self.counter, self.total), end="")

In [10]:
from tf_agents.metrics import tf_metrics

train_metrics = [
    tf_metrics.NumberOfEpisodes(),
    tf_metrics.EnvironmentSteps(),
    tf_metrics.AverageReturnMetric(),
    tf_metrics.AverageEpisodeLengthMetric(),
]

train_metrics[0].result()

<tf.Tensor: shape=(), dtype=int64, numpy=0>

In [11]:
from tf_agents.eval.metric_utils import log_metrics
import logging
logging.getLogger().setLevel(logging.INFO)
log_metrics(train_metrics)

INFO:absl: 
		 NumberOfEpisodes = 0
		 EnvironmentSteps = 0
		 AverageReturn = 0.0
		 AverageEpisodeLength = 0.0


### Creamos el Driver

In [12]:
from tf_agents.drivers.dynamic_step_driver import DynamicStepDriver

collect_driver = DynamicStepDriver(
    tf_env,
    agent.collect_policy,
    observers=[replay_buffer_observer] + train_metrics,
    num_steps=update_period) # recolecte 4

### Creamos El Collect Policy

El collect driver es quien explora el ambiente, dada unas politicas y luego manda esta informacion al observer, quien luego pasa esa informacion al Replay Buffer.

In [13]:
from tf_agents.policies.random_tf_policy import RandomTFPolicy

initial_collect_policy = RandomTFPolicy(tf_env.time_step_spec(),
                                        tf_env.action_spec())
# DynamicStepDriver
# -  recolecta experiencias cada n numero de steps

init_driver = DynamicStepDriver(
    tf_env,
    initial_collect_policy,
    observers=[replay_buffer.add_batch, ShowProgress(20000)],
    num_steps=20000) # <=> 80,000 ALE frames

# ejecutamos el Collect Policy
final_time_step, final_policy_state = init_driver.run()

Instructions for updating:
back_prop=False is deprecated. Consider using tf.stop_gradient instead.
Instead of:
results = tf.while_loop(c, b, vars, back_prop=False)
Use:
results = tf.nest.map_structure(tf.stop_gradient, tf.while_loop(c, b, vars))


Instructions for updating:
back_prop=False is deprecated. Consider using tf.stop_gradient instead.
Instead of:
results = tf.while_loop(c, b, vars, back_prop=False)
Use:
results = tf.nest.map_structure(tf.stop_gradient, tf.while_loop(c, b, vars))


20000/20000

### Crear el DataSet

In [14]:
dataset = replay_buffer.as_dataset(
    sample_batch_size=64,
    num_steps=2,
    num_parallel_calls=3).prefetch(3) # pre-carga 3 batches

Instructions for updating:
Use `as_dataset(..., single_deterministic_pass=False) instead.


Instructions for updating:
Use `as_dataset(..., single_deterministic_pass=False) instead.


In [15]:
# algunos wraps con function para usar en el main loop...
from tf_agents.utils.common import function

collect_driver.run = function(collect_driver.run)
agent.train = function(agent.train)

### Main Loop (Entrenamiento del Agente)

In [16]:
def train_agent(n_iterations):
    time_step = None
    policy_state = agent.collect_policy.get_initial_state(tf_env.batch_size)
    iterator = iter(dataset)
    for iteration in range(n_iterations):
        time_step, policy_state = collect_driver.run(time_step, policy_state)
        trajectories, buffer_info = next(iterator)
        train_loss = agent.train(trajectories)
        print("\r{} loss:{:.5f}".format(
            iteration, train_loss.loss.numpy()), end="")
        if iteration % 1000 == 0:
            log_metrics(train_metrics)
            
train_agent(n_iterations=50000) # que tal 10,000,000 ? mejor que un humano? intentelo....

Instructions for updating:
back_prop=False is deprecated. Consider using tf.stop_gradient instead.
Instead of:
results = tf.foldr(fn, elems, back_prop=False)
Use:
results = tf.nest.map_structure(tf.stop_gradient, tf.foldr(fn, elems))


Instructions for updating:
back_prop=False is deprecated. Consider using tf.stop_gradient instead.
Instead of:
results = tf.foldr(fn, elems, back_prop=False)
Use:
results = tf.nest.map_structure(tf.stop_gradient, tf.foldr(fn, elems))
INFO:absl: 
		 NumberOfEpisodes = 0
		 EnvironmentSteps = 4
		 AverageReturn = 0.0
		 AverageEpisodeLength = 0.0


999 loss:0.00003

INFO:absl: 
		 NumberOfEpisodes = 27
		 EnvironmentSteps = 4004
		 AverageReturn = 0.699999988079071
		 AverageEpisodeLength = 144.39999389648438


1995 loss:0.00004

INFO:absl: 
		 NumberOfEpisodes = 51
		 EnvironmentSteps = 8004
		 AverageReturn = 0.8999999761581421
		 AverageEpisodeLength = 150.10000610351562


2996 loss:0.00173

INFO:absl: 
		 NumberOfEpisodes = 78
		 EnvironmentSteps = 12004
		 AverageReturn = 1.100000023841858
		 AverageEpisodeLength = 158.3000030517578


3995 loss:0.00018

INFO:absl: 
		 NumberOfEpisodes = 101
		 EnvironmentSteps = 16004
		 AverageReturn = 1.100000023841858
		 AverageEpisodeLength = 160.0


4997 loss:0.00007

INFO:absl: 
		 NumberOfEpisodes = 125
		 EnvironmentSteps = 20004
		 AverageReturn = 1.2000000476837158
		 AverageEpisodeLength = 162.10000610351562


5996 loss:0.00002

INFO:absl: 
		 NumberOfEpisodes = 152
		 EnvironmentSteps = 24004
		 AverageReturn = 0.699999988079071
		 AverageEpisodeLength = 142.5


6998 loss:0.00001

INFO:absl: 
		 NumberOfEpisodes = 175
		 EnvironmentSteps = 28004
		 AverageReturn = 1.5
		 AverageEpisodeLength = 177.0


7995 loss:0.00001

INFO:absl: 
		 NumberOfEpisodes = 201
		 EnvironmentSteps = 32004
		 AverageReturn = 0.30000001192092896
		 AverageEpisodeLength = 129.6999969482422


8996 loss:0.00095

INFO:absl: 
		 NumberOfEpisodes = 225
		 EnvironmentSteps = 36004
		 AverageReturn = 1.600000023841858
		 AverageEpisodeLength = 177.8000030517578


9999 loss:0.00003

INFO:absl: 
		 NumberOfEpisodes = 247
		 EnvironmentSteps = 40004
		 AverageReturn = 1.899999976158142
		 AverageEpisodeLength = 192.1999969482422


10996 loss:0.00012

INFO:absl: 
		 NumberOfEpisodes = 271
		 EnvironmentSteps = 44004
		 AverageReturn = 1.2999999523162842
		 AverageEpisodeLength = 169.5


11997 loss:0.00005

INFO:absl: 
		 NumberOfEpisodes = 295
		 EnvironmentSteps = 48004
		 AverageReturn = 0.5
		 AverageEpisodeLength = 140.6999969482422


12998 loss:0.00006

INFO:absl: 
		 NumberOfEpisodes = 317
		 EnvironmentSteps = 52004
		 AverageReturn = 1.2000000476837158
		 AverageEpisodeLength = 162.8000030517578


13997 loss:0.00002

INFO:absl: 
		 NumberOfEpisodes = 342
		 EnvironmentSteps = 56004
		 AverageReturn = 1.100000023841858
		 AverageEpisodeLength = 162.60000610351562


14999 loss:0.00029

INFO:absl: 
		 NumberOfEpisodes = 365
		 EnvironmentSteps = 60004
		 AverageReturn = 2.200000047683716
		 AverageEpisodeLength = 204.10000610351562


15999 loss:0.00008

INFO:absl: 
		 NumberOfEpisodes = 389
		 EnvironmentSteps = 64004
		 AverageReturn = 1.5
		 AverageEpisodeLength = 175.1999969482422


16999 loss:0.00038

INFO:absl: 
		 NumberOfEpisodes = 412
		 EnvironmentSteps = 68004
		 AverageReturn = 1.5
		 AverageEpisodeLength = 177.0


17997 loss:0.00089

INFO:absl: 
		 NumberOfEpisodes = 437
		 EnvironmentSteps = 72004
		 AverageReturn = 1.5
		 AverageEpisodeLength = 171.1999969482422


18997 loss:0.00021

INFO:absl: 
		 NumberOfEpisodes = 460
		 EnvironmentSteps = 76004
		 AverageReturn = 1.100000023841858
		 AverageEpisodeLength = 156.0


19998 loss:0.00064

INFO:absl: 
		 NumberOfEpisodes = 485
		 EnvironmentSteps = 80004
		 AverageReturn = 1.2000000476837158
		 AverageEpisodeLength = 162.8000030517578


20997 loss:0.00062

INFO:absl: 
		 NumberOfEpisodes = 509
		 EnvironmentSteps = 84004
		 AverageReturn = 1.2000000476837158
		 AverageEpisodeLength = 164.3000030517578


21997 loss:0.00095

INFO:absl: 
		 NumberOfEpisodes = 534
		 EnvironmentSteps = 88004
		 AverageReturn = 1.399999976158142
		 AverageEpisodeLength = 170.60000610351562


22997 loss:0.00006

INFO:absl: 
		 NumberOfEpisodes = 555
		 EnvironmentSteps = 92004
		 AverageReturn = 1.7999999523162842
		 AverageEpisodeLength = 188.39999389648438


23997 loss:0.00019

INFO:absl: 
		 NumberOfEpisodes = 578
		 EnvironmentSteps = 96004
		 AverageReturn = 1.2999999523162842
		 AverageEpisodeLength = 167.60000610351562


24998 loss:0.00020

INFO:absl: 
		 NumberOfEpisodes = 600
		 EnvironmentSteps = 100004
		 AverageReturn = 2.0999999046325684
		 AverageEpisodeLength = 201.8000030517578


25998 loss:0.00012

INFO:absl: 
		 NumberOfEpisodes = 625
		 EnvironmentSteps = 104004
		 AverageReturn = 0.8999999761581421
		 AverageEpisodeLength = 150.39999389648438


26996 loss:0.00015

INFO:absl: 
		 NumberOfEpisodes = 648
		 EnvironmentSteps = 108004
		 AverageReturn = 1.2999999523162842
		 AverageEpisodeLength = 165.60000610351562


27998 loss:0.00035

INFO:absl: 
		 NumberOfEpisodes = 673
		 EnvironmentSteps = 112004
		 AverageReturn = 1.600000023841858
		 AverageEpisodeLength = 177.0


28998 loss:0.00012

INFO:absl: 
		 NumberOfEpisodes = 699
		 EnvironmentSteps = 116004
		 AverageReturn = 0.8999999761581421
		 AverageEpisodeLength = 154.10000610351562


29995 loss:0.00050

INFO:absl: 
		 NumberOfEpisodes = 719
		 EnvironmentSteps = 120004
		 AverageReturn = 1.7999999523162842
		 AverageEpisodeLength = 190.0


30996 loss:0.00013

INFO:absl: 
		 NumberOfEpisodes = 742
		 EnvironmentSteps = 124004
		 AverageReturn = 1.5
		 AverageEpisodeLength = 172.3000030517578


31997 loss:0.00019

INFO:absl: 
		 NumberOfEpisodes = 766
		 EnvironmentSteps = 128004
		 AverageReturn = 1.7000000476837158
		 AverageEpisodeLength = 180.89999389648438


32999 loss:0.00053

INFO:absl: 
		 NumberOfEpisodes = 788
		 EnvironmentSteps = 132004
		 AverageReturn = 2.0999999046325684
		 AverageEpisodeLength = 182.8000030517578


33998 loss:0.00181

INFO:absl: 
		 NumberOfEpisodes = 809
		 EnvironmentSteps = 136004
		 AverageReturn = 1.7999999523162842
		 AverageEpisodeLength = 178.1999969482422


34999 loss:0.00019

INFO:absl: 
		 NumberOfEpisodes = 829
		 EnvironmentSteps = 140004
		 AverageReturn = 2.0999999046325684
		 AverageEpisodeLength = 193.5


35998 loss:0.00217

INFO:absl: 
		 NumberOfEpisodes = 849
		 EnvironmentSteps = 144004
		 AverageReturn = 2.4000000953674316
		 AverageEpisodeLength = 196.5


36995 loss:0.00043

INFO:absl: 
		 NumberOfEpisodes = 869
		 EnvironmentSteps = 148004
		 AverageReturn = 2.0999999046325684
		 AverageEpisodeLength = 197.89999389648438


37998 loss:0.00027

INFO:absl: 
		 NumberOfEpisodes = 889
		 EnvironmentSteps = 152004
		 AverageReturn = 2.5
		 AverageEpisodeLength = 199.6999969482422


38996 loss:0.00044

INFO:absl: 
		 NumberOfEpisodes = 909
		 EnvironmentSteps = 156004
		 AverageReturn = 3.0999999046325684
		 AverageEpisodeLength = 211.3000030517578


39996 loss:0.00056

INFO:absl: 
		 NumberOfEpisodes = 931
		 EnvironmentSteps = 160004
		 AverageReturn = 2.0
		 AverageEpisodeLength = 174.1999969482422


40998 loss:0.00034

INFO:absl: 
		 NumberOfEpisodes = 950
		 EnvironmentSteps = 164004
		 AverageReturn = 3.5
		 AverageEpisodeLength = 242.10000610351562


41997 loss:0.00089

INFO:absl: 
		 NumberOfEpisodes = 966
		 EnvironmentSteps = 168004
		 AverageReturn = 4.0
		 AverageEpisodeLength = 249.8000030517578


42999 loss:0.00043

INFO:absl: 
		 NumberOfEpisodes = 984
		 EnvironmentSteps = 172004
		 AverageReturn = 3.5999999046325684
		 AverageEpisodeLength = 223.0


43997 loss:0.00052

INFO:absl: 
		 NumberOfEpisodes = 1000
		 EnvironmentSteps = 176004
		 AverageReturn = 5.400000095367432
		 AverageEpisodeLength = 246.3000030517578


44999 loss:0.00097

INFO:absl: 
		 NumberOfEpisodes = 1015
		 EnvironmentSteps = 180004
		 AverageReturn = 6.199999809265137
		 AverageEpisodeLength = 262.8999938964844


45999 loss:0.00055

INFO:absl: 
		 NumberOfEpisodes = 1030
		 EnvironmentSteps = 184004
		 AverageReturn = 3.799999952316284
		 AverageEpisodeLength = 232.6999969482422


46998 loss:0.00047

INFO:absl: 
		 NumberOfEpisodes = 1043
		 EnvironmentSteps = 188004
		 AverageReturn = 8.100000381469727
		 AverageEpisodeLength = 329.1000061035156


47996 loss:0.00054

INFO:absl: 
		 NumberOfEpisodes = 1056
		 EnvironmentSteps = 192004
		 AverageReturn = 6.0
		 AverageEpisodeLength = 303.5


48996 loss:0.00037

INFO:absl: 
		 NumberOfEpisodes = 1068
		 EnvironmentSteps = 196004
		 AverageReturn = 8.600000381469727
		 AverageEpisodeLength = 341.1000061035156


49999 loss:0.00131

In [17]:
from matplotlib.animation import FuncAnimation

def update_scene(num, frames, patch):
    patch.set_data(frames[num])
    return patch,

def plot_animation(frames, repeat=False, interval=40):
    fig = plt.figure()
    patch = plt.imshow(frames[0])
    plt.axis('off')
    anim = FuncAnimation(
        fig, update_scene, fargs=(frames, patch),
        frames=len(frames), repeat=repeat, interval=interval)
    plt.close()
    return anim

In [18]:
%matplotlib inline
import matplotlib as mpl
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# To get smooth animations
import matplotlib.animation as animation
mpl.rc('animation', html='jshtml')

frames = []
def save_frames(trajectory):
    global frames
    frames.append(tf_env.pyenv.envs[0].render(mode="rgb_array"))

watch_driver = DynamicStepDriver(
    tf_env,
    agent.policy,
    observers=[save_frames, ShowProgress(1000)],
    num_steps=1000)
final_time_step, final_policy_state = watch_driver.run()

plot_animation(frames)

900/1000

INFO:matplotlib.animation:Animation.save using <class 'matplotlib.animation.HTMLWriter'>


1000/1000