<a href="https://colab.research.google.com/github/felipeserna/cacharreando/blob/master/deep_q_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!apt-get install -y xvfb python-opengl > /dev/null 2>&1

In [2]:
!pip install gym pyvirtualdisplay > /dev/null 2>&1

In [3]:
import gym
import numpy as np
import matplotlib.pyplot as plt
from IPython import display as ipythondisplay

In [4]:
from pyvirtualdisplay import Display
display = Display(visible=0, size=(400, 300))
display.start()

<pyvirtualdisplay.display.Display at 0x7fd8852d4590>

In [None]:
import urllib.request
urllib.request.urlretrieve('http://www.atarimania.com/roms/Roms.rar','Roms.rar')
!pip install unrar
!unrar x Roms.rar
!mkdir rars
!mv HC\ ROMS.zip   rars
!mv ROMS.zip  rars
!python -m atari_py.import_roms rars
!pip install keras-rl2
!pip install gym
!pip install gym[atari]

In [None]:
# From stackoverflow for displaying
env = gym.make("Breakout-v0")
env.reset()
prev_screen = env.render(mode='rgb_array')
plt.imshow(prev_screen)

for i in range(50):
  action = env.action_space.sample()
  obs, reward, done, info = env.step(action)
  screen = env.render(mode='rgb_array')

  plt.imshow(screen)
  ipythondisplay.clear_output(wait=True)
  ipythondisplay.display(plt.gcf())

  if done:
    break

ipythondisplay.clear_output(wait=True)
env.close()

In [7]:
#!/usr/bin/env python3
"""
train.py
"""
import gym
from rl.agents.dqn import DQNAgent
from rl.policy import EpsGreedyQPolicy, LinearAnnealedPolicy
from rl.memory import SequentialMemory
from tensorflow.keras import layers
import tensorflow.keras as K
from rl.processors import Processor
from rl.callbacks import ModelIntervalCheckpoint, FileLogger
from PIL import Image
import numpy as np


class AtariProcessor(Processor):
    """
    preprocessing
    """
    def process_observation(self, observation):
        """ resizing and grayscale """
        assert observation.ndim == 3  # (height, width, channel)
        img = Image.fromarray(observation)
        # resize and convert to grayscale
        img = img.resize((84, 84), Image.ANTIALIAS).convert('L')
        processed_observation = np.array(img)
        assert processed_observation.shape == (84, 84)
        # saves storage in experience memory
        return processed_observation.astype('uint8')

    def process_state_batch(self, batch):
        """
        Rescale without using too much memory
        """
        processed_batch = batch.astype('float32') / 255.
        return processed_batch

    def process_reward(self, reward):
        """ rewards between -1 and 1 """
        return np.clip(reward, -1., 1.)


def create_q_model(num_actions, window):
    """
    Preprocessing
    """
    # Network

    inputs = layers.Input(shape=(window, 84, 84))
    # comment the line below to use with GPU
    inputs_sort = layers.Permute((2, 3, 1))(inputs)

    # Change data_format="channels_first" to use GPU
    # change inputs_sort by inputs to use GPU
    layer1 = layers.Conv2D(32, 8, strides=4, activation="relu",
                           data_format="channels_last")(inputs_sort)
    layer2 = layers.Conv2D(64, 4, strides=2, activation="relu",
                           data_format="channels_last")(layer1)
    layer3 = layers.Conv2D(64, 3, strides=1, activation="relu",
                           data_format="channels_last")(layer2)

    layer4 = layers.Flatten()(layer3)

    layer5 = layers.Dense(512, activation="relu")(layer4)
    action = layers.Dense(num_actions, activation="linear")(layer5)

    return K.Model(inputs=inputs, outputs=action)


if __name__ == '__main__':
    env = gym.make("Breakout-v0")
    env.reset()
    num_actions = env.action_space.n
    window = 4
    model = create_q_model(num_actions, window)
    model.summary()
    memory = SequentialMemory(limit=1000000, window_length=window)
    processor = AtariProcessor()

    policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps',
                                  value_max=1., value_min=.1, value_test=.05,
                                  nb_steps=1000000)

    dqn = DQNAgent(model=model, nb_actions=num_actions, policy=policy,
                   memory=memory, processor=processor,
                   nb_steps_warmup=50000, gamma=.99,
                   target_model_update=10000,
                   train_interval=4,
                   delta_clip=1.)

    dqn.compile(K.optimizers.Adam(learning_rate=.00025), metrics=['mae'])
    
    dqn.fit(env,
            nb_steps=5000,
            log_interval=10000,
            visualize=False,
            verbose=2)

    dqn.save_weights('policy.h5', overwrite=True)

Model: "model_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_6 (InputLayer)         [(None, 4, 84, 84)]       0         
_________________________________________________________________
permute_5 (Permute)          (None, 84, 84, 4)         0         
_________________________________________________________________
conv2d_15 (Conv2D)           (None, 20, 20, 32)        8224      
_________________________________________________________________
conv2d_16 (Conv2D)           (None, 9, 9, 64)          32832     
_________________________________________________________________
conv2d_17 (Conv2D)           (None, 7, 7, 64)          36928     
_________________________________________________________________
flatten_5 (Flatten)          (None, 3136)              0         
_________________________________________________________________
dense_10 (Dense)             (None, 512)               160



  479/5000: episode: 1, duration: 2.276s, episode steps: 479, steps per second: 210, episode reward:  5.000, mean reward:  0.010 [ 0.000,  1.000], mean action: 1.449 [0.000, 3.000],  loss: --, mae: --, mean_q: --, mean_eps: --
  659/5000: episode: 2, duration: 0.802s, episode steps: 180, steps per second: 225, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.628 [0.000, 3.000],  loss: --, mae: --, mean_q: --, mean_eps: --
  873/5000: episode: 3, duration: 0.957s, episode steps: 214, steps per second: 224, episode reward:  1.000, mean reward:  0.005 [ 0.000,  1.000], mean action: 1.463 [0.000, 3.000],  loss: --, mae: --, mean_q: --, mean_eps: --
 1055/5000: episode: 4, duration: 0.809s, episode steps: 182, steps per second: 225, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.511 [0.000, 3.000],  loss: --, mae: --, mean_q: --, mean_eps: --
 1450/5000: episode: 5, duration: 1.729s, episode steps: 395, steps per second: 228, episode

In [8]:
#!/usr/bin/env python3
"""
play.py
"""
import gym
from rl.agents.dqn import DQNAgent
from rl.memory import SequentialMemory
import tensorflow.keras as K

# create_q_model = __import__('train').create_q_model
# AtariProcessor = __import__('train').AtariProcessor


if __name__ == '__main__':
    env = gym.make("Breakout-v0")
    env.reset()
    num_actions = env.action_space.n
    # screenshots per state
    window = 4  
    model = create_q_model(num_actions, window)  
    memory = SequentialMemory(limit=1000000, window_length=window)
    processor = AtariProcessor()
    dqn = DQNAgent(model=model, nb_actions=num_actions,
                   processor=processor, memory=memory)
    dqn.compile(K.optimizers.Adam(learning_rate=.00025), metrics=['mae'])

    dqn.load_weights('policy.h5')

    # Only works with 'False'
    dqn.test(env, nb_episodes=4, visualize=False)

Testing for 4 episodes ...




Episode 1: reward: 0.000, steps: 10000
Episode 2: reward: 0.000, steps: 10000
Episode 3: reward: 0.000, steps: 10000
Episode 4: reward: 0.000, steps: 10000
