<a href="https://colab.research.google.com/github/felipeserna/cacharreando/blob/master/deep_q_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!apt-get install -y xvfb python-opengl > /dev/null 2>&1

In [2]:
!pip install gym pyvirtualdisplay > /dev/null 2>&1

In [3]:
import gym
import numpy as np
import matplotlib.pyplot as plt
from IPython import display as ipythondisplay

In [4]:
from pyvirtualdisplay import Display
display = Display(visible=0, size=(400, 300))
display.start()

<pyvirtualdisplay.display.Display at 0x7fd8852d4590>

In [None]:
import urllib.request
urllib.request.urlretrieve('http://www.atarimania.com/roms/Roms.rar','Roms.rar')
!pip install unrar
!unrar x Roms.rar
!mkdir rars
!mv HC\ ROMS.zip   rars
!mv ROMS.zip  rars
!python -m atari_py.import_roms rars
!pip install keras-rl2
!pip install gym
!pip install gym[atari]

In [None]:
# From stackoverflow for displaying
env = gym.make("Breakout-v0")
env.reset()
prev_screen = env.render(mode='rgb_array')
plt.imshow(prev_screen)

for i in range(50):
  action = env.action_space.sample()
  obs, reward, done, info = env.step(action)
  screen = env.render(mode='rgb_array')

  plt.imshow(screen)
  ipythondisplay.clear_output(wait=True)
  ipythondisplay.display(plt.gcf())

  if done:
    break

ipythondisplay.clear_output(wait=True)
env.close()

In [9]:
#!/usr/bin/env python3
"""
train.py
"""
import gym
from rl.agents.dqn import DQNAgent
from rl.policy import EpsGreedyQPolicy, LinearAnnealedPolicy
from rl.memory import SequentialMemory
from tensorflow.keras import layers
import tensorflow.keras as K
from rl.processors import Processor
from rl.callbacks import ModelIntervalCheckpoint, FileLogger
from PIL import Image
import numpy as np


class AtariProcessor(Processor):
    """
    preprocessing
    """
    def process_observation(self, observation):
        """ resizing and grayscale """
        assert observation.ndim == 3  # (height, width, channel)
        img = Image.fromarray(observation)
        # resize and convert to grayscale
        img = img.resize((84, 84), Image.ANTIALIAS).convert('L')
        processed_observation = np.array(img)
        assert processed_observation.shape == (84, 84)
        # saves storage in experience memory
        return processed_observation.astype('uint8')

    def process_state_batch(self, batch):
        """
        Rescale without using too much memory
        """
        processed_batch = batch.astype('float32') / 255.
        return processed_batch

    def process_reward(self, reward):
        """ rewards between -1 and 1 """
        return np.clip(reward, -1., 1.)


def create_q_model(num_actions, window):
    """
    Preprocessing
    """
    # Network

    inputs = layers.Input(shape=(window, 84, 84))
    # comment the line below to use with GPU
    inputs_sort = layers.Permute((2, 3, 1))(inputs)

    # Change data_format="channels_first" to use GPU
    # change inputs_sort by inputs to use GPU
    layer1 = layers.Conv2D(32, 8, strides=4, activation="relu",
                           data_format="channels_last")(inputs_sort)
    layer2 = layers.Conv2D(64, 4, strides=2, activation="relu",
                           data_format="channels_last")(layer1)
    layer3 = layers.Conv2D(64, 3, strides=1, activation="relu",
                           data_format="channels_last")(layer2)

    layer4 = layers.Flatten()(layer3)

    layer5 = layers.Dense(512, activation="relu")(layer4)
    action = layers.Dense(num_actions, activation="linear")(layer5)

    return K.Model(inputs=inputs, outputs=action)


if __name__ == '__main__':
    env = gym.make("Breakout-v0")
    env.reset()
    num_actions = env.action_space.n
    window = 4
    model = create_q_model(num_actions, window)
    model.summary()
    memory = SequentialMemory(limit=1000000, window_length=window)
    processor = AtariProcessor()

    policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps',
                                  value_max=1., value_min=.1, value_test=.05,
                                  nb_steps=1000000)

    dqn = DQNAgent(model=model, nb_actions=num_actions, policy=policy,
                   memory=memory, processor=processor,
                   nb_steps_warmup=50000, gamma=.99,
                   target_model_update=10000,
                   train_interval=4,
                   delta_clip=1.)

    dqn.compile(K.optimizers.Adam(learning_rate=.00025), metrics=['mae'])
    
    dqn.fit(env,
            nb_steps=100000,
            log_interval=10000,
            visualize=False,
            verbose=2)

    dqn.save_weights('policy.h5', overwrite=True)

Model: "model_14"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_8 (InputLayer)         [(None, 4, 84, 84)]       0         
_________________________________________________________________
permute_7 (Permute)          (None, 84, 84, 4)         0         
_________________________________________________________________
conv2d_21 (Conv2D)           (None, 20, 20, 32)        8224      
_________________________________________________________________
conv2d_22 (Conv2D)           (None, 9, 9, 64)          32832     
_________________________________________________________________
conv2d_23 (Conv2D)           (None, 7, 7, 64)          36928     
_________________________________________________________________
flatten_7 (Flatten)          (None, 3136)              0         
_________________________________________________________________
dense_14 (Dense)             (None, 512)               160



   241/100000: episode: 1, duration: 1.336s, episode steps: 241, steps per second: 180, episode reward:  1.000, mean reward:  0.004 [ 0.000,  1.000], mean action: 1.477 [0.000, 3.000],  loss: --, mae: --, mean_q: --, mean_eps: --
   552/100000: episode: 2, duration: 1.390s, episode steps: 311, steps per second: 224, episode reward:  2.000, mean reward:  0.006 [ 0.000,  1.000], mean action: 1.395 [0.000, 3.000],  loss: --, mae: --, mean_q: --, mean_eps: --
   727/100000: episode: 3, duration: 0.784s, episode steps: 175, steps per second: 223, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.526 [0.000, 3.000],  loss: --, mae: --, mean_q: --, mean_eps: --
   992/100000: episode: 4, duration: 1.197s, episode steps: 265, steps per second: 221, episode reward:  2.000, mean reward:  0.008 [ 0.000,  1.000], mean action: 1.498 [0.000, 3.000],  loss: --, mae: --, mean_q: --, mean_eps: --
  1165/100000: episode: 5, duration: 0.788s, episode steps: 173, steps per secon



 50280/100000: episode: 204, duration: 9.034s, episode steps: 296, steps per second:  33, episode reward:  2.000, mean reward:  0.007 [ 0.000,  1.000], mean action: 1.527 [0.000, 3.000],  loss: 0.002268, mae: 0.019614, mean_q: 0.000393, mean_eps: 0.954874
 50512/100000: episode: 205, duration: 5.403s, episode steps: 232, steps per second:  43, episode reward:  1.000, mean reward:  0.004 [ 0.000,  1.000], mean action: 1.453 [0.000, 3.000],  loss: 0.001660, mae: 0.015650, mean_q: -0.004336, mean_eps: 0.954645
 50750/100000: episode: 206, duration: 5.531s, episode steps: 238, steps per second:  43, episode reward:  1.000, mean reward:  0.004 [ 0.000,  1.000], mean action: 1.534 [0.000, 3.000],  loss: 0.004169, mae: 0.015213, mean_q: -0.001293, mean_eps: 0.954433
 51070/100000: episode: 207, duration: 7.298s, episode steps: 320, steps per second:  44, episode reward:  3.000, mean reward:  0.009 [ 0.000,  1.000], mean action: 1.522 [0.000, 3.000],  loss: 0.002351, mae: 0.014193, mean_q: -0.

In [10]:
#!/usr/bin/env python3
"""
play.py
"""
import gym
from rl.agents.dqn import DQNAgent
from rl.memory import SequentialMemory
import tensorflow.keras as K

# create_q_model = __import__('train').create_q_model
# AtariProcessor = __import__('train').AtariProcessor


if __name__ == '__main__':
    env = gym.make("Breakout-v0")
    env.reset()
    num_actions = env.action_space.n
    # screenshots per state
    window = 4  
    model = create_q_model(num_actions, window)  
    memory = SequentialMemory(limit=1000000, window_length=window)
    processor = AtariProcessor()
    dqn = DQNAgent(model=model, nb_actions=num_actions,
                   processor=processor, memory=memory)
    dqn.compile(K.optimizers.Adam(learning_rate=.00025), metrics=['mae'])

    dqn.load_weights('policy.h5')

    # Only works with 'False'
    dqn.test(env, nb_episodes=10, visualize=False)

Testing for 10 episodes ...




Episode 1: reward: 1.000, steps: 228
Episode 2: reward: 2.000, steps: 297
Episode 3: reward: 0.000, steps: 163
Episode 4: reward: 1.000, steps: 222
Episode 5: reward: 0.000, steps: 158
Episode 6: reward: 2.000, steps: 295
Episode 7: reward: 0.000, steps: 160
Episode 8: reward: 0.000, steps: 168
Episode 9: reward: 0.000, steps: 165
Episode 10: reward: 1.000, steps: 232
