<a href="https://colab.research.google.com/github/felipeserna/cacharreando/blob/master/deep_q_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!apt-get install -y xvfb python-opengl > /dev/null 2>&1

In [3]:
!pip install gym pyvirtualdisplay > /dev/null 2>&1

In [4]:
import gym
import numpy as np
import matplotlib.pyplot as plt
from IPython import display as ipythondisplay

In [5]:
from pyvirtualdisplay import Display
display = Display(visible=0, size=(400, 300))
display.start()

<pyvirtualdisplay.display.Display at 0x7fb63f098210>

In [None]:
import urllib.request
urllib.request.urlretrieve('http://www.atarimania.com/roms/Roms.rar','Roms.rar')
!pip install unrar
!unrar x Roms.rar
!mkdir rars
!mv HC\ ROMS.zip   rars
!mv ROMS.zip  rars
!python -m atari_py.import_roms rars
!pip install keras-rl2
!pip install gym
!pip install gym[atari]

In [7]:
#!/usr/bin/env python3
"""
train.py
Script that utilizes keras, keras-rl, and gym
to train an agent that can play Atari’s Breakout:
* Use keras-rl‘s DQNAgent, SequentialMemory, and EpsGreedyQPolicy
* Save the final policy network as policy.h5
"""
import gym
import numpy as np
from PIL import Image
from rl.agents.dqn import DQNAgent
from rl.callbacks import ModelIntervalCheckpoint, FileLogger
from rl.memory import SequentialMemory
from rl.policy import EpsGreedyQPolicy, LinearAnnealedPolicy
from rl.processors import Processor
import tensorflow.keras as K
from tensorflow.keras import layers


class AtariProcessor(Processor):
    """
    The environment in which the game will be played.
    Processor for Atari.
    Prepocesses data based on Deep Learning
    Quick Reference by Mike Bernico.
    """
    def process_observation(self, observation):
        """
        Resizing and grayscale
        """
        # (height, width, channel)
        assert observation.ndim == 3
        # Retrieve image from array
        img = Image.fromarray(observation)
        # Resize image and convert to grayscale
        img = img.resize((84, 84), Image.ANTIALIAS).convert('L')
        # Convert back to array
        processed_observation = np.array(img)
        # Assert input shape
        assert processed_observation.shape == (84, 84)
        
        # Save processed observation in experience memory
        return processed_observation.astype('uint8')

    def process_state_batch(self, batch):
        """
        Convert the batch of images to float32
        """
        processed_batch = batch.astype('float32') / 255.
        return processed_batch

    def process_reward(self, reward):
        """
        Rewards between -1 and 1
        """
        return np.clip(reward, -1., 1.)


def create_q_model(num_actions, window):
    """
    CNN with Keras defined by the Deepmind paper
    """
    # Each RL state is composed of 4 windows
    inputs = layers.Input(shape=(window, 84, 84))
    # Permute is used to change the dimensions of the input
    # according to a given pattern
    layer0 = layers.Permute((2, 3, 1))(inputs)

    layer1 = layers.Conv2D(filters=32, kernel_size=8, strides=(4, 4),
                           activation="relu",
                           data_format="channels_last")(layer0)
    layer2 = layers.Conv2D(filters=64, kernel_size=4, strides=(2, 2),
                           activation="relu",
                           data_format="channels_last")(layer1)
    layer3 = layers.Conv2D(filters=64, kernel_size=3, strides=(1, 1),
                           activation="relu",
                           data_format="channels_last")(layer2)

    layer4 = layers.Flatten()(layer3)

    layer5 = layers.Dense(512, activation="relu")(layer4)
    action = layers.Dense(num_actions, activation="linear")(layer5)

    return K.Model(inputs=inputs, outputs=action)


if __name__ == '__main__':
    env = gym.make("Breakout-v0")
    env.reset()
    num_actions = env.action_space.n
    window = 4
    model = create_q_model(num_actions, window)
    model.summary()
    memory = SequentialMemory(limit=1000000, window_length=window)
    processor = AtariProcessor()

    policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps',
                                  value_max=1., value_min=.1, value_test=.05,
                                  nb_steps=1000000)

    dqn = DQNAgent(model=model, nb_actions=num_actions, policy=policy,
                   memory=memory, processor=processor,
                   nb_steps_warmup=50000, gamma=.99,
                   target_model_update=10000,
                   train_interval=4,
                   delta_clip=1.)

    dqn.compile(K.optimizers.Adam(learning_rate=.00025), metrics=['mae'])
    
    # Train the model
    dqn.fit(env,
            nb_steps=100000,
            log_interval=10000,
            visualize=False,
            verbose=2)

    # Save the final policy network
    dqn.save_weights('policy.h5', overwrite=True)

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 4, 84, 84)]       0         
_________________________________________________________________
permute (Permute)            (None, 84, 84, 4)         0         
_________________________________________________________________
conv2d (Conv2D)              (None, 20, 20, 32)        8224      
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 9, 9, 64)          32832     
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 7, 7, 64)          36928     
_________________________________________________________________
flatten (Flatten)            (None, 3136)              0         
_________________________________________________________________
dense (Dense)                (None, 512)               160614



   175/100000: episode: 1, duration: 29.706s, episode steps: 175, steps per second:   6, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.429 [0.000, 3.000],  loss: --, mae: --, mean_q: --, mean_eps: --
   353/100000: episode: 2, duration: 0.624s, episode steps: 178, steps per second: 285, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.466 [0.000, 3.000],  loss: --, mae: --, mean_q: --, mean_eps: --
   633/100000: episode: 3, duration: 0.946s, episode steps: 280, steps per second: 296, episode reward:  2.000, mean reward:  0.007 [ 0.000,  1.000], mean action: 1.443 [0.000, 3.000],  loss: --, mae: --, mean_q: --, mean_eps: --
   967/100000: episode: 4, duration: 1.123s, episode steps: 334, steps per second: 297, episode reward:  3.000, mean reward:  0.009 [ 0.000,  1.000], mean action: 1.626 [0.000, 3.000],  loss: --, mae: --, mean_q: --, mean_eps: --
  1133/100000: episode: 5, duration: 0.566s, episode steps: 166, steps per seco



 50072/100000: episode: 208, duration: 3.894s, episode steps: 172, steps per second:  44, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.517 [0.000, 3.000],  loss: 0.000960, mae: 0.048676, mean_q: 0.075536, mean_eps: 0.954968
 50249/100000: episode: 209, duration: 3.435s, episode steps: 177, steps per second:  52, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.610 [0.000, 3.000],  loss: 0.001811, mae: 0.042916, mean_q: 0.068084, mean_eps: 0.954856
 50433/100000: episode: 210, duration: 3.514s, episode steps: 184, steps per second:  52, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.511 [0.000, 3.000],  loss: 0.002109, mae: 0.042745, mean_q: 0.065920, mean_eps: 0.954692
 50623/100000: episode: 211, duration: 3.562s, episode steps: 190, steps per second:  53, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 1.516 [0.000, 3.000],  loss: 0.001039, mae: 0.041501, mean_q: 0.060

In [8]:
#!/usr/bin/env python3
"""
play.py
Script that can display a game played by the agent trained by train.py:
* Load the policy network saved in policy.h5
* Your agent should use the GreedyQPolicy
"""
import gym
from rl.agents.dqn import DQNAgent
from rl.memory import SequentialMemory
from rl.policy import GreedyQPolicy
import tensorflow.keras as K
# AtariProcessor = __import__('train').AtariProcessor
# create_q_model = __import__('train').create_q_model


if __name__ == '__main__':
    env = gym.make("Breakout-v0")
    env.reset()
    num_actions = env.action_space.n
    
    # screenshots per state
    window = 4  
    # Deep Q-Network
    model = create_q_model(num_actions, window)  
    memory = SequentialMemory(limit=1000000, window_length=window)
    processor = AtariProcessor()
    
    dqn = DQNAgent(model=model, nb_actions=num_actions,
                   policy=GreedyQPolicy(),
                   processor=processor, memory=memory)
    
    dqn.compile(K.optimizers.Adam(learning_rate=.00025), metrics=['mae'])

    # Load the policy network
    dqn.load_weights('policy.h5')

    # Only works with 'visualize=False' if in Colab
    dqn.test(env, nb_episodes=10, visualize=False)

Testing for 10 episodes ...




Episode 1: reward: 0.000, steps: 170
Episode 2: reward: 1.000, steps: 230
Episode 3: reward: 3.000, steps: 302
Episode 4: reward: 0.000, steps: 171
Episode 5: reward: 1.000, steps: 232
Episode 6: reward: 1.000, steps: 232
Episode 7: reward: 1.000, steps: 10000
Episode 8: reward: 2.000, steps: 252
Episode 9: reward: 0.000, steps: 159
Episode 10: reward: 1.000, steps: 230
