In [26]:
from keras import layers, backend as K, losses, models, optimizers
import numpy as np

from PIL import Image

from rl.agents.dqn import DQNAgent
from rl.policy import LinearAnnealedPolicy, BoltzmannQPolicy, EpsGreedyQPolicy
from rl.memory import SequentialMemory
from rl.core import Processor
from rl.callbacks import FileLogger, ModelIntervalCheckpoint

In [2]:
from retro_contest.local import make
env = make(
    game='SonicTheHedgehog-Genesis', 
    state='GreenHillZone.Act1'
)

In [15]:
actual_actions = [
#     B  A  M  S  U  D  L  R  C  Y  X  Z
    [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], # {}
    [ 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0 ], # { LEFT }
    [ 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0 ], # { RIGHT }, 
    [ 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0 ], # { LEFT, DOWN }, 
    [ 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0 ], # { RIGHT, DOWN }, 
    [ 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0 ], # { DOWN }, 
    [ 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0 ], # { DOWN, B }, 
    [ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]  # { B }
]

In [34]:
INPUT_SHAPE = (84, 84)
WINDOW_LENGTH = 4


class RetroProcessor( Processor ):
    def process_observation( self, obs ):
        img = Image.fromarray( obs )
        img = img.resize( INPUT_SHAPE ).convert( 'L' )

        return np.array( img, dtype=np.uint8 )

    def process_state_batch( self, batch ):
        return batch.astype( np.float32 ) / 255.

    def process_reward( self, reward ):
        return np.clip( reward, -1., 1. ) # Why clip?

    def process_action( self, action ):
        return actual_actions[ action ]

PPO Loosely based on baselines/ppo2

In [57]:
x = input_layer = layers.Input( ( 1, 84, 84 ) )

x = layers.Permute( ( 2, 3, 1 ) )( x )

x = layers.Conv2D(
    filters=32,
    kernel_size=( 8, 8 ),
    strides=4,
    activation='relu'
)( x )

x = layers.Conv2D(
    filters=64,
    kernel_size=( 4, 4 ),
    strides=2,
    activation='relu'
)( x )

x = layers.Conv2D(
    filters=64,
    kernel_size=( 3, 3 ),
    strides=1,
    activation='relu'
)( x )

x = layers.Flatten()( x )

x = layers.Dense( 512, activation='relu' )( x )
x = layers.Dense( len( actual_actions ), activation='linear' )( x )

model = models.Model( inputs=input_layer, outputs=x )

In [58]:
policy = LinearAnnealedPolicy(
    EpsGreedyQPolicy(), 
    attr='eps', 
    value_max=1., 
    value_min=.1, 
    value_test=.05,
    nb_steps=1000000
)

memory = SequentialMemory( limit=100000, window_length=1 )
processor = RetroProcessor()

In [59]:
dqn = DQNAgent(
    model=model, 
    nb_actions=len( actual_actions ), 
    policy=policy, 
    memory=memory,
    processor=processor, 
    nb_steps_warmup=50000, 
    gamma=.99, 
    target_model_update=10000,
    train_interval=1,
    delta_clip=1.
)
dqn.compile( optimizers.Adam( lr=.00025 ), metrics=[ 'mae' ] )

In [None]:
# Okay, now it's time to learn something! We capture the interrupt exception so that training
# can be prematurely aborted. Notice that you can the built-in Keras callbacks!
# weights_filename = 'dqn_weights.h5f'

# checkpoint_weights_filename = 'dqn_' + args.env_name + '_weights_{step}.h5f'
# log_filename = 'dqn_{}_log.json'.format(args.env_name)
# callbacks = [ModelIntervalCheckpoint(checkpoint_weights_filename, interval=250000)]
# callbacks += [FileLogger(log_filename, interval=100)]
dqn.fit(
    env, 
#     callbacks=callbacks, 
    nb_steps=10, 
    log_interval=1
)

# After training is done, we save the final weights one more time.
dqn.save_weights( 'data/wrights', overwrite=True )

# Finally, evaluate our algorithm for 10 episodes.
dqn.test( env, nb_episodes=10, visualize=True )

Training for 10 steps ...
Interval 1 (0 steps performed)
