In [1]:
import abc
import tensorflow as tf
import numpy as np

from tf_agents.environments import py_environment
from tf_agents.environments import tf_environment
from tf_agents.environments import tf_py_environment
from tf_agents.environments import utils
from tf_agents.specs import array_spec
from tf_agents.environments import wrappers
from tf_agents.environments import suite_gym
from tf_agents.trajectories import time_step as ts

In [2]:
import GameHard
import DisplayIA


class YoushiEnv(py_environment.PyEnvironment):
    
    def __init__(self):
        self._env = GameHard.Game()
        self._action_spec = array_spec.BoundedArraySpec(
            shape=(), dtype=np.int32, minimum=0, maximum=2, name='action')
        self._observation_spec = array_spec.BoundedArraySpec(
            shape=(15, 2), dtype=np.float64, minimum=-999, maximum=600, name='observation')
        self._state = self._env.reset()
        self._episode_ended = False
        
    def action_spec(self):
        return self._action_spec
    
    def observation_spec(self):
        return self._observation_spec
    
    def _reset(self):
        self._state = self._env.reset()
        self._episode_ended = False
        return ts.restart(self._state)
    
    def _step(self, action):
        
        if self._episode_ended:
            return self.reset()
        
        cumulative_reward = 0
        cumulative_done = False
        for i in range(4):
            obs, reward, done, info = self._env.step(action)
            cumulative_reward += reward
            self._state = obs
            if (done):
                cumulative_done = True
                break
        
        if cumulative_done:
            self._episode_ended = True
            return ts.termination(self._state, reward)
        else:
            return ts.transition(self._state, reward, discount=0.98)

pygame 1.9.6
Hello from the pygame community. https://www.pygame.org/contribute.html


In [3]:
from tf_agents.environments.tf_py_environment import TFPyEnvironment

tf_env = TFPyEnvironment(YoushiEnv)

In [4]:
from tf_agents.networks.q_network import QNetwork
from tensorflow import keras

preprocessing_layer = keras.layers.Lambda(lambda obs: tf.cast(obs, np.float64) / 1000)
fc_layer_params=[300, 200, 300]

q_net = QNetwork(tf_env.observation_spec(),
                tf_env.action_spec(),
                preprocessing_layers=preprocessing_layer,
                conv_layer_params=None,
                fc_layer_params=fc_layer_params)

In [5]:
from tf_agents.agents.dqn.dqn_agent import DqnAgent

train_step = tf.Variable(0)
update_period = 4
optimizer = keras.optimizers.RMSprop(lr=2.5e-4, rho=0.95, momentum=0.0, epsilon=0.00001, centered=True)
epsilon_fn = keras.optimizers.schedules.PolynomialDecay(initial_learning_rate=1.0, 
                                                        decay_steps=250000 // update_period,
                                                        end_learning_rate=0.01)
agent = DqnAgent(tf_env.time_step_spec(), tf_env.action_spec(),
                 q_network=q_net, optimizer=optimizer,
                 target_update_period=2000,
                 td_errors_loss_fn=keras.losses.Huber(reduction="none"),
                 gamma=0.99,
                 train_step_counter=train_step,
                 epsilon_greedy=lambda: epsilon_fn(train_step))
agent.initialize()



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float

In [6]:
from tf_agents.replay_buffers import tf_uniform_replay_buffer

replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(data_spec=agent.collect_data_spec,
                                                               batch_size=tf_env.batch_size,
                                                               max_length=100000)
replay_buffer_observer = replay_buffer.add_batch

In [7]:
class ShowProgress:
    def __init__(self, total):
        self.counter = 0
        self.total = total
    def __call__(self, trajectory):
        if not trajectory.is_boundary():
            self.counter += 1
        if self.counter % 100 == 0:
            print("/r{}/{}".format(self.counter, self.total), end="")

In [8]:
from tf_agents.metrics import tf_metrics

train_metrics = [
    tf_metrics.NumberOfEpisodes(),
    tf_metrics.EnvironmentSteps(),
    tf_metrics.AverageReturnMetric(),
    tf_metrics.AverageEpisodeLengthMetric()]

In [9]:
from tf_agents.drivers.dynamic_step_driver import DynamicStepDriver

collect_driver = DynamicStepDriver(tf_env,
                                   agent.collect_policy,
                                   observers=[replay_buffer_observer] + train_metrics,
                                   num_steps=update_period)

In [10]:
from tf_agents.policies.random_tf_policy import RandomTFPolicy

initial_collect_policy = RandomTFPolicy(tf_env.time_step_spec(),
                                        tf_env.action_spec())
init_driver = DynamicStepDriver(tf_env,
                                initial_collect_policy,
                                observers=[replay_buffer.add_batch, ShowProgress(20000)],
                                num_steps=20000)
final_time_step, final_policy_state = init_driver.run()

/r100/20000/r200/20000/r300/20000/r400/20000/r500/20000/r600/20000/r700/20000/r800/20000/r900/20000/r1000/20000/r1100/20000/r1200/20000/r1300/20000/r1400/20000/r1500/20000/r1600/20000/r1700/20000/r1800/20000/r1900/20000/r2000/20000/r2100/20000/r2200/20000/r2300/20000/r2400/20000/r2500/20000/r2600/20000/r2700/20000/r2800/20000/r2900/20000/r3000/20000/r3100/20000/r3200/20000/r3300/20000/r3400/20000/r3500/20000/r3600/20000/r3700/20000/r3800/20000/r3900/20000/r4000/20000/r4100/20000/r4200/20000/r4300/20000/r4400/20000/r4500/20000/r4600/20000/r4600/20000/r4700/20000/r4800/20000/r4900/20000/r5000/20000/r5100/20000/r5200/20000/r5300/20000/r5400/20000/r5500/20000/r5600/20000/r5700/20000/r5800/20000/r5900/20000/r6000/20000/r6100/20000/r6200/20000/r6300/20000/r6400/20000/r6500/20000/r6600/20000/r6700/20000/r6800/20000/r6900/20000/r7000/20000/r7100/20000/r7200/20000/r7300/20000/r7400/20000/r7500/20000/r7600/20000/r7700/20000/r7800/20000/r7900/20000/r8000/20000/r8100/20000/r8200/20000/r8300/20000/

In [11]:
dataset = replay_buffer.as_dataset(sample_batch_size=64, num_steps=2, num_parallel_calls=10).prefetch(10)

In [12]:
from tf_agents.eval.metric_utils import log_metrics
import logging
logging.getLogger().setLevel(logging.INFO)
log_metrics(train_metrics)

INFO:absl: 
		 NumberOfEpisodes = 0
		 EnvironmentSteps = 0
		 AverageReturn = 0.0
		 AverageEpisodeLength = 0.0


In [13]:
from tf_agents.utils.common import function

collect_driver.run = function(collect_driver.run)
agent.train = function(agent.train)

def train_agent(n_iterations):
    time_step = None
    policy_state = agent.collect_policy.get_initial_state(tf_env.batch_size)
    iterator = iter(dataset)
    for iteration in range(n_iterations):
        time_step, policy_state = collect_driver.run(time_step, policy_state)
        trajectories, buffer_info = next(iterator)
        train_loss = agent.train(trajectories)
        print("\r{} loss:{:.5f}".format(iteration, train_loss.loss.numpy()), end="")
        if iteration % 100000 == 0:
            log_metrics(train_metrics)

In [14]:
from tf_agents.eval.metric_utils import log_metrics

train_agent(1000000)

INFO:absl: 
		 NumberOfEpisodes = 0
		 EnvironmentSteps = 4
		 AverageReturn = 0.0
		 AverageEpisodeLength = 0.0


99992 loss:0.85718

INFO:absl: 
		 NumberOfEpisodes = 8017
		 EnvironmentSteps = 400004
		 AverageReturn = 152.10000610351562
		 AverageEpisodeLength = 153.10000610351562


199990 loss:0.00497

INFO:absl: 
		 NumberOfEpisodes = 11744
		 EnvironmentSteps = 800004
		 AverageReturn = 165.3000030517578
		 AverageEpisodeLength = 166.3000030517578


299992 loss:0.00713

INFO:absl: 
		 NumberOfEpisodes = 14846
		 EnvironmentSteps = 1200004
		 AverageReturn = 148.10000610351562
		 AverageEpisodeLength = 149.10000610351562


399986 loss:0.01189

INFO:absl: 
		 NumberOfEpisodes = 17596
		 EnvironmentSteps = 1600004
		 AverageReturn = 202.1999969482422
		 AverageEpisodeLength = 203.1999969482422


499996 loss:0.49428

INFO:absl: 
		 NumberOfEpisodes = 20109
		 EnvironmentSteps = 2000004
		 AverageReturn = 110.9000015258789
		 AverageEpisodeLength = 111.9000015258789


599987 loss:0.52577

INFO:absl: 
		 NumberOfEpisodes = 22575
		 EnvironmentSteps = 2400004
		 AverageReturn = 118.0999984741211
		 AverageEpisodeLength = 119.0999984741211


699994 loss:0.51649

INFO:absl: 
		 NumberOfEpisodes = 24952
		 EnvironmentSteps = 2800004
		 AverageReturn = 65.0999984741211
		 AverageEpisodeLength = 66.0999984741211


799998 loss:0.01450

INFO:absl: 
		 NumberOfEpisodes = 27321
		 EnvironmentSteps = 3200004
		 AverageReturn = 153.39999389648438
		 AverageEpisodeLength = 154.39999389648438


899990 loss:0.00489

INFO:absl: 
		 NumberOfEpisodes = 29791
		 EnvironmentSteps = 3600004
		 AverageReturn = 170.1999969482422
		 AverageEpisodeLength = 171.1999969482422


999999 loss:0.53167

NotImplementedError: Saving the model to HDF5 format requires the model to be a Functional model or a Sequential model. It does not work for subclassed models, because such models are defined via the body of a Python method, which isn't safely serializable. Consider saving to the Tensorflow SavedModel format (by setting save_format="tf") or using `save_weights`.

In [16]:
from tf_agents.policies.policy_saver import PolicySaver
saver = PolicySaver(agent.policy, batch_size=None)
saver.save('myPolicyHard')

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


INFO:tensorflow:Assets written to: myPolicyHard\assets


INFO:tensorflow:Assets written to: myPolicyHard\assets
