In [1]:
# import matplotlib.pyplot as plt
import datetime 

import tensorflow as tf
from tf_agents.agents.dqn import dqn_agent
from tf_agents.drivers import dynamic_step_driver
from tf_agents.environments import suite_gym
from tf_agents.environments import tf_py_environment, utils
from tf_agents.eval import metric_utils
from tf_agents.metrics import tf_metrics
from tf_agents.networks import q_network
from tf_agents.policies import random_tf_policy
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.trajectories import trajectory
from tf_agents.utils import common


import gym_card_game

In [2]:
from gym_card_game.envs.card_game_env_tf import CardGameEnv

environment = CardGameEnv()
utils.validate_py_environment(environment, episodes=5)

ValueError: Given `time_step`: TimeStep(step_type=array(0, dtype=int32), reward=array(0., dtype=float32), discount=array(1., dtype=float32), observation=array([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0, -2,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0, -3,  0, -2,  0,  0],
       [ 0,  0,  0,  0, -3,  0,  0, -3,  0,  0],
       [ 0,  0,  0,  0,  0,  0, -2,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  1,  0,  0, -2,  0,  0,  2,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0]])) does not match expected `time_step_spec`: TimeStep(step_type=ArraySpec(shape=(), dtype=dtype('int32'), name='step_type'), reward=ArraySpec(shape=(), dtype=dtype('float32'), name='reward'), discount=BoundedArraySpec(shape=(), dtype=dtype('float32'), name='discount', minimum=0.0, maximum=1.0), observation=BoundedArraySpec(shape=(10, 10), dtype=dtype('int32'), name='observation', minimum=-10, maximum=10))

In [2]:

tf.compat.v1.enable_v2_behavior()

# Global hyperparams
env_name = 'card_game-v0'  # @param
num_iterations = 2000  # @param

initial_collect_steps = 2000  # @param
collect_steps_per_iteration = 1  # @param
replay_buffer_capacity = 100000  # @param

fc_layer_params = (100,)

batch_size = 64  # @param
learning_rate = 1e-3  # @param
log_interval = 500  # @param

num_eval_episodes = 1000  # @param
eval_interval = 10  # @param

log_dir = "/tmp/tf/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

max_episode_steps = 100

tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

In [3]:
eval_py_env = suite_gym.load(env_name, max_episode_steps=max_episode_steps)
train_py_env = suite_gym.load(env_name, max_episode_steps=max_episode_steps)


train_env = tf_py_environment.TFPyEnvironment(train_py_env)
eval_env = tf_py_environment.TFPyEnvironment(eval_py_env)

In [4]:

# Model

q_net = q_network.QNetwork(
    train_env.observation_spec(),
    train_env.action_spec(),
    fc_layer_params=fc_layer_params)

optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate)

train_step_counter = tf.compat.v2.Variable(0)

tf_agent = dqn_agent.DqnAgent(
    train_env.time_step_spec(),
    train_env.action_spec(),
    q_network=q_net,
    optimizer=optimizer,
    td_errors_loss_fn=dqn_agent.element_wise_squared_loss,
    train_step_counter=train_step_counter)
tf_agent.initialize()


eval_policy = tf_agent.policy
collect_policy = tf_agent.collect_policy

In [5]:

#@test {"skip": true}

#@test {"skip": true}
def compute_avg_return(environment, policy, num_episodes=10):

    total_return = 0.0
    for _ in range(num_episodes):

        time_step = environment.reset()
        episode_return = 0.0

    while not time_step.is_last():
        action_step = policy.action(time_step)
        time_step = environment.step(action_step.action)
        episode_return += time_step.reward
    total_return += episode_return

    avg_return = total_return / num_episodes
    return avg_return.numpy()[0]



compute_avg_return(train_env, eval_policy, num_eval_episodes)


-0.195

In [6]:
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    data_spec=tf_agent.collect_data_spec,
    batch_size=train_env.batch_size,
    max_length=replay_buffer_capacity)

In [7]:
#@test {"skip": true}
def collect_step(environment, policy):
    time_step = environment.current_time_step()
    action_step = policy.action(time_step)
    next_time_step = environment.step(action_step.action)
    traj = trajectory.from_transition(time_step, action_step, next_time_step)

    # Add trajectory to the replay buffer
    replay_buffer.add_batch(traj)


random_policy = random_tf_policy.RandomTFPolicy(train_env.time_step_spec(),
                                                train_env.action_spec())

for _ in range(initial_collect_steps):
    collect_step(train_env, random_policy)

In [8]:
# Dataset generates trajectories with shape [Bx2x...]
dataset = replay_buffer.as_dataset(
    num_parallel_calls=3, sample_batch_size=batch_size, num_steps=2).prefetch(3)

iterator = iter(dataset)

In [9]:

# (Optional) Optimize by wrapping some of the code in a graph using TF function.
tf_agent.train = common.function(tf_agent.train)

# Reset the train step
tf_agent.train_step_counter.assign(0)

# Evaluate the agent's policy once before training.
avg_return = compute_avg_return(eval_env, tf_agent.policy, num_eval_episodes)
returns = [avg_return]

for _ in range(num_iterations):

  # Collect a few steps using collect_policy and save to the replay buffer.
  for _ in range(collect_steps_per_iteration):
    #collect_step(train_env, tf_agent.collect_policy)

  # Sample a batch of data from the buffer and update the agent's network.
    experience, unused_info = next(iterator)
    train_loss = tf_agent.train(experience)

    step = tf_agent.train_step_counter.numpy()

    if step % log_interval == 0:
        print('step = {0}: loss = {1}'.format(step, train_loss.loss))

    if step % eval_interval == 0:
        avg_return = compute_avg_return(eval_env, tf_agent.policy, num_eval_episodes)
    print('step = {0}: Average Return = {1}'.format(step, avg_return))
    returns.append(avg_return)

step = 1: Average Return = -0.19499999284744263
step = 2: Average Return = -0.19499999284744263
step = 3: Average Return = -0.19499999284744263
step = 4: Average Return = -0.19499999284744263
step = 5: Average Return = -0.19499999284744263
step = 6: Average Return = -0.19499999284744263
step = 7: Average Return = -0.19499999284744263
step = 8: Average Return = -0.19499999284744263
step = 9: Average Return = -0.19499999284744263
step = 10: Average Return = -0.19499999284744263
step = 11: Average Return = -0.19499999284744263
step = 12: Average Return = -0.19499999284744263
step = 13: Average Return = -0.19499999284744263
step = 14: Average Return = -0.19499999284744263
step = 15: Average Return = -0.19499999284744263
step = 16: Average Return = -0.19499999284744263
step = 17: Average Return = -0.19499999284744263
step = 18: Average Return = -0.19499999284744263
step = 19: Average Return = -0.19499999284744263
step = 20: Average Return = -0.20000000298023224
step = 21: Average Return = -

step = 170: Average Return = -0.19699999690055847
step = 171: Average Return = -0.19699999690055847
step = 172: Average Return = -0.19699999690055847
step = 173: Average Return = -0.19699999690055847
step = 174: Average Return = -0.19699999690055847
step = 175: Average Return = -0.19699999690055847
step = 176: Average Return = -0.19699999690055847
step = 177: Average Return = -0.19699999690055847
step = 178: Average Return = -0.19699999690055847
step = 179: Average Return = -0.19699999690055847
step = 180: Average Return = 0.09600000083446503
step = 181: Average Return = 0.09600000083446503
step = 182: Average Return = 0.09600000083446503
step = 183: Average Return = 0.09600000083446503
step = 184: Average Return = 0.09600000083446503
step = 185: Average Return = 0.09600000083446503
step = 186: Average Return = 0.09600000083446503
step = 187: Average Return = 0.09600000083446503
step = 188: Average Return = 0.09600000083446503
step = 189: Average Return = 0.09600000083446503
step = 190

KeyboardInterrupt: 