In [1]:
%matplotlib inline
import tensorflow as tf
import numpy as np
import tensorflow_probability as tfp
tfd = tfp.distributions
import matplotlib.pyplot as plt
from tensorflow.keras import layers


For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.



In [2]:
import gym
from gym import spaces
import gym_dynamic_set_packing
import time

In [3]:
tf.enable_eager_execution()
# we do want eager execution, because that's the norm for TF2.0
# don't forget to test occasionally to see how it works in the alpha, too

In [4]:
env_example = gym.make('DynamicSetPacking-silly-v0')


In [5]:
def test_silly_env(agent, episode_count, max_steps):
    env = gym.make('DynamicSetPacking-silly-v0')
    reward = 0.0
    done = False
    for i in range(episode_count):
        print('episode {}'.format(i))
        ob = env.reset()
        total_reward = 0.0
        for i in range(max_steps):
            action = agent.act(ob, reward, done)
            ob, reward, done, _ = env.step(action)
            total_reward += reward
            print('action taken: {}, reward: {}, new state: {}'.format(action, reward, env.render()))
    print('total episode reward: {}'.format(total_reward))

## mlp example (note not training yet, just random init)

In [60]:
class MLPMatchAgent:
    def __init__(self, observation_shape):
        self.action_space = spaces.Discrete(2)
        # policy network uses TFP layer at the end.
        self.policy = tf.keras.Sequential([
            layers.Dense(32, activation='relu', input_shape=observation_shape),
            layers.Dense(32, activation='relu'),
            layers.Dense(2),
            tfp.layers.OneHotCategorical(2)
        ])
        ## need to compile for non-eager mode?
    
    def act(self, observation, reward, done):
        "Act on a single observation, return a single action index."
        observation_as_batch = np.expand_dims(observation, 0)
        action_sample = self.policy(observation_as_batch).sample()
        return tf.math.argmax(action_sample,axis=1)[0].numpy()

In [61]:
test_silly_env(MLPMatchAgent(env_example.observation_space.shape), 1, 10)

episode 0
action taken: 0, reward: 0.0, new state: [0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 1. 0. 1. 1. 0. 0.]
action taken: 1, reward: 5.0, new state: [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0.]
action taken: 0, reward: 0.0, new state: [0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0.]
action taken: 0, reward: 0.0, new state: [0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 0.]
action taken: 0, reward: 0.0, new state: [0. 0. 1. 2. 0. 0. 1. 1. 0. 0. 1. 0. 2. 0. 0. 0.]
action taken: 1, reward: 8.0, new state: [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 1. 0. 0. 0. 0.]
action taken: 0, reward: 0.0, new state: [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 1. 0. 0.]
action taken: 1, reward: 3.0, new state: [0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
action taken: 1, reward: 2.0, new state: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
action taken: 1, reward: 1.0, new state: [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
total episode reward: 19.0


## random agent example

In [73]:
class RandomMatchAgent:
    "A simple agent for the 0/1 problem that always matches."
    def __init__(self, match_prob):
        self.policy_dist = tfd.OneHotCategorical(probs=[1.0-match_prob, match_prob])
        self.action_space = spaces.Discrete(2)

    def act(self, observation, reward, done):
        action_sample = self.policy_dist.sample()
        action_result = tf.math.argmax(action_sample)
        return action_result.numpy()

In [75]:
test_silly_env(RandomMatchAgent(0.3), 1, 10)

episode 0
action taken: 0, reward: 0.0, new state: [0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 1. 0. 1. 1. 0. 0.]
action taken: 1, reward: 5.0, new state: [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0.]
action taken: 0, reward: 0.0, new state: [0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0.]
action taken: 0, reward: 0.0, new state: [0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 0.]
action taken: 0, reward: 0.0, new state: [0. 0. 1. 2. 0. 0. 1. 1. 0. 0. 1. 0. 2. 0. 0. 0.]
action taken: 0, reward: 0.0, new state: [0. 0. 1. 1. 0. 0. 1. 1. 1. 0. 2. 1. 1. 0. 0. 0.]
action taken: 1, reward: 9.0, new state: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
action taken: 0, reward: 0.0, new state: [0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
action taken: 0, reward: 0.0, new state: [0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0.]
action taken: 0, reward: 0.0, new state: [0. 0. 0. 0. 1. 1. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0.]
total episode reward: 14.0


## notes below

# General pattern for deep RL algorithms

OpenAI has a "Model" class that creates two policy networks, the "act_model" and the "train_model".

The "act_model" has a "step" function (note model.step = model.act_model.step at some point) which takes one step at a time. This is run in the usual way, in a Python loop, to collect one or more trajectories. These trajectories are used as inputs to the train_model to make an update.

The act_model and the train_model share weights. In other words there is a core "policy" network, shared between both of them. It's just that in the act_model, it gets applied to single states, and there are never any gradients, while in the train_model, it gets applied to batches and there are losses.


# Example of using Keras models, with TFP layer at the end (this is a nice way to represent a policy)

One could also have just the Keras model, and manually create TFP distribution objects taking them in. But why not do things this way instead? It makes it really simple.

In effect, calling the Module ends up spitting out an object from which you can sample, or get the log_prob.

For this categorical case, passing in a batch of 1 will give you a single sample of size 1xactions. Passing in a batch of n gives you a sample of size n by actions. In both cases this is parameterized by the inputs correctly. You can also call sample(n) to get many of these (iid from the distribution).

In [28]:
def create_policy_mlp(action_dim, observation_dim):
    return tf.keras.Sequential([
        layers.Dense(32, activation='relu', input_shape=(observation_dim,)),
        layers.Dense(32, activation='relu'),
        layers.Dense(action_dim),
        tfp.layers.OneHotCategorical(action_dim)
    ])

In [29]:
test = create_policy_mlp(2, 5)

In [30]:
act_input = tf.constant([[1.0,2.0,3.0,4.0,5.0]])

In [31]:
act_result=test(act_input)

In [48]:
act_result.sample()

<tf.Tensor: id=1040, shape=(1, 2), dtype=float32, numpy=array([[0., 1.]], dtype=float32)>

In [43]:
obs_input = tf.constant([[1.0,2.0,3.0,4.0,5.0],[1.0,2.0,1.0,4.0,5.0],[1.0,2.0,1.0,4.0,5.0]])

In [44]:
obs_dist = test(obs_input)