In [2]:
import tensorflow as tf
import gym
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

open problems:
    - use multiprocessing to compute several trajectories at the same time

In [3]:
def collect_data(sess, batch_size, gamma=0.99, debug=False):
    # make some empty lists for logging.
    batch_obs = []          # for observations
    batch_acts = []         # for actions
    batch_weights = []      # for R(tau) weighting in policy gradient
    batch_rets = []         # for measuring episode returns
    batch_lens = []         # for measuring episode lengths

    # reset episode-specific variables
    obs = env.reset()       # first obs comes from starting distribution
    done = False            # signal from environment that episode is over
    ep_rews = []            # list for rewards accrued throughout ep

    # collect experience by acting in the environment with current policy
    while True:
        # save obs
        batch_obs.append(obs.copy())

        # act in the environment
        act = sess.run(actions, {obs_ph: obs.reshape(1,-1)})[0]
        obs, rew, done, _ = env.step(act)

        # save action, reward
        batch_acts.append(act)
        ep_rews.append(rew)

        if done:
            # if episode is over, record info about episode
            ep_ret, ep_len = sum(ep_rews), len(ep_rews)
            batch_rets.append(ep_ret)
            batch_lens.append(ep_len)
            
            # the weight for each logprob(a_t|s_t) is reward-to-go from t
            bootstrap_value = sess.run(state_value, {obs_ph:obs.reshape(1,-1)})[0][0]
            batch_weights += compute_rewards_to_go(ep_rews, gamma, bootstrap_value)

            # reset episode-specific variables
            obs, done, ep_rews = env.reset(), False, []

            # end experience loop if we have enough of it
            if len(batch_obs) > batch_size:
                break
    return batch_obs, batch_acts, batch_weights, batch_rets, batch_lens

def compute_rewards_to_go(rewards, gamma, bootstrap_value):
    rewards_to_go = [rewards[-1] + gamma*bootstrap_value]
    for rew in rewards[:-1][::-1]:
        tmp = rewards_to_go[-1]
        rewards_to_go.append(rew + gamma * tmp)
    return rewards_to_go[::-1]



## Build Computational Graph

In [6]:
obs_dim = 4
n_acts = 2

# placeholder
obs_ph = tf.placeholder(shape=(None, obs_dim), dtype=tf.float32)
act_ph = tf.placeholder(shape=(None,), dtype=tf.int32)
weights_ph = tf.placeholder(shape=(None,), dtype=tf.float32)

# make core of policy network
mlp = tf.keras.models.Sequential()
mlp.add(tf.keras.layers.Dense(30, activation='tanh'))
mlp.add(tf.keras.layers.Dense(n_acts))
logits = mlp(obs_ph)

# make core of state-action-value function network
# mlp_action_val = tf.keras.models.Sequential()
# mlp_action_val.add(tf.keras.layers.Dense(50, activation='tanh'))
# mlp_action_val.add(tf.keras.layers.Dense(50, activation='tanh'))
# mlp_action_val.add(tf.keras.layers.Dense(n_acts))
# state_action_values = mlp_action_val(obs_ph)

# make state-value function network
mlp_val = tf.keras.models.Sequential()
mlp_val.add(tf.keras.layers.Dense(50, activation='relu'))
mlp_val.add(tf.keras.layers.Dense(50, activation='relu'))
mlp_val.add(tf.keras.layers.Dense(1))
state_value = mlp_val(obs_ph)

# make action selection op (outputs int actions, sampled from policy)
actions = tf.squeeze(tf.multinomial(logits=logits,num_samples=1), axis=1)

# make loss function whose gradient, for the right data, is policy gradient
action_masks = tf.one_hot(act_ph, n_acts)
log_probs = tf.reduce_sum(action_masks * tf.nn.log_softmax(logits), axis=1)
# action_values = tf.reduce_sum(action_masks * state_action_values, axis=1)
# baseline = tf.reduce_sum(state_action_values * tf.nn.softmax(logits))
loss = -tf.reduce_mean((weights_ph) * log_probs)



# state value loss function
# loss_action_value = tf.reduce_mean((action_values - weights_ph)**2)
loss_state_value = tf.reduce_mean((state_value - weights_ph)**2)

In [7]:
%%time
# main
env = gym.make('CartPole-v0')

# global_step = tf.Variable(0, trainable=False)
# starter_learning_rate = 0.01
# learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step,
#                                            50, 0.90, staircase=True)
# # Passing global_step to minimize() will increment it at each step.
# policy_learning_step = (
#     tf.train.AdamOptimizer(learning_rate)
#     .minimize(loss, global_step=global_step)
# )

optimizer = tf.train.AdamOptimizer(0.001)
# optimizer_action_value = tf.train.AdamOptimizer(0.001)
optimizer_state_value = tf.train.AdamOptimizer(0.001)
train = optimizer.minimize(loss)
# train_action_value = optimizer_action_value.minimize(loss_action_value)
train_state_value = optimizer_state_value.minimize(loss_state_value)

init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)

n_runs = 10
episode_returns = np.zeros((n_runs, n_epochs))



for i in range(3000):
    tmp1, tmp2, tmp3, batch_rets, batch_len = collect_data(sess, 4000 ,debug=False)
    episode_returns.extend(batch_rets)
    print(i, np.mean(batch_len), np.min(batch_len), np.max(batch_len))
    sess.run([train],feed_dict={
                                    obs_ph: np.array(tmp1),
                                    act_ph: np.array(tmp2),
                                    weights_ph: np.array(tmp3)
                                 })
    for _ in range(10):
        sess.run([train_state_value],feed_dict={
                                obs_ph: np.array(tmp1),
                                act_ph: np.array(tmp2),
                                weights_ph: np.array(tmp3)
                             })
    v = sess.run([state_value], feed_dict={
                                    obs_ph: np.array(tmp1),
                                    act_ph: np.array(tmp2),
                                    weights_ph: np.array(tmp3)
                                 })
    print("Value function mean and std:", np.mean(v), np.std(v))
    #print('Optimized')
print('Evaluation')
tmp1, tmp2, tmp3, batch_rets, batch_len = collect_data(sess, 5, debug=True)
print(np.mean(batch_len), np.min(batch_len), np.max(batch_len))
print()

NameError: name 'n_runs' is not defined

## Visualization

In [5]:
t = 100
episode_mean_returns = [np.mean(episode_returns[i-t:i]) for i in range(t, len(episode_returns))]

fig = plt.figure(figsize=(12,12))
plt.plot(range(t,len(episode_mean_returns)+t), episode_mean_returns, color=np.random.rand(3))
plt.xlabel('Episodes')
plt.ylabel('Mean Episode Return')

NameError: name 'episode_returns' is not defined

## OpenAI Implementation 