In [1]:
import tensorflow as tf
import gym
import time
import pybullet_envs

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

open problems:
    - use multiprocessing to compute several trajectories at the same time
    - find number actions/ observations automatically

In [2]:
def collect_data(sess, batch_size, gamma = 0.99, render=False):
    # make some empty lists for logging.
    batch_obs = []          # for observations
    batch_acts = []         # for actions
    batch_weights = []      # for R(tau) weighting in policy gradient
    batch_rets = []         # for measuring episode returns
    batch_lens = []         # for measuring episode lengths

    # reset episode-specific variables
    obs = env.reset()       # first obs comes from starting distribution
    done = False            # signal from environment that episode is over
    ep_rews = []            # list for rewards accrued throughout ep
    # collect experience by acting in the environment with current policy
    while True:
        if render:
            env.render()
            time.sleep(0.01)
        
        obs[2] /= 8
        batch_obs.append(obs.copy())

        # act in the environment
        act = sess.run(actions, {obs_ph: obs.reshape(1,-1)})[0]
        obs, rew, done, _ = env.step(act)
        
        # save action, reward
        batch_acts.append(act)
        ep_rews.append(rew)

        if done:
            env.close()
            render = False
            # if episode is over, record info about episode
            ep_ret, ep_len = sum(ep_rews), len(ep_rews)
            batch_rets.append(ep_ret)
            batch_lens.append(ep_len)
            
            # the weight for each logprob(a_t|s_t) is reward-to-go from t
#             batch_weights += list(np.cumsum(ep_rews[::-1])[::-1])
            obs[2] /= 8
            bootstrap_value = sess.run(state_values, {obs_ph:obs.reshape(1,-1)})[0][0]
            batch_weights += compute_rewards_to_go(ep_rews, gamma, bootstrap_value)
            
            # reset episode-specific variables
            obs, done, ep_rews = env.reset(), False, []

            # end experience loop if we have enough of it
            if len(batch_obs) > batch_size:
                break
    return batch_obs, batch_acts, batch_weights, batch_rets, batch_lens

def compute_rewards_to_go(rewards, gamma, bootstrap_value):
    rewards_to_go = [rewards[-1] + gamma*bootstrap_value]
    for rew in rewards[:-1][::-1]:
        tmp = rewards_to_go[-1]
        rewards_to_go.append(rew + gamma * tmp)
    return rewards_to_go[::-1]



In [3]:
def test_compute_rewards_to_go():
    rewards = [128,1024,8]
    gamma = 0.5
    bootstrap_value = 32
    expected_output = [128+512+2+4, 1024+4+8, 8+16]
    assert(compute_rewards_to_go(rewards, gamma, bootstrap_value) == expected_output)

test_compute_rewards_to_go()

## Build Computational Graph

In [9]:
# env = gym.make('Pendulum-v0')
env = gym.make('HalfCheetahBulletEnv-v0')
obs_dim = env.observation_space.shape[0]
n_acts = env.action_space.shape[0]

# placeholder
obs_ph = tf.placeholder(shape=(None, obs_dim), dtype=tf.float32)
act_ph = tf.placeholder(shape=(None,n_acts), dtype=tf.float32)
weights_ph = tf.placeholder(shape=(None,), dtype=tf.float32)

# network for gaussian means
mlp = tf.keras.models.Sequential()
mlp.add(tf.keras.layers.Dense(50, activation='tanh'))
mlp.add(tf.keras.layers.Dense(50, activation='tanh'))
mlp.add(tf.keras.layers.Dense(n_acts))
means = mlp(obs_ph)

# value function network
state_value_mlp = tf.keras.models.Sequential()
state_value_mlp.add(tf.keras.layers.Dense(50, activation='relu'))
state_value_mlp.add(tf.keras.layers.Dense(50, activation='relu'))
state_value_mlp.add(tf.keras.layers.Dense(1))
state_values = state_value_mlp(obs_ph)

# variances
log_std = tf.Variable(-0.5)
std = tf.math.exp(log_std)
# compute actions
actions = tf.random.normal((1,1), mean=means, stddev=std)

# make loss function whose gradient, for the right data, is policy gradient
first_summand = ((act_ph - means) / std)**2 + 2*log_std
log_probs = -0.5*(first_summand + n_acts * tf.math.log(2*np.pi))
loss = -tf.reduce_mean((weights_ph - state_values) * log_probs)

state_value_loss = tf.reduce_mean((weights_ph - state_values)**2)

In [13]:
optimizer = tf.train.AdamOptimizer(0.0003)
train = optimizer.minimize(loss)
state_value_optimizer = tf.train.AdamOptimizer(0.001)
state_value_train = state_value_optimizer.minimize(state_value_loss)

sess = tf.Session()
sess.run(tf.global_variables_initializer())

episode_returns = []

print('Iteration, min, max, mean, std\n')
for i in range(200):
    observations, acts, weights, batch_rets, batch_len = collect_data(sess, 4000 ,render=False)
    print(i, np.mean(batch_rets), np.std(batch_rets), np.min(batch_rets), np.max(batch_rets))
    sess.run([train],feed_dict={
                                    obs_ph: np.array(observations),
                                    act_ph: np.array(acts),
                                    weights_ph: np.array(weights)
                                 })
    for _ in range(50):
        sess.run([state_value_train],feed_dict={
                                    obs_ph: np.array(observations),
                                    act_ph: np.array(acts),
                                    weights_ph: np.array(weights)
                                 })
    print(sess.run(state_values ,feed_dict={
                                    obs_ph: np.array(observations),
                                    act_ph: np.array(acts),
                                    weights_ph: np.array(weights)
                                 }))
#     print(sess.run(means,feed_dict={
#                                     obs_ph: np.array([[0,0,0]]),
#                                  }))
    print(sess.run(state_value_loss, feed_dict={
                                    obs_ph: np.array(observations),
                                    act_ph: np.array(acts),
                                    weights_ph: np.array(weights)
        }))
    print()
#     print('State value function loss:', svloss)
#     print(tmp2)
    #print(sess.run([means,std],feed_dict={
#                                     obs_ph: np.array(tmp1),
#                                     act_ph: np.array(tmp2),
#                                     weights_ph: np.array(tmp3)
#                         }))
print('Evaluation')
tmp1, tmp2, tmp3, batch_rets, batch_len = collect_data(sess, 5, render=True)
print(np.mean(batch_len), np.min(batch_len), np.max(batch_len))
print()

Iteration, min, max, mean, std

0 -1244.1410748328035 271.2528695359224 -1704.3896281260375 -907.7400132953625
[[-1.1375622]
 [-1.1322482]
 [-1.1166079]
 ...
 [-1.0574688]
 [-1.076395 ]
 [-1.1256673]]
154646.66

1 -1144.4083095243432 246.00289137219207 -1751.9342003078361 -899.896243535347
[[-3.9953663]
 [-4.0279245]
 [-4.028385 ]
 ...
 [-3.831115 ]
 [-3.8464181]
 [-3.867359 ]]
129697.37

2 -1182.6971419534743 255.16845798204582 -1720.558745042707 -860.4227137964326
[[-11.613259]
 [-11.577782]
 [-11.551296]
 ...
 [ -9.9947  ]
 [ -9.635342]
 [ -9.687998]]
132121.44

3 -1192.4621304042541 246.16323564785466 -1692.3891410448553 -858.3199745832555
[[-27.070093]
 [-27.096394]
 [-27.186115]
 ...
 [-15.266867]
 [-14.834696]
 [-14.501535]]
126243.586

4 -1212.058715272066 230.5399645583135 -1749.352137335977 -943.0653187772937
[[-58.063797]
 [-57.772823]
 [-57.421497]
 ...
 [-51.755924]
 [-53.095146]
 [-53.873165]]
117018.01

5 -1146.950043262634 219.13225779212206 -1666.1530709256888 -857.361

[[-548.0501 ]
 [-547.3443 ]
 [-548.2689 ]
 ...
 [-578.9619 ]
 [-588.3869 ]
 [-597.18866]]
4932.5845

46 -1217.915945273938 279.65566442169893 -1730.559919597074 -873.2943066357492
[[-580.2357 ]
 [-575.1222 ]
 [-573.58575]
 ...
 [-566.92896]
 [-567.37775]
 [-567.6644 ]]
7333.022

47 -1210.1528876841596 273.5700817915437 -1711.7782974131937 -792.54439236837
[[-598.12695]
 [-596.56854]
 [-592.59674]
 ...
 [-597.7301 ]
 [-600.35443]
 [-605.6297 ]]
7597.14

48 -1211.1189012182979 242.98610602936884 -1704.771617898357 -903.7016519618112
[[-592.0892 ]
 [-593.38196]
 [-594.8047 ]
 ...
 [-600.0952 ]
 [-598.72943]
 [-597.0111 ]]
5957.906

49 -1216.4123510143716 245.3192930610616 -1676.0273392495117 -975.6320665972835
[[-607.4302 ]
 [-608.44244]
 [-609.3229 ]
 ...
 [-607.61096]
 [-606.1074 ]
 [-606.3717 ]]
6026.8833

50 -1144.4775552135882 210.77746539409932 -1728.8132769493038 -886.6802570779892
[[-578.881  ]
 [-578.2092 ]
 [-577.22046]
 ...
 [-588.2299 ]
 [-590.9144 ]
 [-590.5542 ]]
4665.0615



91 -1247.0408789338692 256.6440252785296 -1711.7704730459059 -888.6621368196915
[[-610.6186 ]
 [-610.9183 ]
 [-611.17584]
 ...
 [-606.9095 ]
 [-612.71375]
 [-616.1913 ]]
6130.731

92 -1180.4843259984295 215.29514162640302 -1715.028564871696 -896.849830646062
[[-592.8406 ]
 [-593.40173]
 [-592.94104]
 ...
 [-593.15173]
 [-599.79016]
 [-599.52893]]
4779.524

93 -1183.9635302413933 180.44314830321855 -1577.9923289239607 -1007.7398738784727
[[-592.682  ]
 [-592.83777]
 [-592.31604]
 ...
 [-584.44293]
 [-587.6633 ]
 [-594.1415 ]]
3282.7522

94 -1169.023760365021 244.00742995123454 -1671.6740634352786 -822.3640189265562
[[-583.53564]
 [-585.92163]
 [-588.58655]
 ...
 [-585.80286]
 [-583.1093 ]
 [-584.6576 ]]
5644.711

95 -1205.6185119827012 276.07086352159575 -1649.5343413608723 -862.0289372502214
[[-590.92694]
 [-591.0093 ]
 [-591.00903]
 ...
 [-596.0536 ]
 [-593.6063 ]
 [-590.6642 ]]
7500.4653

96 -1202.5713611950355 268.69521906099527 -1733.794831288497 -853.5716280332633
[[-603.1374 ]
 [

KeyboardInterrupt: 

## Visualization

In [None]:
t = 100
episode_mean_returns = [np.mean(episode_returns[i-t:i]) for i in range(t, len(episode_returns))]

fig = plt.figure(figsize=(12,12))
plt.plot(range(t,len(episode_mean_returns)+t), episode_mean_returns, color=np.random.rand(3))
plt.xlabel('Episodes')
plt.ylabel('Mean Episode Return')

In [None]:
# obs_dim = 4
# n_acts = 2
# env = gym.make('CartPole-v0')

# # placeholder
# obs_ph = tf.placeholder(shape=(None, obs_dim), dtype=tf.float32)
# act_ph = tf.placeholder(shape=(None,), dtype=tf.int32)
# weights_ph = tf.placeholder(shape=(None,), dtype=tf.float32)

# # make core of policy network
# mlp = tf.keras.models.Sequential()
# mlp.add(tf.keras.layers.Dense(30, activation='tanh'))
# mlp.add(tf.keras.layers.Dense(n_acts))
# logits = mlp(obs_ph)

# # # make state-value function network
# # mlp_val = tf.keras.models.Sequential()
# # mlp_val.add(tf.keras.layers.Dense(50, activation='relu'))
# # mlp_val.add(tf.keras.layers.Dense(50, activation='relu'))
# # mlp_val.add(tf.keras.layers.Dense(1))
# # state_values = mlp_val(obs_ph)

# # make action selection op (outputs int actions, sampled from policy)
# actions = tf.squeeze(tf.multinomial(logits=logits,num_samples=1), axis=1)

# # make loss function whose gradient, for the right data, is policy gradient
# action_masks = tf.one_hot(act_ph, n_acts)
# log_probs = tf.reduce_sum(action_masks * tf.nn.log_softmax(logits), axis=1)
# loss = -tf.reduce_mean(weights_ph * log_probs)


# state_value_loss = tf.reduce_mean(((weights_ph - state_values) - state_values)**2)

## OpenAI Implementation 