In [1]:
import tensorflow as tf
import gym
import time
import pybullet_envs

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from src.data_collection import collect_data

open problems:
    - use multiprocessing to compute several trajectories at the same time
    - find number actions/ observations automatically

In [2]:
def collect_data(sess, batch_size, gamma = 0.99, render=False):
    # make some empty lists for logging.
    batch_obs = []          # for observations
    batch_acts = []         # for actions
    batch_weights = []      # for R(tau) weighting in policy gradient
    batch_rets = []         # for measuring episode returns
    batch_lens = []         # for measuring episode lengths

    # reset episode-specific variables
    obs = env.reset()       # first obs comes from starting distribution
    done = False            # signal from environment that episode is over
    ep_rews = []            # list for rewards accrued throughout ep
    # collect experience by acting in the environment with current policy
    while True:
        if render:
            env.render()
            time.sleep(0.01)
        
        obs[2] /= 8
        batch_obs.append(obs.copy())

        # act in the environment
        act = sess.run(actions, {obs_ph: obs.reshape(1,-1)})[0]
        obs, rew, done, _ = env.step(act)
        
        # save action, reward
        batch_acts.append(act)
        ep_rews.append(rew)

        if done:
            env.close()
            render = False
            # if episode is over, record info about episode
            ep_ret, ep_len = sum(ep_rews), len(ep_rews)
            batch_rets.append(ep_ret)
            batch_lens.append(ep_len)
            
            # the weight for each logprob(a_t|s_t) is reward-to-go from t
#             batch_weights += list(np.cumsum(ep_rews[::-1])[::-1])
            obs[2] /= 8
            bootstrap_value = sess.run(state_values, {obs_ph:obs.reshape(1,-1)})[0][0]
            batch_weights += compute_rewards_to_go(ep_rews, gamma, bootstrap_value)
            
            # reset episode-specific variables
            obs, done, ep_rews = env.reset(), False, []

            # end experience loop if we have enough of it
            if len(batch_obs) > batch_size:
                break
    return batch_obs, batch_acts, batch_weights, batch_rets, batch_lens

def compute_rewards_to_go(rewards, gamma, bootstrap_value):
    rewards_to_go = [rewards[-1] + gamma*bootstrap_value]
    for rew in rewards[:-1][::-1]:
        tmp = rewards_to_go[-1]
        rewards_to_go.append(rew + gamma * tmp)
    return rewards_to_go[::-1]



In [3]:
def test_compute_rewards_to_go():
    rewards = [128,1024,8]
    gamma = 0.5
    bootstrap_value = 32
    expected_output = [128+512+2+4, 1024+4+8, 8+16]
    assert(compute_rewards_to_go(rewards, gamma, bootstrap_value) == expected_output)

test_compute_rewards_to_go()

## Build Computational Graph

In [6]:
# env = gym.make('Pendulum-v0')
# env = gym.make('HalfCheetahBulletEnv-v0')
import pybullet_envs.bullet.minitaur_gym_env as e
env = e.MinitaurBulletEnv(render=True)
obs_dim = env.observation_space.shape[0]
n_acts = env.action_space.shape[0]

# placeholder
obs_ph = tf.placeholder(shape=(None, obs_dim), dtype=tf.float32)
act_ph = tf.placeholder(shape=(None,n_acts), dtype=tf.float32)
weights_ph = tf.placeholder(shape=(None,), dtype=tf.float32)

# network for gaussian means
mlp = tf.keras.models.Sequential()
mlp.add(tf.keras.layers.Dense(50, activation='tanh'))
mlp.add(tf.keras.layers.Dense(50, activation='tanh'))
mlp.add(tf.keras.layers.Dense(n_acts))
means = mlp(obs_ph)

# value function network
state_value_mlp = tf.keras.models.Sequential()
state_value_mlp.add(tf.keras.layers.Dense(50, activation='relu'))
state_value_mlp.add(tf.keras.layers.Dense(50, activation='relu'))
state_value_mlp.add(tf.keras.layers.Dense(1))
state_values = state_value_mlp(obs_ph)

# variances
log_std = tf.Variable(-0.5)
std = tf.math.exp(log_std)
# compute actions
actions = tf.random.normal((1,1), mean=means, stddev=std)

# make loss function whose gradient, for the right data, is policy gradient
first_summand = tf.reduce_sum(((act_ph - means) / std)**2 + 2*log_std)
log_probs = -0.5*(first_summand + n_acts * tf.math.log(2*np.pi))
loss = -tf.reduce_mean((weights_ph - state_values) * log_probs)

state_value_loss = tf.reduce_mean((weights_ph - state_values)**2)

current_dir=/home/research/repos/rl-zoo/venv/lib/python3.6/site-packages/pybullet_envs/bullet
urdf_root=/home/research/repos/rl-zoo/venv/lib/python3.6/site-packages/pybullet_data
options= 
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [7]:
optimizer = tf.train.AdamOptimizer(0.0003)
train = optimizer.minimize(loss)
state_value_optimizer = tf.train.AdamOptimizer(0.001)
state_value_train = state_value_optimizer.minimize(state_value_loss)

sess = tf.Session()
sess.run(tf.global_variables_initializer())

episode_returns = []

print('Iteration, min, max, mean, std\n')
for i in range(200):
    observations, acts, weights, batch_rets, batch_len = collect_data(sess, 4000 ,render=True)
    print(i, np.mean(batch_rets), np.std(batch_rets), np.min(batch_rets), np.max(batch_rets))
    sess.run([train],feed_dict={
                                    obs_ph: np.array(observations),
                                    act_ph: np.array(acts),
                                    weights_ph: np.array(weights)
                                 })
    for _ in range(50):
        sess.run([state_value_train],feed_dict={
                                    obs_ph: np.array(observations),
                                    act_ph: np.array(acts),
                                    weights_ph: np.array(weights)
                                 })
    print(sess.run(state_values ,feed_dict={
                                    obs_ph: np.array(observations),
                                    act_ph: np.array(acts),
                                    weights_ph: np.array(weights)
                                 }))
#     print(sess.run(means,feed_dict={
#                                     obs_ph: np.array([[0,0,0]]),
#                                  }))
    print(sess.run(state_value_loss, feed_dict={
                                    obs_ph: np.array(observations),
                                    act_ph: np.array(acts),
                                    weights_ph: np.array(weights)
        }))
    print()
#     print('State value function loss:', svloss)
#     print(tmp2)
    #print(sess.run([means,std],feed_dict={
#                                     obs_ph: np.array(tmp1),
#                                     act_ph: np.array(tmp2),
#                                     weights_ph: np.array(tmp3)
#                         }))
print('Evaluation')
tmp1, tmp2, tmp3, batch_rets, batch_len = collect_data(sess, 5, render=True)
print(np.mean(batch_len), np.min(batch_len), np.max(batch_len))
print()

Iteration, min, max, mean, std



error: Not connected to physics server.

## Visualization

In [None]:
t = 100
episode_mean_returns = [np.mean(episode_returns[i-t:i]) for i in range(t, len(episode_returns))]

fig = plt.figure(figsize=(12,12))
plt.plot(range(t,len(episode_mean_returns)+t), episode_mean_returns, color=np.random.rand(3))
plt.xlabel('Episodes')
plt.ylabel('Mean Episode Return')

In [None]:
# obs_dim = 4
# n_acts = 2
# env = gym.make('CartPole-v0')

# # placeholder
# obs_ph = tf.placeholder(shape=(None, obs_dim), dtype=tf.float32)
# act_ph = tf.placeholder(shape=(None,), dtype=tf.int32)
# weights_ph = tf.placeholder(shape=(None,), dtype=tf.float32)

# # make core of policy network
# mlp = tf.keras.models.Sequential()
# mlp.add(tf.keras.layers.Dense(30, activation='tanh'))
# mlp.add(tf.keras.layers.Dense(n_acts))
# logits = mlp(obs_ph)

# # # make state-value function network
# # mlp_val = tf.keras.models.Sequential()
# # mlp_val.add(tf.keras.layers.Dense(50, activation='relu'))
# # mlp_val.add(tf.keras.layers.Dense(50, activation='relu'))
# # mlp_val.add(tf.keras.layers.Dense(1))
# # state_values = mlp_val(obs_ph)

# # make action selection op (outputs int actions, sampled from policy)
# actions = tf.squeeze(tf.multinomial(logits=logits,num_samples=1), axis=1)

# # make loss function whose gradient, for the right data, is policy gradient
# action_masks = tf.one_hot(act_ph, n_acts)
# log_probs = tf.reduce_sum(action_masks * tf.nn.log_softmax(logits), axis=1)
# loss = -tf.reduce_mean(weights_ph * log_probs)


# state_value_loss = tf.reduce_mean(((weights_ph - state_values) - state_values)**2)