In [None]:
import tensorflow as tf
import numpy as np
import gym

# Hyperparameters

In [None]:
env_name = "CartPole-v0"
num_envs = 3
n_steps = 50
n_actions = 2
state_size = 4
entropy_coef = 0
vf_coef = 0.1
learning_rate = 0.002
lam = 0.95
gamma = 0.99
episodes = 20000

# Make environments

In [None]:
from multiprocessing_env import SubprocVecEnv
def make_env():
    def _thunk():
        env = gym.make(env_name)
        return env

    return _thunk
env = gym.make(env_name)

In [None]:
envs = [make_env() for i in range(num_envs)]
envs = SubprocVecEnv(envs)

# A2C Network Architecture

In [None]:
# Input
X = tf.placeholder(tf.float32, shape=(None, state_size), name="X")
y = tf.placeholder(tf.int32, shape=(None, n_actions), name="y")
advantages = tf.placeholder(tf.float32, [None], name="advantages")
rewards = tf.placeholder(tf.float32, [None], name="rewards")

# Hidden
fc1 = tf.layers.dense(X, 32, activation=tf.nn.relu, kernel_initializer=
tf.contrib.layers.xavier_initializer())
fc2 = tf.layers.dense(fc1, 64, activation=tf.nn.relu, kernel_initializer=
tf.contrib.layers.xavier_initializer())
fc3 = tf.layers.dense(fc1, 10, activation=tf.nn.relu, kernel_initializer=
tf.contrib.layers.xavier_initializer())
fc4 = tf.layers.dense(fc3, n_actions, activation=None)
# Output
action_distribution = tf.nn.softmax(fc4)
value = tf.layers.dense(fc3, 1, activation=None, kernel_initializer=
tf.contrib.layers.xavier_initializer())

# Loss
neg_log_prob = tf.nn.softmax_cross_entropy_with_logits_v2(logits=fc4, labels=y)
pg_loss = tf.reduce_mean(advantages * neg_log_prob)
vf_loss = tf.losses.mean_squared_error(tf.squeeze(value), rewards)

entropy = 0 # TODO: Implement entropy
loss = pg_loss - entropy * entropy_coef + vf_loss * vf_coef

# Gradient Clipping
params = tf.trainable_variables()

grads = tf.gradients(loss, params)
grads, grad_norm = tf.clip_by_global_norm(grads, 0.5)
grads = list(zip(grads, params))

# Training
trainer = tf.train.AdamOptimizer(learning_rate=learning_rate)
train = trainer.apply_gradients(grads)

# Tensorboard

In [None]:
writer = tf.summary.FileWriter("./tensorboard/5", )

In [None]:
reward_mean = tf.placeholder(tf.float32, name="reward_mean")
policy_loss = tf.placeholder(tf.float32, name="policy_loss")
value_loss = tf.placeholder(tf.float32, name="value_loss")
tf.summary.scalar("Reward_mean", reward_mean)
tf.summary.scalar("Policy_loss", policy_loss)
tf.summary.scalar("Value_loss", value_loss)
write_op = tf.summary.merge_all()

# Discounted sum of rewards 

In [None]:
def discount_with_dones(next_value, rewards, masks, gamma):
    r = next_value.reshape(next_value.shape[0])
    returns = []
    for step in reversed(range(len(rewards))):
        r = rewards[step] + gamma * r * masks[step]
        returns.insert(0, r)
    return returns

# Main

In [None]:
states = envs.reset()

In [None]:
print(states.shape)
with tf.Session() as sess:
    reward_mean_ = 0
    sess.run(tf.global_variables_initializer())
    for ep in range(episodes):
        mb_states, mb_actions, mb_rewards, mb_values, mb_dones, mb_masks = [],[],[],[],[], []
        for _ in range(n_steps):
            # Feed forward current states 
            action_dist, values = sess.run([action_distribution, value], {X:states})
            # Sample actions from action_dist
            action = list(map(lambda p: np.random.choice(range(2), p=p), action_dist))
            mb_states.append(states)
            
            states, reward, done, _ = envs.step(action)

            n_values = np.max(n_actions)
            mb_actions.append(np.eye(n_values)[action])

            mb_values.append(values.flatten())
            mb_dones.append(done)
            mb_rewards.append(reward)
            mb_masks.append(1-done)
        next_values = sess.run(value, {X: states})
        mb_rewards = np.asarray(mb_rewards)
        mb_returns = discount_with_dones(next_values, mb_rewards, mb_masks, gamma)
        mb_returns = np.asarray(mb_returns)

        mb_actions = np.asarray(mb_actions)
        mb_states = np.asarray(mb_states)
        mb_values = np.asarray(mb_values)

        

        mb_advs = np.asarray(mb_returns) - np.asarray(mb_values)
        mb_states = mb_states.reshape(mb_states.shape[0]*mb_states.shape[1],mb_states.shape[2])
        mb_advs = mb_advs.reshape(mb_advs.shape[0]*mb_advs.shape[1])
        mb_returns = mb_returns.reshape(mb_returns.shape[0]*mb_returns.shape[1])
        mb_actions = mb_actions.reshape(mb_actions.shape[0]*mb_actions.shape[1],mb_actions.shape[2])
        
        feed_dict = {X: mb_states,
            y: mb_actions,
            advantages: mb_advs, # Use to calculate our policy loss
            rewards: mb_returns}

        policy_loss_, value_loss_, loss_, _ = sess.run([pg_loss, vf_loss, loss, train], feed_dict)
        print("Value:",value_loss_)
        print("Policy:",policy_loss_)
        print("Loss:",loss_)
        
        if(ep % 10 == 0):
            tstate = env.reset()
            done = False
            rewards_ = []
            while not done:
                #env.render()
                action_dist, values = sess.run([action_distribution, value], {X:tstate.reshape(1,tstate.shape[0])})
                action=np.random.choice(range(2), p=action_dist[0])
                tstate, reward, done, _ = env.step(action)
                rewards_.append(reward)
            reward_mean_ = np.sum(rewards_)
        summary = sess.run(write_op, feed_dict={reward_mean: reward_mean_,policy_loss: policy_loss_, value_loss: value_loss_})
        writer.add_summary(summary, ep)
        writer.flush()