In [96]:
# Include Libraries
import numpy as np
import tensorflow as tf
import gym
import tensorflow.contrib.layers as layers

In [97]:
# Feed forward network (multi-layer-perceptron, or mlp)

def build_mlp(mlp_input,output_size,scope,n_layers,size,output_activation=None):
    '''
    Build a feed forward network
    '''
    Input = mlp_input
    with tf.variable_scope(scope):
        # Dense Layers
        for i in range(n_layers-1):
            dense = tf.layers.dense(inputs = Input, units = size, activation = tf.nn.relu, bias_initializer=tf.constant_initializer(1.0))
            Input = dense
        # Fully Connected Layer
        out = layers.fully_connected(inputs = Input, num_outputs = output_size, activation_fn=output_activation)
    return out

In [1058]:
# Environment setup
env = gym.make("CartPole-v0")
discrete = isinstance(env.action_space, gym.spaces.Discrete)
observation_dim = env.observation_space.shape[0]
action_dim = env.action_space.n if discrete else env.action_space.shape[0]
max_ep_len = 100
num_traj = 100
traj_length = max_ep_len*(observation_dim + 2)
latent_size = 6
use_baseline = True
feature_size = observation_dim + 2

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


## Initial Policy 

In [285]:
# builds the initial policy
def build_policy_explore(observation_placeholder,action_placeholder,action_dim, scope = "policy_explore"):
    if (discrete):
        action_logits = build_mlp(observation_placeholder,action_dim,scope = scope,n_layers=3,size = 60,output_activation=None)
        sampled_action = tf.multinomial(action_logits,1)
        sampled_action = tf.squeeze(sampled_action, axis=1)
        logprob = -tf.nn.sparse_softmax_cross_entropy_with_logits(logits = action_logits, labels = action_placeholder)
    else:   
        action_means = build_mlp(observation_placeholder,action_dim,scope,n_layers=3,size = 60,output_activation=None)
        init = tf.constant(np.random.rand(1, 2))
        log_std = tf.get_variable("log_std", [action_dim])
        ampled_action =   action_means + tf.multiply(tf.exp(log_std),tf.random_normal(shape = (action_dim,1),mean=0,stddev=1))
        mvn = tf.contrib.distributions.MultivariateNormalDiag(action_means, tf.exp(log_std))
        logprob =  mvn.log_prob(value = action_placeholder, name='log_prob')
        
    return sampled_action, logprob


## Encoder 

In [286]:
#TODO Try RNN
def Encoder(encoder_input_placeholder, scope = "encoder"):
    Z = build_mlp(encoder_input_placeholder,latent_size,scope = scope,n_layers=3,size = 60,output_activation=None)
    return tf.reshape(tf.reduce_mean(Z, axis=0),(1,latent_size))
     

## Encoder LSTM

In [893]:
tf.reset_default_graph()
feature_size = 1
hidden_size = 64
output_size = 6
x = tf.placeholder(tf.float32, [None, None, feature_size])
index = tf.placeholder(tf.int32, [None, ])

initializer = tf.random_uniform_initializer(-1, 1)
cell = tf.nn.rnn_cell.LSTMCell(hidden_size, feature_size, initializer=initializer)
cell_out = tf.contrib.rnn.OutputProjectionWrapper(cell, output_size)
outputs, _ = tf.nn.dynamic_rnn(cell_out, x, sequence_length=index, dtype=tf.float32,
                                       time_major=True)

In [892]:
outputs

<tf.Tensor 'rnn/TensorArrayStack/TensorArrayGatherV3:0' shape=(?, ?, 6) dtype=float32>

array([[1.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [1.],
       [1.],
       [1.],
       [0.]])

## Decoder 

In [287]:

def Decoder(decoder_input_placeholder,decoder_out_dim, scope):
    return build_mlp(decoder_input_placeholder,decoder_out_dim,scope = scope,n_layers=3,size = 60,output_activation=None)

### Baseline and Advantage

In [324]:
def calculate_advantage(returns, observations):
    if (use_baseline):
        baseline = sess.run(baseline, {input_placeholder:observations})
        adv = returns - baseline
        adv = (adv - np.mean(adv))/np.std(adv)
    else:
        adv = returns
    return adv

In [322]:
def update_baseline(returns, observations):
    self.sess.run(update_baseline_op, feed_dict={
                input_placeholder : observations,
                baseline_target_placeholder : returns})

## Sample Path

In [1123]:
def sample_paths_explore(sess, env, num_traj,placeholder, num_episodes = None):
    paths = []
    for i in range(num_traj):
        pad = False
        state = env.reset()
        states, actions, rewards = [], [], []
        for step in range(max_ep_len):
            if (pad):
                states.append([0]*observation_dim)
                actions.append(0)
                rewards.append(0)
            else:
                states.append(state)
                action = sess.run(explore_action, feed_dict={placeholder : states[-1][None]})[0]
                state, reward, done, info = env.step(action)
                actions.append(action)
                rewards.append(reward)
                if (done):
                    pad = True     

        path = {"observation" : np.array(states),
                            "reward" : np.array(rewards),
                            "action" : np.array(actions)}
        paths.append(path)
    return paths

In [1114]:
def sample_paths_exploit(sess, env, num_traj, num_episodes = None):
    paths = []
    for i in range(num_traj):
        state = env.reset()
        states, actions, rewards = [], [], []
        for step in range(max_ep_len):
            states.append(state)
            action = sess.run(exploit_action, feed_dict={observation_placeholder_exploit : state[None], decoder_input_placeholder: Z})[0]
            state, reward, done, info = env.step(action)
            actions.append(action)
            rewards.append(reward)
            if (done):
                break
        path = {"observation" : np.array(states),
                            "reward" : np.array(rewards),
                            "action" : np.array(actions)}
        paths.append(path)
    return paths

In [1115]:
def get_returns(paths):
    all_returns = []
    for path in paths:
        rewards = path["reward"]
        returns = []
        for i in range(len(rewards)):
            path_returns = 0
            k = 0
            for j in range(i,len(rewards)):
                path_returns = path_returns + rewards[j]*(1)**k
                k = k+1
            returns.append(path_returns)
        all_returns.append(returns)
    returns = np.concatenate(all_returns)
    return returns

In [1116]:
def stack_trajectories(paths):
    trajectories = []
    for path in paths:
        rewards = path["reward"]
        states = path["observation"]
        action = path["action"]
        SAR = []
        for i in range(len(states)):
            SAR.append(list(states[i]) + [action[i]] + [rewards[i]])
        trajectories.append(SAR)
    
    return np.array(trajectories)

In [1172]:
tf.reset_default_graph()
latent_size = 6
observation_placeholder_explore = tf.placeholder(tf.float32, shape=(None,observation_dim))
if(discrete):
    action_placeholder = tf.placeholder(tf.int32, shape=(None))
    output_action_PH = tf.placeholder(tf.int32, shape=(None))
else:
    action_placeholder = tf.placeholder(tf.float32, shape=(None,action_dim))
    output_action_PH = tf.placeholder(tf.float32, shape=(None,action_dim))

baseline_target_placeholder = tf.placeholder(tf.float32, shape= None)
advantage_placeholder_explore = tf.placeholder(tf.float32, shape=(None))

encoder_input_placeholder = tf.placeholder(tf.float32, shape= (num_traj,traj_length))
decoder_input_placeholder = tf.placeholder(tf.float32, shape= (1,latent_size))
observation_placeholder_exploit = tf.placeholder(tf.float32, shape=(None,observation_dim))
advantage_placeholder_exploit = tf.placeholder(tf.float32, shape=(None))


In [1173]:
explore_action, explore_logprob = build_policy_explore(observation_placeholder_explore,action_placeholder,action_dim,scope = "explore")
explore_policy_loss = -tf.reduce_sum(explore_logprob * advantage_placeholder_explore)
loss_grads_explore = explore_logprob * advantage_placeholder_explore
tvars_explore = tf.trainable_variables()
gradients_explore = tf.gradients(loss_grads_explore,tvars_explore)

# baseline
baseline = build_mlp(observation_placeholder_explore,1,scope = "baseline",n_layers=3, size = 60,output_activation=None)
baseline_loss = tf.losses.mean_squared_error(baseline_target_placeholder,baseline,scope = "baseline")
baseline_adam_optimizer =  tf.train.AdamOptimizer(learning_rate = 0.1)
update_baseline_op = adam_optimizer.minimize(baseline_loss)

Encoded = Encoder(encoder_input_placeholder,scope = "encode2")


hidden_size = 64
output_size = 6
x = tf.placeholder(tf.float32, [None, None, feature_size])
index = tf.placeholder(tf.int32, [None, ])

initializer = tf.random_uniform_initializer(-1, 1)
cell = tf.nn.rnn_cell.LSTMCell(hidden_size, feature_size, initializer=initializer)
cell_out = tf.contrib.rnn.OutputProjectionWrapper(cell, output_size)
outputs, _ = tf.nn.dynamic_rnn(cell_out, x, sequence_length=index, dtype=tf.float32,
                                       time_major=True)



# decoder output weights
d_W1 = Decoder(decoder_input_placeholder,observation_dim*traj_length, scope = "W1")
d_W2 = Decoder(decoder_input_placeholder,traj_length*traj_length, scope = "W2")
d_W3 = Decoder(decoder_input_placeholder,traj_length*action_dim, scope = "W3")
d_W1 = tf.reshape(d_W1, [observation_dim, traj_length])
d_W2 = tf.reshape(d_W2, [traj_length, traj_length])
d_W3 = tf.reshape(d_W3, [traj_length, action_dim])

# decoder output bias
d_B1 = tf.reshape(Decoder(decoder_input_placeholder,traj_length, scope = "B1"), [traj_length])
d_B2 = tf.reshape(Decoder(decoder_input_placeholder,traj_length, scope = "B2"), [traj_length])
d_B3 = tf.reshape(Decoder(decoder_input_placeholder,action_dim, scope = "B3"), [action_dim])


exploit_action_logits = tf.matmul(tf.nn.relu(tf.matmul(tf.nn.relu(tf.matmul(observation_placeholder_exploit,d_W1) + d_B1), d_W2) + d_B2),d_W3) + d_B3
exploit_action = tf.multinomial(exploit_action_logits,1)
exploit_action = tf.squeeze(exploit_action, axis=1)
exploit_logprob = -tf.nn.sparse_softmax_cross_entropy_with_logits(logits = exploit_action_logits, labels = output_action_PH)
exploit_policy_loss = -tf.reduce_sum(exploit_logprob * advantage_placeholder_exploit)
loss_grads_exploit = exploit_logprob * advantage_placeholder_exploit

d = [d_W1, d_B1, d_W2, d_B2, d_W3, d_B3]

gradients_exploit = tf.gradients(loss_grads_exploit,d)


# train encoder and decoder
adam_optimizer_exploit =  tf.train.AdamOptimizer(0.1)
output_train_op = adam_optimizer_exploit.minimize(exploit_policy_loss)
# train original network
adam_optimizer_explore = tf.train.AdamOptimizer(0.1)
input_train_op = adam_optimizer_explore.minimize(explore_policy_loss)

In [1174]:
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)

In [1175]:
explore_paths = sample_paths_explore(sess, env, num_traj,observation_placeholder_explore, num_episodes = None)

In [1159]:
observations_explore = np.concatenate([path["observation"] for path in explore_paths])
actions_explore = np.concatenate([path["action"] for path in explore_paths])
rewards_explore = np.concatenate([path["reward"] for path in explore_paths])
returns_explore = get_returns(explore_paths)

In [1160]:
baseline = sess.run(baseline, {observation_placeholder_explore:observations_explore})
adv = returns_explore - np.squeeze(baseline)
advantages_explore = (adv - np.mean(adv))/np.std(adv)

In [1161]:

sess.run(update_baseline_op, {observation_placeholder_explore:observations_explore, 
                             baseline_target_placeholder : returns_explore})

In [1162]:
grads_explore = sess.run(gradients_explore, feed_dict={
                    observation_placeholder_explore : observations_explore,
                    action_placeholder : actions_explore,
                    advantage_placeholder_explore : advantages_explore})

In [1163]:
M = stack_trajectories(explore_paths)

In [1164]:
np.array(M).shape
M = np.reshape(np.array(M), [num_traj,max_ep_len,feature_size])

In [1165]:
M.shape

(100, 100, 6)

In [1166]:
a = np.array([num_traj]).reshape(1)
a

array([100])

In [1167]:
o = sess.run(outputs, feed_dict = {x:M, index:[max_ep_len]*num_traj})

In [1168]:
o.shape

(100, 100, 64)

In [412]:
paths= sample_paths_exploit(sess, env, num_traj )

[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'do

In [413]:
observations_exploit = np.concatenate([path["observation"] for path in paths])
actions_exploit = np.concatenate([path["action"] for path in paths])
rewards_exploit = np.concatenate([path["reward"] for path in paths])
returns_exploit = get_returns(paths)
#advantages_exploit = np.float32(calculate_advantage(returns_exploit, observations_exploit))

In [415]:
grads_exploit = sess.run(gradients_exploit,feed_dict={
                    observation_placeholder_exploit : observations_exploit,
                    output_action_PH : actions_exploit,
                    advantage_placeholder_exploit : returns_exploit,
                    decoder_input_placeholder: Z})

In [278]:
grads_exploit[0].shape

(4, 60)

In [279]:
Policy_loss = sess.run(exploit_policy_loss, feed_dict={
                    observation_placeholder_exploit : observations_exploit,
                    output_action_PH : actions_exploit,
                    advantage_placeholder_exploit : returns_exploit,
                    decoder_input_placeholder: Z})

In [416]:
sess.run(output_train_op, feed_dict={
                    observation_placeholder_exploit : observations_exploit,
                    output_action_PH : actions_exploit,
                    advantage_placeholder_exploit : returns_exploit,
                    decoder_input_placeholder: Z})

In [417]:
grads_exploit
grads_explore
# Calculate the advantage
advantage_explore = 0
for i in range(len(grads_exploit)):
    l1 = grads_exploit[i]
    l2 = grads_exploit[i]
    advantage_explore = advantage_exploit + np.matmul(l1.flatten(), l2.flatten())

In [418]:
sess.run(input_train_op, feed_dict={
                    observation_placeholder_explore : observations_explore,
                    action_placeholder : actions_explore,
                    advantage_placeholder_explore : advantage_explore})

## Setup

In [765]:
tf.reset_default_graph()
latent_size = 6
observation_placeholder_explore = tf.placeholder(tf.float32, shape=(None,observation_dim))
if(discrete):
    action_placeholder = tf.placeholder(tf.int32, shape=(None))
    output_action_PH = tf.placeholder(tf.int32, shape=(None))
else:
    action_placeholder = tf.placeholder(tf.float32, shape=(None,action_dim))
    output_action_PH = tf.placeholder(tf.float32, shape=(None,action_dim))

baseline_target_placeholder = tf.placeholder(tf.float32, shape= None)
advantage_placeholder_explore = tf.placeholder(tf.float32, shape=(None))

encoder_input_placeholder = tf.placeholder(tf.float32, shape= (num_traj,traj_length))
decoder_input_placeholder = tf.placeholder(tf.float32, shape= (1,latent_size))
observation_placeholder_exploit = tf.placeholder(tf.float32, shape=(None,observation_dim))
advantage_placeholder_exploit = tf.placeholder(tf.float32, shape=(None))



In [766]:
#explore_action, explore_logprob = build_policy_explore(observation_placeholder_explore,action_placeholder,action_dim,scope = "explore")
explore_action_logits = build_mlp(observation_placeholder_explore,action_dim,scope = "explore",n_layers=3,size = 60,output_activation=None)
explore_action = tf.multinomial(explore_action_logits,1)
explore_action = tf.squeeze(explore_action, axis=1)
explore_logprob = -tf.nn.sparse_softmax_cross_entropy_with_logits(logits = explore_action_logits, labels = action_placeholder)

explore_policy_loss = -tf.reduce_sum(explore_logprob * advantage_placeholder_explore)
loss_grads_explore = explore_logprob * advantage_placeholder_explore
tvars_explore = tf.trainable_variables()
gradients_explore = tf.gradients(loss_grads_explore,tvars_explore)

# baseline
baseline = build_mlp(observation_placeholder_explore,1,scope = "baseline",n_layers=3, size = 60,output_activation=None)
baseline_loss = tf.losses.mean_squared_error(baseline_target_placeholder,baseline,scope = "baseline")
baseline_adam_optimizer =  tf.train.AdamOptimizer(learning_rate = 0.1)
update_baseline_op = adam_optimizer.minimize(baseline_loss)

# encoder
Encoded = Encoder(encoder_input_placeholder,scope = "encode")

decoder_len = 60
# decoder output weights
d_W1 = Decoder(decoder_input_placeholder,observation_dim*decoder_len, scope = "W1")
d_W2 = Decoder(decoder_input_placeholder,decoder_len*decoder_len, scope = "W2")
d_W3 = Decoder(decoder_input_placeholder,decoder_len*action_dim, scope = "W3")
mean_dW1, var_dW1 = tf.nn.moments(d_W1, axes=[1])
mean_dW2, var_dW2 = tf.nn.moments(d_W2, axes=[1])
mean_dW3, var_dW3 = tf.nn.moments(d_W3, axes=[1])
# d_W1 = (d_W1 - mean_dW1)/var_dW1
# d_W2 = (d_W2 - mean_dW2)/var_dW2
# d_W3 = (d_W3 - mean_dW3)/var_dW3

d_W1 = d_W1 / (tf.reduce_max(d_W1) - tf.reduce_min(d_W1))
d_W2 = d_W2 / (tf.reduce_max(d_W2) - tf.reduce_min(d_W2))
d_W3 = d_W3 / (tf.reduce_max(d_W3) - tf.reduce_min(d_W3))

d_W1 = tf.reshape(d_W1, [observation_dim, decoder_len])
d_W2 = tf.reshape(d_W2, [decoder_len, decoder_len])
d_W3 = tf.reshape(d_W3, [decoder_len, action_dim])
# decoder output bias
d_B1 = Decoder(decoder_input_placeholder,decoder_len, scope = "B1")
d_B2 = Decoder(decoder_input_placeholder,decoder_len, scope = "B2")
d_B3 = Decoder(decoder_input_placeholder,action_dim, scope = "B3")
mean_dB1, var_dB1 = tf.nn.moments(d_B1, axes=[1])
mean_dB2, var_dB2 = tf.nn.moments(d_B2, axes=[1])
mean_dB3, var_dB3 = tf.nn.moments(d_B3, axes=[1])
# d_B1 = (d_B1 - mean_dB1)/var_dB1
# d_B2 = (d_B2 - mean_dB2)/var_dB2
# d_B3 = (d_B3 - mean_dB3)/var_dB3
d_B1 = d_B1 / (tf.reduce_max(d_B1) - tf.reduce_min(d_B1))
d_B2 = d_B2 / (tf.reduce_max(d_B2) - tf.reduce_min(d_B2))
d_B3 = d_B3 / (tf.reduce_max(d_B3) - tf.reduce_min(d_B3))
d_B1 = tf.reshape(d_B1, [decoder_len])
d_B2 = tf.reshape(d_B2, [decoder_len])
d_B3 = tf.reshape(d_B3, [action_dim])

exploit_action_logits = ((tf.matmul(tf.nn.relu(tf.matmul(tf.nn.relu(tf.matmul(observation_placeholder_exploit,d_W1) + d_B1), d_W2) + d_B2),d_W3) + d_B3))
exploit_action = tf.multinomial(exploit_action_logits,1)
exploit_action = tf.squeeze(exploit_action, axis=1)
exploit_logprob = -tf.nn.sparse_softmax_cross_entropy_with_logits(logits = exploit_action_logits, labels = output_action_PH)
exploit_policy_loss = -tf.reduce_sum(exploit_logprob * advantage_placeholder_exploit)
loss_grads_exploit = exploit_logprob * advantage_placeholder_exploit

d = [d_W1, d_B1, d_W2, d_B2, d_W3, d_B3]

gradients_exploit = tf.gradients(loss_grads_exploit,d)


# train encoder and decoder
adam_optimizer_exploit =  tf.train.AdamOptimizer(0.1)
output_train_op = adam_optimizer_exploit.minimize(exploit_policy_loss)
# train original network
adam_optimizer_explore = tf.train.AdamOptimizer(0.1)
input_train_op = adam_optimizer_explore.minimize(explore_policy_loss)

# Train

In [817]:
# Sample T Trajectories
num_traj = 100
num_batches = 2

    
# Tensorflow session
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)

for t in range(5):
    # sample paths
    explore_paths = sample_paths_explore(sess, env, num_traj,observation_placeholder_explore, num_episodes = None)
    
    # get observations, actions and rewards
    observations_explore = np.concatenate([path["observation"] for path in explore_paths])
    actions_explore = np.concatenate([path["action"] for path in explore_paths])
    rewards_explore = np.concatenate([path["reward"] for path in explore_paths])
    returns_explore = get_returns(explore_paths)
    
    # update baseline
    if (use_baseline):
        baseline_explore = sess.run(baseline, {observation_placeholder_explore:observations_explore})
        adv = returns_explore - np.squeeze(baseline_explore)
        advantages_explore = (adv - np.mean(adv))/np.std(adv)
        sess.run(update_baseline_op, {observation_placeholder_explore:observations_explore, 
                                      baseline_target_placeholder : advantages_explore})
    else:
        advantages_explore = returns_explore
        
    
    # calculate explore gradients
    grads_explore = sess.run(gradients_explore, feed_dict={
                    observation_placeholder_explore : observations_explore,
                    action_placeholder : actions_explore,
                    advantage_placeholder_explore : advantages_explore})
    
    # form trajectory matrix
    M = stack_trajectories(explore_paths)
    
    # encode
    Z = sess.run(Encoded, feed_dict={encoder_input_placeholder : M})
    # sample paths
    exploit_paths= sample_paths_exploit(sess, env, num_traj )
    # get observations, actions and rewards
    observations_exploit = np.concatenate([path["observation"] for path in exploit_paths])
    actions_exploit = np.concatenate([path["action"] for path in exploit_paths])
    rewards_exploit = np.concatenate([path["reward"] for path in exploit_paths])
    returns_exploit = get_returns(exploit_paths)
    
    #### Debugging strts
    ##
    explore_loss = sess.run(loss_grads_explore, feed_dict={
                    observation_placeholder_explore : observations_explore,
                    action_placeholder : actions_explore,
                    advantage_placeholder_explore : advantages_explore})
    ##
    print("explore loss", explore_loss)
    ##
    d1 = sess.run(d,feed_dict={
                    observation_placeholder_exploit : observations_exploit,
                    output_action_PH : actions_exploit,
                    advantage_placeholder_exploit : returns_exploit,
                    decoder_input_placeholder: Z})
    ##
    print("d", d1)
    
    explore_log = sess.run(explore_action_logits, feed_dict={
                    observation_placeholder_explore : observations_explore,
                    action_placeholder : actions_explore,})
    
    exploit_log= sess.run(exploit_action_logits,feed_dict={
                    observation_placeholder_exploit : observations_exploit,
                    output_action_PH : actions_exploit,
                    decoder_input_placeholder: Z})
    print("explore logprob",explore_log )
    print("exploit logprob",exploit_log )
    
    #### Debugging ends
    
    # calculate exploit gradients
    grads_exploit = sess.run(gradients_exploit,feed_dict={
                    observation_placeholder_exploit : observations_exploit,
                    output_action_PH : actions_exploit,
                    advantage_placeholder_exploit : returns_exploit,
                    decoder_input_placeholder: Z})
    
    
    # train encoder and decoder network
    sess.run(output_train_op, feed_dict={
                    observation_placeholder_exploit : observations_exploit,
                    output_action_PH : actions_exploit,
                    advantage_placeholder_exploit : returns_exploit,
                    decoder_input_placeholder: Z})
    advantage_explore = 0
    for i in range(len(grads_exploit)):
        l1 = grads_exploit[i]
        l2 = grads_explore[i]
        advantage_explore = advantage_exploit + np.matmul(l1.flatten(), l2.flatten())
    
    # train input policy
    sess.run(input_train_op, feed_dict={
                    observation_placeholder_explore : observations_explore,
                    action_placeholder : actions_explore,
                    advantage_placeholder_explore : advantage_explore})

[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'do

[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'do

[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'do

[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'do

[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'do

[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'do

[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'do

[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'do

[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'do

[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'do

[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'do

[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'do

[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'do

[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'do

[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'do

[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'do

[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'do

[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'do

[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'do

[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'do

[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'do

[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'do

[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'do

[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'do

[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.[0m
[33mWARN: You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'do

In [658]:
explore_log[1000]

array([ 0.62517935, -4.304047  ], dtype=float32)

In [826]:
d1

[array([[-0.3367864 , -0.37071374,  0.3222602 , -0.31405738,  0.40182817,
         -0.40131673, -0.41546643,  0.41359502,  0.39926302,  0.347711  ,
         -0.39415154, -0.36900234,  0.32596463,  0.36722067,  0.3508077 ,
          0.3621695 , -0.46341336,  0.33221322, -0.33733925, -0.03466514,
          0.03413202, -0.37971878,  0.3958258 ,  0.42392245,  0.36070007,
         -0.3366391 ,  0.34625268,  0.34326658,  0.35818484,  0.3760201 ,
          0.3842534 ,  0.33813226, -0.32993266, -0.3432932 ,  0.33798438,
          0.37984085,  0.37711948, -0.4039098 ,  0.37722802, -0.41839033,
         -0.37025505, -0.00941446, -0.35742834, -0.38739645,  0.4320519 ,
          0.33478022, -0.37497407,  0.44614288,  0.33183503,  0.37057355,
         -0.34153974, -0.40127838, -0.38128167,  0.34640878,  0.395489  ,
         -0.36325818, -0.2886933 ,  0.38642433,  0.39998463, -0.41894725],
        [ 0.2927451 , -0.32312667,  0.43424875, -0.34544805,  0.31045353,
         -0.34116352, -0.39567804,  0