In [27]:
# Include Libraries
import numpy as np
import tensorflow as tf
import gym
import tensorflow.contrib.layers as layers
from mujoco_py import load_model_from_xml, MjSim, MjViewer
import policies
import value_functions as vfuncs
import utils_pg as utils

# Environment setup
#env = "CartPole-v0"
#env="InvertedPendulum-v2"
env = "HalfCheetah-v2"

# discrete = isinstance(env.action_space, gym.spaces.Discrete)
# observation_dim = env.observation_space.shape[0]
# action_dim = env.action_space.n if discrete else env.action_space.shape[0]
max_ep_len = 1000
num_traj = 20
#traj_length = max_ep_len*(observation_dim + 2)
latent_size = 100
use_baseline = True



In [28]:
# Feed forward network (multi-layer-perceptron, or mlp)

def build_mlp(mlp_input,output_size,scope,n_layers,size,output_activation=None):
    '''
    Build a feed forward network
    '''
    Input = mlp_input
    with tf.variable_scope(scope):
        # Dense Layers
        for i in range(n_layers-1):
            dense = tf.layers.dense(inputs = Input, units = size, activation = tf.nn.relu, bias_initializer=tf.constant_initializer(1.0))
            Input = dense
        # Fully Connected Layer
        out = layers.fully_connected(inputs = Input, num_outputs = output_size, activation_fn=output_activation)
    return out

In [29]:
class MetaLearner():
    def __init__(self, env, max_ep_len, num_traj,latent_size ):
        self.env = gym.make(env)
        self.discrete = isinstance(self.env.action_space, gym.spaces.Discrete)
        self.observation_dim = self.env.observation_space.shape[0]
        self.action_dim = self.env.action_space.n if self.discrete else self.env.action_space.shape[0]
        self.max_ep_len = max_ep_len
        self.num_traj = num_traj
        self.traj_length = self.max_ep_len*(self.observation_dim + 2) # TO Change
        self.use_baseline = True
        self.latent_size = latent_size
        self.feature_size = self.observation_dim + 1 + self.env.action_space.shape[0] # HC
        self.lr = 3e-3
        self.num_layers = 2
        self.layers_size = 32
        self.num_mdps = 10
        self.model = self.env.env.model
        self.sim = MjSim(self.model)
        self.gamma = 0.95
        self.decoder_len = 32

        # build model
        self.ConstructGraph()
    
    def add_placeholders(self):
        self.observation_placeholder_explore = tf.placeholder(tf.float32, shape=(None,self.observation_dim))
        if(self.discrete):
            self.action_placeholder_explore = tf.placeholder(tf.int32, shape=(None))
            self.action_placeholder_exploit = tf.placeholder(tf.int32, shape=(None))
        else:
            self.action_placeholder_explore = tf.placeholder(tf.float32, shape=(None,self.action_dim))
            self.action_placeholder_exploit= tf.placeholder(tf.float32, shape=(None,self.action_dim))

        self.baseline_target_placeholder = tf.placeholder(tf.float32, shape= None)
        self.advantage_placeholder_explore = tf.placeholder(tf.float32, shape=(None))
        
        #self.encoder_input_placeholder = tf.placeholder(tf.float32, shape= (self.num_traj,self.traj_length))
        self.encoder_input_placeholder = tf.placeholder(tf.float32, [None, None, self.feature_size])
        self.decoder_input_placeholder = tf.placeholder(tf.float32, shape= (1,self.latent_size))
        self.sequence_length_placeholder = tf.placeholder(tf.int32, [None, ])
        
        self.observation_placeholder_exploit = tf.placeholder(tf.float32, shape=(None,self.observation_dim))
        #TODO
        self.advantage_placeholder_exploit = tf.placeholder(tf.float32, shape=(None))
        
        
    def build_policy_explore(self, scope = "policy_explore"):
        if (self.discrete):
            self.action_logits = build_mlp(self.observation_placeholder_explore,self.action_dim,scope = scope,n_layers=self.num_layers,size = self.layers_size,output_activation=None)
            self.explore_action = tf.multinomial(self.action_logits,1)
            self.explore_action = tf.squeeze(self.explore_action, axis=1)
            self.explore_logprob = -tf.nn.sparse_softmax_cross_entropy_with_logits(logits = self.action_logits, labels = self.action_placeholder_explore)

        else:   
            action_means = build_mlp(self.observation_placeholder_explore,self.action_dim,scope,n_layers=self.num_layers, size = self.layers_size,output_activation=None)
            init = tf.constant(np.random.rand(1, 2))
            #log_std = tf.get_variable("log_std", [self.action_dim])
            log_std = tf.get_variable(
                'log_std',
                shape=[1, self.action_dim],
                dtype=tf.float32,
                initializer=tf.zeros_initializer(),
                trainable=True
            )
            log_std = tf.log(tf.exp(log_std) + 1.)
            log_std = tf.tile(log_std,
                                   [tf.shape(action_means)[0], 1])
            self.explore_action = tf.random_normal((1,),
                                                   action_means,
                                                   log_std)
            dist = tf.contrib.distributions.MultivariateNormalDiag(action_means, log_std)
            self.explore_logprob = dist.prob(self.action_placeholder_explore, name = 'log_prob')
            #self.explore_action =   action_means + tf.multiply(tf.exp(log_std),tf.random_normal(shape = (self.action_dim,1),mean=0,stddev=1))
            #mvn = tf.contrib.distributions.MultivariateNormalDiag(action_means, tf.exp(log_std))
            #self.explore_logprob =  mvn.log_prob(value = self.action_placeholder_explore, name='log_prob')

    
    
    def build_policy_exploit(self, scope = "policy_exploit"):
        if(self.discrete):
            #self.exploit_action_logits = (tf.matmul(tf.nn.relu(tf.matmul(tf.nn.relu(tf.matmul(self.observation_placeholder_exploit,self.d_W1) + self.d_B1), self.d_W2) + self.d_B2),self.d_W3) + self.d_B3)
            #self.exploit_action_logits = tf.matmul(self.observation_placeholder_exploit,self.d_W3) + self.d_B3
            self.exploit_action_logits = tf.matmul(tf.nn.relu(tf.matmul(self.observation_placeholder_exploit,self.d_W2) + self.d_B2), self.d_W3) + self.d_B3
            self.exploit_action = tf.multinomial(self.exploit_action_logits,1)
            self.exploit_action = tf.squeeze(self.exploit_action, axis=1)
            self.exploit_logprob = -tf.nn.sparse_softmax_cross_entropy_with_logits(logits = self.exploit_action_logits, labels = self.action_placeholder_exploit)
        else:
            action_means = tf.matmul(tf.nn.relu(tf.matmul(self.observation_placeholder_exploit,self.d_W2) + self.d_B2), self.d_W3) + self.d_B3
            init = tf.constant(np.random.rand(1, 2))
            log_std = tf.get_variable("exploit_log_prob", [self.action_dim])
            self.exploit_action = action_means + tf.multiply(tf.exp(log_std),tf.random_normal(shape = (self.action_dim,1),mean=0,stddev=1))
            self.exploit_logprob = utils.gauss_log_prob(mu=action_means, logstd=log_std, x=self.action_placeholder_exploit)
            
#             init = tf.constant(np.random.rand(1, 2))
#             log_std = tf.get_variable("exploit_log_prob", [self.action_dim])
#             self.exploit_action =   action_means + tf.multiply(tf.exp(log_std),tf.random_normal(shape = (self.action_dim,1),mean=0,stddev=1))
#             mvn = tf.contrib.distributions.MultivariateNormalDiag(action_means, tf.exp(log_std))
#             self.exploit_logprob =  mvn.log_prob(value = self.action_placeholder_exploit, name='exploit_log_prob')

        #self.loss_grads_exploit = self.exploit_logprob * self.advantage_placeholder_exploit
        
    def NNEncoder(self, scope = "NNEncoder"):
        self.Z = build_mlp(self.encoder_input_placeholder,self.latent_size,scope = scope,n_layers=3,size = 60,output_activation=None)
    
 

    # input [num_traj, length, features (obs + action + reward) ]
    def LSTMEncoder(self, scope = "LSTMEncoder"):
        self.hidden_size = 64
        initializer = tf.random_uniform_initializer(-1, 1)
        cell = tf.nn.rnn_cell.LSTMCell(self.hidden_size, self.feature_size, initializer=initializer)
        cell_out = tf.contrib.rnn.OutputProjectionWrapper(cell, self.latent_size)
        self.output, _ = tf.nn.dynamic_rnn(cell_out,self.encoder_input_placeholder,self.sequence_length_placeholder,dtype=tf.float32,)
        batch_size = tf.shape(self.output)[0]
        max_length = tf.shape(self.output)[1]
        out_size = int(self.output.get_shape()[2])
        index = tf.range(0, batch_size) * max_length + (self.sequence_length_placeholder - 1)
        flat = tf.reshape(self.output, [-1, out_size])
        self.Z = tf.reduce_mean(tf.gather(flat, index), axis = 0)
        
    
    def Decoder(self, decoder_out_dim, scope):
        return build_mlp(self.decoder_input_placeholder,decoder_out_dim,scope = scope,n_layers=1,size = 64,output_activation=None)
    
    
    def sample_paths_explore(self, env,mdp_num,Test = False, num_episodes = None):
        paths = []
        self.length = []
        episode_rewards = []
        self.sim.model.opt.gravity[-1] = -5 - mdp_num
        for i in range(self.num_traj):
            pad = False
            state = env.reset()
            new_states,states,new_actions, actions, rewards = [], [], [], [], []
            episode_reward = 0
            for step in range(self.max_ep_len):
                if (pad):
                    states.append([0]*self.observation_dim)
                    if(self.discrete):
                        actions.append(0)
                    else:
                        actions.append([0]*self.action_dim)
                    rewards.append(0)
                else:
                    states.append(state)
                    new_states.append(state)
                    #action = self.sess.run(self.explore_action, feed_dict={self.observation_placeholder_explore : states[-1][None]})[0]
                    action = self.policyfn.sample_action(state) #NEW
                    state, reward, done, info = env.step(action)
                    actions.append(action)
                    new_actions.append(action)
                    rewards.append(reward)
                    episode_reward += reward
                    if (done or step == self.max_ep_len-1):
                        episode_rewards.append(episode_reward)
                        self.length.append(step + 1)
                        pad = True 
                        
            episode_rewards.append(episode_reward)            
            #print("explore",np.array(actions))
            path = {"new_obs" : np.array(new_states),"observation" : np.array(states),
                                "reward" : np.array(rewards),"new_acs" : np.array(new_actions),
                                "action" : np.array(actions)}
            paths.append(path)
        return paths, episode_rewards
    
    def sample_paths_exploit(self, env,Z,mdp_num,Test = False, num_episodes = None):
        self.sim.model.opt.gravity[-1] = -5 - mdp_num
        
        paths = []
        num = 0
        episode_rewards = []
        for i in range(self.num_traj):
            state = env.reset()
            states, actions, rewards = [], [], []
            episode_reward = 0
            for step in range(self.max_ep_len):
                states.append(state)
                action = self.sess.run(self.exploit_action, feed_dict={self.observation_placeholder_exploit : state[None], self.decoder_input_placeholder: Z})[0]
                state, reward, done, info = env.step(action)
                actions.append(action)
                rewards.append(reward)
                episode_reward += reward
                if (done or step == self.max_ep_len-1):
                    episode_rewards.append(episode_reward)
                    break
            #print("exploit",np.array(actions))
            path = {"observation" : np.array(states),
                                "reward" : np.array(rewards),
                                "action" : np.array(actions)}
            paths.append(path)
 
        #print("exploit success: ", num)
        return paths, episode_rewards
    
    def get_returns(self,paths, explore = False):
        all_returns = []
        m = 0
        for path in paths:
            rewards = path["reward"]
            returns = []
            if(explore):
                length = self.length[m]
            else:
                length = len(rewards)
            for i in range(length):
                path_returns = 0
                k = 0
                for j in range(i,length):
                    path_returns = path_returns + rewards[j]*(self.gamma)**k
                    k = k+1
                returns.append(path_returns)
            all_returns.append(returns)
            m+=1
        returns = np.concatenate(all_returns)
        return returns
    
    def stack_trajectories(self,paths):
        trajectories = []
        for path in paths:
            rewards = path["reward"]
            states = path["observation"]
            action = path["action"]
            
            SAR = []
            for i in range(len(states)):
                SAR.append(list(states[i]) + list(action[i]) + [rewards[i]])
            trajectories.append(SAR)
        return np.array(trajectories)
    
    def addBaseline(self):
        self.baseline = build_mlp(self.observation_placeholder_explore,1,scope = "baseline",n_layers=self.num_layers, size = self.layers_size,output_activation=None)
        self.baseline_loss = tf.losses.mean_squared_error(self.baseline_target_placeholder,self.baseline,scope = "baseline")
        baseline_adam_optimizer =  tf.train.AdamOptimizer(learning_rate = self.lr)
        self.update_baseline_op = baseline_adam_optimizer.minimize(self.baseline_loss)

    def calculate_advantage(self,returns, observations):
        if (self.use_baseline):
            baseline = self.sess.run(self.baseline, {input_placeholder:observations})
            adv = returns - baseline
            adv = (adv - np.mean(adv))/np.std(adv)
        else:
            adv = returns
        return adv
    

        
    def ConstructGraph(self):
        tf.reset_default_graph()
        self.sess = tf.Session()
        
        self.vf = vfuncs.NnValueFunction(session=self.sess,ob_dim=self.observation_dim)
        self.policyfn = policies.GaussianPolicy(self.sess,self.observation_dim, self.action_dim)
        
        self.add_placeholders()
        
#         self.add_placeholders()
        
#         self.build_policy_explore()
#         self.explore_policy_loss = -tf.reduce_sum(self.explore_logprob * self.advantage_placeholder_explore)
#         self.loss_grads_explore = -self.explore_logprob * self.advantage_placeholder_explore
#         self.tvars_explore = tf.trainable_variables()
#         self.gradients_explore = tf.gradients(self.explore_policy_loss,self.tvars_explore)
        
#         #self.addBaseline()
        
#         self.baseline = build_mlp(self.observation_placeholder_explore,1,scope = "baseline",n_layers=1, size = 16,output_activation=None)
#         self.baseline_loss = tf.losses.mean_squared_error(self.baseline_target_placeholder,self.baseline,scope = "baseline")
#         baseline_adam_optimizer =  tf.train.AdamOptimizer(learning_rate = self.lr)
#         self.update_baseline_op = baseline_adam_optimizer.minimize(self.baseline_loss)
        
#         #Encoder LSTM
        self.LSTMEncoder()
        
        
        #decoder weights
        #self.d_W1 = self.Decoder(scope = "W1", decoder_out_dim = self.observation_dim*self.decoder_len)
        self.d_W2 = self.Decoder(scope = "W2", decoder_out_dim = self.observation_dim*self.decoder_len)
        #self.d_W3 = self.Decoder(scope = "W3", decoder_out_dim = self.decoder_len*action_dim)
        self.d_W3 = self.Decoder(scope = "W3", decoder_out_dim = self.decoder_len*self.action_dim)
        
        #self.d_W1 = ((self.d_W1 - (tf.reduce_max(self.d_W1) + tf.reduce_min(self.d_W1))/2)/(tf.reduce_max(self.d_W1) - tf.reduce_min(self.d_W1)))*2 
        #self.d_W2 = ((self.d_W2 - (tf.reduce_max(self.d_W2) + tf.reduce_min(self.d_W2))/2)/(tf.reduce_max(self.d_W2) - tf.reduce_min(self.d_W2)))*2 
        #self.d_W3 = ((self.d_W3 - (tf.reduce_max(self.d_W3) + tf.reduce_min(self.d_W3))/2)/(tf.reduce_max(self.d_W3) - tf.reduce_min(self.d_W3)))*2 
        
        #self.d_W1 = tf.reshape(self.d_W1, [self.observation_dim, self.decoder_len])
        self.d_W2 = tf.reshape(self.d_W2, [self.observation_dim, self.decoder_len])
        #self.d_W3 = tf.reshape(self.d_W3, [self.decoder_len, self.action_dim])
        self.d_W3 = tf.reshape(self.d_W3, [self.decoder_len, self.action_dim])
        
        # decoder output bias
        #self.d_B1 = tf.reshape(self.Decoder(decoder_out_dim = self.decoder_len, scope = "B1"), [self.decoder_len])
        self.d_B2 = tf.reshape(self.Decoder(decoder_out_dim = self.decoder_len, scope = "B2"), [self.decoder_len])
        self.d_B3 = tf.reshape(self.Decoder(decoder_out_dim = self.action_dim, scope = "B3"), [self.action_dim])
        #self.d_B1 = ((self.d_B1 - (tf.reduce_max(self.d_B1) + tf.reduce_min(self.d_B1))/2)/(tf.reduce_max(self.d_B1) - tf.reduce_min(self.d_B1)))*2 
        #self.d_B2 = ((self.d_B2 - (tf.reduce_max(self.d_B2) + tf.reduce_min(self.d_B2))/2)/(tf.reduce_max(self.d_B2) - tf.reduce_min(self.d_B2)))*2 
        
        
        # exploit policy
        self.build_policy_exploit()
        #self.d = [self.d_W1, self.d_B1, self.d_W2, self.d_B2, self.d_W3, self.d_B3]
        self.exploit_policy_loss = -tf.reduce_sum(self.exploit_logprob * self.advantage_placeholder_exploit)
        self.d = [self.d_W3, self.d_B3]
        self.gradients_exploit = tf.gradients(self.exploit_policy_loss,self.d)
        
        # train encoder and decoder
        adam_optimizer_exploit =  tf.train.AdamOptimizer(self.lr*0.3)
        self.output_train_op = adam_optimizer_exploit.minimize(self.exploit_policy_loss)
        # train original network
#         adam_optimizer_explore = tf.train.AdamOptimizer(self.lr)
#         self.input_train_op = adam_optimizer_explore.minimize(self.explore_policy_loss)
        init = tf.global_variables_initializer()
        self.sess.run(init)
    
    def initialize(self):
        self.ConstructGraph()
#         # create tf session
#         self.sess = tf.Session()
        
#         # initiliaze all variables
#         init = tf.global_variables_initializer()
#         self.sess.run(init)
    
    def train_step(self): 
        # sample num_traj*num_MDPs
        for i in range(self.num_mdps):
            explore_paths, explore_rewards = self.sample_paths_explore(self.env,i)
            observations_explore = np.concatenate([path["observation"] for path in explore_paths])
            new_observations_explore = np.concatenate([path["new_obs"] for path in explore_paths])
            new_actions_explore = np.concatenate([path["new_acs"] for path in explore_paths])
            actions_explore = np.concatenate([path["action"] for path in explore_paths])
            rewards_explore = np.concatenate([path["reward"] for path in explore_paths])
            
            vtargs, vpreds, advantages_explore = [], [], []
            for path in explore_paths:
                rew_t = path["reward"]
                return_t = utils.discount(rew_t, self.gamma)
                vpred_t = self.vf.predict(path["observation"])
                adv_t = return_t - vpred_t
                advantages_explore.append(adv_t)
                vtargs.append(return_t)
                vpreds.append(vpred_t)
                
                
            #returns_explore = self.get_returns(explore_paths, explore = True)
            advantages_explore = np.concatenate(advantages_explore)
            advantages_explore = (advantages_explore - advantages_explore.mean()) / (advantages_explore.std() + 1e-8)
            vtarg_n = np.concatenate(vtargs)
            vpred_n = np.concatenate(vpreds)
            #update baseline
            self.vf.fit(new_observations_explore, vtarg_n)
            
            surr_loss, oldmean_na, oldlogstd_a = self.policyfn.update_policy(
                    new_observations_explore, new_actions_explore, advantages_explore, self.lr)
            
            print("average reward explore:", np.mean(explore_rewards))
            
            #print(returns_explore)
#             print("average reward explore: MDP", i, np.sum(explore_rewards)/num_traj, len(explore_rewards))

#             baseline_explore = self.sess.run(self.baseline, {self.observation_placeholder_explore:new_observations_explore})
#             adv = returns_explore - np.squeeze(baseline_explore)
#             advantages_explore = (adv - np.mean(adv))/(np.std(adv) + 1e-12)


#             # update the baseline

#             self.sess.run(self.update_baseline_op, {self.observation_placeholder_explore:new_observations_explore, 
#                                            self.baseline_target_placeholder : returns_explore})

            # calculate explore gradients
    #         grads_explore = self.sess.run(self.gradients_explore, feed_dict={
    #                     self.observation_placeholder_explore : observations_explore,
    #                     self.action_placeholder_explore : actions_explore,
    #                     self.advantage_placeholder_explore : returns_explore})
            #print("explore",grads_explore )
            # form trajectory matrix

            M = np.array(self.stack_trajectories(explore_paths))


            #encoder LSTM
            Z = self.sess.run(self.Z, feed_dict = {self.encoder_input_placeholder: M,self.sequence_length_placeholder: self.length })
            Z = np.reshape(Z,[1,len(Z)])
#             #print("Z",Z)
#             #print(self.sess.run(self.d, feed_dict = {self.decoder_input_placeholder: Z}))
#             #print("d",self.d)
#             # sample paths
#             tvars = tf.trainable_variables()
#             tvars_vals = self.sess.run(tvars[-5:-1])
#             #print(tvars_vals)
            for j in range(10):
                exploit_paths, exploit_rewards = self.sample_paths_exploit(self.env,Z,i)
                # get observations, actions and rewards
                observations_exploit = np.concatenate([path["observation"] for path in exploit_paths])
                actions_exploit = np.concatenate([path["action"] for path in exploit_paths])
                rewards_exploit = np.concatenate([path["reward"] for path in exploit_paths])
                returns_exploit = self.get_returns(exploit_paths)
                print("average reward exploit: MDP", i, np.sum(exploit_rewards) / num_traj, len(exploit_rewards))


#             # exploit grads
#     #         grads_exploit = self.sess.run(self.gradients_exploit,feed_dict={
#     #                     self.observation_placeholder_exploit : observations_exploit,
#     #                     self.action_placeholder_exploit : actions_exploit,
#     #                     self.advantage_placeholder_exploit : returns_exploit,
#     #                     self.decoder_input_placeholder: Z})

            #train encoder and decoder network
                self.sess.run(self.output_train_op, feed_dict={
                                self.observation_placeholder_exploit : observations_exploit,
                                self.action_placeholder_exploit : actions_exploit,
                                self.advantage_placeholder_exploit : returns_exploit,
                                self.decoder_input_placeholder: Z})

            # find advantage for input network
    #         advantage_explore = 0
    #         for i in range(len(grads_exploit)):
    #             l1 = grads_exploit[i]
    #             l2 = grads_explore[i]
    #             advantage_explore = advantage_explore + np.matmul(l1.flatten(), l2.flatten())

            # train input policy
#             self.sess.run(self.input_train_op, feed_dict={
#                             self.observation_placeholder_explore : new_observations_explore,
#                             self.action_placeholder_explore : new_actions_explore,
#                             self.advantage_placeholder_explore : advantages_explore})
    
    def test(self):
        explore_paths, explore_rewards = self.sample_paths_explore(self.env, Test = True)
        M = self.stack_trajectories(explore_paths)
        Z = self.sess.run(self.Z, feed_dict = {self.encoder_input_placeholder: M,self.sequence_length_placeholder: self.length })
        Z = np.reshape(Z,[1,len(Z)])
        # sample paths
        exploit_paths, exploit_rewards = self.sample_paths_exploit(self.env,Z, Test = True)
        print("average reward exploit", np.sum(exploit_rewards) / num_traj, len(exploit_rewards))
        
    def train(self):
        self.initialize()
        num_epochs = 200
        for epoch in range(num_epochs):
            print("epoch number: ", epoch)
            self.train_step()
            

In [30]:
a = MetaLearner(env, max_ep_len, num_traj, latent_size)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [32]:
a.train()

epoch number:  0
average reward explore: -5310.066074312663
average reward exploit: MDP 0 -565.7242126885885 20
average reward exploit: MDP 0 -596.2405979875969 20
average reward exploit: MDP 0 -639.0962299746558 20
average reward exploit: MDP 0 -496.9236126208666 20
average reward exploit: MDP 0 -570.0544014836529 20
average reward exploit: MDP 0 -590.2367972001124 20
average reward exploit: MDP 0 -511.40432451515505 20
average reward exploit: MDP 0 -572.8674025740679 20
average reward exploit: MDP 0 -475.9333887833629 20
average reward exploit: MDP 0 -561.9618459192357 20
average reward explore: -4224.515114259144
average reward exploit: MDP 1 -577.7034491840056 20
average reward exploit: MDP 1 -537.3051393006888 20
average reward exploit: MDP 1 -440.9168775826106 20
average reward exploit: MDP 1 -439.8913469319731 20
average reward exploit: MDP 1 -447.118194790489 20
average reward exploit: MDP 1 -459.51644214006984 20
average reward exploit: MDP 1 -456.9382182586727 20
average rewa

average reward exploit: MDP 4 -185.09705901700306 20
average reward exploit: MDP 4 -242.8686390561496 20
average reward exploit: MDP 4 -201.71609382615088 20
average reward exploit: MDP 4 -218.9580284791955 20
average reward exploit: MDP 4 -225.86485535338588 20
average reward exploit: MDP 4 -242.9028797764312 20
average reward explore: -1022.5150677493039
average reward exploit: MDP 5 -519.840886429588 20
average reward exploit: MDP 5 -520.6857440284555 20
average reward exploit: MDP 5 -486.26501719215656 20
average reward exploit: MDP 5 -487.4817548430389 20
average reward exploit: MDP 5 -536.7860305461011 20
average reward exploit: MDP 5 -645.6008131170476 20
average reward exploit: MDP 5 -684.1156670527625 20
average reward exploit: MDP 5 -657.8501333639175 20
average reward exploit: MDP 5 -612.1220915703265 20
average reward exploit: MDP 5 -584.402583610452 20
average reward explore: -897.7388162918751
average reward exploit: MDP 6 -450.2039997435535 20
average reward exploit: MDP

average reward exploit: MDP 8 -188.72477918245465 20
average reward explore: -554.9175471418664
average reward exploit: MDP 9 -302.0863833509806 20
average reward exploit: MDP 9 -323.8163123094188 20
average reward exploit: MDP 9 -309.44753085481057 20
average reward exploit: MDP 9 -282.5189885002032 20
average reward exploit: MDP 9 -253.54968724442068 20
average reward exploit: MDP 9 -265.70743044031116 20
average reward exploit: MDP 9 -272.71650311895905 20
average reward exploit: MDP 9 -249.68790836003026 20
average reward exploit: MDP 9 -256.5943324757556 20
average reward exploit: MDP 9 -267.6888449863956 20
epoch number:  3
average reward explore: -752.9147316572859
average reward exploit: MDP 0 -130.09866203753134 20
average reward exploit: MDP 0 -79.87851054902767 20
average reward exploit: MDP 0 -49.47386328638167 20
average reward exploit: MDP 0 -195.0161806751928 20
average reward exploit: MDP 0 -164.6121461484387 20
average reward exploit: MDP 0 -236.35859295301097 20
avera

average reward exploit: MDP 3 -11.680235627462778 20
average reward exploit: MDP 3 85.65250944707701 20
average reward exploit: MDP 3 -92.12980421101615 20
average reward exploit: MDP 3 -198.32526890566015 20
average reward exploit: MDP 3 -150.60785022408982 20
average reward exploit: MDP 3 -231.95965997991115 20
average reward exploit: MDP 3 -210.31946173530278 20
average reward explore: -619.0967437853076
average reward exploit: MDP 4 -169.94509168888965 20
average reward exploit: MDP 4 -133.0468290275124 20
average reward exploit: MDP 4 -83.86852989482215 20
average reward exploit: MDP 4 -12.185795456839358 20
average reward exploit: MDP 4 52.94505043933985 20
average reward exploit: MDP 4 101.14870110568857 20
average reward exploit: MDP 4 198.7523195626554 20
average reward exploit: MDP 4 170.2653736132917 20
average reward exploit: MDP 4 88.72110378349993 20
average reward exploit: MDP 4 106.79390574512222 20
average reward explore: -523.2839326044259
average reward exploit: MDP 

average reward exploit: MDP 7 -125.29256723123312 20
average reward explore: -398.0583558613416
average reward exploit: MDP 8 -207.01998576959167 20
average reward exploit: MDP 8 -217.28011179963988 20
average reward exploit: MDP 8 -161.84510774687493 20
average reward exploit: MDP 8 -147.21082305294382 20
average reward exploit: MDP 8 -131.5542997092473 20
average reward exploit: MDP 8 -121.94776472034297 20
average reward exploit: MDP 8 -97.78408945375482 20
average reward exploit: MDP 8 -78.75199799677434 20
average reward exploit: MDP 8 -70.71649743756805 20
average reward exploit: MDP 8 -50.13888296192127 20
average reward explore: -371.5653889140272
average reward exploit: MDP 9 -515.6641099874676 20
average reward exploit: MDP 9 -341.51106031306875 20
average reward exploit: MDP 9 -296.61041043817386 20
average reward exploit: MDP 9 -278.99986372847354 20
average reward exploit: MDP 9 -268.8073133026989 20
average reward exploit: MDP 9 -259.4556982324367 20
average reward exploi

average reward exploit: MDP 2 225.6495094513475 20
average reward exploit: MDP 2 136.99805024890242 20
average reward exploit: MDP 2 38.61885044211275 20
average reward exploit: MDP 2 -138.43002523978004 20
average reward exploit: MDP 2 -147.6057551518246 20
average reward exploit: MDP 2 -149.74672304783687 20
average reward exploit: MDP 2 -98.9658825315124 20
average reward explore: -431.84363381494984
average reward exploit: MDP 3 264.88996590521646 20
average reward exploit: MDP 3 193.13519912693485 20
average reward exploit: MDP 3 182.422696544133 20
average reward exploit: MDP 3 138.75658878706383 20
average reward exploit: MDP 3 78.56036205804075 20
average reward exploit: MDP 3 20.855085341950122 20
average reward exploit: MDP 3 -85.6255960413927 20
average reward exploit: MDP 3 -77.69742707654474 20
average reward exploit: MDP 3 -125.26916697667973 20
average reward exploit: MDP 3 -132.1700980976598 20
average reward explore: -446.0618360774641
average reward exploit: MDP 4 -64

average reward exploit: MDP 6 533.8362368499784 20
average reward explore: -344.70792415405543
average reward exploit: MDP 7 -221.17143915545904 20
average reward exploit: MDP 7 -187.8742146888963 20
average reward exploit: MDP 7 -186.68316456392273 20
average reward exploit: MDP 7 -159.0766388489643 20
average reward exploit: MDP 7 -130.3685205805036 20
average reward exploit: MDP 7 -148.71889339016246 20
average reward exploit: MDP 7 -107.7731808773564 20
average reward exploit: MDP 7 -105.98083903186833 20
average reward exploit: MDP 7 -53.552866394879025 20
average reward exploit: MDP 7 -61.497244665058304 20
average reward explore: -332.6981015005434
average reward exploit: MDP 8 -28.663180819482182 20
average reward exploit: MDP 8 -21.138832537608167 20
average reward exploit: MDP 8 -60.13000046816967 20
average reward exploit: MDP 8 -29.308057472794918 20
average reward exploit: MDP 8 -34.853228284183146 20
average reward exploit: MDP 8 13.043813818706614 20
average reward explo

average reward exploit: MDP 1 -834.5263318022577 20
average reward exploit: MDP 1 -721.6425053871951 20
average reward exploit: MDP 1 -779.474046191777 20
average reward exploit: MDP 1 -680.9972576162043 20
average reward exploit: MDP 1 -651.3249849302666 20
average reward exploit: MDP 1 -771.7296491943647 20
average reward exploit: MDP 1 -751.9259834011982 20
average reward explore: -435.3659957951935
average reward exploit: MDP 2 167.69838723991867 20
average reward exploit: MDP 2 162.8021528073688 20
average reward exploit: MDP 2 181.52439093786728 20
average reward exploit: MDP 2 167.38865734281268 20
average reward exploit: MDP 2 183.17912968873338 20
average reward exploit: MDP 2 187.27316972064548 20
average reward exploit: MDP 2 209.5486644500676 20
average reward exploit: MDP 2 226.20078344932477 20
average reward exploit: MDP 2 211.66681269015166 20
average reward exploit: MDP 2 222.02919255451388 20
average reward explore: -404.56357468395015
average reward exploit: MDP 3 -9

average reward exploit: MDP 5 -32.02989237595621 20
average reward explore: -275.89741419912275
average reward exploit: MDP 6 78.2913577737958 20
average reward exploit: MDP 6 109.23761363994586 20
average reward exploit: MDP 6 78.36743467409886 20
average reward exploit: MDP 6 137.44423408172864 20
average reward exploit: MDP 6 50.830238628346585 20
average reward exploit: MDP 6 116.59074072605388 20
average reward exploit: MDP 6 108.54936953040408 20
average reward exploit: MDP 6 66.15468713418515 20
average reward exploit: MDP 6 78.91164110145871 20
average reward exploit: MDP 6 11.511617645979493 20
average reward explore: -245.15545886583837
average reward exploit: MDP 7 -334.95590420839704 20
average reward exploit: MDP 7 -282.9690185113608 20
average reward exploit: MDP 7 -238.32372035854732 20
average reward exploit: MDP 7 -250.4182508420261 20
average reward exploit: MDP 7 -261.0403413955147 20
average reward exploit: MDP 7 -219.37794076202363 20
average reward exploit: MDP 7 

average reward exploit: MDP 0 196.76552617249843 20
average reward exploit: MDP 0 346.60880780667276 20
average reward exploit: MDP 0 568.0143298056058 20
average reward exploit: MDP 0 627.8123277478004 20
average reward exploit: MDP 0 573.4169967955891 20
average reward exploit: MDP 0 509.1758997164379 20
average reward explore: -370.5788581172589
average reward exploit: MDP 1 355.31622393500373 20
average reward exploit: MDP 1 357.8425127183133 20
average reward exploit: MDP 1 362.9946140644673 20
average reward exploit: MDP 1 397.41308048746725 20
average reward exploit: MDP 1 407.0308599333628 20
average reward exploit: MDP 1 447.5605452515476 20
average reward exploit: MDP 1 444.04184674158284 20
average reward exploit: MDP 1 522.017944233375 20
average reward exploit: MDP 1 551.0190990820009 20
average reward exploit: MDP 1 507.83833691170213 20
average reward explore: -359.29159070552765
average reward exploit: MDP 2 64.58549506549252 20
average reward exploit: MDP 2 96.43432967

average reward exploit: MDP 5 108.15413055096755 20
average reward exploit: MDP 5 84.82853450035404 20
average reward exploit: MDP 5 145.32321316944154 20
average reward exploit: MDP 5 171.7926416212853 20
average reward exploit: MDP 5 78.23286401225955 20
average reward exploit: MDP 5 106.68028184751938 20
average reward exploit: MDP 5 134.45836085821136 20
average reward exploit: MDP 5 107.59830295369696 20
average reward exploit: MDP 5 127.02585936316711 20
average reward exploit: MDP 5 142.51063209164957 20
average reward explore: -202.27486433605088
average reward exploit: MDP 6 100.27023684310618 20
average reward exploit: MDP 6 96.30159114122702 20
average reward exploit: MDP 6 32.144300135529924 20
average reward exploit: MDP 6 59.58854480965514 20
average reward exploit: MDP 6 132.16153257985485 20
average reward exploit: MDP 6 178.3174333690149 20
average reward exploit: MDP 6 368.5532579203245 20
average reward exploit: MDP 6 488.6520276380089 20
average reward exploit: MDP 

average reward exploit: MDP 9 -40.858678072649475 20
average reward exploit: MDP 9 -52.99534060139316 20
average reward exploit: MDP 9 -21.54156240217721 20
epoch number:  16
average reward explore: -328.8339309716919
average reward exploit: MDP 0 -240.3110694833108 20
average reward exploit: MDP 0 -122.60226468584752 20
average reward exploit: MDP 0 37.74587669837379 20
average reward exploit: MDP 0 58.87542291355735 20
average reward exploit: MDP 0 -22.27896521807048 20
average reward exploit: MDP 0 -122.77822565524939 20
average reward exploit: MDP 0 -206.63296636666163 20
average reward exploit: MDP 0 -249.0597182750543 20
average reward exploit: MDP 0 -349.6602526822978 20
average reward exploit: MDP 0 -215.59332538105755 20
average reward explore: -278.7153508579836
average reward exploit: MDP 1 -120.70205255409493 20
average reward exploit: MDP 1 -84.88367106514465 20
average reward exploit: MDP 1 -13.067390379038963 20
average reward exploit: MDP 1 -39.59912393581389 20
average

average reward exploit: MDP 4 -34.54162375807259 20
average reward exploit: MDP 4 -44.96918133274287 20
average reward exploit: MDP 4 -36.47129214772198 20
average reward exploit: MDP 4 -22.12767097258841 20
average reward exploit: MDP 4 -20.994607790366256 20
average reward exploit: MDP 4 -34.56844040660855 20
average reward exploit: MDP 4 -34.55688722024341 20
average reward exploit: MDP 4 -70.04898063262974 20
average reward explore: -191.89947307707115
average reward exploit: MDP 5 114.00450664483515 20
average reward exploit: MDP 5 78.08642260097508 20
average reward exploit: MDP 5 152.01906735811042 20
average reward exploit: MDP 5 135.6581069629055 20
average reward exploit: MDP 5 134.29583334170297 20
average reward exploit: MDP 5 168.07441061642515 20
average reward exploit: MDP 5 183.12871108934542 20
average reward exploit: MDP 5 182.32508713649287 20
average reward exploit: MDP 5 194.0880849302075 20
average reward exploit: MDP 5 136.48027685144118 20
average reward explore

average reward exploit: MDP 8 -153.8726386374805 20
average reward explore: -71.64857677403116
average reward exploit: MDP 9 -131.32315774930314 20
average reward exploit: MDP 9 -122.59642226181882 20
average reward exploit: MDP 9 -130.2405678922706 20
average reward exploit: MDP 9 -129.07249132792538 20
average reward exploit: MDP 9 -128.64044185413258 20
average reward exploit: MDP 9 -124.95638542950931 20
average reward exploit: MDP 9 -128.78343453497212 20
average reward exploit: MDP 9 -128.18493738336883 20
average reward exploit: MDP 9 -135.08581585084397 20
average reward exploit: MDP 9 -127.5865023368724 20
epoch number:  19
average reward explore: -244.27410133507192
average reward exploit: MDP 0 -115.9688070751259 20
average reward exploit: MDP 0 -60.29008279189335 20
average reward exploit: MDP 0 -105.60998741384644 20
average reward exploit: MDP 0 -77.45397084992382 20
average reward exploit: MDP 0 -189.75430629197135 20
average reward exploit: MDP 0 -103.887273341626 20
av

average reward exploit: MDP 3 -142.67009564437294 20
average reward exploit: MDP 3 -57.216182050952376 20
average reward exploit: MDP 3 -340.3290847403327 20
average reward exploit: MDP 3 -254.97107785058515 20
average reward exploit: MDP 3 -299.3126892659722 20
average reward exploit: MDP 3 -224.8580041975823 20
average reward explore: -128.3594999374622
average reward exploit: MDP 4 -27.892818135887133 20
average reward exploit: MDP 4 -19.612637347538588 20
average reward exploit: MDP 4 167.35281267711946 20
average reward exploit: MDP 4 68.37951784869514 20
average reward exploit: MDP 4 51.46947718113201 20
average reward exploit: MDP 4 21.31148326374979 20
average reward exploit: MDP 4 61.00779860056135 20
average reward exploit: MDP 4 21.28607979252117 20
average reward exploit: MDP 4 70.62206193844813 20
average reward exploit: MDP 4 5.813243572504689 20
average reward explore: -97.87773148460457
average reward exploit: MDP 5 -171.30799592465416 20
average reward exploit: MDP 5 -

average reward explore: -7.942765173046553
average reward exploit: MDP 8 -71.60227726693017 20
average reward exploit: MDP 8 -91.29888280257036 20
average reward exploit: MDP 8 -101.35424928696725 20
average reward exploit: MDP 8 -100.42479850246595 20
average reward exploit: MDP 8 -82.4616954228716 20
average reward exploit: MDP 8 -77.86115646655207 20
average reward exploit: MDP 8 -81.91586693338188 20
average reward exploit: MDP 8 -67.44675408434398 20
average reward exploit: MDP 8 -85.80840329646013 20
average reward exploit: MDP 8 -53.7372105519947 20
average reward explore: -2.116605093751857
average reward exploit: MDP 9 213.77979264166106 20
average reward exploit: MDP 9 195.44608790984088 20
average reward exploit: MDP 9 221.2585734777778 20
average reward exploit: MDP 9 233.0572886325424 20
average reward exploit: MDP 9 246.97031603809145 20
average reward exploit: MDP 9 296.34108622691605 20
average reward exploit: MDP 9 313.13871815635787 20
average reward exploit: MDP 9 44

average reward exploit: MDP 2 272.04050535224536 20
average reward exploit: MDP 2 178.93960131764152 20
average reward exploit: MDP 2 155.24706017380203 20
average reward exploit: MDP 2 64.96729370870976 20
average reward exploit: MDP 2 69.01711513788328 20
average reward explore: 53.01655939053418
average reward exploit: MDP 3 59.481251561596686 20
average reward exploit: MDP 3 30.39457572301219 20
average reward exploit: MDP 3 -16.959803614092735 20
average reward exploit: MDP 3 -16.583677272701042 20
average reward exploit: MDP 3 -40.83119542689339 20
average reward exploit: MDP 3 17.92755329814268 20
average reward exploit: MDP 3 27.483715568085653 20


KeyboardInterrupt: 

In [None]:
a.lr = 3e-3
num_epochs = 200
for epoch in range(num_epochs):
    print("epoch number: ", epoch)
    a.train_step()

epoch number:  0
average reward explore: 235.43240868895236
average reward exploit: MDP 0 -56.196823765934234 20
average reward exploit: MDP 0 -40.35286801557989 20
average reward exploit: MDP 0 -24.02129771140376 20
average reward exploit: MDP 0 5.6154089875947735 20
average reward exploit: MDP 0 2.361316171514381 20
average reward exploit: MDP 0 1.8780362510601762 20
average reward exploit: MDP 0 6.468609164977151 20
average reward exploit: MDP 0 15.634511581327839 20
average reward exploit: MDP 0 19.544960570362772 20
average reward exploit: MDP 0 16.576887624604538 20
average reward explore: 228.65107311880274
average reward exploit: MDP 1 -307.84492334345975 20
average reward exploit: MDP 1 -383.96434798348656 20
average reward exploit: MDP 1 -390.60449242570064 20
average reward exploit: MDP 1 -279.28742585685853 20
average reward exploit: MDP 1 -296.0112527865921 20
average reward exploit: MDP 1 -427.442436974707 20
average reward exploit: MDP 1 -266.4491958554944 20
average rew

In [89]:
for i in range(20):
    explore_paths, explore_rewards = a.sample_paths_explore(a.env,i)
    print("gravity:", a.sim.model.opt.gravity)
    print("average reward explore:", np.mean(explore_rewards))
    M = a.stack_trajectories(explore_paths)
    Z = a.sess.run(a.Z, feed_dict = {a.encoder_input_placeholder: M,a.sequence_length_placeholder: a.length })
    Z = np.reshape(Z,[1,len(Z)])
    exploit_paths, exploit_rewards = a.sample_paths_exploit(a.env,Z,1)
    print("average reward exploit", np.sum(exploit_rewards) / num_traj, len(exploit_rewards))
    

KeyboardInterrupt: 

In [111]:
explore_paths, explore_rewards = a.sample_paths_explore(a.env,1)
print("average reward explore", np.sum(explore_rewards) / len(explore_rewards))

average reward explore 10096.764246907856


In [112]:
M = a.stack_trajectories(explore_paths)
Z = a.sess.run(a.Z, feed_dict = {a.encoder_input_placeholder: M,a.sequence_length_placeholder: a.length })

In [113]:
Z = np.reshape(Z,[1,len(Z)])

In [114]:
exploit_paths, exploit_rewards = a.sample_paths_exploit(a.env,Z,1)

In [115]:
print("average reward exploit", np.sum(exploit_rewards) / num_traj, len(exploit_rewards))

average reward exploit 1275.3226317979522 400


In [67]:
a.sim.model.opt.gravity[-1] = -18

In [None]:
a.num_traj = 800

In [167]:
a.sim.model.opt.gravity[-1] = -15

In [None]:
observations_explore = np.concatenate([path["observation"] for path in explore_paths])
new_observations_explore = np.concatenate([path["new_obs"] for path in explore_paths])
new_actions_explore = np.concatenate([path["new_acs"] for path in explore_paths])
actions_explore = np.concatenate([path["action"] for path in explore_paths])
rewards_explore = np.concatenate([path["reward"] for path in explore_paths])
returns_explore = a.get_returns(explore_paths, explore = True)

In [None]:
M = np.array(a.stack_trajectories(explore_paths))
Z = a.sess.run(a.Z, feed_dict = {a.encoder_input_placeholder: M,a.sequence_length_placeholder: a.length })
Z = np.reshape(Z,[1,len(Z)])

In [None]:
 print("average reward explore: MDP", np.sum(explore_rewards)/num_traj, len(explore_rewards))

In [None]:
exploit_paths, exploit_rewards = a.sample_paths_exploit(a.env,Z,1000)

In [None]:
print("average reward exploit: MDP", np.sum(exploit_rewards) / num_traj, len(exploit_rewards))

In [None]:
def randomize_pendulum(env):
    m = config['pendulum_mass_min'] + np.random.rand()*(config['pendulum_mass_max'] - config['pendulum_mass_min'])
    l = config['pendulum_len_min'] + np.random.rand()*(config['pendulum_len_max'] - config['pendulum_len_min'])
    env.m = m
    env.l = l

In [None]:
def randomize_hopper(env):
    ts = config['torso_min'] + np.random.rand()*(config['torso_max'] - config['torso_min'])
    f = config['friction_min'] + np.random.rand()*(config['friction_max'] - config['friction_min'])
    
    env.friction = f
    env.torso_size = ts
    env.apply_env_modifications()

In [None]:
import yaml
cfg_filename = 'hopper-config.yml'
with open(cfg_filename,'r') as ymlfile:
    config = yaml.load(ymlfile)

In [None]:
env = Randomizer(gym.make('Hopper-v2'), randomize_hopper)
env.unwrapped.__dict__.keys()
env.reset()

In [None]:
m = config['pendulum_mass_min'] + np.random.rand()*(config['pendulum_mass_max'] - config['pendulum_mass_min'])

In [None]:
config

In [None]:
env_id = 'Hopper-v2'
e = gym.make(env_id)

In [None]:
e.unwrapped.__dict__.keys()

In [None]:
e.unwrapped.apply_env_modifications()