In [47]:
# Include Libraries
import numpy as np
import tensorflow as tf
import gym
import tensorflow.contrib.layers as layers
from mujoco_py import load_model_from_xml, MjSim, MjViewer
import policies
import value_functions as vfuncs
import utils_pg as utils

# Environment setup
#env = "CartPole-v0"
#env="InvertedPendulum-v2"
env = "Hopper-v2"

# discrete = isinstance(env.action_space, gym.spaces.Discrete)
# observation_dim = env.observation_space.shape[0]
# action_dim = env.action_space.n if discrete else env.action_space.shape[0]
max_ep_len = 1000
num_traj = 20
#traj_length = max_ep_len*(observation_dim + 2)
latent_size = 100
use_baseline = True



In [48]:
# Feed forward network (multi-layer-perceptron, or mlp)

def build_mlp(mlp_input,output_size,scope,n_layers,size,output_activation=None):
    '''
    Build a feed forward network
    '''
    Input = mlp_input
    with tf.variable_scope(scope):
        # Dense Layers
        for i in range(n_layers-1):
            dense = tf.layers.dense(inputs = Input, units = size, activation = tf.nn.relu, bias_initializer=tf.constant_initializer(1.0))
            Input = dense
        # Fully Connected Layer
        out = layers.fully_connected(inputs = Input, num_outputs = output_size, activation_fn=output_activation)
    return out

In [49]:
class MetaLearner():
    def __init__(self, env, max_ep_len, num_traj,latent_size ):
        self.env = gym.make(env)
        self.discrete = isinstance(self.env.action_space, gym.spaces.Discrete)
        self.observation_dim = self.env.observation_space.shape[0]
        self.action_dim = self.env.action_space.n if self.discrete else self.env.action_space.shape[0]
        self.max_ep_len = max_ep_len
        self.num_traj = num_traj
        self.traj_length = self.max_ep_len*(self.observation_dim + 2) # TO Change
        self.use_baseline = True
        self.latent_size = latent_size
        self.feature_size = self.observation_dim + 1 + self.env.action_space.shape[0] # HC
        self.lr = 3e-3
        self.num_layers = 2
        self.layers_size = 32
        self.num_mdps = 10
        self.model = self.env.env.model
        self.sim = MjSim(self.model)
        self.gamma = 0.95
        self.decoder_len = 32

        # build model
        self.ConstructGraph()
    
    def add_placeholders(self):
        self.observation_placeholder_explore = tf.placeholder(tf.float32, shape=(None,self.observation_dim))
        if(self.discrete):
            self.action_placeholder_explore = tf.placeholder(tf.int32, shape=(None))
            self.action_placeholder_exploit = tf.placeholder(tf.int32, shape=(None))
        else:
            self.action_placeholder_explore = tf.placeholder(tf.float32, shape=(None,self.action_dim))
            self.action_placeholder_exploit= tf.placeholder(tf.float32, shape=(None,self.action_dim))

        self.baseline_target_placeholder = tf.placeholder(tf.float32, shape= None)
        self.advantage_placeholder_explore = tf.placeholder(tf.float32, shape=(None))
        
        #self.encoder_input_placeholder = tf.placeholder(tf.float32, shape= (self.num_traj,self.traj_length))
        self.encoder_input_placeholder = tf.placeholder(tf.float32, [None, None, self.feature_size])
        self.decoder_input_placeholder = tf.placeholder(tf.float32, shape= (1,self.latent_size))
        self.sequence_length_placeholder = tf.placeholder(tf.int32, [None, ])
        
        self.observation_placeholder_exploit = tf.placeholder(tf.float32, shape=(None,self.observation_dim))
        #TODO
        self.advantage_placeholder_exploit = tf.placeholder(tf.float32, shape=(None))
        
        
    def build_policy_explore(self, scope = "policy_explore"):
        if (self.discrete):
            self.action_logits = build_mlp(self.observation_placeholder_explore,self.action_dim,scope = scope,n_layers=self.num_layers,size = self.layers_size,output_activation=None)
            self.explore_action = tf.multinomial(self.action_logits,1)
            self.explore_action = tf.squeeze(self.explore_action, axis=1)
            self.explore_logprob = -tf.nn.sparse_softmax_cross_entropy_with_logits(logits = self.action_logits, labels = self.action_placeholder_explore)

        else:   
            action_means = build_mlp(self.observation_placeholder_explore,self.action_dim,scope,n_layers=self.num_layers, size = self.layers_size,output_activation=None)
            init = tf.constant(np.random.rand(1, 2))
            #log_std = tf.get_variable("log_std", [self.action_dim])
            log_std = tf.get_variable(
                'log_std',
                shape=[1, self.action_dim],
                dtype=tf.float32,
                initializer=tf.zeros_initializer(),
                trainable=True
            )
            log_std = tf.log(tf.exp(log_std) + 1.)
            log_std = tf.tile(log_std,
                                   [tf.shape(action_means)[0], 1])
            self.explore_action = tf.random_normal((1,),
                                                   action_means,
                                                   log_std)
            dist = tf.contrib.distributions.MultivariateNormalDiag(action_means, log_std)
            self.explore_logprob = dist.prob(self.action_placeholder_explore, name = 'log_prob')
            #self.explore_action =   action_means + tf.multiply(tf.exp(log_std),tf.random_normal(shape = (self.action_dim,1),mean=0,stddev=1))
            #mvn = tf.contrib.distributions.MultivariateNormalDiag(action_means, tf.exp(log_std))
            #self.explore_logprob =  mvn.log_prob(value = self.action_placeholder_explore, name='log_prob')

    
    
    def build_policy_exploit(self, scope = "policy_exploit"):
        if(self.discrete):
            #self.exploit_action_logits = (tf.matmul(tf.nn.relu(tf.matmul(tf.nn.relu(tf.matmul(self.observation_placeholder_exploit,self.d_W1) + self.d_B1), self.d_W2) + self.d_B2),self.d_W3) + self.d_B3)
            #self.exploit_action_logits = tf.matmul(self.observation_placeholder_exploit,self.d_W3) + self.d_B3
            self.exploit_action_logits = tf.matmul(tf.nn.relu(tf.matmul(self.observation_placeholder_exploit,self.d_W2) + self.d_B2), self.d_W3) + self.d_B3
            self.exploit_action = tf.multinomial(self.exploit_action_logits,1)
            self.exploit_action = tf.squeeze(self.exploit_action, axis=1)
            self.exploit_logprob = -tf.nn.sparse_softmax_cross_entropy_with_logits(logits = self.exploit_action_logits, labels = self.action_placeholder_exploit)
        else:
            action_means = tf.matmul(tf.nn.relu(tf.matmul(self.observation_placeholder_exploit,self.d_W2) + self.d_B2), self.d_W3) + self.d_B3
            init = tf.constant(np.random.rand(1, 2))
            log_std = tf.get_variable("exploit_log_prob", [self.action_dim])
            self.exploit_action = action_means + tf.multiply(tf.exp(log_std),tf.random_normal(shape = (self.action_dim,1),mean=0,stddev=1))
            self.exploit_logprob = utils.gauss_log_prob(mu=action_means, logstd=log_std, x=self.action_placeholder_exploit)
            
#             init = tf.constant(np.random.rand(1, 2))
#             log_std = tf.get_variable("exploit_log_prob", [self.action_dim])
#             self.exploit_action =   action_means + tf.multiply(tf.exp(log_std),tf.random_normal(shape = (self.action_dim,1),mean=0,stddev=1))
#             mvn = tf.contrib.distributions.MultivariateNormalDiag(action_means, tf.exp(log_std))
#             self.exploit_logprob =  mvn.log_prob(value = self.action_placeholder_exploit, name='exploit_log_prob')

        #self.loss_grads_exploit = self.exploit_logprob * self.advantage_placeholder_exploit
        
    def NNEncoder(self, scope = "NNEncoder"):
        self.Z = build_mlp(self.encoder_input_placeholder,self.latent_size,scope = scope,n_layers=3,size = 60,output_activation=None)
    
 

    # input [num_traj, length, features (obs + action + reward) ]
    def LSTMEncoder(self, scope = "LSTMEncoder"):
        self.hidden_size = 64
        initializer = tf.random_uniform_initializer(-1, 1)
        cell = tf.nn.rnn_cell.LSTMCell(self.hidden_size, self.feature_size, initializer=initializer)
        cell_out = tf.contrib.rnn.OutputProjectionWrapper(cell, self.latent_size)
        self.output, _ = tf.nn.dynamic_rnn(cell_out,self.encoder_input_placeholder,self.sequence_length_placeholder,dtype=tf.float32,)
        batch_size = tf.shape(self.output)[0]
        max_length = tf.shape(self.output)[1]
        out_size = int(self.output.get_shape()[2])
        index = tf.range(0, batch_size) * max_length + (self.sequence_length_placeholder - 1)
        flat = tf.reshape(self.output, [-1, out_size])
        self.Z = tf.reduce_mean(tf.gather(flat, index), axis = 0)
        
    
    def Decoder(self, decoder_out_dim, scope):
        return build_mlp(self.decoder_input_placeholder,decoder_out_dim,scope = scope,n_layers=1,size = 64,output_activation=None)
    
    
    def sample_paths_explore(self, env,mdp_num,Test = False, num_episodes = None):
        paths = []
        self.length = []
        episode_rewards = []
        self.sim.model.opt.gravity[-1] = -5 - mdp_num
        for i in range(self.num_traj):
            pad = False
            state = env.reset()
            new_states,states,new_actions, actions, rewards,new_rewards = [], [], [], [], [],[]
            episode_reward = 0
            for step in range(self.max_ep_len):
                if (pad):
                    states.append([0]*self.observation_dim)
                    if(self.discrete):
                        actions.append(0)
                    else:
                        actions.append([0]*self.action_dim)
                    rewards.append(0)
                else:
                    states.append(state)
                    new_states.append(state)
                    #action = self.sess.run(self.explore_action, feed_dict={self.observation_placeholder_explore : states[-1][None]})[0]
                    action = self.policyfn.sample_action(state) #NEW
                    state, reward, done, info = env.step(action)
                    actions.append(action)
                    new_actions.append(action)
                    rewards.append(reward)
                    new_rewards.append(reward)
                    episode_reward += reward
                    if (done or step == self.max_ep_len-1):
                        self.length.append(step + 1)
                        pad = True 
                        
            episode_rewards.append(episode_reward)            
            #print("explore",np.array(actions))
            path = {"new_obs" : np.array(new_states),"observation" : np.array(states),
                                "reward" : np.array(rewards),"new_acs" : np.array(new_actions),
                                "action" : np.array(actions),"new_rewards":np.array(new_rewards)}
            paths.append(path)
        return paths, episode_rewards
    
    def sample_paths_exploit(self, env,Z,mdp_num,Test = False, num_episodes = None):
        self.sim.model.opt.gravity[-1] = -5 - mdp_num
        
        paths = []
        num = 0
        episode_rewards = []
        for i in range(self.num_traj):
            state = env.reset()
            states, actions, rewards = [], [], []
            episode_reward = 0
            for step in range(self.max_ep_len):
                states.append(state)
                action = self.sess.run(self.exploit_action, feed_dict={self.observation_placeholder_exploit : state[None], self.decoder_input_placeholder: Z})[0]
                state, reward, done, info = env.step(action)
                actions.append(action)
                rewards.append(reward)
                episode_reward += reward
                if (done or step == self.max_ep_len-1):
                    episode_rewards.append(episode_reward)
                    break
            #print("exploit",np.array(actions))
            path = {"observation" : np.array(states),
                                "reward" : np.array(rewards),
                                "action" : np.array(actions)}
            paths.append(path)
 
        #print("exploit success: ", num)
        return paths, episode_rewards
    
    def get_returns(self,paths, explore = False):
        all_returns = []
        m = 0
        for path in paths:
            rewards = path["reward"]
            returns = []
            if(explore):
                length = self.length[m]
            else:
                length = len(rewards)
            for i in range(length):
                path_returns = 0
                k = 0
                for j in range(i,length):
                    path_returns = path_returns + rewards[j]*(self.gamma)**k
                    k = k+1
                returns.append(path_returns)
            all_returns.append(returns)
            m+=1
        returns = np.concatenate(all_returns)
        return returns
    
    def stack_trajectories(self,paths):
        trajectories = []
        for path in paths:
            rewards = path["reward"]
            states = path["observation"]
            action = path["action"]
            
            SAR = []
            for i in range(len(states)):
                SAR.append(list(states[i]) + list(action[i]) + [rewards[i]])
            trajectories.append(SAR)
        return np.array(trajectories)
    
    def addBaseline(self):
        self.baseline = build_mlp(self.observation_placeholder_explore,1,scope = "baseline",n_layers=self.num_layers, size = self.layers_size,output_activation=None)
        self.baseline_loss = tf.losses.mean_squared_error(self.baseline_target_placeholder,self.baseline,scope = "baseline")
        baseline_adam_optimizer =  tf.train.AdamOptimizer(learning_rate = self.lr)
        self.update_baseline_op = baseline_adam_optimizer.minimize(self.baseline_loss)

    def calculate_advantage(self,returns, observations):
        if (self.use_baseline):
            baseline = self.sess.run(self.baseline, {input_placeholder:observations})
            adv = returns - baseline
            adv = (adv - np.mean(adv))/np.std(adv)
        else:
            adv = returns
        return adv
    

        
    def ConstructGraph(self):
        tf.reset_default_graph()
        self.sess = tf.Session()
        
        self.vf = vfuncs.NnValueFunction(session=self.sess,ob_dim=self.observation_dim)
        self.policyfn = policies.GaussianPolicy(self.sess,self.observation_dim, self.action_dim)
        
        self.add_placeholders()
        
#         self.add_placeholders()
        
#         self.build_policy_explore()
#         self.explore_policy_loss = -tf.reduce_sum(self.explore_logprob * self.advantage_placeholder_explore)
#         self.loss_grads_explore = -self.explore_logprob * self.advantage_placeholder_explore
#         self.tvars_explore = tf.trainable_variables()
#         self.gradients_explore = tf.gradients(self.explore_policy_loss,self.tvars_explore)
        
#         #self.addBaseline()
        
#         self.baseline = build_mlp(self.observation_placeholder_explore,1,scope = "baseline",n_layers=1, size = 16,output_activation=None)
#         self.baseline_loss = tf.losses.mean_squared_error(self.baseline_target_placeholder,self.baseline,scope = "baseline")
#         baseline_adam_optimizer =  tf.train.AdamOptimizer(learning_rate = self.lr)
#         self.update_baseline_op = baseline_adam_optimizer.minimize(self.baseline_loss)
        
#         #Encoder LSTM
        self.LSTMEncoder()
        
        
        #decoder weights
        #self.d_W1 = self.Decoder(scope = "W1", decoder_out_dim = self.observation_dim*self.decoder_len)
        self.d_W2 = self.Decoder(scope = "W2", decoder_out_dim = self.observation_dim*self.decoder_len)
        #self.d_W3 = self.Decoder(scope = "W3", decoder_out_dim = self.decoder_len*action_dim)
        self.d_W3 = self.Decoder(scope = "W3", decoder_out_dim = self.decoder_len*self.action_dim)
        
        #self.d_W1 = ((self.d_W1 - (tf.reduce_max(self.d_W1) + tf.reduce_min(self.d_W1))/2)/(tf.reduce_max(self.d_W1) - tf.reduce_min(self.d_W1)))*2 
        #self.d_W2 = ((self.d_W2 - (tf.reduce_max(self.d_W2) + tf.reduce_min(self.d_W2))/2)/(tf.reduce_max(self.d_W2) - tf.reduce_min(self.d_W2)))*2 
        #self.d_W3 = ((self.d_W3 - (tf.reduce_max(self.d_W3) + tf.reduce_min(self.d_W3))/2)/(tf.reduce_max(self.d_W3) - tf.reduce_min(self.d_W3)))*2 
        
        #self.d_W1 = tf.reshape(self.d_W1, [self.observation_dim, self.decoder_len])
        self.d_W2 = tf.reshape(self.d_W2, [self.observation_dim, self.decoder_len])
        #self.d_W3 = tf.reshape(self.d_W3, [self.decoder_len, self.action_dim])
        self.d_W3 = tf.reshape(self.d_W3, [self.decoder_len, self.action_dim])
        
        # decoder output bias
        #self.d_B1 = tf.reshape(self.Decoder(decoder_out_dim = self.decoder_len, scope = "B1"), [self.decoder_len])
        self.d_B2 = tf.reshape(self.Decoder(decoder_out_dim = self.decoder_len, scope = "B2"), [self.decoder_len])
        self.d_B3 = tf.reshape(self.Decoder(decoder_out_dim = self.action_dim, scope = "B3"), [self.action_dim])
        #self.d_B1 = ((self.d_B1 - (tf.reduce_max(self.d_B1) + tf.reduce_min(self.d_B1))/2)/(tf.reduce_max(self.d_B1) - tf.reduce_min(self.d_B1)))*2 
        #self.d_B2 = ((self.d_B2 - (tf.reduce_max(self.d_B2) + tf.reduce_min(self.d_B2))/2)/(tf.reduce_max(self.d_B2) - tf.reduce_min(self.d_B2)))*2 
        
        
        # exploit policy
        self.build_policy_exploit()
        #self.d = [self.d_W1, self.d_B1, self.d_W2, self.d_B2, self.d_W3, self.d_B3]
        self.exploit_policy_loss = -tf.reduce_sum(self.exploit_logprob * self.advantage_placeholder_exploit)
        self.d = [self.d_W3, self.d_B3]
        self.gradients_exploit = tf.gradients(self.exploit_policy_loss,self.d)
        
        # train encoder and decoder
        adam_optimizer_exploit =  tf.train.AdamOptimizer(self.lr*0.3)
        self.output_train_op = adam_optimizer_exploit.minimize(self.exploit_policy_loss)
        # train original network
#         adam_optimizer_explore = tf.train.AdamOptimizer(self.lr)
#         self.input_train_op = adam_optimizer_explore.minimize(self.explore_policy_loss)
        init = tf.global_variables_initializer()
        self.sess.run(init)
    
    def initialize(self):
        self.ConstructGraph()
#         # create tf session
#         self.sess = tf.Session()
        
#         # initiliaze all variables
#         init = tf.global_variables_initializer()
#         self.sess.run(init)
    
    def train_step(self): 
        # sample num_traj*num_MDPs
        for i in range(self.num_mdps):
            explore_paths, explore_rewards = self.sample_paths_explore(self.env,10*np.random.rand())
            observations_explore = np.concatenate([path["observation"] for path in explore_paths])
            new_observations_explore = np.concatenate([path["new_obs"] for path in explore_paths])
            new_actions_explore = np.concatenate([path["new_acs"] for path in explore_paths])
            actions_explore = np.concatenate([path["action"] for path in explore_paths])
            rewards_explore = np.concatenate([path["reward"] for path in explore_paths])
            
            vtargs, vpreds, advantages_explore = [], [], []
            for path in explore_paths:
                rew_t = path["new_rewards"]
                return_t = utils.discount(rew_t, self.gamma)
                vpred_t = self.vf.predict(path["new_obs"])
                adv_t = return_t - vpred_t
                advantages_explore.append(adv_t)
                vtargs.append(return_t)
                vpreds.append(vpred_t)
                
                
            #returns_explore = self.get_returns(explore_paths, explore = True)
            advantages_explore = np.concatenate(advantages_explore)
            advantages_explore = (advantages_explore - advantages_explore.mean()) / (advantages_explore.std() + 1e-8)
            vtarg_n = np.concatenate(vtargs)
            vpred_n = np.concatenate(vpreds)
            #update baseline
            #print(advantages_explore.shape,new_observations_explore.shape)
            self.vf.fit(new_observations_explore, vtarg_n)
            
            surr_loss, oldmean_na, oldlogstd_a = self.policyfn.update_policy(
                    new_observations_explore, new_actions_explore, advantages_explore, self.lr)
            
            print("average reward explore:", np.mean(explore_rewards))
            
            #print(returns_explore)
#             print("average reward explore: MDP", i, np.sum(explore_rewards)/num_traj, len(explore_rewards))

#             baseline_explore = self.sess.run(self.baseline, {self.observation_placeholder_explore:new_observations_explore})
#             adv = returns_explore - np.squeeze(baseline_explore)
#             advantages_explore = (adv - np.mean(adv))/(np.std(adv) + 1e-12)


#             # update the baseline

#             self.sess.run(self.update_baseline_op, {self.observation_placeholder_explore:new_observations_explore, 
#                                            self.baseline_target_placeholder : returns_explore})

            # calculate explore gradients
    #         grads_explore = self.sess.run(self.gradients_explore, feed_dict={
    #                     self.observation_placeholder_explore : observations_explore,
    #                     self.action_placeholder_explore : actions_explore,
    #                     self.advantage_placeholder_explore : returns_explore})
            #print("explore",grads_explore )
            # form trajectory matrix

            M = np.array(self.stack_trajectories(explore_paths))


            #encoder LSTM
            Z = self.sess.run(self.Z, feed_dict = {self.encoder_input_placeholder: M,self.sequence_length_placeholder: self.length })
            Z = np.reshape(Z,[1,len(Z)])
#             #print("Z",Z)
#             #print(self.sess.run(self.d, feed_dict = {self.decoder_input_placeholder: Z}))
#             #print("d",self.d)
#             # sample paths
#             tvars = tf.trainable_variables()
#             tvars_vals = self.sess.run(tvars[-5:-1])
#             #print(tvars_vals)
            for j in range(10):
                exploit_paths, exploit_rewards = self.sample_paths_exploit(self.env,Z,10*np.random.rand())
                # get observations, actions and rewards
                observations_exploit = np.concatenate([path["observation"] for path in exploit_paths])
                actions_exploit = np.concatenate([path["action"] for path in exploit_paths])
                rewards_exploit = np.concatenate([path["reward"] for path in exploit_paths])
                returns_exploit = self.get_returns(exploit_paths)
                print("average reward exploit: MDP", i, np.sum(exploit_rewards) / num_traj, len(exploit_rewards))


#             # exploit grads
#     #         grads_exploit = self.sess.run(self.gradients_exploit,feed_dict={
#     #                     self.observation_placeholder_exploit : observations_exploit,
#     #                     self.action_placeholder_exploit : actions_exploit,
#     #                     self.advantage_placeholder_exploit : returns_exploit,
#     #                     self.decoder_input_placeholder: Z})

            #train encoder and decoder network
                self.sess.run(self.output_train_op, feed_dict={
                                self.observation_placeholder_exploit : observations_exploit,
                                self.action_placeholder_exploit : actions_exploit,
                                self.advantage_placeholder_exploit : returns_exploit,
                                self.decoder_input_placeholder: Z})

            # find advantage for input network
    #         advantage_explore = 0
    #         for i in range(len(grads_exploit)):
    #             l1 = grads_exploit[i]
    #             l2 = grads_explore[i]
    #             advantage_explore = advantage_explore + np.matmul(l1.flatten(), l2.flatten())

            # train input policy
#             self.sess.run(self.input_train_op, feed_dict={
#                             self.observation_placeholder_explore : new_observations_explore,
#                             self.action_placeholder_explore : new_actions_explore,
#                             self.advantage_placeholder_explore : advantages_explore})
    
    def test(self):
        explore_paths, explore_rewards = self.sample_paths_explore(self.env, Test = True)
        M = self.stack_trajectories(explore_paths)
        Z = self.sess.run(self.Z, feed_dict = {self.encoder_input_placeholder: M,self.sequence_length_placeholder: self.length })
        Z = np.reshape(Z,[1,len(Z)])
        # sample paths
        exploit_paths, exploit_rewards = self.sample_paths_exploit(self.env,Z, Test = True)
        print("average reward exploit", np.sum(exploit_rewards) / num_traj, len(exploit_rewards))
        
    def train(self):
        self.initialize()
        num_epochs = 200
        for epoch in range(num_epochs):
            print("epoch number: ", epoch)
            self.train_step()
            

In [50]:
a = MetaLearner(env, max_ep_len, num_traj, latent_size)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [51]:
a.train()

epoch number:  0
average reward explore: 28.170619892816614
average reward exploit: MDP 0 31.914395718928073 20
average reward exploit: MDP 0 62.82781149449869 20
average reward exploit: MDP 0 50.643726090773676 20
average reward exploit: MDP 0 52.813951758847075 20
average reward exploit: MDP 0 54.60915002274434 20
average reward exploit: MDP 0 60.17411719875082 20
average reward exploit: MDP 0 68.87978534891036 20
average reward exploit: MDP 0 78.1526570588571 20
average reward exploit: MDP 0 72.96218068589374 20
average reward exploit: MDP 0 67.3822658643837 20
average reward explore: 42.317980030223566
average reward exploit: MDP 1 60.26339791594768 20
average reward exploit: MDP 1 56.38519248925555 20
average reward exploit: MDP 1 61.461082184795885 20
average reward exploit: MDP 1 66.08101130647945 20
average reward exploit: MDP 1 66.8969520600822 20
average reward exploit: MDP 1 65.79469469916972 20
average reward exploit: MDP 1 67.52985774760852 20
average reward exploit: MDP 1

average reward exploit: MDP 4 85.20741123226767 20
average reward exploit: MDP 4 86.40620079585884 20
average reward exploit: MDP 4 87.45895003232837 20
average reward explore: 145.98720285510072
average reward exploit: MDP 5 105.45789048164973 20
average reward exploit: MDP 5 103.01736988434081 20
average reward exploit: MDP 5 105.17577760254558 20
average reward exploit: MDP 5 101.52456296367052 20
average reward exploit: MDP 5 90.9269567591885 20
average reward exploit: MDP 5 90.93021958476808 20
average reward exploit: MDP 5 78.47681268541625 20
average reward exploit: MDP 5 76.88972726772367 20
average reward exploit: MDP 5 73.92149973047407 20
average reward exploit: MDP 5 87.4630209148943 20
average reward explore: 164.7718515942167
average reward exploit: MDP 6 60.69560612403789 20
average reward exploit: MDP 6 59.51886915895217 20
average reward exploit: MDP 6 74.813936165702 20
average reward exploit: MDP 6 88.87808570832745 20
average reward exploit: MDP 6 95.41012010372991 

average reward exploit: MDP 9 49.865323222387694 20
average reward exploit: MDP 9 56.00314241035786 20
average reward exploit: MDP 9 59.94592000753412 20
average reward exploit: MDP 9 64.7791997329783 20
average reward exploit: MDP 9 67.25339061082158 20
average reward exploit: MDP 9 72.12328902487253 20
epoch number:  3
average reward explore: 157.12206379952744
average reward exploit: MDP 0 97.36541631599383 20
average reward exploit: MDP 0 93.19267065955873 20
average reward exploit: MDP 0 94.99821672743144 20
average reward exploit: MDP 0 95.755312864754 20
average reward exploit: MDP 0 94.02585529086927 20
average reward exploit: MDP 0 93.41987546296231 20
average reward exploit: MDP 0 94.82918382441449 20
average reward exploit: MDP 0 93.63139786764876 20
average reward exploit: MDP 0 95.04210612193825 20
average reward exploit: MDP 0 90.51305801561344 20
average reward explore: 160.97037610331523
average reward exploit: MDP 1 97.47001091110283 20
average reward exploit: MDP 1 99

average reward exploit: MDP 4 138.2967260886786 20
average reward exploit: MDP 4 142.4935680510243 20
average reward exploit: MDP 4 152.03210624411906 20
average reward exploit: MDP 4 158.4952694011985 20
average reward exploit: MDP 4 153.9556499297061 20
average reward exploit: MDP 4 175.5362459838215 20
average reward exploit: MDP 4 194.30017638453458 20
average reward exploit: MDP 4 219.7449114123051 20
average reward exploit: MDP 4 225.05724911639646 20
average reward explore: 172.93359094216734
average reward exploit: MDP 5 233.6135918062293 20
average reward exploit: MDP 5 239.79444114912798 20
average reward exploit: MDP 5 210.14749210701763 20
average reward exploit: MDP 5 175.22579683075804 20
average reward exploit: MDP 5 196.39619559497007 20
average reward exploit: MDP 5 190.7093161203045 20
average reward exploit: MDP 5 159.54814901595472 20
average reward exploit: MDP 5 195.87492563997745 20
average reward exploit: MDP 5 176.66607964073336 20
average reward exploit: MDP 5

average reward exploit: MDP 8 183.34811631268548 20
average reward explore: 180.51311738972063
average reward exploit: MDP 9 176.94008806466974 20
average reward exploit: MDP 9 182.32556999444768 20
average reward exploit: MDP 9 175.70696522223986 20
average reward exploit: MDP 9 178.86499285526446 20
average reward exploit: MDP 9 179.93634208597751 20
average reward exploit: MDP 9 172.1474849054686 20
average reward exploit: MDP 9 163.1489067848699 20
average reward exploit: MDP 9 166.4098211543165 20
average reward exploit: MDP 9 162.29909186877293 20
average reward exploit: MDP 9 148.7137873903528 20
epoch number:  6
average reward explore: 167.10556076767298
average reward exploit: MDP 0 215.66079965458465 20
average reward exploit: MDP 0 208.56077248258822 20
average reward exploit: MDP 0 203.73825383017314 20
average reward exploit: MDP 0 229.36364665703059 20
average reward exploit: MDP 0 214.92334940646407 20
average reward exploit: MDP 0 193.30310357175435 20
average reward ex

average reward exploit: MDP 3 204.79697716893932 20
average reward exploit: MDP 3 194.50658628157663 20
average reward exploit: MDP 3 197.05178725459243 20
average reward exploit: MDP 3 186.72710212395873 20
average reward exploit: MDP 3 186.9290110592714 20
average reward explore: 167.5530679147592
average reward exploit: MDP 4 195.5672115567215 20
average reward exploit: MDP 4 196.91636132524576 20
average reward exploit: MDP 4 193.4764712108156 20
average reward exploit: MDP 4 187.21496763059082 20
average reward exploit: MDP 4 176.08109577011413 20
average reward exploit: MDP 4 171.14043403272765 20
average reward exploit: MDP 4 160.9747884874726 20
average reward exploit: MDP 4 159.89180128289442 20
average reward exploit: MDP 4 157.03578971141872 20
average reward exploit: MDP 4 152.2542744991848 20
average reward explore: 163.20733079861367
average reward exploit: MDP 5 159.95913417436861 20
average reward exploit: MDP 5 179.48629589663284 20
average reward exploit: MDP 5 165.23

average reward exploit: MDP 8 181.89434152494462 20
average reward exploit: MDP 8 173.5738376247405 20
average reward exploit: MDP 8 175.50949714817256 20
average reward exploit: MDP 8 179.22312661668008 20
average reward exploit: MDP 8 179.90636356726029 20
average reward exploit: MDP 8 172.03427060120345 20
average reward exploit: MDP 8 174.6373191210534 20
average reward exploit: MDP 8 179.1351543288054 20
average reward exploit: MDP 8 174.6542126562294 20
average reward explore: 184.79988603813553
average reward exploit: MDP 9 167.37483866522015 20
average reward exploit: MDP 9 164.6811874207612 20
average reward exploit: MDP 9 152.84607588270165 20
average reward exploit: MDP 9 158.9743717164851 20
average reward exploit: MDP 9 147.40882249948825 20
average reward exploit: MDP 9 122.55980502997643 20
average reward exploit: MDP 9 138.0509543850458 20
average reward exploit: MDP 9 139.32177703638075 20
average reward exploit: MDP 9 150.44039539049314 20
average reward exploit: MDP 

average reward exploit: MDP 2 223.9125378491919 20
average reward exploit: MDP 2 234.89539272483927 20
average reward explore: 168.8265333758724
average reward exploit: MDP 3 226.3136169325986 20
average reward exploit: MDP 3 224.2245765809317 20
average reward exploit: MDP 3 225.43914790123677 20
average reward exploit: MDP 3 221.8763380575368 20
average reward exploit: MDP 3 224.22350683457944 20
average reward exploit: MDP 3 217.25521568984541 20
average reward exploit: MDP 3 223.29913463535735 20
average reward exploit: MDP 3 224.2392253386118 20
average reward exploit: MDP 3 219.3267958264693 20
average reward exploit: MDP 3 224.90606651269226 20
average reward explore: 174.64222115108032
average reward exploit: MDP 4 217.3781582200483 20
average reward exploit: MDP 4 216.90245001253024 20
average reward exploit: MDP 4 211.97518679144218 20
average reward exploit: MDP 4 217.30183828775444 20
average reward exploit: MDP 4 210.3028174196464 20
average reward exploit: MDP 4 213.32599

average reward exploit: MDP 7 206.72790701774733 20
average reward exploit: MDP 7 185.28338711513996 20
average reward exploit: MDP 7 182.6159230026138 20
average reward exploit: MDP 7 189.21342579925974 20
average reward exploit: MDP 7 186.97736048915826 20
average reward explore: 172.1283151025663
average reward exploit: MDP 8 174.62961493544003 20
average reward exploit: MDP 8 173.29474208242084 20
average reward exploit: MDP 8 169.5645448750575 20
average reward exploit: MDP 8 154.61504047038366 20
average reward exploit: MDP 8 165.67343013000362 20
average reward exploit: MDP 8 160.08127330845326 20
average reward exploit: MDP 8 155.75623895641826 20
average reward exploit: MDP 8 158.23807160273967 20
average reward exploit: MDP 8 150.18927772639248 20
average reward exploit: MDP 8 139.540588577317 20
average reward explore: 176.73792138180661
average reward exploit: MDP 9 141.10280852976229 20
average reward exploit: MDP 9 146.8759208361493 20
average reward exploit: MDP 9 134.48

average reward exploit: MDP 2 172.2004076982512 20
average reward exploit: MDP 2 197.68249701426058 20
average reward exploit: MDP 2 192.3673629588021 20
average reward exploit: MDP 2 200.9021380761952 20
average reward exploit: MDP 2 190.9840806393185 20
average reward exploit: MDP 2 220.42210177212033 20
average reward exploit: MDP 2 189.8124337294356 20
average reward exploit: MDP 2 205.66930228206496 20
average reward exploit: MDP 2 215.17872454402504 20
average reward explore: 163.86636194433558
average reward exploit: MDP 3 184.6968977136715 20
average reward exploit: MDP 3 177.70324131119497 20
average reward exploit: MDP 3 196.53731713194094 20
average reward exploit: MDP 3 173.0014688616942 20
average reward exploit: MDP 3 163.74009063012403 20
average reward exploit: MDP 3 175.53382162606846 20
average reward exploit: MDP 3 173.81534938287572 20
average reward exploit: MDP 3 169.16046216525137 20
average reward exploit: MDP 3 170.26617891395344 20
average reward exploit: MDP 

average reward exploit: MDP 6 203.1449648457177 20
average reward explore: 185.0047903932587
average reward exploit: MDP 7 197.40764329747668 20
average reward exploit: MDP 7 195.02566159460568 20
average reward exploit: MDP 7 198.53448076722844 20
average reward exploit: MDP 7 197.0157677252447 20
average reward exploit: MDP 7 196.61745976993925 20
average reward exploit: MDP 7 192.86294794670494 20
average reward exploit: MDP 7 197.5710438056621 20
average reward exploit: MDP 7 195.07201755886143 20
average reward exploit: MDP 7 196.67154587798308 20
average reward exploit: MDP 7 191.630581768327 20
average reward explore: 197.17210692799773
average reward exploit: MDP 8 232.9749044005069 20
average reward exploit: MDP 8 229.9127207063429 20
average reward exploit: MDP 8 229.68535147921912 20
average reward exploit: MDP 8 208.9344894608028 20
average reward exploit: MDP 8 219.24603505857948 20
average reward exploit: MDP 8 189.22140201156802 20
average reward exploit: MDP 8 197.45169

KeyboardInterrupt: 

In [112]:
a.lr = 3e-3
num_epochs = 50
for epoch in range(num_epochs):
        print("epoch number: ", epoch)
        a.train_step()

epoch number:  0
average reward explore: 210.21803166108037
average reward exploit: MDP 0 1.2102014499978386 20
average reward exploit: MDP 0 0.453440734965816 20
average reward exploit: MDP 0 0.4610745693248276 20
average reward exploit: MDP 0 0.9814852631956976 20
average reward exploit: MDP 0 0.6795388032283648 20
average reward exploit: MDP 0 0.6245450751715281 20
average reward exploit: MDP 0 1.4510826378393429 20
average reward exploit: MDP 0 1.810710538956704 20
average reward exploit: MDP 0 1.130289546216216 20
average reward exploit: MDP 0 2.4941984260498913 20
average reward explore: 527.9931821043476
average reward exploit: MDP 1 1.028904755019719 20
average reward exploit: MDP 1 1.0591985770693284 20
average reward exploit: MDP 1 1.6336284834311225 20
average reward exploit: MDP 1 1.8515118683949563 20
average reward exploit: MDP 1 1.4947950404341592 20
average reward exploit: MDP 1 0.9785628041426131 20
average reward exploit: MDP 1 1.0039998948866873 20
average reward exp

average reward exploit: MDP 4 1.892341254804031 20
average reward exploit: MDP 4 0.6151933375807191 20
average reward explore: 408.90876026130803
average reward exploit: MDP 5 0.9620166050930704 20
average reward exploit: MDP 5 1.0893781696267157 20
average reward exploit: MDP 5 1.3744874584530453 20
average reward exploit: MDP 5 0.6693206634728739 20
average reward exploit: MDP 5 1.95903062017022 20
average reward exploit: MDP 5 1.455284422400464 20
average reward exploit: MDP 5 0.6530484610954044 20
average reward exploit: MDP 5 1.2788438626498153 20
average reward exploit: MDP 5 2.088974852083587 20
average reward exploit: MDP 5 1.472029407726338 20
average reward explore: 496.7743838206603
average reward exploit: MDP 6 0.7208058948727898 20
average reward exploit: MDP 6 0.9969632769076637 20
average reward exploit: MDP 6 1.5613579126256885 20
average reward exploit: MDP 6 0.7319661465892877 20
average reward exploit: MDP 6 1.170060604405784 20
average reward exploit: MDP 6 0.959735

average reward exploit: MDP 9 2.4280427411122627 20
average reward exploit: MDP 9 2.365473188044566 20
average reward exploit: MDP 9 2.43380922705365 20
average reward exploit: MDP 9 2.401722032051028 20
epoch number:  3
average reward explore: 169.6350110264886
average reward exploit: MDP 0 1.6400464883268522 20
average reward exploit: MDP 0 1.8321292551815058 20
average reward exploit: MDP 0 1.6451016185255996 20
average reward exploit: MDP 0 1.9103526700646682 20
average reward exploit: MDP 0 1.7174099616282994 20
average reward exploit: MDP 0 2.1370213293939146 20
average reward exploit: MDP 0 1.819549388835044 20
average reward exploit: MDP 0 1.469480682007362 20
average reward exploit: MDP 0 1.8258458032351068 20
average reward exploit: MDP 0 1.8046657084381206 20
average reward explore: 159.30675700069006
average reward exploit: MDP 1 2.4268073633930602 20
average reward exploit: MDP 1 2.5402566873495127 20
average reward exploit: MDP 1 2.4519620311807246 20
average reward explo

average reward exploit: MDP 4 3.403125997536732 20
average reward exploit: MDP 4 3.5566032539101053 20
average reward exploit: MDP 4 3.3834263741079207 20
average reward exploit: MDP 4 3.3619894270478445 20
average reward exploit: MDP 4 3.511480233629831 20
average reward exploit: MDP 4 3.4149560002468275 20
average reward exploit: MDP 4 3.441350477792862 20
average reward exploit: MDP 4 3.3447540304983128 20
average reward explore: 342.3038140197932
average reward exploit: MDP 5 2.3451874150980387 20
average reward exploit: MDP 5 2.2958163434104217 20
average reward exploit: MDP 5 2.305713690001167 20
average reward exploit: MDP 5 2.2358238324216275 20
average reward exploit: MDP 5 2.321048256171623 20
average reward exploit: MDP 5 2.2905110954139842 20
average reward exploit: MDP 5 2.3416156092194536 20
average reward exploit: MDP 5 2.3000127655649196 20
average reward exploit: MDP 5 2.322802796725919 20
average reward exploit: MDP 5 2.337615426999576 20
average reward explore: 320.4

average reward explore: 495.5990284194425
average reward exploit: MDP 9 20.38862433010374 20
average reward exploit: MDP 9 22.077216988779412 20
average reward exploit: MDP 9 22.523367773751854 20
average reward exploit: MDP 9 23.487766451758453 20
average reward exploit: MDP 9 24.09179129343089 20
average reward exploit: MDP 9 37.202614366173705 20
average reward exploit: MDP 9 31.72998691533694 20
average reward exploit: MDP 9 48.98214812092432 20
average reward exploit: MDP 9 52.92493657926849 20
average reward exploit: MDP 9 53.614368219538925 20
epoch number:  6
average reward explore: 194.9523024613585
average reward exploit: MDP 0 43.25874818638217 20
average reward exploit: MDP 0 56.15548733544739 20
average reward exploit: MDP 0 46.82735025538089 20
average reward exploit: MDP 0 56.46956778350277 20
average reward exploit: MDP 0 56.17843339960486 20
average reward exploit: MDP 0 22.511503168935995 20
average reward exploit: MDP 0 2.3490682225514274 20
average reward exploit: M

average reward exploit: MDP 3 1.2746015386143759 20
average reward exploit: MDP 3 1.2806859365011802 20
average reward exploit: MDP 3 1.304001899234302 20
average reward exploit: MDP 3 1.2554677626432904 20
average reward explore: 412.4223886744538
average reward exploit: MDP 4 1.6243700531051188 20
average reward exploit: MDP 4 1.6805583815329572 20
average reward exploit: MDP 4 1.7545495385549021 20
average reward exploit: MDP 4 1.6812349895741867 20
average reward exploit: MDP 4 1.714189653305579 20
average reward exploit: MDP 4 1.6340835569545853 20
average reward exploit: MDP 4 1.7716763065219971 20
average reward exploit: MDP 4 1.734883170322334 20
average reward exploit: MDP 4 1.7266156876719219 20
average reward exploit: MDP 4 1.7543126721679072 20
average reward explore: 450.38404283095207
average reward exploit: MDP 5 0.7485747389503835 20
average reward exploit: MDP 5 0.6952514283511276 20
average reward exploit: MDP 5 0.673971686402865 20
average reward exploit: MDP 5 0.725

average reward exploit: MDP 8 0.9846505977216358 20
average reward exploit: MDP 8 1.0368306651560855 20
average reward exploit: MDP 8 1.076781329019567 20
average reward exploit: MDP 8 1.0494972077975029 20
average reward exploit: MDP 8 1.061259112230238 20
average reward exploit: MDP 8 0.9940957748304584 20
average reward exploit: MDP 8 1.069406370137734 20
average reward exploit: MDP 8 1.1427779453125395 20
average reward explore: 457.54085467259574
average reward exploit: MDP 9 2.0172817066712314 20
average reward exploit: MDP 9 1.950507261553034 20
average reward exploit: MDP 9 2.0743022669568356 20
average reward exploit: MDP 9 2.1630025097893033 20
average reward exploit: MDP 9 2.273478082961082 20
average reward exploit: MDP 9 2.3489860657984236 20
average reward exploit: MDP 9 2.445148903578305 20
average reward exploit: MDP 9 2.42506509457763 20
average reward exploit: MDP 9 2.4874419355374124 20
average reward exploit: MDP 9 2.5166190991307693 20
epoch number:  9
average rewa

average reward exploit: MDP 2 2.6730572252012506 20
average reward explore: 378.6837831408881
average reward exploit: MDP 3 2.6444020905555368 20
average reward exploit: MDP 3 2.6240383755412684 20
average reward exploit: MDP 3 2.717641978805227 20
average reward exploit: MDP 3 2.7028159571732315 20
average reward exploit: MDP 3 2.717649539228572 20
average reward exploit: MDP 3 2.8000397933438417 20
average reward exploit: MDP 3 2.9061827384067014 20
average reward exploit: MDP 3 2.9032471413162173 20
average reward exploit: MDP 3 2.670976248906167 20
average reward exploit: MDP 3 2.909482096152418 20
average reward explore: 482.7668487263536
average reward exploit: MDP 4 2.4843336386157153 20
average reward exploit: MDP 4 2.528408688634645 20
average reward exploit: MDP 4 2.5240636232206737 20
average reward exploit: MDP 4 2.6290099691082705 20
average reward exploit: MDP 4 2.6633508767338703 20
average reward exploit: MDP 4 2.655123993733436 20
average reward exploit: MDP 4 2.703963

average reward exploit: MDP 7 3.1599947784904368 20
average reward exploit: MDP 7 3.2398893065757632 20
average reward exploit: MDP 7 3.372783051269092 20
average reward exploit: MDP 7 3.3842376855181144 20
average reward explore: 344.24214390558626
average reward exploit: MDP 8 3.003881087279281 20
average reward exploit: MDP 8 3.033039169862197 20
average reward exploit: MDP 8 3.0994154367323796 20
average reward exploit: MDP 8 3.001458012609007 20
average reward exploit: MDP 8 2.9530395884976737 20
average reward exploit: MDP 8 3.0107008719903527 20
average reward exploit: MDP 8 3.0687795487392173 20
average reward exploit: MDP 8 2.9926643079858977 20
average reward exploit: MDP 8 3.1755292312073853 20
average reward exploit: MDP 8 2.992924600987851 20
average reward explore: 465.37704273993313
average reward exploit: MDP 9 3.625313764703062 20
average reward exploit: MDP 9 3.8367369982121695 20
average reward exploit: MDP 9 3.6882473385858296 20
average reward exploit: MDP 9 3.8220

KeyboardInterrupt: 

In [103]:
explore_paths, explore_rewards = a.sample_paths_explore(a.env,1)

In [104]:
print("average reward explore:", np.mean(explore_rewards), len(explore_rewards))
M = a.stack_trajectories(explore_paths)

average reward explore: 427.84420643869714 40


In [105]:
Z = a.sess.run(a.Z, feed_dict = {a.encoder_input_placeholder: M,a.sequence_length_placeholder: a.length })

In [106]:
Z = np.reshape(Z,[1,len(Z)])

In [107]:
exploit_paths, exploit_rewards = a.sample_paths_exploit(a.env,Z,1)

In [108]:
print("average reward exploit", np.sum(exploit_rewards) / num_traj, len(exploit_rewards))

average reward exploit 22.049929724727424 20


In [111]:
a.num_traj = 20

In [83]:
for i in range(10):
    maxrev2 = 0
    maxrev1 = 0
    traj = np.random.rand()*(i+1)
    
    for i in range(4):
        explore_paths, explore_rewards = a.sample_paths_explore(a.env,traj)
        #print("average reward explore:", np.mean(explore_rewards))
        M = a.stack_trajectories(explore_paths)
        Z = a.sess.run(a.Z, feed_dict = {a.encoder_input_placeholder: M,a.sequence_length_placeholder: a.length })
        Z = np.reshape(Z,[1,len(Z)])
        exploit_paths, explore_rewards = a.sample_paths_exploit(a.env,Z,traj)
        if(np.mean(explore_rewards) > maxrev1):
            maxrev1 = np.mean(explore_rewards) / num_traj
        if(np.mean(exploit_rewards)>maxrev2):
            maxrev2 = np.mean(exploit_rewards) / num_traj
    print("gravity:", a.sim.model.opt.gravity)
    print("average reward explore", maxrev1)
    print("average reward exploit", maxrev2)
    

gravity: [ 0.         0.        -5.4325712]
average reward explore 3.5619071147766848
average reward exploit 2.7301699712255383


KeyboardInterrupt: 

In [90]:
explore_paths, explore_rewards = a.sample_paths_explore(a.env,5)
observations_explore = np.concatenate([path["observation"] for path in explore_paths])
new_observations_explore = np.concatenate([path["new_obs"] for path in explore_paths])
new_actions_explore = np.concatenate([path["new_acs"] for path in explore_paths])
actions_explore = np.concatenate([path["action"] for path in explore_paths])
rewards_explore = np.concatenate([path["reward"] for path in explore_paths])
returns_explore = a.get_returns(explore_paths, explore = True)

In [91]:
M = np.array(a.stack_trajectories(explore_paths))
Z = a.sess.run(a.Z, feed_dict = {a.encoder_input_placeholder: M,a.sequence_length_placeholder: a.length })
Z = np.reshape(Z,[1,len(Z)])

In [92]:
 print("average reward explore: MDP", np.sum(explore_rewards)/num_traj, len(explore_rewards))

average reward explore: MDP 20488.85105031593 800


In [93]:
exploit_paths, exploit_rewards = a.sample_paths_exploit(a.env,Z,5)

In [94]:
print("average reward exploit: MDP", np.sum(exploit_rewards) / num_traj, len(exploit_rewards))

average reward exploit: MDP 644.4290518284578 400


In [None]:
def randomize_pendulum(env):
    m = config['pendulum_mass_min'] + np.random.rand()*(config['pendulum_mass_max'] - config['pendulum_mass_min'])
    l = config['pendulum_len_min'] + np.random.rand()*(config['pendulum_len_max'] - config['pendulum_len_min'])
    env.m = m
    env.l = l

In [None]:
def randomize_hopper(env):
    ts = config['torso_min'] + np.random.rand()*(config['torso_max'] - config['torso_min'])
    f = config['friction_min'] + np.random.rand()*(config['friction_max'] - config['friction_min'])
    
    env.friction = f
    env.torso_size = ts
    env.apply_env_modifications()

In [None]:
import yaml
cfg_filename = 'hopper-config.yml'
with open(cfg_filename,'r') as ymlfile:
    config = yaml.load(ymlfile)

In [None]:
env = Randomizer(gym.make('Hopper-v2'), randomize_hopper)
env.unwrapped.__dict__.keys()
env.reset()

In [None]:
m = config['pendulum_mass_min'] + np.random.rand()*(config['pendulum_mass_max'] - config['pendulum_mass_min'])

In [None]:
config

In [None]:
env_id = 'Hopper-v2'
e = gym.make(env_id)

In [None]:
e.unwrapped.__dict__.keys()

In [None]:
e.unwrapped.apply_env_modifications()

In [7]:
a.sim.model.opt

<mujoco_py.cymj.PyMjOption at 0x1c23ba8ac8>