In [136]:
# Include Libraries
import numpy as np
import tensorflow as tf
import gym
import tensorflow.contrib.layers as layers
from mujoco_py import load_model_from_xml, MjSim, MjViewer
import policies
import value_functions as vfuncs
import utils_pg as utils

# Environment setup
#env = "CartPole-v0"
env="InvertedPendulum-v2"
#env = "Hopper-v2"

# discrete = isinstance(env.action_space, gym.spaces.Discrete)
# observation_dim = env.observation_space.shape[0]
# action_dim = env.action_space.n if discrete else env.action_space.shape[0]
max_ep_len = 1000
num_traj = 10
#traj_length = max_ep_len*(observation_dim + 2)
latent_size = 10
use_baseline = True



In [137]:
# Feed forward network (multi-layer-perceptron, or mlp)

def build_mlp(mlp_input,output_size,scope,n_layers,size,output_activation=None):
    '''
    Build a feed forward network
    '''
    Input = mlp_input
    with tf.variable_scope(scope):
        # Dense Layers
        for i in range(n_layers-1):
            dense = tf.layers.dense(inputs = Input, units = size, activation = tf.nn.relu, bias_initializer=tf.constant_initializer(1.0))
            Input = dense
        # Fully Connected Layer
        out = layers.fully_connected(inputs = Input, num_outputs = output_size, activation_fn=output_activation)
    return out

In [138]:
class MetaLearner():
    def __init__(self, env, max_ep_len, num_traj,latent_size ):
        self.env = gym.make(env)
        self.discrete = isinstance(self.env.action_space, gym.spaces.Discrete)
        self.observation_dim = self.env.observation_space.shape[0]
        self.action_dim = self.env.action_space.n if self.discrete else self.env.action_space.shape[0]
        self.max_ep_len = max_ep_len
        self.num_traj = num_traj
        self.traj_length = self.max_ep_len*(self.observation_dim + 2) # TO Change
        self.use_baseline = True
        self.latent_size = latent_size
        self.feature_size = self.observation_dim + 1 + self.env.action_space.shape[0] # HC
        self.lr = 3e-3
        self.num_layers = 2
        self.layers_size = 32
        self.num_mdps = 10
        self.model = self.env.env.model
        self.sim = MjSim(self.model)
        self.gamma = 0.95
        self.decoder_len = 32

        # build model
        self.ConstructGraph()
    
    def add_placeholders(self):
        self.observation_placeholder_explore = tf.placeholder(tf.float32, shape=(None,self.observation_dim))
        if(self.discrete):
            self.action_placeholder_explore = tf.placeholder(tf.int32, shape=(None))
            self.action_placeholder_exploit = tf.placeholder(tf.int32, shape=(None))
        else:
            self.action_placeholder_explore = tf.placeholder(tf.float32, shape=(None,self.action_dim))
            self.action_placeholder_exploit= tf.placeholder(tf.float32, shape=(None,self.action_dim))

        self.baseline_target_placeholder = tf.placeholder(tf.float32, shape= None)
        self.advantage_placeholder_explore = tf.placeholder(tf.float32, shape=(None))
        
        #self.encoder_input_placeholder = tf.placeholder(tf.float32, shape= (self.num_traj,self.traj_length))
        self.encoder_input_placeholder = tf.placeholder(tf.float32, [None, None, self.feature_size])
        self.decoder_input_placeholder = tf.placeholder(tf.float32, shape= (1,self.latent_size))
        self.sequence_length_placeholder = tf.placeholder(tf.int32, [None, ])
        
        self.observation_placeholder_exploit = tf.placeholder(tf.float32, shape=(None,self.observation_dim))
        #TODO
        self.advantage_placeholder_exploit = tf.placeholder(tf.float32, shape=(None))
        
        
    def build_policy_explore(self, scope = "policy_explore"):
        if (self.discrete):
            self.action_logits = build_mlp(self.observation_placeholder_explore,self.action_dim,scope = scope,n_layers=self.num_layers,size = self.layers_size,output_activation=None)
            self.explore_action = tf.multinomial(self.action_logits,1)
            self.explore_action = tf.squeeze(self.explore_action, axis=1)
            self.explore_logprob = -tf.nn.sparse_softmax_cross_entropy_with_logits(logits = self.action_logits, labels = self.action_placeholder_explore)

        else:   
            action_means = build_mlp(self.observation_placeholder_explore,self.action_dim,scope,n_layers=self.num_layers, size = self.layers_size,output_activation=None)
            init = tf.constant(np.random.rand(1, 2))
            #log_std = tf.get_variable("log_std", [self.action_dim])
            log_std = tf.get_variable(
                'log_std',
                shape=[1, self.action_dim],
                dtype=tf.float32,
                initializer=tf.zeros_initializer(),
                trainable=True
            )
            log_std = tf.log(tf.exp(log_std) + 1.)
            log_std = tf.tile(log_std,
                                   [tf.shape(action_means)[0], 1])
            self.explore_action = tf.random_normal((1,),
                                                   action_means,
                                                   log_std)
            dist = tf.contrib.distributions.MultivariateNormalDiag(action_means, log_std)
            self.explore_logprob = dist.prob(self.action_placeholder_explore, name = 'log_prob')
            #self.explore_action =   action_means + tf.multiply(tf.exp(log_std),tf.random_normal(shape = (self.action_dim,1),mean=0,stddev=1))
            #mvn = tf.contrib.distributions.MultivariateNormalDiag(action_means, tf.exp(log_std))
            #self.explore_logprob =  mvn.log_prob(value = self.action_placeholder_explore, name='log_prob')

    
    
    def build_policy_exploit(self, scope = "policy_exploit"):
        if(self.discrete):
            #self.exploit_action_logits = (tf.matmul(tf.nn.relu(tf.matmul(tf.nn.relu(tf.matmul(self.observation_placeholder_exploit,self.d_W1) + self.d_B1), self.d_W2) + self.d_B2),self.d_W3) + self.d_B3)
            #self.exploit_action_logits = tf.matmul(self.observation_placeholder_exploit,self.d_W3) + self.d_B3
            self.exploit_action_logits = tf.matmul(tf.nn.relu(tf.matmul(self.observation_placeholder_exploit,self.d_W2) + self.d_B2), self.d_W3) + self.d_B3
            self.exploit_action = tf.multinomial(self.exploit_action_logits,1)
            self.exploit_action = tf.squeeze(self.exploit_action, axis=1)
            self.exploit_logprob = -tf.nn.sparse_softmax_cross_entropy_with_logits(logits = self.exploit_action_logits, labels = self.action_placeholder_exploit)
        else:
            action_means = tf.matmul(tf.nn.relu(tf.matmul(self.observation_placeholder_exploit,self.d_W2) + self.d_B2), self.d_W3) + self.d_B3
            init = tf.constant(np.random.rand(1, 2))
            log_std = tf.get_variable("exploit_log_prob", [self.action_dim])
            self.exploit_action = action_means + tf.multiply(tf.exp(log_std),tf.random_normal(shape = (self.action_dim,1),mean=0,stddev=1))
            self.exploit_logprob = utils.gauss_log_prob(mu=action_means, logstd=log_std, x=self.action_placeholder_exploit)
            
#             init = tf.constant(np.random.rand(1, 2))
#             log_std = tf.get_variable("exploit_log_prob", [self.action_dim])
#             self.exploit_action =   action_means + tf.multiply(tf.exp(log_std),tf.random_normal(shape = (self.action_dim,1),mean=0,stddev=1))
#             mvn = tf.contrib.distributions.MultivariateNormalDiag(action_means, tf.exp(log_std))
#             self.exploit_logprob =  mvn.log_prob(value = self.action_placeholder_exploit, name='exploit_log_prob')

        #self.loss_grads_exploit = self.exploit_logprob * self.advantage_placeholder_exploit
        
    def NNEncoder(self, scope = "NNEncoder"):
        self.Z = build_mlp(self.encoder_input_placeholder,self.latent_size,scope = scope,n_layers=3,size = 60,output_activation=None)
    
 

    # input [num_traj, length, features (obs + action + reward) ]
    def LSTMEncoder(self, scope = "LSTMEncoder"):
        self.hidden_size = 64
        initializer = tf.random_uniform_initializer(-1, 1)
        cell = tf.nn.rnn_cell.LSTMCell(self.hidden_size, self.feature_size, initializer=initializer)
        cell_out = tf.contrib.rnn.OutputProjectionWrapper(cell, self.latent_size)
        self.output, _ = tf.nn.dynamic_rnn(cell_out,self.encoder_input_placeholder,self.sequence_length_placeholder,dtype=tf.float32,)
        batch_size = tf.shape(self.output)[0]
        max_length = tf.shape(self.output)[1]
        out_size = int(self.output.get_shape()[2])
        index = tf.range(0, batch_size) * max_length + (self.sequence_length_placeholder - 1)
        flat = tf.reshape(self.output, [-1, out_size])
        self.Z = tf.reduce_mean(tf.gather(flat, index), axis = 0)
        
    
    def Decoder(self, decoder_out_dim, scope):
        return build_mlp(self.decoder_input_placeholder,decoder_out_dim,scope = scope,n_layers=1,size = 64,output_activation=None)
    
    
    def sample_paths_explore(self, env,mdp_num,Test = False, num_episodes = None):
        paths = []
        self.length = []
        episode_rewards = []
        self.sim.model.opt.gravity[-1] = -5 - mdp_num
        for i in range(self.num_traj):
            pad = False
            state = env.reset()
            new_states,states,new_actions, actions, rewards,new_rewards = [], [], [], [], [],[]
            episode_reward = 0
            for step in range(self.max_ep_len):
                if (pad):
                    states.append([0]*self.observation_dim)
                    if(self.discrete):
                        actions.append(0)
                    else:
                        actions.append([0]*self.action_dim)
                    rewards.append(0)
                else:
                    states.append(state)
                    new_states.append(state)
                    #action = self.sess.run(self.explore_action, feed_dict={self.observation_placeholder_explore : states[-1][None]})[0]
                    action = self.policyfn.sample_action(state) #NEW
                    state, reward, done, info = env.step(action)
                    actions.append(action)
                    new_actions.append(action)
                    rewards.append(reward)
                    new_rewards.append(reward)
                    episode_reward += reward
                    if (done or step == self.max_ep_len-1):
                        self.length.append(step + 1)
                        pad = True 
                        
            episode_rewards.append(episode_reward)            
            #print("explore",np.array(actions))
            path = {"new_obs" : np.array(new_states),"observation" : np.array(states),
                                "reward" : np.array(rewards),"new_acs" : np.array(new_actions),
                                "action" : np.array(actions),"new_rewards":np.array(new_rewards)}
            paths.append(path)
        return paths, episode_rewards
    
    def sample_paths_exploit(self, env,Z,mdp_num,Test = False, num_episodes = None):
        self.sim.model.opt.gravity[-1] = -5 - mdp_num
        
        paths = []
        num = 0
        episode_rewards = []
        for i in range(self.num_traj):
            state = env.reset()
            states, actions, rewards = [], [], []
            episode_reward = 0
            for step in range(self.max_ep_len):
                states.append(state)
                action = self.sess.run(self.exploit_action, feed_dict={self.observation_placeholder_exploit : state[None], self.decoder_input_placeholder: Z})[0]
                state, reward, done, info = env.step(action)
                actions.append(action)
                rewards.append(reward)
                episode_reward += reward
                if (done or step == self.max_ep_len-1):
                    episode_rewards.append(episode_reward)
                    break
            #print("exploit",np.array(actions))
            path = {"observation" : np.array(states),
                                "reward" : np.array(rewards),
                                "action" : np.array(actions)}
            paths.append(path)
 
        #print("exploit success: ", num)
        return paths, episode_rewards
    
    def get_returns(self,paths, explore = False):
        all_returns = []
        m = 0
        for path in paths:
            rewards = path["reward"]
            returns = []
            if(explore):
                length = self.length[m]
            else:
                length = len(rewards)
            for i in range(length):
                path_returns = 0
                k = 0
                for j in range(i,length):
                    path_returns = path_returns + rewards[j]*(self.gamma)**k
                    k = k+1
                returns.append(path_returns)
            all_returns.append(returns)
            m+=1
        returns = np.concatenate(all_returns)
        return returns
    
    def stack_trajectories(self,paths):
        trajectories = []
        for path in paths:
            rewards = path["reward"]
            states = path["observation"]
            action = path["action"]
            
            SAR = []
            for i in range(len(states)):
                SAR.append(list(states[i]) + list(action[i]) + [rewards[i]])
            trajectories.append(SAR)
        return np.array(trajectories)
    
    def addBaseline(self):
        self.baseline = build_mlp(self.observation_placeholder_explore,1,scope = "baseline",n_layers=self.num_layers, size = self.layers_size,output_activation=None)
        self.baseline_loss = tf.losses.mean_squared_error(self.baseline_target_placeholder,self.baseline,scope = "baseline")
        baseline_adam_optimizer =  tf.train.AdamOptimizer(learning_rate = self.lr)
        self.update_baseline_op = baseline_adam_optimizer.minimize(self.baseline_loss)

    def calculate_advantage(self,returns, observations):
        if (self.use_baseline):
            baseline = self.sess.run(self.baseline, {input_placeholder:observations})
            adv = returns - baseline
            adv = (adv - np.mean(adv))/np.std(adv)
        else:
            adv = returns
        return adv
    

        
    def ConstructGraph(self):
        tf.reset_default_graph()
        self.sess = tf.Session()
        
        self.vf = vfuncs.NnValueFunction(session=self.sess,ob_dim=self.observation_dim)
        self.policyfn = policies.GaussianPolicy(self.sess,self.observation_dim, self.action_dim)
        
        self.add_placeholders()
        
#         self.add_placeholders()
        
#         self.build_policy_explore()
#         self.explore_policy_loss = -tf.reduce_sum(self.explore_logprob * self.advantage_placeholder_explore)
#         self.loss_grads_explore = -self.explore_logprob * self.advantage_placeholder_explore
#         self.tvars_explore = tf.trainable_variables()
#         self.gradients_explore = tf.gradients(self.explore_policy_loss,self.tvars_explore)
        
#         #self.addBaseline()
        
#         self.baseline = build_mlp(self.observation_placeholder_explore,1,scope = "baseline",n_layers=1, size = 16,output_activation=None)
#         self.baseline_loss = tf.losses.mean_squared_error(self.baseline_target_placeholder,self.baseline,scope = "baseline")
#         baseline_adam_optimizer =  tf.train.AdamOptimizer(learning_rate = self.lr)
#         self.update_baseline_op = baseline_adam_optimizer.minimize(self.baseline_loss)
        
#         #Encoder LSTM
        self.LSTMEncoder()
        
        
        #decoder weights
        #self.d_W1 = self.Decoder(scope = "W1", decoder_out_dim = self.observation_dim*self.decoder_len)
        self.d_W2 = self.Decoder(scope = "W2", decoder_out_dim = self.observation_dim*self.decoder_len)
        #self.d_W3 = self.Decoder(scope = "W3", decoder_out_dim = self.decoder_len*action_dim)
        self.d_W3 = self.Decoder(scope = "W3", decoder_out_dim = self.decoder_len*self.action_dim)
        
        #self.d_W1 = ((self.d_W1 - (tf.reduce_max(self.d_W1) + tf.reduce_min(self.d_W1))/2)/(tf.reduce_max(self.d_W1) - tf.reduce_min(self.d_W1)))*2 
        #self.d_W2 = ((self.d_W2 - (tf.reduce_max(self.d_W2) + tf.reduce_min(self.d_W2))/2)/(tf.reduce_max(self.d_W2) - tf.reduce_min(self.d_W2)))*2 
        #self.d_W3 = ((self.d_W3 - (tf.reduce_max(self.d_W3) + tf.reduce_min(self.d_W3))/2)/(tf.reduce_max(self.d_W3) - tf.reduce_min(self.d_W3)))*2 
        
        #self.d_W1 = tf.reshape(self.d_W1, [self.observation_dim, self.decoder_len])
        self.d_W2 = tf.reshape(self.d_W2, [self.observation_dim, self.decoder_len])
        #self.d_W3 = tf.reshape(self.d_W3, [self.decoder_len, self.action_dim])
        self.d_W3 = tf.reshape(self.d_W3, [self.decoder_len, self.action_dim])
        
        # decoder output bias
        #self.d_B1 = tf.reshape(self.Decoder(decoder_out_dim = self.decoder_len, scope = "B1"), [self.decoder_len])
        self.d_B2 = tf.reshape(self.Decoder(decoder_out_dim = self.decoder_len, scope = "B2"), [self.decoder_len])
        self.d_B3 = tf.reshape(self.Decoder(decoder_out_dim = self.action_dim, scope = "B3"), [self.action_dim])
        #self.d_B1 = ((self.d_B1 - (tf.reduce_max(self.d_B1) + tf.reduce_min(self.d_B1))/2)/(tf.reduce_max(self.d_B1) - tf.reduce_min(self.d_B1)))*2 
        #self.d_B2 = ((self.d_B2 - (tf.reduce_max(self.d_B2) + tf.reduce_min(self.d_B2))/2)/(tf.reduce_max(self.d_B2) - tf.reduce_min(self.d_B2)))*2 
        
        
        # exploit policy
        self.build_policy_exploit()
        #self.d = [self.d_W1, self.d_B1, self.d_W2, self.d_B2, self.d_W3, self.d_B3]
        self.exploit_policy_loss = -tf.reduce_sum(self.exploit_logprob * self.advantage_placeholder_exploit)
        self.d = [self.d_W3, self.d_B3]
        self.gradients_exploit = tf.gradients(self.exploit_policy_loss,self.d)
        
        # train encoder and decoder
        adam_optimizer_exploit =  tf.train.AdamOptimizer(self.lr*0.3)
        self.output_train_op = adam_optimizer_exploit.minimize(self.exploit_policy_loss)
        # train original network
#         adam_optimizer_explore = tf.train.AdamOptimizer(self.lr)
#         self.input_train_op = adam_optimizer_explore.minimize(self.explore_policy_loss)
        init = tf.global_variables_initializer()
        self.sess.run(init)
    
    def initialize(self):
        self.ConstructGraph()
#         # create tf session
#         self.sess = tf.Session()
        
#         # initiliaze all variables
#         init = tf.global_variables_initializer()
#         self.sess.run(init)
    
    def train_step(self): 
        # sample num_traj*num_MDPs
        for i in range(self.num_mdps):
            explore_paths, explore_rewards = self.sample_paths_explore(self.env,10*np.random.rand())
            observations_explore = np.concatenate([path["observation"] for path in explore_paths])
            new_observations_explore = np.concatenate([path["new_obs"] for path in explore_paths])
            new_actions_explore = np.concatenate([path["new_acs"] for path in explore_paths])
            actions_explore = np.concatenate([path["action"] for path in explore_paths])
            rewards_explore = np.concatenate([path["reward"] for path in explore_paths])
            
            vtargs, vpreds, advantages_explore = [], [], []
            for path in explore_paths:
                rew_t = path["new_rewards"]
                return_t = utils.discount(rew_t, self.gamma)
                vpred_t = self.vf.predict(path["new_obs"])
                adv_t = return_t - vpred_t
                advantages_explore.append(adv_t)
                vtargs.append(return_t)
                vpreds.append(vpred_t)
                
                
            #returns_explore = self.get_returns(explore_paths, explore = True)
            advantages_explore = np.concatenate(advantages_explore)
            advantages_explore = (advantages_explore - advantages_explore.mean()) / (advantages_explore.std() + 1e-8)
            vtarg_n = np.concatenate(vtargs)
            vpred_n = np.concatenate(vpreds)
            #update baseline
            #print(advantages_explore.shape,new_observations_explore.shape)
            self.vf.fit(new_observations_explore, vtarg_n)
            
            surr_loss, oldmean_na, oldlogstd_a = self.policyfn.update_policy(
                    new_observations_explore, new_actions_explore, advantages_explore, self.lr)
            
            print("average reward explore:", np.mean(explore_rewards))
            
            #print(returns_explore)
#             print("average reward explore: MDP", i, np.sum(explore_rewards)/num_traj, len(explore_rewards))

#             baseline_explore = self.sess.run(self.baseline, {self.observation_placeholder_explore:new_observations_explore})
#             adv = returns_explore - np.squeeze(baseline_explore)
#             advantages_explore = (adv - np.mean(adv))/(np.std(adv) + 1e-12)


#             # update the baseline

#             self.sess.run(self.update_baseline_op, {self.observation_placeholder_explore:new_observations_explore, 
#                                            self.baseline_target_placeholder : returns_explore})

            # calculate explore gradients
    #         grads_explore = self.sess.run(self.gradients_explore, feed_dict={
    #                     self.observation_placeholder_explore : observations_explore,
    #                     self.action_placeholder_explore : actions_explore,
    #                     self.advantage_placeholder_explore : returns_explore})
            #print("explore",grads_explore )
            # form trajectory matrix

            M = np.array(self.stack_trajectories(explore_paths))


            #encoder LSTM
            Z = self.sess.run(self.Z, feed_dict = {self.encoder_input_placeholder: M,self.sequence_length_placeholder: self.length })
            Z = np.reshape(Z,[1,len(Z)])
#             #print("Z",Z)
#             #print(self.sess.run(self.d, feed_dict = {self.decoder_input_placeholder: Z}))
#             #print("d",self.d)
#             # sample paths
#             tvars = tf.trainable_variables()
#             tvars_vals = self.sess.run(tvars[-5:-1])
#             #print(tvars_vals)
            for j in range(10):
                exploit_paths, exploit_rewards = self.sample_paths_exploit(self.env,Z,10*np.random.rand())
                # get observations, actions and rewards
                observations_exploit = np.concatenate([path["observation"] for path in exploit_paths])
                actions_exploit = np.concatenate([path["action"] for path in exploit_paths])
                rewards_exploit = np.concatenate([path["reward"] for path in exploit_paths])
                returns_exploit = self.get_returns(exploit_paths)
                print("average reward exploit: MDP", i, np.sum(exploit_rewards) / num_traj, len(exploit_rewards))


#             # exploit grads
#     #         grads_exploit = self.sess.run(self.gradients_exploit,feed_dict={
#     #                     self.observation_placeholder_exploit : observations_exploit,
#     #                     self.action_placeholder_exploit : actions_exploit,
#     #                     self.advantage_placeholder_exploit : returns_exploit,
#     #                     self.decoder_input_placeholder: Z})

            #train encoder and decoder network
                self.sess.run(self.output_train_op, feed_dict={
                                self.observation_placeholder_exploit : observations_exploit,
                                self.action_placeholder_exploit : actions_exploit,
                                self.advantage_placeholder_exploit : returns_exploit,
                                self.decoder_input_placeholder: Z})

            # find advantage for input network
    #         advantage_explore = 0
    #         for i in range(len(grads_exploit)):
    #             l1 = grads_exploit[i]
    #             l2 = grads_explore[i]
    #             advantage_explore = advantage_explore + np.matmul(l1.flatten(), l2.flatten())

            # train input policy
#             self.sess.run(self.input_train_op, feed_dict={
#                             self.observation_placeholder_explore : new_observations_explore,
#                             self.action_placeholder_explore : new_actions_explore,
#                             self.advantage_placeholder_explore : advantages_explore})
    
    def test(self):
        explore_paths, explore_rewards = self.sample_paths_explore(self.env,20*np.random.rand())
        print("gravity: ",self.sim.model.opt.gravity )
        M = self.stack_trajectories(explore_paths)
        Z = self.sess.run(self.Z, feed_dict = {self.encoder_input_placeholder: M,self.sequence_length_placeholder: self.length })
        Z = np.reshape(Z,[1,len(Z)])
        # sample paths
        exploit_paths, exploit_rewards = self.sample_paths_exploit(self.env,Z,20*np.random.rand())
        print("average reward exploit", np.sum(exploit_rewards) / num_traj, len(exploit_rewards))
        
    def train(self):
        self.initialize()
        num_epochs = 200
        for epoch in range(num_epochs):
            self.num_traj = 20
            print("epoch number: ", epoch)
            self.train_step()
            print("evaluating on new MDPs")
            self.num_traj = 400
            self.test()

            
            

In [139]:
a = MetaLearner(env, max_ep_len, num_traj, latent_size)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [140]:
a.train()

epoch number:  0
average reward explore: 7.433057022770922
average reward exploit: MDP 0 13.146825934476633 20
average reward exploit: MDP 0 37.918129129707715 20
average reward exploit: MDP 0 54.49108160179552 20
average reward exploit: MDP 0 66.36915720000206 20
average reward exploit: MDP 0 67.02055270531001 20
average reward exploit: MDP 0 77.99376205030164 20
average reward exploit: MDP 0 76.33559053099799 20
average reward exploit: MDP 0 76.20290403605632 20
average reward exploit: MDP 0 72.80569838767053 20
average reward exploit: MDP 0 83.09025679225665 20
average reward explore: 10.023697805549068
average reward exploit: MDP 1 35.174874416523984 20
average reward exploit: MDP 1 48.30036971181886 20
average reward exploit: MDP 1 63.37136103682455 20
average reward exploit: MDP 1 68.72944317678463 20
average reward exploit: MDP 1 64.58220323068957 20
average reward exploit: MDP 1 59.802738744713636 20
average reward exploit: MDP 1 66.75063745115361 20
average reward exploit: MDP

average reward exploit: MDP 4 37.91554004365383 20
average reward exploit: MDP 4 37.715788154869905 20
average reward exploit: MDP 4 37.820646269472604 20
average reward exploit: MDP 4 37.58602610761451 20
average reward exploit: MDP 4 37.49671001778866 20
average reward explore: 64.17152019505104
average reward exploit: MDP 5 37.645485733304554 20
average reward exploit: MDP 5 37.99572518308127 20
average reward exploit: MDP 5 37.591439671608335 20
average reward exploit: MDP 5 37.021231365880325 20
average reward exploit: MDP 5 37.27947072460943 20
average reward exploit: MDP 5 37.37662795296653 20
average reward exploit: MDP 5 37.97903007381281 20
average reward exploit: MDP 5 36.71246626474816 20
average reward exploit: MDP 5 37.10821975800601 20
average reward exploit: MDP 5 36.988151847088076 20
average reward explore: 69.45749829925246
average reward exploit: MDP 6 35.329494812075524 20
average reward exploit: MDP 6 34.657552096243755 20
average reward exploit: MDP 6 35.17051327

average reward exploit: MDP 9 20.23383545617598 20
average reward exploit: MDP 9 19.64334830450773 20
average reward exploit: MDP 9 17.02804363865916 20
average reward exploit: MDP 9 16.939879336205166 20
average reward exploit: MDP 9 16.318936640052762 20
average reward exploit: MDP 9 15.899041052240563 20
average reward exploit: MDP 9 14.667128640043842 20
average reward exploit: MDP 9 15.018694753192978 20
average reward exploit: MDP 9 17.068805135736483 20
average reward exploit: MDP 9 17.3951878496376 20
evaluating on new MDPs
gravity:  [  0.           0.         -24.39416799]
average reward exploit 211.6998216426823 400
epoch number:  3
average reward explore: 112.7451024771945
average reward exploit: MDP 0 18.562053713463197 20
average reward exploit: MDP 0 19.082665946633718 20
average reward exploit: MDP 0 18.699506109862877 20
average reward exploit: MDP 0 18.183753284597355 20
average reward exploit: MDP 0 18.959293833461096 20
average reward exploit: MDP 0 18.26636036219680

average reward exploit: MDP 3 80.93431584595687 20
average reward exploit: MDP 3 92.89986743201814 20
average reward exploit: MDP 3 99.60624486824688 20
average reward exploit: MDP 3 88.5217702360836 20
average reward exploit: MDP 3 96.04849541178245 20
average reward exploit: MDP 3 94.91055003952154 20
average reward exploit: MDP 3 113.23605585941645 20
average reward explore: 153.5405946075899
average reward exploit: MDP 4 96.63746993672339 20
average reward exploit: MDP 4 74.76682710460129 20
average reward exploit: MDP 4 34.62500942589418 20
average reward exploit: MDP 4 14.8043309941804 20
average reward exploit: MDP 4 13.110294125128817 20
average reward exploit: MDP 4 15.592147836972671 20
average reward exploit: MDP 4 7.5305573960200975 20
average reward exploit: MDP 4 11.164417031091023 20
average reward exploit: MDP 4 11.127456254661942 20
average reward exploit: MDP 4 9.469068954105817 20
average reward explore: 168.28501487964817
average reward exploit: MDP 5 11.38027039480

average reward exploit: MDP 7 63.39326254962327 20
average reward explore: 163.05340067943007
average reward exploit: MDP 8 64.9038545157216 20
average reward exploit: MDP 8 70.18926469979716 20
average reward exploit: MDP 8 65.61775274956702 20
average reward exploit: MDP 8 74.52719346971966 20
average reward exploit: MDP 8 67.97620484065095 20
average reward exploit: MDP 8 69.37637511096732 20
average reward exploit: MDP 8 73.28756493026331 20
average reward exploit: MDP 8 68.96333998834203 20
average reward exploit: MDP 8 73.63853272191345 20
average reward exploit: MDP 8 75.50361112040578 20
average reward explore: 158.9359764327874
average reward exploit: MDP 9 75.02561677245143 20
average reward exploit: MDP 9 80.42993609178875 20
average reward exploit: MDP 9 76.6863869952492 20
average reward exploit: MDP 9 80.0377971701111 20
average reward exploit: MDP 9 86.14947624059423 20
average reward exploit: MDP 9 77.16424439279231 20
average reward exploit: MDP 9 79.54499151937912 20


average reward exploit: MDP 2 75.40597769430477 20
average reward exploit: MDP 2 65.00176830640075 20
average reward exploit: MDP 2 81.77708674353967 20
average reward exploit: MDP 2 79.74556582737848 20
average reward exploit: MDP 2 78.15959890670413 20
average reward exploit: MDP 2 75.10446747065653 20
average reward exploit: MDP 2 75.7937385839902 20
average reward explore: 153.72514669603453
average reward exploit: MDP 3 63.76684199899235 20
average reward exploit: MDP 3 51.45255515682025 20
average reward exploit: MDP 3 61.60676724683282 20
average reward exploit: MDP 3 51.63027086992231 20
average reward exploit: MDP 3 56.445098359742225 20
average reward exploit: MDP 3 53.307958697749044 20
average reward exploit: MDP 3 59.70686672965193 20
average reward exploit: MDP 3 53.838538093306 20
average reward exploit: MDP 3 49.50498520158534 20
average reward exploit: MDP 3 51.987430251036315 20
average reward explore: 159.3200909252706
average reward exploit: MDP 4 41.3918828518801 2

average reward explore: 166.104156574672
average reward exploit: MDP 7 66.80026759699506 20
average reward exploit: MDP 7 68.47037623268004 20
average reward exploit: MDP 7 66.49481103219196 20
average reward exploit: MDP 7 72.47893359446893 20
average reward exploit: MDP 7 76.69304610841579 20
average reward exploit: MDP 7 69.0023595104888 20
average reward exploit: MDP 7 81.68693120286396 20
average reward exploit: MDP 7 78.72842162640528 20
average reward exploit: MDP 7 81.54920530040214 20
average reward exploit: MDP 7 89.64945370898263 20
average reward explore: 162.10086979235658
average reward exploit: MDP 8 72.86583959613259 20
average reward exploit: MDP 8 72.00885646550665 20
average reward exploit: MDP 8 70.34823701018128 20
average reward exploit: MDP 8 73.34716752931556 20
average reward exploit: MDP 8 70.07578970493783 20
average reward exploit: MDP 8 91.90950482458827 20
average reward exploit: MDP 8 94.5213676036898 20
average reward exploit: MDP 8 105.27465486353523 20

average reward exploit: MDP 1 43.54155927630619 20
average reward exploit: MDP 1 42.34960180072886 20
average reward exploit: MDP 1 42.357442074009064 20
average reward exploit: MDP 1 40.260911521019096 20
average reward exploit: MDP 1 38.0847162735134 20
average reward exploit: MDP 1 38.312731365598296 20
average reward exploit: MDP 1 38.642568768567195 20
average reward explore: 161.48604277905332
average reward exploit: MDP 2 41.48969032171456 20
average reward exploit: MDP 2 43.87234327303268 20
average reward exploit: MDP 2 43.895252865387164 20
average reward exploit: MDP 2 44.500894433445936 20
average reward exploit: MDP 2 44.546465864403274 20
average reward exploit: MDP 2 45.41875580365498 20
average reward exploit: MDP 2 45.72150797357911 20
average reward exploit: MDP 2 46.13662821057208 20
average reward exploit: MDP 2 47.302518788408776 20
average reward exploit: MDP 2 47.8169321469213 20
average reward explore: 156.07326409004216
average reward exploit: MDP 3 45.94733652

average reward exploit: MDP 5 41.67720956268418 20
average reward explore: 159.77027655323437
average reward exploit: MDP 6 34.26261775356174 20
average reward exploit: MDP 6 34.42151919204476 20
average reward exploit: MDP 6 34.43296111760626 20
average reward exploit: MDP 6 34.80252422571202 20
average reward exploit: MDP 6 35.06856634274046 20
average reward exploit: MDP 6 35.16961271053604 20
average reward exploit: MDP 6 35.70149717666867 20
average reward exploit: MDP 6 35.30525621873711 20
average reward exploit: MDP 6 36.720593731009686 20
average reward exploit: MDP 6 36.95946420198648 20
average reward explore: 164.69066710852036
average reward exploit: MDP 7 39.015376714233746 20
average reward exploit: MDP 7 37.32299435780098 20
average reward exploit: MDP 7 38.91971311498771 20
average reward exploit: MDP 7 38.57600505591135 20
average reward exploit: MDP 7 38.95012118479484 20
average reward exploit: MDP 7 41.490237865256006 20
average reward exploit: MDP 7 40.21305092187

average reward exploit: MDP 0 57.32505027088281 20
average reward exploit: MDP 0 62.09785805627555 20
average reward exploit: MDP 0 63.32713488617774 20
average reward exploit: MDP 0 61.363736425123 20
average reward exploit: MDP 0 59.60153482360867 20
average reward exploit: MDP 0 59.637010086865565 20
average reward exploit: MDP 0 59.80903533083212 20
average reward exploit: MDP 0 59.118056061941864 20
average reward explore: 163.8729871960482
average reward exploit: MDP 1 58.479836641594076 20
average reward exploit: MDP 1 64.55849288169813 20
average reward exploit: MDP 1 61.49830112774531 20
average reward exploit: MDP 1 60.39648953001292 20
average reward exploit: MDP 1 63.924806257703494 20
average reward exploit: MDP 1 62.482502541379134 20
average reward exploit: MDP 1 65.19547255102653 20
average reward exploit: MDP 1 64.04856939668068 20
average reward exploit: MDP 1 66.7924702799043 20
average reward exploit: MDP 1 68.53359174194487 20
average reward explore: 155.9037293649

average reward exploit: MDP 4 54.2743159687213 20
average reward explore: 156.83778349410423
average reward exploit: MDP 5 63.67374456414243 20
average reward exploit: MDP 5 61.46414565660702 20
average reward exploit: MDP 5 64.59370253900397 20
average reward exploit: MDP 5 64.53061102415498 20
average reward exploit: MDP 5 59.70284854753157 20
average reward exploit: MDP 5 61.47556995812513 20
average reward exploit: MDP 5 65.19452151559908 20
average reward exploit: MDP 5 63.38512971893895 20
average reward exploit: MDP 5 67.95623910869729 20
average reward exploit: MDP 5 62.50392783098795 20
average reward explore: 167.75481085690018
average reward exploit: MDP 6 67.04221442412323 20
average reward exploit: MDP 6 70.92395751297435 20
average reward exploit: MDP 6 71.99878827238055 20
average reward exploit: MDP 6 68.37019130028376 20
average reward exploit: MDP 6 77.34405349954008 20
average reward exploit: MDP 6 69.18807450304743 20
average reward exploit: MDP 6 77.61105962508029 

average reward exploit: MDP 9 49.82148196760513 20
average reward exploit: MDP 9 46.8035057538476 20
average reward exploit: MDP 9 44.81026886803018 20
average reward exploit: MDP 9 42.80955072336758 20
average reward exploit: MDP 9 40.47747947096521 20
evaluating on new MDPs
gravity:  [ 0.          0.         -8.77579698]
average reward exploit 761.8175648742592 400
epoch number:  16
average reward explore: 168.49479789132803
average reward exploit: MDP 0 39.70560338048385 20
average reward exploit: MDP 0 38.205233516801734 20
average reward exploit: MDP 0 37.393125785929406 20
average reward exploit: MDP 0 37.23245094200529 20
average reward exploit: MDP 0 35.66383018747951 20
average reward exploit: MDP 0 37.729724519474175 20
average reward exploit: MDP 0 38.67197663463912 20
average reward exploit: MDP 0 37.418577784892804 20
average reward exploit: MDP 0 36.60291035357863 20
average reward exploit: MDP 0 36.51634640593029 20
average reward explore: 168.8105820344918
average rewar

average reward exploit: MDP 3 24.93066857144666 20
average reward exploit: MDP 3 26.331458327608665 20
average reward explore: 167.98540533777722
average reward exploit: MDP 4 27.36690389232873 20
average reward exploit: MDP 4 28.33669429972404 20
average reward exploit: MDP 4 27.2637282708469 20
average reward exploit: MDP 4 28.448053798616332 20
average reward exploit: MDP 4 26.909894848765237 20
average reward exploit: MDP 4 27.725098991554926 20
average reward exploit: MDP 4 27.103020903678196 20
average reward exploit: MDP 4 26.96985962991935 20
average reward exploit: MDP 4 27.874483725410396 20
average reward exploit: MDP 4 27.205867096986953 20
average reward explore: 167.2950678035543
average reward exploit: MDP 5 27.640687085494772 20
average reward exploit: MDP 5 26.44228539657082 20
average reward exploit: MDP 5 26.934356814649487 20
average reward exploit: MDP 5 26.963415042794814 20
average reward exploit: MDP 5 27.689184939534254 20
average reward exploit: MDP 5 27.90942

average reward exploit: MDP 8 34.33291139576389 20
average reward exploit: MDP 8 34.60073962684692 20
average reward exploit: MDP 8 34.517342058045 20
average reward exploit: MDP 8 35.38892978872557 20
average reward exploit: MDP 8 34.20193776597195 20
average reward exploit: MDP 8 34.08133904033718 20
average reward exploit: MDP 8 34.87433734163741 20
average reward explore: 171.93268196416798
average reward exploit: MDP 9 35.637854144088784 20
average reward exploit: MDP 9 34.9972063652993 20
average reward exploit: MDP 9 34.92062216035944 20
average reward exploit: MDP 9 35.28021645431549 20
average reward exploit: MDP 9 35.1874060346006 20
average reward exploit: MDP 9 35.134201015618025 20
average reward exploit: MDP 9 35.47569476232679 20
average reward exploit: MDP 9 36.11832293739291 20
average reward exploit: MDP 9 35.44517831527182 20
average reward exploit: MDP 9 35.76982364129117 20
evaluating on new MDPs
gravity:  [  0.           0.         -20.84715559]
average reward exp

average reward exploit: MDP 2 41.22760326967004 20
average reward exploit: MDP 2 41.23027986878323 20
average reward exploit: MDP 2 41.53967102796727 20
average reward exploit: MDP 2 41.723696201035104 20
average reward explore: 157.24035693607732
average reward exploit: MDP 3 37.5299027491751 20
average reward exploit: MDP 3 36.23722676149635 20
average reward exploit: MDP 3 37.25596720226796 20
average reward exploit: MDP 3 38.52649897858381 20
average reward exploit: MDP 3 37.6747709009005 20
average reward exploit: MDP 3 38.617530357331695 20
average reward exploit: MDP 3 37.77767971839188 20
average reward exploit: MDP 3 37.46153754403341 20
average reward exploit: MDP 3 38.195273697844485 20
average reward exploit: MDP 3 38.66291360615311 20
average reward explore: 168.16992850487804
average reward exploit: MDP 4 39.592773697466264 20
average reward exploit: MDP 4 40.18641046103089 20
average reward exploit: MDP 4 39.51097384815236 20
average reward exploit: MDP 4 39.496943977255

average reward exploit: MDP 7 39.02140593446299 20
average reward exploit: MDP 7 38.503723320403154 20
average reward exploit: MDP 7 38.299259352015014 20
average reward exploit: MDP 7 38.30161839373763 20
average reward exploit: MDP 7 38.85739236529387 20
average reward exploit: MDP 7 39.19123708737833 20
average reward exploit: MDP 7 39.916509321696395 20
average reward exploit: MDP 7 40.01219560130402 20
average reward exploit: MDP 7 40.73189420352311 20
average reward explore: 170.85958984304528
average reward exploit: MDP 8 39.378973957789356 20
average reward exploit: MDP 8 38.80592120347238 20
average reward exploit: MDP 8 38.220119804589004 20
average reward exploit: MDP 8 35.74861540874578 20
average reward exploit: MDP 8 34.59829464165317 20
average reward exploit: MDP 8 33.28436790606996 20
average reward exploit: MDP 8 32.5858970329668 20
average reward exploit: MDP 8 32.595461424619955 20
average reward exploit: MDP 8 32.028682470992706 20
average reward exploit: MDP 8 31.

average reward exploit: MDP 1 41.4784159752034 20
average reward exploit: MDP 1 41.349546070343315 20
average reward exploit: MDP 1 42.97673044861262 20
average reward exploit: MDP 1 43.36440908054181 20
average reward exploit: MDP 1 43.058322715077836 20
average reward explore: 156.53830628330124
average reward exploit: MDP 2 41.69536791457493 20
average reward exploit: MDP 2 44.275685019566694 20
average reward exploit: MDP 2 43.81041615771379 20
average reward exploit: MDP 2 40.5471655311143 20
average reward exploit: MDP 2 41.571859219089454 20
average reward exploit: MDP 2 40.88285648345364 20
average reward exploit: MDP 2 43.761617642362886 20
average reward exploit: MDP 2 49.50725538177778 20
average reward exploit: MDP 2 41.63323483581625 20
average reward exploit: MDP 2 41.77994131065187 20
average reward explore: 169.35224881381765
average reward exploit: MDP 3 35.68182948716016 20
average reward exploit: MDP 3 38.55684425592642 20
average reward exploit: MDP 3 35.57543385487

average reward exploit: MDP 6 1.505425672763748 20
average reward exploit: MDP 6 0.2869987453055206 20
average reward exploit: MDP 6 0.6341107306335163 20
average reward exploit: MDP 6 2.2722880853234595 20
average reward exploit: MDP 6 1.0705042206002748 20
average reward exploit: MDP 6 4.680474046338613 20
average reward exploit: MDP 6 2.444033043163931 20
average reward exploit: MDP 6 0.5251714377000923 20
average reward exploit: MDP 6 2.2943074717882084 20
average reward exploit: MDP 6 1.9815142093784979 20
average reward explore: 158.81304146745845
average reward exploit: MDP 7 11.615742239893711 20
average reward exploit: MDP 7 14.58413848887183 20
average reward exploit: MDP 7 14.915196678098695 20
average reward exploit: MDP 7 13.527014555804675 20
average reward exploit: MDP 7 15.825676114219018 20
average reward exploit: MDP 7 15.681833303178866 20
average reward exploit: MDP 7 17.73759299855023 20
average reward exploit: MDP 7 21.18604274430023 20
average reward exploit: MDP

average reward exploit: MDP 0 34.99698033274989 20
average reward exploit: MDP 0 34.2050878522696 20
average reward exploit: MDP 0 34.3899758196884 20
average reward exploit: MDP 0 34.49595033678408 20
average reward exploit: MDP 0 35.29325723600045 20
average reward exploit: MDP 0 35.7943511157108 20
average reward exploit: MDP 0 35.632575981467944 20
average reward exploit: MDP 0 35.40702248676756 20
average reward explore: 287.4936147264124
average reward exploit: MDP 1 35.6268933449884 20
average reward exploit: MDP 1 35.239420287051566 20
average reward exploit: MDP 1 36.053457118177114 20
average reward exploit: MDP 1 38.80110985412615 20
average reward exploit: MDP 1 38.46403959059414 20
average reward exploit: MDP 1 37.44576007848054 20
average reward exploit: MDP 1 39.365901182384064 20
average reward exploit: MDP 1 42.197674051347676 20
average reward exploit: MDP 1 43.54385351549777 20
average reward exploit: MDP 1 46.286447046537994 20
average reward explore: 297.7987546047

average reward exploit: MDP 4 34.280011885303935 20
average reward exploit: MDP 4 33.642341131263755 20
average reward explore: 326.45503183934255
average reward exploit: MDP 5 33.32999027351874 20
average reward exploit: MDP 5 33.49627059463729 20
average reward exploit: MDP 5 31.63142541661646 20
average reward exploit: MDP 5 34.508094691440746 20
average reward exploit: MDP 5 33.99291332640284 20
average reward exploit: MDP 5 32.0089918882078 20
average reward exploit: MDP 5 32.70191936694836 20
average reward exploit: MDP 5 32.910423941106956 20
average reward exploit: MDP 5 32.33777935184147 20
average reward exploit: MDP 5 32.07336931082394 20
average reward explore: 272.27178374711104
average reward exploit: MDP 6 33.858261368085415 20
average reward exploit: MDP 6 33.00589305769782 20
average reward exploit: MDP 6 33.285796798124615 20
average reward exploit: MDP 6 33.10537501647458 20
average reward exploit: MDP 6 32.968112760441 20
average reward exploit: MDP 6 32.72006304988

average reward exploit: MDP 9 34.1499808718619 20
average reward exploit: MDP 9 35.67213309536241 20
average reward exploit: MDP 9 35.31763101564226 20
average reward exploit: MDP 9 33.76400108620416 20
average reward exploit: MDP 9 34.89221369017465 20
average reward exploit: MDP 9 33.00006659366564 20
average reward exploit: MDP 9 32.31623271547929 20
evaluating on new MDPs
gravity:  [ 0.          0.         -5.62448345]
average reward exploit 510.6607101514108 400
epoch number:  29
average reward explore: 357.1884911610629
average reward exploit: MDP 0 31.663024720285808 20
average reward exploit: MDP 0 31.626005188189236 20
average reward exploit: MDP 0 30.707646120392116 20
average reward exploit: MDP 0 28.69473810764016 20
average reward exploit: MDP 0 28.125159942298193 20
average reward exploit: MDP 0 28.062893822818257 20
average reward exploit: MDP 0 27.26961436999079 20
average reward exploit: MDP 0 28.537344805029978 20
average reward exploit: MDP 0 27.142868159571275 20
av

average reward exploit: MDP 3 28.064709371642767 20
average reward exploit: MDP 3 27.75780705183744 20
average reward exploit: MDP 3 28.8276741444434 20
average reward exploit: MDP 3 28.299898553859105 20
average reward explore: 316.92172788746046
average reward exploit: MDP 4 30.339546057667672 20
average reward exploit: MDP 4 31.28743431502546 20
average reward exploit: MDP 4 31.229234019147462 20
average reward exploit: MDP 4 30.330928290770487 20
average reward exploit: MDP 4 32.22531774940582 20
average reward exploit: MDP 4 31.72567200559659 20
average reward exploit: MDP 4 31.07968338258478 20
average reward exploit: MDP 4 31.823804866011024 20
average reward exploit: MDP 4 32.30139886345727 20
average reward exploit: MDP 4 31.96404094158866 20
average reward explore: 351.00701960139713
average reward exploit: MDP 5 32.425240839619065 20
average reward exploit: MDP 5 31.619036277849155 20
average reward exploit: MDP 5 31.92734324472866 20
average reward exploit: MDP 5 32.9421613

average reward exploit: MDP 8 23.3195925574937 20
average reward exploit: MDP 8 23.617595327066834 20
average reward exploit: MDP 8 23.726778742802054 20
average reward exploit: MDP 8 24.944856386225446 20
average reward exploit: MDP 8 26.371635663336928 20
average reward exploit: MDP 8 27.55467703165619 20
average reward exploit: MDP 8 29.909192866848986 20
average reward exploit: MDP 8 30.16099356262103 20
average reward exploit: MDP 8 31.495535864646126 20
average reward explore: 268.85563109908236
average reward exploit: MDP 9 25.148183654805035 20
average reward exploit: MDP 9 24.501950724121198 20
average reward exploit: MDP 9 23.01285110374491 20
average reward exploit: MDP 9 25.20793139725519 20
average reward exploit: MDP 9 27.198328970731364 20
average reward exploit: MDP 9 26.68603238650615 20
average reward exploit: MDP 9 28.758153181360548 20
average reward exploit: MDP 9 31.471180100085043 20
average reward exploit: MDP 9 32.03214268836436 20
average reward exploit: MDP 9

average reward exploit: MDP 2 23.94667728430923 20
average reward exploit: MDP 2 23.09405546604201 20
average reward exploit: MDP 2 23.374577582397684 20
average reward exploit: MDP 2 21.955005142023396 20
average reward exploit: MDP 2 22.25694305409942 20
average reward explore: 333.73386485634063
average reward exploit: MDP 3 34.1136012111176 20
average reward exploit: MDP 3 33.473572687414446 20
average reward exploit: MDP 3 32.685860886340535 20
average reward exploit: MDP 3 32.45352040112314 20
average reward exploit: MDP 3 32.0527190706443 20
average reward exploit: MDP 3 31.166429189851225 20
average reward exploit: MDP 3 32.44313024128881 20
average reward exploit: MDP 3 32.570949817473576 20
average reward exploit: MDP 3 32.42994142818515 20
average reward exploit: MDP 3 30.2391660732195 20
average reward explore: 327.36590491872437
average reward exploit: MDP 4 28.656274174880004 20
average reward exploit: MDP 4 28.414528583479136 20
average reward exploit: MDP 4 27.965810604

average reward exploit: MDP 7 43.208404435778526 20
average reward exploit: MDP 7 42.32729784079149 20
average reward exploit: MDP 7 40.67202569727151 20
average reward exploit: MDP 7 42.7839487701895 20
average reward exploit: MDP 7 42.74666462567579 20
average reward exploit: MDP 7 41.37216931344923 20
average reward exploit: MDP 7 43.64292943830511 20
average reward exploit: MDP 7 41.357098316373694 20
average reward exploit: MDP 7 42.74262156815323 20
average reward exploit: MDP 7 43.109872067025734 20
average reward explore: 258.539003042417
average reward exploit: MDP 8 38.053741567463625 20
average reward exploit: MDP 8 38.64984645252467 20
average reward exploit: MDP 8 38.704228099447846 20
average reward exploit: MDP 8 38.00133628609101 20
average reward exploit: MDP 8 38.894605003000876 20
average reward exploit: MDP 8 39.14109743398012 20
average reward exploit: MDP 8 38.52250161785337 20
average reward exploit: MDP 8 39.849258023170684 20
average reward exploit: MDP 8 39.05

average reward exploit: MDP 1 33.63961703973821 20
average reward exploit: MDP 1 32.33534995821789 20
average reward exploit: MDP 1 32.730598367674546 20
average reward exploit: MDP 1 31.6630322430644 20
average reward exploit: MDP 1 32.01238543102393 20
average reward exploit: MDP 1 30.89262553483067 20
average reward exploit: MDP 1 30.881059479165582 20
average reward explore: 309.7190749312962
average reward exploit: MDP 2 31.976365585916376 20
average reward exploit: MDP 2 31.127222453176756 20
average reward exploit: MDP 2 30.06191881323469 20
average reward exploit: MDP 2 29.661739403453925 20
average reward exploit: MDP 2 26.95973151079395 20
average reward exploit: MDP 2 29.064692487421958 20
average reward exploit: MDP 2 27.895574780437055 20
average reward exploit: MDP 2 27.807236825275766 20
average reward exploit: MDP 2 27.960468001487335 20
average reward exploit: MDP 2 27.030852313091977 20
average reward explore: 267.45749853355466
average reward exploit: MDP 3 21.633615

average reward exploit: MDP 5 49.25037309320554 20
average reward explore: 391.14882467062523
average reward exploit: MDP 6 43.03221525008071 20
average reward exploit: MDP 6 49.17080562473016 20
average reward exploit: MDP 6 49.10616292859036 20
average reward exploit: MDP 6 48.84742303204819 20
average reward exploit: MDP 6 44.70434131074494 20
average reward exploit: MDP 6 45.53504143783515 20
average reward exploit: MDP 6 46.43291172544383 20
average reward exploit: MDP 6 47.625533231762724 20
average reward exploit: MDP 6 48.17148782096545 20
average reward exploit: MDP 6 45.5734192462349 20
average reward explore: 348.48682686003303
average reward exploit: MDP 7 44.9967914468679 20
average reward exploit: MDP 7 47.23718924221847 20
average reward exploit: MDP 7 42.08718066059485 20
average reward exploit: MDP 7 41.4714519770573 20
average reward exploit: MDP 7 41.632682191983186 20
average reward exploit: MDP 7 43.615721425512696 20
average reward exploit: MDP 7 42.56861502936682

average reward exploit: MDP 0 122.51018390151982 20
average reward exploit: MDP 0 141.43411868137534 20
average reward exploit: MDP 0 117.59292436961114 20
average reward exploit: MDP 0 116.4500601913754 20
average reward exploit: MDP 0 115.06405489772389 20
average reward exploit: MDP 0 107.42589585448252 20
average reward exploit: MDP 0 101.06931089742275 20
average reward exploit: MDP 0 105.45326529908667 20
average reward explore: 388.45338326762624
average reward exploit: MDP 1 105.31619329488922 20
average reward exploit: MDP 1 105.64191486096736 20
average reward exploit: MDP 1 97.0694723536904 20
average reward exploit: MDP 1 96.88299793778252 20
average reward exploit: MDP 1 100.99063229866972 20
average reward exploit: MDP 1 90.42406098193725 20
average reward exploit: MDP 1 88.82769034613548 20
average reward exploit: MDP 1 86.38351773743237 20
average reward exploit: MDP 1 82.30721756015691 20
average reward exploit: MDP 1 79.80651695376903 20
average reward explore: 447.78

average reward exploit: MDP 4 -16.714239375962944 20
average reward exploit: MDP 4 -17.475093074169884 20
average reward explore: 342.49711978772984
average reward exploit: MDP 5 -10.776981168841838 20
average reward exploit: MDP 5 -10.852106437805968 20
average reward exploit: MDP 5 -11.35614093751806 20
average reward exploit: MDP 5 -11.606888586096009 20
average reward exploit: MDP 5 -11.89800273087885 20
average reward exploit: MDP 5 -11.584020412327376 20
average reward exploit: MDP 5 -11.722619867166683 20
average reward exploit: MDP 5 -11.39612187566767 20
average reward exploit: MDP 5 -11.324672965845373 20
average reward exploit: MDP 5 -11.517683476842057 20
average reward explore: 414.07511456058245
average reward exploit: MDP 6 -17.73975040185954 20
average reward exploit: MDP 6 -18.61765206435374 20
average reward exploit: MDP 6 -19.290048992440568 20
average reward exploit: MDP 6 -20.36622565573694 20
average reward exploit: MDP 6 -20.20108871555045 20
average reward explo

average reward exploit: MDP 9 67.38876735204036 20
average reward exploit: MDP 9 71.45200501521761 20
average reward exploit: MDP 9 71.5673182423391 20
average reward exploit: MDP 9 67.28072628390402 20
average reward exploit: MDP 9 65.29764742044586 20
average reward exploit: MDP 9 62.555109475955234 20
average reward exploit: MDP 9 62.09447983473787 20
average reward exploit: MDP 9 62.86317931677196 20
evaluating on new MDPs
gravity:  [  0.           0.         -15.09671125]
average reward exploit 1413.5740180843538 400
epoch number:  42
average reward explore: 351.5389104795836
average reward exploit: MDP 0 73.50725140905135 20
average reward exploit: MDP 0 63.72337935126808 20
average reward exploit: MDP 0 62.353060826573824 20
average reward exploit: MDP 0 55.87313352197914 20
average reward exploit: MDP 0 59.01120122337606 20
average reward exploit: MDP 0 54.961112719172135 20
average reward exploit: MDP 0 52.60554698827315 20
average reward exploit: MDP 0 52.29052200865141 20
av

average reward exploit: MDP 3 24.37794126999054 20
average reward exploit: MDP 3 23.655631387942854 20
average reward exploit: MDP 3 22.859637320490158 20
average reward exploit: MDP 3 20.61923121262915 20
average reward explore: 511.24631686530813
average reward exploit: MDP 4 4.4923564290536735 20
average reward exploit: MDP 4 4.457317844326011 20
average reward exploit: MDP 4 1.6267410396190383 20
average reward exploit: MDP 4 4.392009430140379 20
average reward exploit: MDP 4 3.8307654541698617 20
average reward exploit: MDP 4 2.5676250960382347 20
average reward exploit: MDP 4 5.280229472404045 20
average reward exploit: MDP 4 4.899900664295847 20
average reward exploit: MDP 4 6.306080608091214 20
average reward exploit: MDP 4 9.193766127824382 20
average reward explore: 461.03646545202116
average reward exploit: MDP 5 -7.23181357936885 20
average reward exploit: MDP 5 -1.200175058713691 20
average reward exploit: MDP 5 -0.9590371807052571 20
average reward exploit: MDP 5 -2.65572

average reward exploit: MDP 8 140.64381732529094 20
average reward exploit: MDP 8 119.78769980221382 20
average reward exploit: MDP 8 100.06300469537726 20
average reward exploit: MDP 8 81.17854052011624 20
average reward exploit: MDP 8 95.811313892009 20
average reward exploit: MDP 8 109.03746053349096 20
average reward exploit: MDP 8 82.07949189506411 20
average reward exploit: MDP 8 81.15988081963039 20
average reward exploit: MDP 8 96.70995749250292 20
average reward explore: 517.83841783794
average reward exploit: MDP 9 86.45335461408055 20
average reward exploit: MDP 9 91.7777790659305 20
average reward exploit: MDP 9 73.9560247766394 20
average reward exploit: MDP 9 83.70378279536752 20
average reward exploit: MDP 9 73.40431122824212 20
average reward exploit: MDP 9 92.75076345360733 20
average reward exploit: MDP 9 84.91682642929688 20
average reward exploit: MDP 9 83.33779445645658 20
average reward exploit: MDP 9 87.63057551428207 20
average reward exploit: MDP 9 87.155433239

average reward exploit: MDP 2 70.45736683262287 20
average reward exploit: MDP 2 64.3791996984303 20
average reward exploit: MDP 2 71.74894385122775 20
average reward exploit: MDP 2 77.4465794985985 20
average reward exploit: MDP 2 65.0467702708676 20
average reward exploit: MDP 2 67.21189889242669 20
average reward explore: 363.62818840544435
average reward exploit: MDP 3 82.70193832769678 20
average reward exploit: MDP 3 64.9935379697047 20
average reward exploit: MDP 3 92.24839120085983 20
average reward exploit: MDP 3 63.166383802901386 20
average reward exploit: MDP 3 66.66333118206707 20
average reward exploit: MDP 3 90.58962748599205 20
average reward exploit: MDP 3 80.96005692611524 20
average reward exploit: MDP 3 85.85959734356446 20
average reward exploit: MDP 3 95.81047051438978 20
average reward exploit: MDP 3 66.34056403199311 20
average reward explore: 450.03030777285994
average reward exploit: MDP 4 58.906809158709294 20
average reward exploit: MDP 4 59.231438559913364 

average reward explore: 481.80309441360004
average reward exploit: MDP 7 65.74244849004194 20
average reward exploit: MDP 7 67.51172257086992 20
average reward exploit: MDP 7 75.60945244662445 20
average reward exploit: MDP 7 92.33608794107808 20
average reward exploit: MDP 7 98.40850760404085 20
average reward exploit: MDP 7 69.20108257693585 20
average reward exploit: MDP 7 77.19439404889836 20
average reward exploit: MDP 7 104.11635064209649 20
average reward exploit: MDP 7 103.00339585739975 20
average reward exploit: MDP 7 117.31968753095039 20
average reward explore: 669.9099145351696
average reward exploit: MDP 8 71.77422797773895 20
average reward exploit: MDP 8 77.43603308326298 20
average reward exploit: MDP 8 70.87024295209235 20
average reward exploit: MDP 8 84.795936517407 20
average reward exploit: MDP 8 74.56221084970977 20
average reward exploit: MDP 8 71.66923006611948 20
average reward exploit: MDP 8 85.43362251726468 20
average reward exploit: MDP 8 81.00410156714294

average reward exploit: MDP 1 30.21184578711805 20
average reward exploit: MDP 1 33.57550790837801 20
average reward exploit: MDP 1 32.32559522385186 20
average reward exploit: MDP 1 32.64905240366422 20
average reward exploit: MDP 1 32.68557190303833 20
average reward exploit: MDP 1 34.28473195928727 20
average reward exploit: MDP 1 31.709150867715397 20
average reward explore: 649.0918764624114
average reward exploit: MDP 2 35.34575377883914 20
average reward exploit: MDP 2 36.97265810505013 20
average reward exploit: MDP 2 34.94303522911105 20
average reward exploit: MDP 2 36.53952713904228 20
average reward exploit: MDP 2 38.5869471207982 20
average reward exploit: MDP 2 36.38238310165716 20
average reward exploit: MDP 2 35.65062314140838 20
average reward exploit: MDP 2 36.31712467466927 20
average reward exploit: MDP 2 38.336745815764225 20
average reward exploit: MDP 2 39.16434414971124 20
average reward explore: 526.3840222650786
average reward exploit: MDP 3 35.34764579897988 

average reward exploit: MDP 5 58.816891427043785 20
average reward explore: 161.89834391900462
average reward exploit: MDP 6 42.27014613114167 20
average reward exploit: MDP 6 39.06771842885614 20
average reward exploit: MDP 6 38.231073589644566 20
average reward exploit: MDP 6 38.02561033008347 20
average reward exploit: MDP 6 38.70675364396827 20
average reward exploit: MDP 6 37.931312570976004 20
average reward exploit: MDP 6 40.914662377816434 20
average reward exploit: MDP 6 39.893417884911955 20
average reward exploit: MDP 6 35.77036171567227 20
average reward exploit: MDP 6 34.56041147832908 20
average reward explore: 369.6935166951017
average reward exploit: MDP 7 60.81527016464743 20
average reward exploit: MDP 7 57.24815368510092 20
average reward exploit: MDP 7 56.78651392742351 20
average reward exploit: MDP 7 60.18622708440809 20
average reward exploit: MDP 7 56.403883907694535 20
average reward exploit: MDP 7 65.34488725353269 20
average reward exploit: MDP 7 59.955523838

average reward exploit: MDP 0 25.801518171038282 20
average reward exploit: MDP 0 25.78606054990591 20
average reward exploit: MDP 0 25.14980576395776 20
average reward exploit: MDP 0 26.74936985878539 20
average reward exploit: MDP 0 24.556328920108154 20
average reward exploit: MDP 0 24.516896441952195 20
average reward exploit: MDP 0 22.888838942794617 20
average reward exploit: MDP 0 22.93395014202998 20
average reward explore: 337.76558944349824
average reward exploit: MDP 1 35.88177789148384 20
average reward exploit: MDP 1 40.503644821507315 20
average reward exploit: MDP 1 34.66599572974133 20
average reward exploit: MDP 1 37.540648223083515 20
average reward exploit: MDP 1 35.50468076570805 20
average reward exploit: MDP 1 36.595306093128805 20
average reward exploit: MDP 1 37.07897564226071 20
average reward exploit: MDP 1 36.68930891169188 20
average reward exploit: MDP 1 36.62335066805471 20
average reward exploit: MDP 1 36.866483908675015 20
average reward explore: 343.837

average reward exploit: MDP 4 26.720905469525302 20
average reward exploit: MDP 4 25.555825478524806 20
average reward explore: 338.3717942339789
average reward exploit: MDP 5 53.877374810289915 20
average reward exploit: MDP 5 48.570992685862635 20
average reward exploit: MDP 5 50.718408969679295 20
average reward exploit: MDP 5 49.46000855859971 20
average reward exploit: MDP 5 54.70845583176689 20
average reward exploit: MDP 5 54.16971098969842 20
average reward exploit: MDP 5 47.16191923540494 20
average reward exploit: MDP 5 55.471901784184254 20
average reward exploit: MDP 5 62.74822147098139 20
average reward exploit: MDP 5 53.53496156194912 20
average reward explore: 328.9819915548077
average reward exploit: MDP 6 52.55774965478329 20
average reward exploit: MDP 6 54.20587807905888 20
average reward exploit: MDP 6 59.1323231004544 20
average reward exploit: MDP 6 46.0890852142108 20
average reward exploit: MDP 6 46.36891743736233 20
average reward exploit: MDP 6 56.173580700039

average reward exploit: MDP 9 46.81588326758414 20
average reward exploit: MDP 9 48.579034879191866 20
average reward exploit: MDP 9 44.6690593258934 20
average reward exploit: MDP 9 42.09279679025169 20
average reward exploit: MDP 9 47.72713269722696 20
average reward exploit: MDP 9 43.13045957313908 20
average reward exploit: MDP 9 40.6613855261489 20
evaluating on new MDPs
gravity:  [  0.           0.         -15.77824809]
average reward exploit 831.0845796465865 400
epoch number:  55
average reward explore: 525.0214096583078
average reward exploit: MDP 0 49.51527700613577 20
average reward exploit: MDP 0 44.292585912295486 20
average reward exploit: MDP 0 43.07340318812135 20
average reward exploit: MDP 0 45.20582468394863 20
average reward exploit: MDP 0 53.07372484862917 20
average reward exploit: MDP 0 47.2852181618351 20
average reward exploit: MDP 0 50.330737189737256 20
average reward exploit: MDP 0 49.34004201285008 20
average reward exploit: MDP 0 46.918372549811394 20
aver

average reward exploit: MDP 3 29.318109088481158 20
average reward exploit: MDP 3 30.16603159306685 20
average reward exploit: MDP 3 29.49404334836971 20
average reward explore: 524.8775797975928
average reward exploit: MDP 4 34.86671103282056 20
average reward exploit: MDP 4 33.30645337153166 20
average reward exploit: MDP 4 34.53923718832175 20
average reward exploit: MDP 4 35.22373878572675 20
average reward exploit: MDP 4 34.22047004480765 20
average reward exploit: MDP 4 32.90377478245242 20
average reward exploit: MDP 4 34.289010614357736 20
average reward exploit: MDP 4 36.85992529971308 20
average reward exploit: MDP 4 36.052719382073164 20
average reward exploit: MDP 4 36.20412469027299 20
average reward explore: 722.6775275274267
average reward exploit: MDP 5 36.05114280522849 20
average reward exploit: MDP 5 38.62061339014479 20
average reward exploit: MDP 5 38.00259257565092 20
average reward exploit: MDP 5 36.62089886424802 20
average reward exploit: MDP 5 39.6421741991391

average reward exploit: MDP 8 39.5956748968366 20
average reward exploit: MDP 8 37.6083898441091 20
average reward exploit: MDP 8 39.8140481508323 20
average reward exploit: MDP 8 36.52361965442602 20
average reward exploit: MDP 8 36.74138116853838 20
average reward exploit: MDP 8 36.99574298512745 20
average reward exploit: MDP 8 37.68434378878812 20
average reward exploit: MDP 8 34.23476862040945 20
average reward explore: 497.85571159563796
average reward exploit: MDP 9 35.606429764926204 20
average reward exploit: MDP 9 34.816749916060154 20
average reward exploit: MDP 9 35.83804889680184 20
average reward exploit: MDP 9 38.177381264444975 20
average reward exploit: MDP 9 36.62789166853152 20
average reward exploit: MDP 9 37.44237245462865 20
average reward exploit: MDP 9 37.447945677851045 20
average reward exploit: MDP 9 34.56230310252478 20
average reward exploit: MDP 9 36.183152762477754 20
average reward exploit: MDP 9 35.16212568368472 20
evaluating on new MDPs
gravity:  [  0

average reward exploit: MDP 2 36.7218085012345 20
average reward exploit: MDP 2 36.47343357052457 20
average reward exploit: MDP 2 37.12378644653852 20
average reward exploit: MDP 2 36.22732385747398 20
average reward exploit: MDP 2 36.13799765633041 20
average reward explore: 353.9895849438625
average reward exploit: MDP 3 37.04662277323705 20
average reward exploit: MDP 3 38.56363950889577 20
average reward exploit: MDP 3 38.53422815154339 20
average reward exploit: MDP 3 37.524781886494985 20
average reward exploit: MDP 3 37.340486695256814 20
average reward exploit: MDP 3 37.83602427488584 20
average reward exploit: MDP 3 37.815557798228205 20
average reward exploit: MDP 3 37.56303999365431 20
average reward exploit: MDP 3 37.83044429200841 20
average reward exploit: MDP 3 36.94980120840084 20
average reward explore: 516.8197032682554
average reward exploit: MDP 4 20.62391379072188 20
average reward exploit: MDP 4 25.302223175316687 20
average reward exploit: MDP 4 27.4035767414443

average reward exploit: MDP 7 49.97628715172859 20
average reward exploit: MDP 7 50.052705577522126 20
average reward exploit: MDP 7 48.174344710414694 20
average reward exploit: MDP 7 50.80329733615062 20
average reward exploit: MDP 7 49.79986400981221 20
average reward exploit: MDP 7 49.817538636069955 20
average reward exploit: MDP 7 48.43497070236243 20
average reward exploit: MDP 7 47.390380793266196 20
average reward exploit: MDP 7 45.21678350774226 20
average reward exploit: MDP 7 45.42286537569696 20
average reward explore: 474.5622502387647
average reward exploit: MDP 8 38.363964222535344 20
average reward exploit: MDP 8 36.68514380949948 20
average reward exploit: MDP 8 37.882383648715376 20
average reward exploit: MDP 8 36.41412432390227 20
average reward exploit: MDP 8 33.42062189277144 20
average reward exploit: MDP 8 32.163093616849274 20
average reward exploit: MDP 8 30.735622727617265 20
average reward exploit: MDP 8 33.2918356657043 20
average reward exploit: MDP 8 30.

average reward exploit: MDP 1 32.700675342241134 20
average reward exploit: MDP 1 33.565735166807755 20
average reward exploit: MDP 1 33.65884574496565 20
average reward exploit: MDP 1 28.674112438256923 20
average reward exploit: MDP 1 30.918609444929796 20
average reward exploit: MDP 1 32.778307608887516 20
average reward explore: 550.5744144165794
average reward exploit: MDP 2 36.92258399039347 20
average reward exploit: MDP 2 34.164717807741035 20
average reward exploit: MDP 2 36.11049491915675 20
average reward exploit: MDP 2 36.77150126211538 20
average reward exploit: MDP 2 35.156720724534196 20
average reward exploit: MDP 2 34.88650004054686 20
average reward exploit: MDP 2 34.17371618661081 20
average reward exploit: MDP 2 34.22015911154224 20
average reward exploit: MDP 2 36.414445261706945 20
average reward exploit: MDP 2 36.494444450830294 20
average reward explore: 237.98619591185712
average reward exploit: MDP 3 35.55193111482863 20
average reward exploit: MDP 3 36.462930

average reward explore: 259.0611704956443
average reward exploit: MDP 6 34.65615881273809 20
average reward exploit: MDP 6 34.85634130035286 20
average reward exploit: MDP 6 33.876050466067326 20
average reward exploit: MDP 6 34.2434074109735 20
average reward exploit: MDP 6 34.790532732647364 20
average reward exploit: MDP 6 34.4825374837226 20
average reward exploit: MDP 6 34.98392849919477 20
average reward exploit: MDP 6 34.31144252576196 20
average reward exploit: MDP 6 33.66087761639074 20
average reward exploit: MDP 6 34.03240376080555 20
average reward explore: 389.4145251136484
average reward exploit: MDP 7 35.229453786132204 20
average reward exploit: MDP 7 34.96561985454873 20
average reward exploit: MDP 7 34.69235785393222 20
average reward exploit: MDP 7 34.04482666475646 20
average reward exploit: MDP 7 33.509262389035406 20
average reward exploit: MDP 7 33.7556825549136 20
average reward exploit: MDP 7 35.87290332269536 20
average reward exploit: MDP 7 33.06976425438207 

average reward exploit: MDP 0 14.331497491976517 20
average reward exploit: MDP 0 13.195362448660408 20
average reward exploit: MDP 0 14.115975468023105 20
average reward exploit: MDP 0 14.604508647103458 20
average reward exploit: MDP 0 14.43072746932901 20
average reward exploit: MDP 0 11.970690763801352 20
average reward exploit: MDP 0 15.67061350207259 20
average reward exploit: MDP 0 10.507216812394661 20
average reward explore: 567.9369301463005
average reward exploit: MDP 1 13.748456433863035 20
average reward exploit: MDP 1 10.850833937658605 20
average reward exploit: MDP 1 16.4375455390875 20
average reward exploit: MDP 1 12.926749368968595 20
average reward exploit: MDP 1 12.559045434296461 20
average reward exploit: MDP 1 13.247711958856353 20
average reward exploit: MDP 1 17.00545258457292 20
average reward exploit: MDP 1 15.759339963360626 20
average reward exploit: MDP 1 17.785031987620187 20
average reward exploit: MDP 1 18.741264620703948 20
average reward explore: 659

average reward exploit: MDP 4 31.990416826586248 20
average reward exploit: MDP 4 34.0847769770403 20
average reward explore: 356.8682486791846
average reward exploit: MDP 5 36.44759378316651 20
average reward exploit: MDP 5 36.87443093344556 20
average reward exploit: MDP 5 37.02821934286732 20
average reward exploit: MDP 5 35.61128047826917 20
average reward exploit: MDP 5 36.10839604329041 20
average reward exploit: MDP 5 37.039785760893885 20
average reward exploit: MDP 5 36.5953863996071 20
average reward exploit: MDP 5 35.4329877569684 20
average reward exploit: MDP 5 36.367815503228215 20
average reward exploit: MDP 5 35.450561655134415 20
average reward explore: 417.15239416452914
average reward exploit: MDP 6 35.595006006778604 20
average reward exploit: MDP 6 33.81471767113099 20
average reward exploit: MDP 6 34.98568268747903 20
average reward exploit: MDP 6 35.39879272028545 20
average reward exploit: MDP 6 35.0413382524148 20
average reward exploit: MDP 6 34.93375384318951

average reward exploit: MDP 9 38.66334581363155 20
average reward exploit: MDP 9 38.5433767860712 20
average reward exploit: MDP 9 38.374782835354715 20
average reward exploit: MDP 9 37.02711295086443 20
average reward exploit: MDP 9 36.52405432771653 20
average reward exploit: MDP 9 36.49050045675248 20
average reward exploit: MDP 9 37.854017966930115 20
evaluating on new MDPs
gravity:  [ 0.          0.         -6.90227485]
average reward exploit 763.0263286142522 400
epoch number:  68
average reward explore: 372.9373809231763
average reward exploit: MDP 0 35.56999468083882 20
average reward exploit: MDP 0 33.88002931793574 20
average reward exploit: MDP 0 34.716381523093986 20
average reward exploit: MDP 0 33.78218429850678 20
average reward exploit: MDP 0 34.0367644614709 20
average reward exploit: MDP 0 35.26345782108796 20
average reward exploit: MDP 0 33.85774631229651 20
average reward exploit: MDP 0 33.72545920322765 20
average reward exploit: MDP 0 34.20739612057737 20
average

average reward exploit: MDP 3 -75.36413344887154 20
average reward exploit: MDP 3 -72.74686544999619 20
average reward exploit: MDP 3 -72.5803338175858 20
average reward exploit: MDP 3 -72.2060774456836 20
average reward explore: 350.4011090341524
average reward exploit: MDP 4 -94.66310964353944 20
average reward exploit: MDP 4 -91.15538130670254 20
average reward exploit: MDP 4 -101.72027636674552 20
average reward exploit: MDP 4 -106.8212460745244 20
average reward exploit: MDP 4 -96.29710565583416 20
average reward exploit: MDP 4 -83.5329264302967 20
average reward exploit: MDP 4 -75.79806305661685 20
average reward exploit: MDP 4 -67.77059224367721 20
average reward exploit: MDP 4 -59.21632478218203 20
average reward exploit: MDP 4 -51.03959790011949 20
average reward explore: 339.92824299870415
average reward exploit: MDP 5 -29.507903465232953 20
average reward exploit: MDP 5 -26.929124072522 20
average reward exploit: MDP 5 -25.710982321684533 20
average reward exploit: MDP 5 -24

average reward explore: 192.11367061985072
average reward exploit: MDP 8 1.5036375752613447 20
average reward exploit: MDP 8 6.997958786945498 20
average reward exploit: MDP 8 5.892738957717631 20
average reward exploit: MDP 8 6.5994518519964505 20
average reward exploit: MDP 8 6.9143889724271785 20
average reward exploit: MDP 8 10.094043876172176 20
average reward exploit: MDP 8 8.94008734372614 20
average reward exploit: MDP 8 12.537465196891434 20
average reward exploit: MDP 8 13.944842468482346 20
average reward exploit: MDP 8 16.306450658559665 20
average reward explore: 412.5217944607997
average reward exploit: MDP 9 2.173726930876951 20
average reward exploit: MDP 9 5.3831375410287166 20
average reward exploit: MDP 9 6.667714384372014 20
average reward exploit: MDP 9 4.604558367298542 20
average reward exploit: MDP 9 7.2787795702589735 20
average reward exploit: MDP 9 10.113295453247135 20
average reward exploit: MDP 9 9.78906821136343 20
average reward exploit: MDP 9 9.44763084

average reward exploit: MDP 2 20.635007623710706 20
average reward exploit: MDP 2 19.44857639497503 20
average reward exploit: MDP 2 15.813282591001803 20
average reward exploit: MDP 2 17.81463766479946 20
average reward exploit: MDP 2 19.298820120483278 20
average reward exploit: MDP 2 18.393512613079857 20
average reward exploit: MDP 2 14.333178088962095 20
average reward exploit: MDP 2 13.006892396510645 20
average reward exploit: MDP 2 15.916709662280564 20
average reward explore: 591.5823133679095
average reward exploit: MDP 3 16.22255837578161 20
average reward exploit: MDP 3 15.402622498255514 20
average reward exploit: MDP 3 15.234021333712931 20
average reward exploit: MDP 3 15.418608404594584 20
average reward exploit: MDP 3 14.488449258137758 20
average reward exploit: MDP 3 13.423616052334006 20
average reward exploit: MDP 3 10.355264351876333 20
average reward exploit: MDP 3 13.233052386648922 20
average reward exploit: MDP 3 13.053680005756329 20
average reward exploit: M

average reward exploit: MDP 6 38.16357148952478 20
average reward exploit: MDP 6 37.9280542354865 20
average reward exploit: MDP 6 37.25562737687567 20
average reward explore: 390.7080103963225
average reward exploit: MDP 7 39.36004976517666 20
average reward exploit: MDP 7 38.10410384255044 20
average reward exploit: MDP 7 36.575471023251595 20
average reward exploit: MDP 7 38.33313236409473 20
average reward exploit: MDP 7 36.915635188783725 20
average reward exploit: MDP 7 37.537047162300524 20
average reward exploit: MDP 7 38.196157044333596 20
average reward exploit: MDP 7 38.19648574121646 20
average reward exploit: MDP 7 36.8630409632483 20
average reward exploit: MDP 7 38.63951859606225 20
average reward explore: 428.213371074072
average reward exploit: MDP 8 38.122223467900085 20
average reward exploit: MDP 8 37.307177018845806 20
average reward exploit: MDP 8 36.47238106516964 20
average reward exploit: MDP 8 37.26132924834264 20
average reward exploit: MDP 8 36.8233982896040

average reward explore: 402.3915998219426
average reward exploit: MDP 1 29.68196169711569 20
average reward exploit: MDP 1 29.562007405042202 20
average reward exploit: MDP 1 30.31313323320369 20
average reward exploit: MDP 1 28.865856960499666 20
average reward exploit: MDP 1 28.559571579543586 20
average reward exploit: MDP 1 28.17263905992285 20
average reward exploit: MDP 1 27.38064275856026 20
average reward exploit: MDP 1 26.708776317551532 20
average reward exploit: MDP 1 28.11808892745435 20
average reward exploit: MDP 1 27.43703796173407 20
average reward explore: 464.5135953610576
average reward exploit: MDP 2 21.854009031639492 20
average reward exploit: MDP 2 21.241947620138014 20
average reward exploit: MDP 2 20.60417743845671 20
average reward exploit: MDP 2 21.598435111685607 20
average reward exploit: MDP 2 22.642043425710035 20
average reward exploit: MDP 2 20.020247320095674 20
average reward exploit: MDP 2 21.638273184323094 20
average reward exploit: MDP 2 22.286916

average reward exploit: MDP 5 40.13296305065578 20
average reward exploit: MDP 5 39.404661209418606 20
average reward exploit: MDP 5 40.237851165071525 20
average reward exploit: MDP 5 41.52763936497434 20
average reward exploit: MDP 5 40.14015417201489 20
average reward explore: 429.75661871094263
average reward exploit: MDP 6 41.44094853249773 20
average reward exploit: MDP 6 40.75067202928494 20
average reward exploit: MDP 6 40.81672775851352 20
average reward exploit: MDP 6 42.16016436248183 20
average reward exploit: MDP 6 39.590896908406634 20
average reward exploit: MDP 6 39.671099344582856 20
average reward exploit: MDP 6 42.05131707227315 20
average reward exploit: MDP 6 43.250501839788754 20
average reward exploit: MDP 6 42.05710136253173 20
average reward exploit: MDP 6 41.822936502335935 20
average reward explore: 397.66489670909107
average reward exploit: MDP 7 38.05869253536035 20
average reward exploit: MDP 7 38.270752905733346 20
average reward exploit: MDP 7 37.7577228

gravity:  [  0.           0.         -16.31083106]
average reward exploit 735.2798244307932 400
epoch number:  78
average reward explore: 345.13854053776646
average reward exploit: MDP 0 40.255280258387906 20
average reward exploit: MDP 0 38.392663436218726 20
average reward exploit: MDP 0 39.92705243094365 20
average reward exploit: MDP 0 39.18138614877407 20
average reward exploit: MDP 0 40.79654018143125 20
average reward exploit: MDP 0 40.812848517164944 20
average reward exploit: MDP 0 41.21652869348749 20
average reward exploit: MDP 0 41.18283341523304 20
average reward exploit: MDP 0 42.50454093304213 20
average reward exploit: MDP 0 41.06864260753833 20
average reward explore: 380.3240810859887
average reward exploit: MDP 1 41.84061548702947 20
average reward exploit: MDP 1 43.04997813944455 20
average reward exploit: MDP 1 40.63761827002195 20
average reward exploit: MDP 1 41.36174156415412 20
average reward exploit: MDP 1 43.719676939660175 20
average reward exploit: MDP 1 44

average reward exploit: MDP 4 39.107724550278206 20
average reward exploit: MDP 4 40.46164704754941 20
average reward exploit: MDP 4 41.658600385891596 20
average reward exploit: MDP 4 38.814273267715805 20
average reward exploit: MDP 4 40.07619097733276 20
average reward exploit: MDP 4 39.918320907084166 20
average reward exploit: MDP 4 40.85545160946658 20
average reward explore: 503.48699042595126
average reward exploit: MDP 5 33.51917999118659 20
average reward exploit: MDP 5 34.34252687698202 20
average reward exploit: MDP 5 32.447668688685155 20
average reward exploit: MDP 5 31.610090430134203 20
average reward exploit: MDP 5 29.59003184370338 20
average reward exploit: MDP 5 30.067701764959473 20
average reward exploit: MDP 5 25.64981777966657 20
average reward exploit: MDP 5 29.76551044436444 20
average reward exploit: MDP 5 29.693268651487863 20
average reward exploit: MDP 5 34.42166597654693 20
average reward explore: 205.68216281000232
average reward exploit: MDP 6 42.134642

average reward exploit: MDP 8 28.282272585507133 20
average reward explore: 605.7543762922581
average reward exploit: MDP 9 30.987915069218616 20
average reward exploit: MDP 9 32.4212723080586 20
average reward exploit: MDP 9 31.915753346622722 20
average reward exploit: MDP 9 32.42959308069246 20
average reward exploit: MDP 9 33.481759532472026 20
average reward exploit: MDP 9 33.41089744340023 20
average reward exploit: MDP 9 33.730333182452895 20
average reward exploit: MDP 9 33.81382350476576 20
average reward exploit: MDP 9 33.12364724991622 20
average reward exploit: MDP 9 34.19589678819613 20
evaluating on new MDPs
gravity:  [  0.          0.        -24.9432616]
average reward exploit 495.4917118549235 400
epoch number:  81
average reward explore: 698.111974824366
average reward exploit: MDP 0 26.97465865112316 20
average reward exploit: MDP 0 27.7292353916592 20
average reward exploit: MDP 0 28.434158417211115 20
average reward exploit: MDP 0 29.49006781847977 20
average reward

average reward exploit: MDP 3 35.26733123121928 20
average reward exploit: MDP 3 36.23504763725587 20
average reward exploit: MDP 3 39.00123971104057 20
average reward exploit: MDP 3 35.24313942455955 20
average reward exploit: MDP 3 36.98171191680188 20
average reward exploit: MDP 3 37.5660465086274 20
average reward exploit: MDP 3 38.60034160311937 20
average reward exploit: MDP 3 39.074545328334764 20
average reward exploit: MDP 3 37.62159934445111 20
average reward explore: 661.3353318971048
average reward exploit: MDP 4 36.7761742713716 20
average reward exploit: MDP 4 35.320362079172085 20
average reward exploit: MDP 4 36.558824404284444 20
average reward exploit: MDP 4 35.262913168857025 20
average reward exploit: MDP 4 37.74862877455413 20
average reward exploit: MDP 4 37.29494375960586 20
average reward exploit: MDP 4 36.92757710102992 20
average reward exploit: MDP 4 36.39154542291804 20
average reward exploit: MDP 4 36.38910191015379 20
average reward exploit: MDP 4 37.09995

average reward exploit: MDP 7 35.605314426346325 20
average reward exploit: MDP 7 36.253986513936205 20
average reward exploit: MDP 7 36.60600632896799 20
average reward explore: 512.5000541748444
average reward exploit: MDP 8 35.5459071540176 20
average reward exploit: MDP 8 36.92031445346015 20
average reward exploit: MDP 8 36.11878991647765 20
average reward exploit: MDP 8 36.217523864533895 20
average reward exploit: MDP 8 37.10101457048161 20
average reward exploit: MDP 8 37.382440682227454 20
average reward exploit: MDP 8 37.542451627179965 20
average reward exploit: MDP 8 38.31360822416643 20
average reward exploit: MDP 8 38.099292221714315 20
average reward exploit: MDP 8 39.835015930367796 20
average reward explore: 603.0063755630965
average reward exploit: MDP 9 43.38515897607233 20
average reward exploit: MDP 9 41.658651270443116 20
average reward exploit: MDP 9 44.53475344574626 20
average reward exploit: MDP 9 46.189629007279514 20
average reward exploit: MDP 9 46.95432696

average reward exploit: MDP 2 113.2411187896067 20
average reward exploit: MDP 2 84.98834586488336 20
average reward exploit: MDP 2 83.36144330911146 20
average reward exploit: MDP 2 101.1714247431327 20
average reward exploit: MDP 2 104.11910242269616 20
average reward exploit: MDP 2 89.64933304435064 20
average reward exploit: MDP 2 98.92681000219831 20
average reward exploit: MDP 2 112.16437354442829 20
average reward exploit: MDP 2 101.62784421850317 20
average reward exploit: MDP 2 102.87186174215648 20
average reward explore: 318.836877511507
average reward exploit: MDP 3 72.82025403198534 20
average reward exploit: MDP 3 73.18437655904819 20
average reward exploit: MDP 3 76.60705720881786 20
average reward exploit: MDP 3 78.53066250140725 20
average reward exploit: MDP 3 73.33187440226837 20
average reward exploit: MDP 3 75.54743788162185 20
average reward exploit: MDP 3 72.81062520172694 20
average reward exploit: MDP 3 68.89977358036137 20
average reward exploit: MDP 3 67.8539

KeyboardInterrupt: 

In [None]:
a.lr = 3e-3
num_epochs = 50
for epoch in range(num_epochs):
        print("epoch number: ", epoch)
        a.train_step()

In [123]:
explore_paths, explore_rewards = a.sample_paths_explore(a.env,1)

In [124]:
print("average reward explore:", np.mean(explore_rewards), len(explore_rewards))
M = a.stack_trajectories(explore_paths)

average reward explore: 151.57677928797034 400


In [125]:
Z = a.sess.run(a.Z, feed_dict = {a.encoder_input_placeholder: M,a.sequence_length_placeholder: a.length })

In [126]:
Z = np.reshape(Z,[1,len(Z)])

In [127]:
exploit_paths, exploit_rewards = a.sample_paths_exploit(a.env,Z,1)

In [128]:
print("average reward exploit", np.sum(exploit_rewards) / num_traj, len(exploit_rewards))

average reward exploit 18.577710931432883 400


In [122]:
a.num_traj = 400

In [83]:
for i in range(10):
    maxrev2 = 0
    maxrev1 = 0
    traj = np.random.rand()*(i+1)
    
    for i in range(4):
        explore_paths, explore_rewards = a.sample_paths_explore(a.env,traj)
        #print("average reward explore:", np.mean(explore_rewards))
        M = a.stack_trajectories(explore_paths)
        Z = a.sess.run(a.Z, feed_dict = {a.encoder_input_placeholder: M,a.sequence_length_placeholder: a.length })
        Z = np.reshape(Z,[1,len(Z)])
        exploit_paths, explore_rewards = a.sample_paths_exploit(a.env,Z,traj)
        if(np.mean(explore_rewards) > maxrev1):
            maxrev1 = np.mean(explore_rewards) / num_traj
        if(np.mean(exploit_rewards)>maxrev2):
            maxrev2 = np.mean(exploit_rewards) / num_traj
    print("gravity:", a.sim.model.opt.gravity)
    print("average reward explore", maxrev1)
    print("average reward exploit", maxrev2)
    

gravity: [ 0.         0.        -5.4325712]
average reward explore 3.5619071147766848
average reward exploit 2.7301699712255383


KeyboardInterrupt: 

In [90]:
explore_paths, explore_rewards = a.sample_paths_explore(a.env,5)
observations_explore = np.concatenate([path["observation"] for path in explore_paths])
new_observations_explore = np.concatenate([path["new_obs"] for path in explore_paths])
new_actions_explore = np.concatenate([path["new_acs"] for path in explore_paths])
actions_explore = np.concatenate([path["action"] for path in explore_paths])
rewards_explore = np.concatenate([path["reward"] for path in explore_paths])
returns_explore = a.get_returns(explore_paths, explore = True)

In [91]:
M = np.array(a.stack_trajectories(explore_paths))
Z = a.sess.run(a.Z, feed_dict = {a.encoder_input_placeholder: M,a.sequence_length_placeholder: a.length })
Z = np.reshape(Z,[1,len(Z)])

In [92]:
 print("average reward explore: MDP", np.sum(explore_rewards)/num_traj, len(explore_rewards))

average reward explore: MDP 20488.85105031593 800


In [93]:
exploit_paths, exploit_rewards = a.sample_paths_exploit(a.env,Z,5)

In [94]:
print("average reward exploit: MDP", np.sum(exploit_rewards) / num_traj, len(exploit_rewards))

average reward exploit: MDP 644.4290518284578 400


In [None]:
def randomize_pendulum(env):
    m = config['pendulum_mass_min'] + np.random.rand()*(config['pendulum_mass_max'] - config['pendulum_mass_min'])
    l = config['pendulum_len_min'] + np.random.rand()*(config['pendulum_len_max'] - config['pendulum_len_min'])
    env.m = m
    env.l = l

In [None]:
def randomize_hopper(env):
    ts = config['torso_min'] + np.random.rand()*(config['torso_max'] - config['torso_min'])
    f = config['friction_min'] + np.random.rand()*(config['friction_max'] - config['friction_min'])
    
    env.friction = f
    env.torso_size = ts
    env.apply_env_modifications()

In [None]:
import yaml
cfg_filename = 'hopper-config.yml'
with open(cfg_filename,'r') as ymlfile:
    config = yaml.load(ymlfile)

In [None]:
env = Randomizer(gym.make('Hopper-v2'), randomize_hopper)
env.unwrapped.__dict__.keys()
env.reset()

In [None]:
m = config['pendulum_mass_min'] + np.random.rand()*(config['pendulum_mass_max'] - config['pendulum_mass_min'])

In [None]:
config

In [None]:
env_id = 'Hopper-v2'
e = gym.make(env_id)

In [None]:
e.unwrapped.__dict__.keys()

In [None]:
e.unwrapped.apply_env_modifications()

In [7]:
a.sim.model.opt

<mujoco_py.cymj.PyMjOption at 0x1c23ba8ac8>