In [60]:
# Include Libraries
import numpy as np
import tensorflow as tf
import gym
import tensorflow.contrib.layers as layers
from mujoco_py import load_model_from_xml, MjSim, MjViewer
import policies
import value_functions as vfuncs
import utils_pg as utils

# Environment setup
#env = "CartPole-v0"
#env="InvertedPendulum-v2"
env = "HalfCheetah-v2"

# discrete = isinstance(env.action_space, gym.spaces.Discrete)
# observation_dim = env.observation_space.shape[0]
# action_dim = env.action_space.n if discrete else env.action_space.shape[0]
max_ep_len = 1000
num_traj = 20
#traj_length = max_ep_len*(observation_dim + 2)
latent_size = 100
use_baseline = True



In [61]:
# Feed forward network (multi-layer-perceptron, or mlp)

def build_mlp(mlp_input,output_size,scope,n_layers,size,output_activation=None):
    '''
    Build a feed forward network
    '''
    Input = mlp_input
    with tf.variable_scope(scope):
        # Dense Layers
        for i in range(n_layers-1):
            dense = tf.layers.dense(inputs = Input, units = size, activation = tf.nn.relu, bias_initializer=tf.constant_initializer(1.0))
            Input = dense
        # Fully Connected Layer
        out = layers.fully_connected(inputs = Input, num_outputs = output_size, activation_fn=output_activation)
    return out

In [62]:
class MetaLearner():
    def __init__(self, env, max_ep_len, num_traj,latent_size ):
        self.env = gym.make(env)
        self.discrete = isinstance(self.env.action_space, gym.spaces.Discrete)
        self.observation_dim = self.env.observation_space.shape[0]
        self.action_dim = self.env.action_space.n if self.discrete else self.env.action_space.shape[0]
        self.max_ep_len = max_ep_len
        self.num_traj = num_traj
        self.traj_length = self.max_ep_len*(self.observation_dim + 2) # TO Change
        self.use_baseline = True
        self.latent_size = latent_size
        self.feature_size = self.observation_dim + 1 + self.env.action_space.shape[0] # HC
        self.lr = 3e-3
        self.num_layers = 2
        self.layers_size = 32
        self.num_mdps = 10
        self.model = self.env.env.model
        self.sim = MjSim(self.model)
        self.gamma = 0.95
        self.decoder_len = 32

        # build model
        self.ConstructGraph()
    
    def add_placeholders(self):
        self.observation_placeholder_explore = tf.placeholder(tf.float32, shape=(None,self.observation_dim))
        if(self.discrete):
            self.action_placeholder_explore = tf.placeholder(tf.int32, shape=(None))
            self.action_placeholder_exploit = tf.placeholder(tf.int32, shape=(None))
        else:
            self.action_placeholder_explore = tf.placeholder(tf.float32, shape=(None,self.action_dim))
            self.action_placeholder_exploit= tf.placeholder(tf.float32, shape=(None,self.action_dim))

        self.baseline_target_placeholder = tf.placeholder(tf.float32, shape= None)
        self.advantage_placeholder_explore = tf.placeholder(tf.float32, shape=(None))
        
        #self.encoder_input_placeholder = tf.placeholder(tf.float32, shape= (self.num_traj,self.traj_length))
        self.encoder_input_placeholder = tf.placeholder(tf.float32, [None, None, self.feature_size])
        self.decoder_input_placeholder = tf.placeholder(tf.float32, shape= (1,self.latent_size))
        self.sequence_length_placeholder = tf.placeholder(tf.int32, [None, ])
        
        self.observation_placeholder_exploit = tf.placeholder(tf.float32, shape=(None,self.observation_dim))
        #TODO
        self.advantage_placeholder_exploit = tf.placeholder(tf.float32, shape=(None))
        
        
    def build_policy_explore(self, scope = "policy_explore"):
        if (self.discrete):
            self.action_logits = build_mlp(self.observation_placeholder_explore,self.action_dim,scope = scope,n_layers=self.num_layers,size = self.layers_size,output_activation=None)
            self.explore_action = tf.multinomial(self.action_logits,1)
            self.explore_action = tf.squeeze(self.explore_action, axis=1)
            self.explore_logprob = -tf.nn.sparse_softmax_cross_entropy_with_logits(logits = self.action_logits, labels = self.action_placeholder_explore)

        else:   
            action_means = build_mlp(self.observation_placeholder_explore,self.action_dim,scope,n_layers=self.num_layers, size = self.layers_size,output_activation=None)
            init = tf.constant(np.random.rand(1, 2))
            #log_std = tf.get_variable("log_std", [self.action_dim])
            log_std = tf.get_variable(
                'log_std',
                shape=[1, self.action_dim],
                dtype=tf.float32,
                initializer=tf.zeros_initializer(),
                trainable=True
            )
            log_std = tf.log(tf.exp(log_std) + 1.)
            log_std = tf.tile(log_std,
                                   [tf.shape(action_means)[0], 1])
            self.explore_action = tf.random_normal((1,),
                                                   action_means,
                                                   log_std)
            dist = tf.contrib.distributions.MultivariateNormalDiag(action_means, log_std)
            self.explore_logprob = dist.prob(self.action_placeholder_explore, name = 'log_prob')
            #self.explore_action =   action_means + tf.multiply(tf.exp(log_std),tf.random_normal(shape = (self.action_dim,1),mean=0,stddev=1))
            #mvn = tf.contrib.distributions.MultivariateNormalDiag(action_means, tf.exp(log_std))
            #self.explore_logprob =  mvn.log_prob(value = self.action_placeholder_explore, name='log_prob')

    
    
    def build_policy_exploit(self, scope = "policy_exploit"):
        if(self.discrete):
            #self.exploit_action_logits = (tf.matmul(tf.nn.relu(tf.matmul(tf.nn.relu(tf.matmul(self.observation_placeholder_exploit,self.d_W1) + self.d_B1), self.d_W2) + self.d_B2),self.d_W3) + self.d_B3)
            #self.exploit_action_logits = tf.matmul(self.observation_placeholder_exploit,self.d_W3) + self.d_B3
            self.exploit_action_logits = tf.matmul(tf.nn.relu(tf.matmul(self.observation_placeholder_exploit,self.d_W2) + self.d_B2), self.d_W3) + self.d_B3
            self.exploit_action = tf.multinomial(self.exploit_action_logits,1)
            self.exploit_action = tf.squeeze(self.exploit_action, axis=1)
            self.exploit_logprob = -tf.nn.sparse_softmax_cross_entropy_with_logits(logits = self.exploit_action_logits, labels = self.action_placeholder_exploit)
        else:
            action_means = tf.matmul(tf.nn.relu(tf.matmul(self.observation_placeholder_exploit,self.d_W2) + self.d_B2), self.d_W3) + self.d_B3
            init = tf.constant(np.random.rand(1, 2))
            log_std = tf.get_variable("exploit_log_prob", [self.action_dim])
            self.exploit_action = action_means + tf.multiply(tf.exp(log_std),tf.random_normal(shape = (self.action_dim,1),mean=0,stddev=1))
            self.exploit_logprob = utils.gauss_log_prob(mu=action_means, logstd=log_std, x=self.action_placeholder_exploit)
            
#             init = tf.constant(np.random.rand(1, 2))
#             log_std = tf.get_variable("exploit_log_prob", [self.action_dim])
#             self.exploit_action =   action_means + tf.multiply(tf.exp(log_std),tf.random_normal(shape = (self.action_dim,1),mean=0,stddev=1))
#             mvn = tf.contrib.distributions.MultivariateNormalDiag(action_means, tf.exp(log_std))
#             self.exploit_logprob =  mvn.log_prob(value = self.action_placeholder_exploit, name='exploit_log_prob')

        #self.loss_grads_exploit = self.exploit_logprob * self.advantage_placeholder_exploit
        
    def NNEncoder(self, scope = "NNEncoder"):
        self.Z = build_mlp(self.encoder_input_placeholder,self.latent_size,scope = scope,n_layers=3,size = 60,output_activation=None)
    
 

    # input [num_traj, length, features (obs + action + reward) ]
    def LSTMEncoder(self, scope = "LSTMEncoder"):
        self.hidden_size = 64
        initializer = tf.random_uniform_initializer(-1, 1)
        cell = tf.nn.rnn_cell.LSTMCell(self.hidden_size, self.feature_size, initializer=initializer)
        cell_out = tf.contrib.rnn.OutputProjectionWrapper(cell, self.latent_size)
        self.output, _ = tf.nn.dynamic_rnn(cell_out,self.encoder_input_placeholder,self.sequence_length_placeholder,dtype=tf.float32,)
        batch_size = tf.shape(self.output)[0]
        max_length = tf.shape(self.output)[1]
        out_size = int(self.output.get_shape()[2])
        index = tf.range(0, batch_size) * max_length + (self.sequence_length_placeholder - 1)
        flat = tf.reshape(self.output, [-1, out_size])
        self.Z = tf.reduce_mean(tf.gather(flat, index), axis = 0)
        
    
    def Decoder(self, decoder_out_dim, scope):
        return build_mlp(self.decoder_input_placeholder,decoder_out_dim,scope = scope,n_layers=1,size = 64,output_activation=None)
    
    
    def sample_paths_explore(self, env,mdp_num,Test = False, num_episodes = None):
        paths = []
        self.length = []
        episode_rewards = []
        #self.sim.model.opt.gravity[-1] = -5 - mdp_num
        for i in range(self.num_traj):
            pad = False
            state = env.reset()
            new_states,states,new_actions, actions, rewards = [], [], [], [], []
            episode_reward = 0
            for step in range(self.max_ep_len):
                if (pad):
                    states.append([0]*self.observation_dim)
                    if(self.discrete):
                        actions.append(0)
                    else:
                        actions.append([0]*self.action_dim)
                    rewards.append(0)
                else:
                    states.append(state)
                    new_states.append(state)
                    #action = self.sess.run(self.explore_action, feed_dict={self.observation_placeholder_explore : states[-1][None]})[0]
                    action = self.policyfn.sample_action(state) #NEW
                    state, reward, done, info = env.step(action)
                    actions.append(action)
                    new_actions.append(action)
                    rewards.append(reward)
                    episode_reward += reward
                    if (done or step == self.max_ep_len-1):
                        episode_rewards.append(episode_reward)
                        self.length.append(step + 1)
                        pad = True 
                        
            episode_rewards.append(episode_reward)            
            #print("explore",np.array(actions))
            path = {"new_obs" : np.array(new_states),"observation" : np.array(states),
                                "reward" : np.array(rewards),"new_acs" : np.array(new_actions),
                                "action" : np.array(actions)}
            paths.append(path)
        return paths, episode_rewards
    
    def sample_paths_exploit(self, env,Z,mdp_num,Test = False, num_episodes = None):
        #self.sim.model.opt.gravity[-1] = -5 - mdp_num
        
        paths = []
        num = 0
        episode_rewards = []
        for i in range(self.num_traj):
            state = env.reset()
            states, actions, rewards = [], [], []
            episode_reward = 0
            for step in range(self.max_ep_len):
                states.append(state)
                action = self.sess.run(self.exploit_action, feed_dict={self.observation_placeholder_exploit : state[None], self.decoder_input_placeholder: Z})[0]
                state, reward, done, info = env.step(action)
                actions.append(action)
                rewards.append(reward)
                episode_reward += reward
                if (done or step == self.max_ep_len-1):
                    episode_rewards.append(episode_reward)
                    break
            #print("exploit",np.array(actions))
            path = {"observation" : np.array(states),
                                "reward" : np.array(rewards),
                                "action" : np.array(actions)}
            paths.append(path)
 
        #print("exploit success: ", num)
        return paths, episode_rewards
    
    def get_returns(self,paths, explore = False):
        all_returns = []
        m = 0
        for path in paths:
            rewards = path["reward"]
            returns = []
            if(explore):
                length = self.length[m]
            else:
                length = len(rewards)
            for i in range(length):
                path_returns = 0
                k = 0
                for j in range(i,length):
                    path_returns = path_returns + rewards[j]*(self.gamma)**k
                    k = k+1
                returns.append(path_returns)
            all_returns.append(returns)
            m+=1
        returns = np.concatenate(all_returns)
        return returns
    
    def stack_trajectories(self,paths):
        trajectories = []
        for path in paths:
            rewards = path["reward"]
            states = path["observation"]
            action = path["action"]
            
            SAR = []
            for i in range(len(states)):
                SAR.append(list(states[i]) + list(action[i]) + [rewards[i]])
            trajectories.append(SAR)
        return np.array(trajectories)
    
    def addBaseline(self):
        self.baseline = build_mlp(self.observation_placeholder_explore,1,scope = "baseline",n_layers=self.num_layers, size = self.layers_size,output_activation=None)
        self.baseline_loss = tf.losses.mean_squared_error(self.baseline_target_placeholder,self.baseline,scope = "baseline")
        baseline_adam_optimizer =  tf.train.AdamOptimizer(learning_rate = self.lr)
        self.update_baseline_op = baseline_adam_optimizer.minimize(self.baseline_loss)

    def calculate_advantage(self,returns, observations):
        if (self.use_baseline):
            baseline = self.sess.run(self.baseline, {input_placeholder:observations})
            adv = returns - baseline
            adv = (adv - np.mean(adv))/np.std(adv)
        else:
            adv = returns
        return adv
    

        
    def ConstructGraph(self):
        tf.reset_default_graph()
        self.sess = tf.Session()
        
        self.vf = vfuncs.NnValueFunction(session=self.sess,ob_dim=self.observation_dim)
        self.policyfn = policies.GaussianPolicy(self.sess,self.observation_dim, self.action_dim)
        
        self.add_placeholders()
        
#         self.add_placeholders()
        
#         self.build_policy_explore()
#         self.explore_policy_loss = -tf.reduce_sum(self.explore_logprob * self.advantage_placeholder_explore)
#         self.loss_grads_explore = -self.explore_logprob * self.advantage_placeholder_explore
#         self.tvars_explore = tf.trainable_variables()
#         self.gradients_explore = tf.gradients(self.explore_policy_loss,self.tvars_explore)
        
#         #self.addBaseline()
        
#         self.baseline = build_mlp(self.observation_placeholder_explore,1,scope = "baseline",n_layers=1, size = 16,output_activation=None)
#         self.baseline_loss = tf.losses.mean_squared_error(self.baseline_target_placeholder,self.baseline,scope = "baseline")
#         baseline_adam_optimizer =  tf.train.AdamOptimizer(learning_rate = self.lr)
#         self.update_baseline_op = baseline_adam_optimizer.minimize(self.baseline_loss)
        
#         #Encoder LSTM
        self.LSTMEncoder()
        
        
        #decoder weights
        #self.d_W1 = self.Decoder(scope = "W1", decoder_out_dim = self.observation_dim*self.decoder_len)
        self.d_W2 = self.Decoder(scope = "W2", decoder_out_dim = self.observation_dim*self.decoder_len)
        #self.d_W3 = self.Decoder(scope = "W3", decoder_out_dim = self.decoder_len*action_dim)
        self.d_W3 = self.Decoder(scope = "W3", decoder_out_dim = self.decoder_len*self.action_dim)
        
        #self.d_W1 = ((self.d_W1 - (tf.reduce_max(self.d_W1) + tf.reduce_min(self.d_W1))/2)/(tf.reduce_max(self.d_W1) - tf.reduce_min(self.d_W1)))*2 
        #self.d_W2 = ((self.d_W2 - (tf.reduce_max(self.d_W2) + tf.reduce_min(self.d_W2))/2)/(tf.reduce_max(self.d_W2) - tf.reduce_min(self.d_W2)))*2 
        #self.d_W3 = ((self.d_W3 - (tf.reduce_max(self.d_W3) + tf.reduce_min(self.d_W3))/2)/(tf.reduce_max(self.d_W3) - tf.reduce_min(self.d_W3)))*2 
        
        #self.d_W1 = tf.reshape(self.d_W1, [self.observation_dim, self.decoder_len])
        self.d_W2 = tf.reshape(self.d_W2, [self.observation_dim, self.decoder_len])
        #self.d_W3 = tf.reshape(self.d_W3, [self.decoder_len, self.action_dim])
        self.d_W3 = tf.reshape(self.d_W3, [self.decoder_len, self.action_dim])
        
        # decoder output bias
        #self.d_B1 = tf.reshape(self.Decoder(decoder_out_dim = self.decoder_len, scope = "B1"), [self.decoder_len])
        self.d_B2 = tf.reshape(self.Decoder(decoder_out_dim = self.decoder_len, scope = "B2"), [self.decoder_len])
        self.d_B3 = tf.reshape(self.Decoder(decoder_out_dim = self.action_dim, scope = "B3"), [self.action_dim])
        #self.d_B1 = ((self.d_B1 - (tf.reduce_max(self.d_B1) + tf.reduce_min(self.d_B1))/2)/(tf.reduce_max(self.d_B1) - tf.reduce_min(self.d_B1)))*2 
        #self.d_B2 = ((self.d_B2 - (tf.reduce_max(self.d_B2) + tf.reduce_min(self.d_B2))/2)/(tf.reduce_max(self.d_B2) - tf.reduce_min(self.d_B2)))*2 
        
        
        # exploit policy
        self.build_policy_exploit()
        #self.d = [self.d_W1, self.d_B1, self.d_W2, self.d_B2, self.d_W3, self.d_B3]
        self.exploit_policy_loss = -tf.reduce_sum(self.exploit_logprob * self.advantage_placeholder_exploit)
        self.d = [self.d_W3, self.d_B3]
        self.gradients_exploit = tf.gradients(self.exploit_policy_loss,self.d)
        
        # train encoder and decoder
        adam_optimizer_exploit =  tf.train.AdamOptimizer(self.lr*0.3)
        self.output_train_op = adam_optimizer_exploit.minimize(self.exploit_policy_loss)
        # train original network
#         adam_optimizer_explore = tf.train.AdamOptimizer(self.lr)
#         self.input_train_op = adam_optimizer_explore.minimize(self.explore_policy_loss)
        init = tf.global_variables_initializer()
        self.sess.run(init)
    
    def initialize(self):
        self.ConstructGraph()
#         # create tf session
#         self.sess = tf.Session()
        
#         # initiliaze all variables
#         init = tf.global_variables_initializer()
#         self.sess.run(init)
    
    def train_step(self): 
        # sample num_traj*num_MDPs
        for i in range(self.num_mdps):
            explore_paths, explore_rewards = self.sample_paths_explore(self.env,i)
            observations_explore = np.concatenate([path["observation"] for path in explore_paths])
            new_observations_explore = np.concatenate([path["new_obs"] for path in explore_paths])
            new_actions_explore = np.concatenate([path["new_acs"] for path in explore_paths])
            actions_explore = np.concatenate([path["action"] for path in explore_paths])
            rewards_explore = np.concatenate([path["reward"] for path in explore_paths])
            
            vtargs, vpreds, advantages_explore = [], [], []
            for path in explore_paths:
                rew_t = path["reward"]
                return_t = utils.discount(rew_t, self.gamma)
                vpred_t = self.vf.predict(path["observation"])
                adv_t = return_t - vpred_t
                advantages_explore.append(adv_t)
                vtargs.append(return_t)
                vpreds.append(vpred_t)
                
                
            #returns_explore = self.get_returns(explore_paths, explore = True)
            advantages_explore = np.concatenate(advantages_explore)
            advantages_explore = (advantages_explore - advantages_explore.mean()) / (advantages_explore.std() + 1e-8)
            vtarg_n = np.concatenate(vtargs)
            vpred_n = np.concatenate(vpreds)
            #update baseline
            self.vf.fit(new_observations_explore, vtarg_n)
            
            surr_loss, oldmean_na, oldlogstd_a = self.policyfn.update_policy(
                    new_observations_explore, new_actions_explore, advantages_explore, self.lr)
            
            print("average reward explore:", np.mean(explore_rewards))
            
            #print(returns_explore)
#             print("average reward explore: MDP", i, np.sum(explore_rewards)/num_traj, len(explore_rewards))

#             baseline_explore = self.sess.run(self.baseline, {self.observation_placeholder_explore:new_observations_explore})
#             adv = returns_explore - np.squeeze(baseline_explore)
#             advantages_explore = (adv - np.mean(adv))/(np.std(adv) + 1e-12)


#             # update the baseline

#             self.sess.run(self.update_baseline_op, {self.observation_placeholder_explore:new_observations_explore, 
#                                            self.baseline_target_placeholder : returns_explore})

            # calculate explore gradients
    #         grads_explore = self.sess.run(self.gradients_explore, feed_dict={
    #                     self.observation_placeholder_explore : observations_explore,
    #                     self.action_placeholder_explore : actions_explore,
    #                     self.advantage_placeholder_explore : returns_explore})
            #print("explore",grads_explore )
            # form trajectory matrix

            M = np.array(self.stack_trajectories(explore_paths))


            #encoder LSTM
            Z = self.sess.run(self.Z, feed_dict = {self.encoder_input_placeholder: M,self.sequence_length_placeholder: self.length })
            Z = np.reshape(Z,[1,len(Z)])
#             #print("Z",Z)
#             #print(self.sess.run(self.d, feed_dict = {self.decoder_input_placeholder: Z}))
#             #print("d",self.d)
#             # sample paths
#             tvars = tf.trainable_variables()
#             tvars_vals = self.sess.run(tvars[-5:-1])
#             #print(tvars_vals)
            for j in range(10):
                exploit_paths, exploit_rewards = self.sample_paths_exploit(self.env,Z,i)
                # get observations, actions and rewards
                observations_exploit = np.concatenate([path["observation"] for path in exploit_paths])
                actions_exploit = np.concatenate([path["action"] for path in exploit_paths])
                rewards_exploit = np.concatenate([path["reward"] for path in exploit_paths])
                returns_exploit = self.get_returns(exploit_paths)
                print("average reward exploit: MDP", i, np.sum(exploit_rewards) / num_traj, len(exploit_rewards))


#             # exploit grads
#     #         grads_exploit = self.sess.run(self.gradients_exploit,feed_dict={
#     #                     self.observation_placeholder_exploit : observations_exploit,
#     #                     self.action_placeholder_exploit : actions_exploit,
#     #                     self.advantage_placeholder_exploit : returns_exploit,
#     #                     self.decoder_input_placeholder: Z})

            #train encoder and decoder network
                self.sess.run(self.output_train_op, feed_dict={
                                self.observation_placeholder_exploit : observations_exploit,
                                self.action_placeholder_exploit : actions_exploit,
                                self.advantage_placeholder_exploit : returns_exploit,
                                self.decoder_input_placeholder: Z})

            # find advantage for input network
    #         advantage_explore = 0
    #         for i in range(len(grads_exploit)):
    #             l1 = grads_exploit[i]
    #             l2 = grads_explore[i]
    #             advantage_explore = advantage_explore + np.matmul(l1.flatten(), l2.flatten())

            # train input policy
#             self.sess.run(self.input_train_op, feed_dict={
#                             self.observation_placeholder_explore : new_observations_explore,
#                             self.action_placeholder_explore : new_actions_explore,
#                             self.advantage_placeholder_explore : advantages_explore})
    
    def test(self):
        explore_paths, explore_rewards = self.sample_paths_explore(self.env, Test = True)
        M = self.stack_trajectories(explore_paths)
        Z = self.sess.run(self.Z, feed_dict = {self.encoder_input_placeholder: M,self.sequence_length_placeholder: self.length })
        Z = np.reshape(Z,[1,len(Z)])
        # sample paths
        exploit_paths, exploit_rewards = self.sample_paths_exploit(self.env,Z, Test = True)
        print("average reward exploit", np.sum(exploit_rewards) / num_traj, len(exploit_rewards))
        
    def train(self):
        self.initialize()
        num_epochs = 200
        for epoch in range(num_epochs):
            print("epoch number: ", epoch)
            self.train_step()
            

In [63]:
a = MetaLearner(env, max_ep_len, num_traj, latent_size)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [64]:
a.train()

epoch number:  0
average reward explore: -3662.672117041522
average reward exploit: MDP 0 -1094.3352855059734 20
average reward exploit: MDP 0 -923.1838206040175 20
average reward exploit: MDP 0 -1012.6563000137643 20
average reward exploit: MDP 0 -1027.7397086157866 20
average reward exploit: MDP 0 -785.7132634476591 20
average reward exploit: MDP 0 -727.1258788278433 20
average reward exploit: MDP 0 -653.2006160224764 20
average reward exploit: MDP 0 -553.0010214735477 20
average reward exploit: MDP 0 -557.4145369736119 20
average reward exploit: MDP 0 -500.3180935135112 20
average reward explore: -3003.5069068266525
average reward exploit: MDP 1 -782.798932411737 20
average reward exploit: MDP 1 -689.9480456156092 20
average reward exploit: MDP 1 -655.2196221780739 20
average reward exploit: MDP 1 -700.7474503770436 20
average reward exploit: MDP 1 -708.5027673316515 20
average reward exploit: MDP 1 -755.562741896361 20
average reward exploit: MDP 1 -693.6899605017755 20
average rew

average reward exploit: MDP 4 -606.5333750089306 20
average reward exploit: MDP 4 -589.4726165552904 20
average reward exploit: MDP 4 -642.9023925933739 20
average reward exploit: MDP 4 -592.737447206868 20
average reward exploit: MDP 4 -637.4433559154246 20
average reward explore: -917.3916115977958
average reward exploit: MDP 5 -470.189605782103 20
average reward exploit: MDP 5 -454.46677719301096 20
average reward exploit: MDP 5 -448.32525021032086 20
average reward exploit: MDP 5 -499.1548469549043 20
average reward exploit: MDP 5 -499.6712102341071 20
average reward exploit: MDP 5 -447.8534640308173 20
average reward exploit: MDP 5 -420.9335255995411 20
average reward exploit: MDP 5 -459.4254163862635 20
average reward exploit: MDP 5 -366.19815553549887 20
average reward exploit: MDP 5 -392.03444081378234 20
average reward explore: -931.8216391364543
average reward exploit: MDP 6 -614.3955064877956 20
average reward exploit: MDP 6 -605.6966374302078 20
average reward exploit: MDP 

average reward explore: -630.1094843323232
average reward exploit: MDP 9 -88.47537596169886 20
average reward exploit: MDP 9 -227.57534097718408 20
average reward exploit: MDP 9 -243.67396324167999 20
average reward exploit: MDP 9 -243.90661562744708 20
average reward exploit: MDP 9 -352.0522056491178 20
average reward exploit: MDP 9 -478.5716193861484 20
average reward exploit: MDP 9 -485.57511473628546 20
average reward exploit: MDP 9 -449.6669167631377 20
average reward exploit: MDP 9 -328.53048947318365 20
average reward exploit: MDP 9 -342.7320802677542 20
epoch number:  3
average reward explore: -684.9278764264384
average reward exploit: MDP 0 -59.66195704758803 20
average reward exploit: MDP 0 94.78476796391028 20
average reward exploit: MDP 0 23.71134560387805 20
average reward exploit: MDP 0 24.208684763405913 20
average reward exploit: MDP 0 2.470835928797436 20
average reward exploit: MDP 0 13.854073706278314 20
average reward exploit: MDP 0 -56.71868632740599 20
average rew

average reward exploit: MDP 3 -272.72698582027544 20
average reward exploit: MDP 3 -283.4368726243052 20
average reward exploit: MDP 3 -292.70426270596386 20
average reward exploit: MDP 3 -219.9741667791186 20
average reward exploit: MDP 3 -288.65020469007806 20
average reward exploit: MDP 3 -303.9515717932624 20
average reward explore: -572.2962007682002
average reward exploit: MDP 4 -136.80661945124248 20
average reward exploit: MDP 4 -175.30254342065922 20
average reward exploit: MDP 4 -133.3938364014972 20
average reward exploit: MDP 4 -106.56969778733969 20
average reward exploit: MDP 4 -94.73653884430543 20
average reward exploit: MDP 4 -116.71446833717069 20
average reward exploit: MDP 4 -141.01414397018468 20
average reward exploit: MDP 4 -249.3080375393297 20
average reward exploit: MDP 4 -199.9873421091906 20
average reward exploit: MDP 4 -246.3970889224157 20
average reward explore: -564.5478377843383
average reward exploit: MDP 5 -541.3515878910437 20
average reward exploit

average reward exploit: MDP 7 -215.45858742934402 20
average reward explore: -477.07489758898464
average reward exploit: MDP 8 -140.41959447664016 20
average reward exploit: MDP 8 -64.38168701685359 20
average reward exploit: MDP 8 -133.93236135464645 20
average reward exploit: MDP 8 13.92846053800412 20
average reward exploit: MDP 8 26.524268621401575 20
average reward exploit: MDP 8 172.11991348514715 20
average reward exploit: MDP 8 238.52653616226266 20
average reward exploit: MDP 8 164.45319619777746 20
average reward exploit: MDP 8 -124.67767345392556 20
average reward exploit: MDP 8 -160.37975450654784 20
average reward explore: -477.9669058162056
average reward exploit: MDP 9 -297.60888707739645 20
average reward exploit: MDP 9 -304.16933547624143 20
average reward exploit: MDP 9 -317.6650515856875 20
average reward exploit: MDP 9 -274.03524024566394 20
average reward exploit: MDP 9 -271.31765537546835 20
average reward exploit: MDP 9 -262.35093689540656 20
average reward explo

average reward exploit: MDP 2 -205.8940313260197 20
average reward exploit: MDP 2 -155.94852831097376 20
average reward exploit: MDP 2 -146.30677960764555 20
average reward exploit: MDP 2 -85.90482168867665 20
average reward exploit: MDP 2 -88.37970343037243 20
average reward exploit: MDP 2 7.928901941574867 20
average reward exploit: MDP 2 41.93098743726865 20
average reward explore: -409.0217211206156
average reward exploit: MDP 3 250.05240098552548 20
average reward exploit: MDP 3 155.7339911202857 20
average reward exploit: MDP 3 182.33672290839158 20
average reward exploit: MDP 3 288.8255832087125 20
average reward exploit: MDP 3 201.68327635487384 20
average reward exploit: MDP 3 137.70273620065194 20
average reward exploit: MDP 3 84.18399149684194 20
average reward exploit: MDP 3 30.186152919596275 20
average reward exploit: MDP 3 50.39244905342804 20
average reward exploit: MDP 3 119.4480966281221 20
average reward explore: -404.8957905637093
average reward exploit: MDP 4 69.11

average reward exploit: MDP 6 -84.98062466125944 20
average reward explore: -371.79603926391655
average reward exploit: MDP 7 -59.07044905580592 20
average reward exploit: MDP 7 -87.78270086902552 20
average reward exploit: MDP 7 -31.668394483896048 20
average reward exploit: MDP 7 -52.5137525889386 20
average reward exploit: MDP 7 -109.49723896483104 20
average reward exploit: MDP 7 -37.6532081246167 20
average reward exploit: MDP 7 -100.71191126434314 20
average reward exploit: MDP 7 -92.65263033707338 20
average reward exploit: MDP 7 -162.3057221751467 20
average reward exploit: MDP 7 -118.79642653787641 20
average reward explore: -366.4070777277455
average reward exploit: MDP 8 22.44981381113073 20
average reward exploit: MDP 8 -32.01101342999784 20
average reward exploit: MDP 8 44.37620882529891 20
average reward exploit: MDP 8 -6.016671091526333 20
average reward exploit: MDP 8 124.52561588483556 20
average reward exploit: MDP 8 107.91115119620603 20
average reward exploit: MDP 8

average reward exploit: MDP 1 -140.82209032264367 20
average reward exploit: MDP 1 -141.62885831664062 20
average reward exploit: MDP 1 -140.56944033538383 20
average reward exploit: MDP 1 -95.97334601360554 20
average reward exploit: MDP 1 -121.45599128979765 20
average reward exploit: MDP 1 -130.3989419215173 20
average reward exploit: MDP 1 -112.2779499721972 20
average reward explore: -305.22917854393705
average reward exploit: MDP 2 -73.11171206036433 20
average reward exploit: MDP 2 -58.72276200599335 20
average reward exploit: MDP 2 -24.899692550083294 20
average reward exploit: MDP 2 -18.32041078847782 20
average reward exploit: MDP 2 36.203967298964116 20
average reward exploit: MDP 2 71.48450275396723 20
average reward exploit: MDP 2 10.852906319874982 20
average reward exploit: MDP 2 89.92244009063987 20
average reward exploit: MDP 2 64.90643477924884 20
average reward exploit: MDP 2 32.95691920108238 20
average reward explore: -320.85760113350807
average reward exploit: MDP

average reward exploit: MDP 5 606.4025223838063 20
average reward explore: -265.41753177694085
average reward exploit: MDP 6 64.2264760381976 20
average reward exploit: MDP 6 171.99159005225826 20
average reward exploit: MDP 6 121.30942050225656 20
average reward exploit: MDP 6 146.86246581003167 20
average reward exploit: MDP 6 216.19695985925256 20
average reward exploit: MDP 6 165.05349774194417 20
average reward exploit: MDP 6 246.419605691954 20
average reward exploit: MDP 6 286.7820387697576 20
average reward exploit: MDP 6 255.13688144144658 20
average reward exploit: MDP 6 198.0770149598855 20
average reward explore: -250.38453375466378
average reward exploit: MDP 7 56.30725041996059 20
average reward exploit: MDP 7 81.793721755312 20
average reward exploit: MDP 7 54.33393909173407 20
average reward exploit: MDP 7 -29.01611913225903 20
average reward exploit: MDP 7 -67.04515924351604 20
average reward exploit: MDP 7 -39.689852107991854 20
average reward exploit: MDP 7 -98.90330

average reward exploit: MDP 0 275.4216576745813 20
average reward exploit: MDP 0 278.7063798676573 20
average reward exploit: MDP 0 217.25922773764236 20
average reward exploit: MDP 0 156.30698008202597 20
average reward exploit: MDP 0 137.0444819200943 20
average reward exploit: MDP 0 123.00571525844187 20
average reward explore: -217.5515689759035
average reward exploit: MDP 1 -64.55895061817398 20
average reward exploit: MDP 1 -2.4376297060095027 20
average reward exploit: MDP 1 68.80483934232569 20
average reward exploit: MDP 1 128.45340499828248 20
average reward exploit: MDP 1 112.58039093624582 20
average reward exploit: MDP 1 169.7919297900694 20
average reward exploit: MDP 1 125.20091938692542 20
average reward exploit: MDP 1 164.87384271211255 20
average reward exploit: MDP 1 139.88929850723258 20
average reward exploit: MDP 1 150.39139153537727 20
average reward explore: -217.53691491071953
average reward exploit: MDP 2 3.5158930879523673 20
average reward exploit: MDP 2 53.

average reward exploit: MDP 5 -129.2919254275348 20
average reward exploit: MDP 5 -113.48861337459716 20
average reward exploit: MDP 5 -132.07625593906727 20
average reward exploit: MDP 5 -128.25579285719314 20
average reward exploit: MDP 5 -114.23729098796017 20
average reward exploit: MDP 5 -153.54258343359388 20
average reward exploit: MDP 5 -138.9364373176135 20
average reward exploit: MDP 5 -153.4474898755147 20
average reward exploit: MDP 5 -95.89596139160633 20
average reward exploit: MDP 5 -166.51994868984667 20
average reward explore: -158.7642514068284
average reward exploit: MDP 6 -75.73911286303515 20
average reward exploit: MDP 6 -129.17468183449722 20
average reward exploit: MDP 6 -90.21620057069939 20
average reward exploit: MDP 6 -92.6804816919248 20
average reward exploit: MDP 6 -112.46542490834659 20
average reward exploit: MDP 6 -139.99813462289163 20
average reward exploit: MDP 6 -66.85264349545545 20
average reward exploit: MDP 6 -24.57804279761847 20
average rewar

average reward exploit: MDP 9 139.05240405438457 20
average reward exploit: MDP 9 147.91193430957838 20
average reward exploit: MDP 9 121.70609716959434 20
average reward exploit: MDP 9 126.70968144793676 20
epoch number:  16
average reward explore: -60.02866710727004
average reward exploit: MDP 0 361.9120174617263 20
average reward exploit: MDP 0 369.0919590409649 20
average reward exploit: MDP 0 369.37062839898846 20
average reward exploit: MDP 0 368.3915227690411 20
average reward exploit: MDP 0 370.64450824972 20
average reward exploit: MDP 0 362.92003779893275 20
average reward exploit: MDP 0 361.5367597614359 20
average reward exploit: MDP 0 364.97802023216457 20
average reward exploit: MDP 0 355.5743709079626 20
average reward exploit: MDP 0 340.7744347802401 20
average reward explore: -45.556153329248026
average reward exploit: MDP 1 24.1619170665958 20
average reward exploit: MDP 1 53.701050510008734 20
average reward exploit: MDP 1 88.70534602049764 20
average reward exploit:

average reward exploit: MDP 4 343.0454025252307 20
average reward exploit: MDP 4 347.03635490919754 20
average reward exploit: MDP 4 361.94555700741694 20
average reward exploit: MDP 4 359.31411672342466 20
average reward exploit: MDP 4 373.5166036139949 20
average reward exploit: MDP 4 365.92449058903196 20
average reward exploit: MDP 4 381.40886010953477 20
average reward exploit: MDP 4 390.04009398630023 20
average reward explore: -39.231121189988826
average reward exploit: MDP 5 205.9609038233428 20
average reward exploit: MDP 5 198.9672811136391 20
average reward exploit: MDP 5 193.05176546456724 20
average reward exploit: MDP 5 187.845346091544 20
average reward exploit: MDP 5 148.90100175011804 20
average reward exploit: MDP 5 166.1183684952992 20
average reward exploit: MDP 5 135.23044963374016 20
average reward exploit: MDP 5 104.06575866251987 20
average reward exploit: MDP 5 82.98901032492316 20
average reward exploit: MDP 5 34.64280580577493 20
average reward explore: 23.69

average reward exploit: MDP 8 -397.3815262026518 20
average reward exploit: MDP 8 -338.9272714579367 20
average reward explore: 108.10055168874842
average reward exploit: MDP 9 -432.2061909238344 20
average reward exploit: MDP 9 -444.79281247892243 20
average reward exploit: MDP 9 -311.04538372379545 20
average reward exploit: MDP 9 -257.4415199912619 20
average reward exploit: MDP 9 -204.2793265487435 20
average reward exploit: MDP 9 -58.36596646101018 20
average reward exploit: MDP 9 41.09817291462753 20
average reward exploit: MDP 9 47.7411434414909 20
average reward exploit: MDP 9 50.241564514259906 20
average reward exploit: MDP 9 49.565255207557094 20
epoch number:  19
average reward explore: 102.60279706081107
average reward exploit: MDP 0 -344.70303263380106 20
average reward exploit: MDP 0 -294.35637186903045 20
average reward exploit: MDP 0 -241.1864748234678 20
average reward exploit: MDP 0 -274.344611000927 20
average reward exploit: MDP 0 -249.12768486912296 20
average rew

average reward exploit: MDP 3 -39.05914742941072 20
average reward exploit: MDP 3 98.48125089234652 20
average reward exploit: MDP 3 107.3366229991716 20
average reward exploit: MDP 3 128.17242661876384 20
average reward exploit: MDP 3 126.25796643218322 20
average reward exploit: MDP 3 152.13643289647592 20
average reward explore: 173.18123898485433
average reward exploit: MDP 4 -469.64231165558976 20
average reward exploit: MDP 4 -441.8034002596055 20
average reward exploit: MDP 4 -416.8566038485659 20
average reward exploit: MDP 4 -346.5853390485843 20
average reward exploit: MDP 4 -344.36152255774806 20
average reward exploit: MDP 4 -286.95520497460086 20
average reward exploit: MDP 4 -276.62591060672673 20
average reward exploit: MDP 4 -317.6284958676467 20
average reward exploit: MDP 4 -306.0892641787019 20
average reward exploit: MDP 4 -292.95518682400575 20
average reward explore: 200.7395595905913
average reward exploit: MDP 5 65.86879554463178 20
average reward exploit: MDP 5

average reward exploit: MDP 8 -527.7684985469128 20
average reward exploit: MDP 8 -458.77674546377165 20
average reward exploit: MDP 8 -423.75396006602494 20
average reward exploit: MDP 8 -289.12602391194787 20
average reward exploit: MDP 8 -264.1080973984735 20
average reward exploit: MDP 8 -283.20377140282795 20
average reward exploit: MDP 8 -231.2963371052224 20
average reward exploit: MDP 8 -198.61684954268907 20
average reward exploit: MDP 8 -210.24101144591253 20
average reward exploit: MDP 8 -179.8904117947928 20
average reward explore: 318.83814553272185
average reward exploit: MDP 9 -265.45768436829417 20
average reward exploit: MDP 9 -169.6399607721334 20
average reward exploit: MDP 9 -96.36228109303669 20
average reward exploit: MDP 9 -27.422454829440618 20
average reward exploit: MDP 9 -39.91739036022432 20
average reward exploit: MDP 9 -76.48139599351009 20
average reward exploit: MDP 9 -124.34032716864769 20
average reward exploit: MDP 9 -149.3139851998382 20
average rewa

average reward exploit: MDP 2 41.245447188887056 20
average reward exploit: MDP 2 -178.38270224742882 20
average reward exploit: MDP 2 -720.1051372029161 20
average reward exploit: MDP 2 -226.715501526279 20
average reward explore: 333.6993149592121
average reward exploit: MDP 3 284.4952200443476 20
average reward exploit: MDP 3 299.04552991115065 20
average reward exploit: MDP 3 272.1013754025055 20
average reward exploit: MDP 3 308.0807961865001 20
average reward exploit: MDP 3 315.3103928764987 20
average reward exploit: MDP 3 355.5175085949513 20
average reward exploit: MDP 3 332.3787685927837 20
average reward exploit: MDP 3 401.26244022393155 20
average reward exploit: MDP 3 395.0242148802723 20
average reward exploit: MDP 3 471.0708915150817 20
average reward explore: 295.395438959228
average reward exploit: MDP 4 -1147.3143705946968 20
average reward exploit: MDP 4 -1289.0209997184927 20
average reward exploit: MDP 4 -1005.7895450610691 20
average reward exploit: MDP 4 -950.993

average reward exploit: MDP 7 271.5632903894042 20
average reward exploit: MDP 7 266.1154385595467 20
average reward exploit: MDP 7 263.3113748283863 20
average reward exploit: MDP 7 255.7037318365325 20
average reward exploit: MDP 7 266.9990527255169 20
average reward exploit: MDP 7 280.6751779773864 20
average reward exploit: MDP 7 267.8297788109376 20
average reward exploit: MDP 7 273.4712529742749 20
average reward explore: 421.21858346674
average reward exploit: MDP 8 -128.58356478658044 20
average reward exploit: MDP 8 -138.32746471672112 20
average reward exploit: MDP 8 -118.22119133320908 20
average reward exploit: MDP 8 -112.69215176111118 20
average reward exploit: MDP 8 -101.33121102057802 20
average reward exploit: MDP 8 -96.22017172590573 20
average reward exploit: MDP 8 -95.16690423247626 20
average reward exploit: MDP 8 -59.215708135920934 20
average reward exploit: MDP 8 -75.0287344340656 20
average reward exploit: MDP 8 -49.945532291558315 20
average reward explore: 47

average reward exploit: MDP 1 -616.8137874265711 20
average reward explore: 506.94474666658846
average reward exploit: MDP 2 -632.3503464397521 20
average reward exploit: MDP 2 -575.9756446662774 20
average reward exploit: MDP 2 -480.18287391299316 20
average reward exploit: MDP 2 -490.5786414193429 20
average reward exploit: MDP 2 -552.0198575088973 20
average reward exploit: MDP 2 -434.0792393567801 20
average reward exploit: MDP 2 -564.3322250042881 20
average reward exploit: MDP 2 -532.3274079114174 20
average reward exploit: MDP 2 -903.0436631197924 20
average reward exploit: MDP 2 -610.1173476815472 20
average reward explore: 425.9334893264636
average reward exploit: MDP 3 -216.35170484893138 20
average reward exploit: MDP 3 -329.38529811335906 20
average reward exploit: MDP 3 -221.02195206767632 20
average reward exploit: MDP 3 -236.3839428454766 20
average reward exploit: MDP 3 -297.33841552737033 20
average reward exploit: MDP 3 -371.18420002413194 20
average reward exploit: M

average reward exploit: MDP 6 640.9517735297593 20
average reward exploit: MDP 6 594.7281904938768 20
average reward exploit: MDP 6 545.9380297604811 20
average reward exploit: MDP 6 514.2828635254672 20
average reward exploit: MDP 6 520.7519381500127 20
average reward explore: 602.9911309610624
average reward exploit: MDP 7 -820.4900652257118 20
average reward exploit: MDP 7 -445.3413030293189 20
average reward exploit: MDP 7 21.264296606863287 20
average reward exploit: MDP 7 313.60163268242707 20
average reward exploit: MDP 7 466.55087392637154 20
average reward exploit: MDP 7 476.0074712368561 20
average reward exploit: MDP 7 435.8801638912444 20
average reward exploit: MDP 7 387.6445542299574 20
average reward exploit: MDP 7 359.0856688929579 20
average reward exploit: MDP 7 321.1643770298996 20
average reward explore: 559.1566508074043
average reward exploit: MDP 8 -679.3683194935785 20
average reward exploit: MDP 8 -589.2969746180331 20
average reward exploit: MDP 8 -541.5639697

average reward exploit: MDP 1 260.94593807650614 20
average reward exploit: MDP 1 265.6749575397431 20
average reward exploit: MDP 1 270.88002580455657 20
average reward exploit: MDP 1 265.3596769588902 20
average reward exploit: MDP 1 265.8467805849067 20
average reward exploit: MDP 1 261.29113287878806 20
average reward exploit: MDP 1 271.29804757633286 20
average reward exploit: MDP 1 275.04722496661276 20
average reward exploit: MDP 1 279.68251591768257 20
average reward exploit: MDP 1 279.1040320933777 20
average reward explore: 577.7542371388156
average reward exploit: MDP 2 236.52517084544155 20
average reward exploit: MDP 2 236.9189711219563 20
average reward exploit: MDP 2 241.01931143841153 20
average reward exploit: MDP 2 246.14356229478258 20
average reward exploit: MDP 2 243.39898133122702 20
average reward exploit: MDP 2 247.61606402063438 20
average reward exploit: MDP 2 247.47558641998603 20
average reward exploit: MDP 2 250.39060093115722 20
average reward exploit: MDP

average reward exploit: MDP 5 23.906163780573273 20
average reward exploit: MDP 5 5.289779067559101 20
average reward explore: 643.0291845200335
average reward exploit: MDP 6 -368.7912615977282 20
average reward exploit: MDP 6 -233.21467017950084 20
average reward exploit: MDP 6 -193.03410757573164 20
average reward exploit: MDP 6 154.27657989051986 20
average reward exploit: MDP 6 156.76161035876248 20
average reward exploit: MDP 6 195.34740838391994 20
average reward exploit: MDP 6 497.97552254759694 20
average reward exploit: MDP 6 630.9305496606687 20
average reward exploit: MDP 6 724.7879378276435 20
average reward exploit: MDP 6 785.8134555773266 20
average reward explore: 629.8699679862791
average reward exploit: MDP 7 175.10194455727546 20
average reward exploit: MDP 7 153.2398736636138 20
average reward exploit: MDP 7 345.93874660305124 20
average reward exploit: MDP 7 517.7770698965057 20
average reward exploit: MDP 7 680.7023738172077 20
average reward exploit: MDP 7 737.132

average reward exploit: MDP 0 285.31997101095095 20
average reward exploit: MDP 0 325.0712579349066 20
average reward exploit: MDP 0 384.45148575325635 20
average reward exploit: MDP 0 417.41914317641897 20
average reward exploit: MDP 0 486.5641598654127 20
average reward explore: 615.7709051716431
average reward exploit: MDP 1 365.9048441236513 20
average reward exploit: MDP 1 378.9632195733509 20
average reward exploit: MDP 1 403.3380148087907 20
average reward exploit: MDP 1 396.49061462430325 20
average reward exploit: MDP 1 431.1777831281556 20
average reward exploit: MDP 1 441.5873222870165 20
average reward exploit: MDP 1 461.3942601132402 20
average reward exploit: MDP 1 456.2635034134199 20
average reward exploit: MDP 1 479.28245661764356 20
average reward exploit: MDP 1 463.3815243568841 20
average reward explore: 745.0176945475057
average reward exploit: MDP 2 -837.8338498114557 20
average reward exploit: MDP 2 -567.302104937936 20
average reward exploit: MDP 2 -104.80464868

average reward exploit: MDP 5 -55.58366054850203 20
average reward exploit: MDP 5 -83.97080905835706 20
average reward exploit: MDP 5 -58.45488555168409 20
average reward exploit: MDP 5 -27.21434344498207 20
average reward exploit: MDP 5 -69.86304523074881 20
average reward exploit: MDP 5 -132.92315089529046 20
average reward exploit: MDP 5 26.82960267344378 20
average reward exploit: MDP 5 29.936370374595548 20
average reward exploit: MDP 5 27.883298172959485 20
average reward explore: 775.291779274901
average reward exploit: MDP 6 -393.63105664939576 20
average reward exploit: MDP 6 -358.6357839997439 20
average reward exploit: MDP 6 -266.1715195566587 20
average reward exploit: MDP 6 -600.4390126815072 20
average reward exploit: MDP 6 -472.96344602491143 20
average reward exploit: MDP 6 -179.75713200618796 20
average reward exploit: MDP 6 -155.35150959402395 20
average reward exploit: MDP 6 -99.80297288698563 20
average reward exploit: MDP 6 -89.2222783630725 20
average reward explo

average reward exploit: MDP 9 100.62263243317147 20
average reward exploit: MDP 9 122.76819453839872 20
epoch number:  35
average reward explore: 791.3488488426248
average reward exploit: MDP 0 731.3088608787843 20
average reward exploit: MDP 0 748.1777601329744 20
average reward exploit: MDP 0 765.0154513938104 20
average reward exploit: MDP 0 780.2927288658941 20
average reward exploit: MDP 0 780.9591908372124 20
average reward exploit: MDP 0 697.3832350300218 20
average reward exploit: MDP 0 533.142031251531 20
average reward exploit: MDP 0 670.0955388628229 20
average reward exploit: MDP 0 540.5151464160714 20
average reward exploit: MDP 0 361.417963890166 20
average reward explore: 824.3217092319876
average reward exploit: MDP 1 -81.25560129024697 20
average reward exploit: MDP 1 -134.84219311065223 20
average reward exploit: MDP 1 -118.99741812906132 20
average reward exploit: MDP 1 -112.70782657672683 20
average reward exploit: MDP 1 -150.25482818321842 20
average reward exploit

average reward exploit: MDP 4 -153.58552108400744 20
average reward exploit: MDP 4 -145.99968582417503 20
average reward exploit: MDP 4 -131.48865655587673 20
average reward exploit: MDP 4 -125.60744167513799 20
average reward exploit: MDP 4 -111.53135157330378 20
average reward exploit: MDP 4 -120.88987289749852 20
average reward exploit: MDP 4 -93.67605556220693 20
average reward explore: 796.9755914253975
average reward exploit: MDP 5 -47.97114418472043 20
average reward exploit: MDP 5 -13.610198364520523 20
average reward exploit: MDP 5 -14.259543335460894 20
average reward exploit: MDP 5 -0.544026175536662 20
average reward exploit: MDP 5 12.710408125590357 20
average reward exploit: MDP 5 49.08249370319894 20
average reward exploit: MDP 5 12.501197325858342 20
average reward exploit: MDP 5 58.62839219444538 20
average reward exploit: MDP 5 92.5042745644834 20
average reward exploit: MDP 5 86.28141013146212 20
average reward explore: 796.5457432634679
average reward exploit: MDP 6

KeyboardInterrupt: 

In [175]:
explore_paths, explore_rewards = a.sample_paths_explore(a.env,1)

In [176]:
print("average reward explore:", np.mean(explore_rewards))
M = a.stack_trajectories(explore_paths)

average reward explore: 188.03284965532836


In [177]:
Z = a.sess.run(a.Z, feed_dict = {a.encoder_input_placeholder: M,a.sequence_length_placeholder: a.length })

In [178]:
Z = np.reshape(Z,[1,len(Z)])

In [179]:
exploit_paths, exploit_rewards = a.sample_paths_exploit(a.env,Z,1)

In [180]:
print("average reward exploit", np.sum(exploit_rewards) / num_traj, len(exploit_rewards))

average reward exploit 562.4916582849902 400


In [174]:
a.num_traj = 400

In [167]:
a.sim.model.opt.gravity[-1] = -15

In [None]:
observations_explore = np.concatenate([path["observation"] for path in explore_paths])
new_observations_explore = np.concatenate([path["new_obs"] for path in explore_paths])
new_actions_explore = np.concatenate([path["new_acs"] for path in explore_paths])
actions_explore = np.concatenate([path["action"] for path in explore_paths])
rewards_explore = np.concatenate([path["reward"] for path in explore_paths])
returns_explore = a.get_returns(explore_paths, explore = True)

In [None]:
M = np.array(a.stack_trajectories(explore_paths))
Z = a.sess.run(a.Z, feed_dict = {a.encoder_input_placeholder: M,a.sequence_length_placeholder: a.length })
Z = np.reshape(Z,[1,len(Z)])

In [None]:
 print("average reward explore: MDP", np.sum(explore_rewards)/num_traj, len(explore_rewards))

In [None]:
exploit_paths, exploit_rewards = a.sample_paths_exploit(a.env,Z,1000)

In [None]:
print("average reward exploit: MDP", np.sum(exploit_rewards) / num_traj, len(exploit_rewards))

In [None]:
def randomize_pendulum(env):
    m = config['pendulum_mass_min'] + np.random.rand()*(config['pendulum_mass_max'] - config['pendulum_mass_min'])
    l = config['pendulum_len_min'] + np.random.rand()*(config['pendulum_len_max'] - config['pendulum_len_min'])
    env.m = m
    env.l = l

In [None]:
def randomize_hopper(env):
    ts = config['torso_min'] + np.random.rand()*(config['torso_max'] - config['torso_min'])
    f = config['friction_min'] + np.random.rand()*(config['friction_max'] - config['friction_min'])
    
    env.friction = f
    env.torso_size = ts
    env.apply_env_modifications()

In [None]:
import yaml
cfg_filename = 'hopper-config.yml'
with open(cfg_filename,'r') as ymlfile:
    config = yaml.load(ymlfile)

In [None]:
env = Randomizer(gym.make('Hopper-v2'), randomize_hopper)
env.unwrapped.__dict__.keys()
env.reset()

In [None]:
m = config['pendulum_mass_min'] + np.random.rand()*(config['pendulum_mass_max'] - config['pendulum_mass_min'])

In [None]:
config

In [None]:
env_id = 'Hopper-v2'
e = gym.make(env_id)

In [None]:
e.unwrapped.__dict__.keys()

In [None]:
e.unwrapped.apply_env_modifications()