In [131]:
# Include Libraries
import numpy as np
import tensorflow as tf
import gym
import tensorflow.contrib.layers as layers
from mujoco_py import load_model_from_xml, MjSim, MjViewer
import policies
import value_functions as vfuncs
import utils_pg as utils

# Environment setup
#env = "CartPole-v0"
#env="InvertedPendulum-v2"
env = "Walker2d-v2"

# discrete = isinstance(env.action_space, gym.spaces.Discrete)
# observation_dim = env.observation_space.shape[0]
# action_dim = env.action_space.n if discrete else env.action_space.shape[0]
max_ep_len = 1000
num_traj = 10
#traj_length = max_ep_len*(observation_dim + 2)
latent_size = 64
use_baseline = True



In [132]:
# Feed forward network (multi-layer-perceptron, or mlp)

def build_mlp(mlp_input,output_size,scope,n_layers,size,output_activation=None):
    '''
    Build a feed forward network
    '''
    Input = mlp_input
    with tf.variable_scope(scope):
        # Dense Layers
        for i in range(n_layers-1):
            dense = tf.layers.dense(inputs = Input, units = size, activation = tf.nn.relu, bias_initializer=tf.constant_initializer(1.0))
            Input = dense
        # Fully Connected Layer
        out = layers.fully_connected(inputs = Input, num_outputs = output_size, activation_fn=output_activation)
    return out

In [133]:
class MetaLearner():
    def __init__(self, env, max_ep_len, num_traj,latent_size ):
        self.env = gym.make(env)
        self.discrete = isinstance(self.env.action_space, gym.spaces.Discrete)
        self.observation_dim = self.env.observation_space.shape[0]
        self.action_dim = self.env.action_space.n if self.discrete else self.env.action_space.shape[0]
        self.max_ep_len = max_ep_len
        self.num_traj = num_traj
        self.traj_length = self.max_ep_len*(self.observation_dim + 2) # TO Change
        self.use_baseline = True
        self.latent_size = latent_size
        self.feature_size = self.observation_dim + 1 + self.env.action_space.shape[0] # HC
        self.lr = 3e-3
        self.num_layers = 1
        self.layers_size = 32
        self.num_mdps = 10
        self.model = self.env.env.model
        self.sim = MjSim(self.model)
        self.gamma = 0.95
        self.decoder_len = 32

        # build model
        self.ConstructGraph()
    
    def add_placeholders(self):
        self.observation_placeholder_explore = tf.placeholder(tf.float32, shape=(None,self.observation_dim))
        if(self.discrete):
            self.action_placeholder_explore = tf.placeholder(tf.int32, shape=(None))
            self.action_placeholder_exploit = tf.placeholder(tf.int32, shape=(None))
        else:
            self.action_placeholder_explore = tf.placeholder(tf.float32, shape=(None,self.action_dim))
            self.action_placeholder_exploit= tf.placeholder(tf.float32, shape=(None,self.action_dim))

        self.baseline_target_placeholder = tf.placeholder(tf.float32, shape= None)
        self.advantage_placeholder_explore = tf.placeholder(tf.float32, shape=(None))
        
        #self.encoder_input_placeholder = tf.placeholder(tf.float32, shape= (self.num_traj,self.traj_length))
        self.encoder_input_placeholder = tf.placeholder(tf.float32, [None, None, self.feature_size])
        self.decoder_input_placeholder = tf.placeholder(tf.float32, shape= (1,self.latent_size))
        self.sequence_length_placeholder = tf.placeholder(tf.int32, [None, ])
        
        self.observation_placeholder_exploit = tf.placeholder(tf.float32, shape=(None,self.observation_dim))
        #TODO
        self.advantage_placeholder_exploit = tf.placeholder(tf.float32, shape=(None))
        
        
    def build_policy_explore(self, scope = "policy_explore"):
        if (self.discrete):
            self.action_logits = build_mlp(self.observation_placeholder_explore,self.action_dim,scope = scope,n_layers=self.num_layers,size = self.layers_size,output_activation=None)
            self.explore_action = tf.multinomial(self.action_logits,1)
            self.explore_action = tf.squeeze(self.explore_action, axis=1)
            self.explore_logprob = -tf.nn.sparse_softmax_cross_entropy_with_logits(logits = self.action_logits, labels = self.action_placeholder_explore)

        else:   
            action_means = build_mlp(self.observation_placeholder_explore,self.action_dim,scope,n_layers=self.num_layers, size = self.layers_size,output_activation=None)
            init = tf.constant(np.random.rand(1, 2))
            #log_std = tf.get_variable("log_std", [self.action_dim])
            log_std = tf.get_variable(
                'log_std',
                shape=[1, self.action_dim],
                dtype=tf.float32,
                initializer=tf.zeros_initializer(),
                trainable=True
            )
            log_std = tf.log(tf.exp(log_std) + 1.)
            log_std = tf.tile(log_std,
                                   [tf.shape(action_means)[0], 1])
            self.explore_action = tf.random_normal((1,),
                                                   action_means,
                                                   log_std)
            dist = tf.contrib.distributions.MultivariateNormalDiag(action_means, log_std)
            self.explore_logprob = dist.prob(self.action_placeholder_explore, name = 'log_prob')
            #self.explore_action =   action_means + tf.multiply(tf.exp(log_std),tf.random_normal(shape = (self.action_dim,1),mean=0,stddev=1))
            #mvn = tf.contrib.distributions.MultivariateNormalDiag(action_means, tf.exp(log_std))
            #self.explore_logprob =  mvn.log_prob(value = self.action_placeholder_explore, name='log_prob')

    
    
    def build_policy_exploit(self, scope = "policy_exploit"):
        if(self.discrete):
            #self.exploit_action_logits = (tf.matmul(tf.nn.relu(tf.matmul(tf.nn.relu(tf.matmul(self.observation_placeholder_exploit,self.d_W1) + self.d_B1), self.d_W2) + self.d_B2),self.d_W3) + self.d_B3)
            #self.exploit_action_logits = tf.matmul(self.observation_placeholder_exploit,self.d_W3) + self.d_B3
            self.exploit_action_logits = tf.matmul(tf.nn.relu(tf.matmul(self.observation_placeholder_exploit,self.d_W2) + self.d_B2), self.d_W3) + self.d_B3
            self.exploit_action = tf.multinomial(self.exploit_action_logits,1)
            self.exploit_action = tf.squeeze(self.exploit_action, axis=1)
            self.exploit_logprob = -tf.nn.sparse_softmax_cross_entropy_with_logits(logits = self.exploit_action_logits, labels = self.action_placeholder_exploit)
        else:
            action_means = tf.matmul(tf.nn.relu(tf.matmul(self.observation_placeholder_exploit,self.d_W2) + self.d_B2), self.d_W3) + self.d_B3
            init = tf.constant(np.random.rand(1, 2))
            log_std = tf.get_variable("exploit_log_prob", [self.action_dim])
            self.exploit_action = action_means + tf.multiply(tf.exp(log_std),tf.random_normal(shape = (self.action_dim,1),mean=0,stddev=1))
            self.exploit_logprob = utils.gauss_log_prob(mu=action_means, logstd=log_std, x=self.action_placeholder_exploit)
            
#             init = tf.constant(np.random.rand(1, 2))
#             log_std = tf.get_variable("exploit_log_prob", [self.action_dim])
#             self.exploit_action =   action_means + tf.multiply(tf.exp(log_std),tf.random_normal(shape = (self.action_dim,1),mean=0,stddev=1))
#             mvn = tf.contrib.distributions.MultivariateNormalDiag(action_means, tf.exp(log_std))
#             self.exploit_logprob =  mvn.log_prob(value = self.action_placeholder_exploit, name='exploit_log_prob')

        #self.loss_grads_exploit = self.exploit_logprob * self.advantage_placeholder_exploit
        
    def NNEncoder(self, scope = "NNEncoder"):
        self.Z = build_mlp(self.encoder_input_placeholder,self.latent_size,scope = scope,n_layers=3,size = 60,output_activation=None)
    
 

    # input [num_traj, length, features (obs + action + reward) ]
    def LSTMEncoder(self, scope = "LSTMEncoder"):
        self.hidden_size = 64
        initializer = tf.random_uniform_initializer(-1, 1)
        cell = tf.nn.rnn_cell.LSTMCell(self.hidden_size, self.feature_size, initializer=initializer)
        cell_out = tf.contrib.rnn.OutputProjectionWrapper(cell, self.latent_size)
        self.output, _ = tf.nn.dynamic_rnn(cell_out,self.encoder_input_placeholder,self.sequence_length_placeholder,dtype=tf.float32,)
        batch_size = tf.shape(self.output)[0]
        max_length = tf.shape(self.output)[1]
        out_size = int(self.output.get_shape()[2])
        index = tf.range(0, batch_size) * max_length + (self.sequence_length_placeholder - 1)
        flat = tf.reshape(self.output, [-1, out_size])
        self.Z = tf.reduce_mean(tf.gather(flat, index), axis = 0)
        
    
    def Decoder(self, decoder_out_dim, scope):
        return build_mlp(self.decoder_input_placeholder,decoder_out_dim,scope = scope,n_layers=1,size = 64,output_activation=None)
    
    
    def sample_paths_explore(self, env,mdp_num,Test = False, num_episodes = None):
        paths = []
        self.length = []
        episode_rewards = []
        #self.sim.model.opt.gravity[-1] = -5 - mdp_num
        for i in range(self.num_traj):
            pad = False
            state = env.reset()
            new_states,states,new_actions, actions, rewards,new_rewards = [], [], [], [], [],[]
            episode_reward = 0
            for step in range(self.max_ep_len):
                if (pad):
                    states.append([0]*self.observation_dim)
                    if(self.discrete):
                        actions.append(0)
                    else:
                        actions.append([0]*self.action_dim)
                    rewards.append(0)
                else:
                    states.append(state)
                    new_states.append(state)
                    #action = self.sess.run(self.explore_action, feed_dict={self.observation_placeholder_explore : states[-1][None]})[0]
                    action = self.policyfn.sample_action(state) #NEW
                    state, reward, done, info = env.step(action)
                    actions.append(action)
                    new_actions.append(action)
                    rewards.append(reward)
                    new_rewards.append(reward)
                    episode_reward += reward
                    if (done or step == self.max_ep_len-1):
                        episode_rewards.append(episode_reward)
                        self.length.append(step + 1)
                        pad = True 
                        
            episode_rewards.append(episode_reward)            
            #print("explore",np.array(actions))
            path = {"new_obs" : np.array(new_states),"observation" : np.array(states),
                                "reward" : np.array(rewards),"new_acs" : np.array(new_actions),
                                "action" : np.array(actions),"new_rewards":np.array(new_rewards)}
            paths.append(path)
        return paths, episode_rewards
    
    def sample_paths_exploit(self, env,Z,mdp_num,Test = False, num_episodes = None):
        #self.sim.model.opt.gravity[-1] = -5 - mdp_num
        
        paths = []
        num = 0
        episode_rewards = []
        for i in range(self.num_traj):
            state = env.reset()
            states, actions, rewards = [], [], []
            episode_reward = 0
            for step in range(self.max_ep_len):
                states.append(state)
                action = self.sess.run(self.exploit_action, feed_dict={self.observation_placeholder_exploit : state[None], self.decoder_input_placeholder: Z})[0]
                state, reward, done, info = env.step(action)
                actions.append(action)
                rewards.append(reward)
                episode_reward += reward
                if (done or step == self.max_ep_len-1):
                    episode_rewards.append(episode_reward)
                    break
            #print("exploit",np.array(actions))
            path = {"observation" : np.array(states),
                                "reward" : np.array(rewards),
                                "action" : np.array(actions)}
            paths.append(path)
 
        #print("exploit success: ", num)
        return paths, episode_rewards
    
    def get_returns(self,paths, explore = False):
        all_returns = []
        m = 0
        for path in paths:
            rewards = path["reward"]
            returns = []
            if(explore):
                length = self.length[m]
            else:
                length = len(rewards)
            for i in range(length):
                path_returns = 0
                k = 0
                for j in range(i,length):
                    path_returns = path_returns + rewards[j]*(self.gamma)**k
                    k = k+1
                returns.append(path_returns)
            all_returns.append(returns)
            m+=1
        returns = np.concatenate(all_returns)
        return returns
    
    def stack_trajectories(self,paths):
        trajectories = []
        for path in paths:
            rewards = path["reward"]
            states = path["observation"]
            action = path["action"]
            
            SAR = []
            for i in range(len(states)):
                SAR.append(list(states[i]) + list(action[i]) + [rewards[i]])
            trajectories.append(SAR)
        return np.array(trajectories)
    
    def addBaseline(self):
        self.baseline = build_mlp(self.observation_placeholder_explore,1,scope = "baseline",n_layers=self.num_layers, size = self.layers_size,output_activation=None)
        self.baseline_loss = tf.losses.mean_squared_error(self.baseline_target_placeholder,self.baseline,scope = "baseline")
        baseline_adam_optimizer =  tf.train.AdamOptimizer(learning_rate = self.lr)
        self.update_baseline_op = baseline_adam_optimizer.minimize(self.baseline_loss)

    def calculate_advantage(self,returns, observations):
        if (self.use_baseline):
            baseline = self.sess.run(self.baseline, {input_placeholder:observations})
            adv = returns - baseline
            adv = (adv - np.mean(adv))/np.std(adv)
        else:
            adv = returns
        return adv
    

        
    def ConstructGraph(self):
        tf.reset_default_graph()
        self.sess = tf.Session()
        
        self.vf = vfuncs.NnValueFunction(session=self.sess,ob_dim=self.observation_dim)
        self.policyfn = policies.GaussianPolicy(self.sess,self.observation_dim, self.action_dim)
        
        self.add_placeholders()
        
#         self.add_placeholders()
        
#         self.build_policy_explore()
#         self.explore_policy_loss = -tf.reduce_sum(self.explore_logprob * self.advantage_placeholder_explore)
#         self.loss_grads_explore = -self.explore_logprob * self.advantage_placeholder_explore
#         self.tvars_explore = tf.trainable_variables()
#         self.gradients_explore = tf.gradients(self.explore_policy_loss,self.tvars_explore)
        
#         #self.addBaseline()
        
#         self.baseline = build_mlp(self.observation_placeholder_explore,1,scope = "baseline",n_layers=1, size = 16,output_activation=None)
#         self.baseline_loss = tf.losses.mean_squared_error(self.baseline_target_placeholder,self.baseline,scope = "baseline")
#         baseline_adam_optimizer =  tf.train.AdamOptimizer(learning_rate = self.lr)
#         self.update_baseline_op = baseline_adam_optimizer.minimize(self.baseline_loss)
        
#         #Encoder LSTM
        self.LSTMEncoder()
        
        
        #decoder weights
        #self.d_W1 = self.Decoder(scope = "W1", decoder_out_dim = self.observation_dim*self.decoder_len)
        self.d_W2 = self.Decoder(scope = "W2", decoder_out_dim = self.observation_dim*self.decoder_len)
        #self.d_W3 = self.Decoder(scope = "W3", decoder_out_dim = self.decoder_len*action_dim)
        self.d_W3 = self.Decoder(scope = "W3", decoder_out_dim = self.decoder_len*self.action_dim)
        
        #self.d_W1 = ((self.d_W1 - (tf.reduce_max(self.d_W1) + tf.reduce_min(self.d_W1))/2)/(tf.reduce_max(self.d_W1) - tf.reduce_min(self.d_W1)))*2 
        #self.d_W2 = ((self.d_W2 - (tf.reduce_max(self.d_W2) + tf.reduce_min(self.d_W2))/2)/(tf.reduce_max(self.d_W2) - tf.reduce_min(self.d_W2)))*2 
        #self.d_W3 = ((self.d_W3 - (tf.reduce_max(self.d_W3) + tf.reduce_min(self.d_W3))/2)/(tf.reduce_max(self.d_W3) - tf.reduce_min(self.d_W3)))*2 
        
        #self.d_W1 = tf.reshape(self.d_W1, [self.observation_dim, self.decoder_len])
        self.d_W2 = tf.reshape(self.d_W2, [self.observation_dim, self.decoder_len])
        #self.d_W3 = tf.reshape(self.d_W3, [self.decoder_len, self.action_dim])
        self.d_W3 = tf.reshape(self.d_W3, [self.decoder_len, self.action_dim])
        
        # decoder output bias
        #self.d_B1 = tf.reshape(self.Decoder(decoder_out_dim = self.decoder_len, scope = "B1"), [self.decoder_len])
        self.d_B2 = tf.reshape(self.Decoder(decoder_out_dim = self.decoder_len, scope = "B2"), [self.decoder_len])
        self.d_B3 = tf.reshape(self.Decoder(decoder_out_dim = self.action_dim, scope = "B3"), [self.action_dim])
        #self.d_B1 = ((self.d_B1 - (tf.reduce_max(self.d_B1) + tf.reduce_min(self.d_B1))/2)/(tf.reduce_max(self.d_B1) - tf.reduce_min(self.d_B1)))*2 
        #self.d_B2 = ((self.d_B2 - (tf.reduce_max(self.d_B2) + tf.reduce_min(self.d_B2))/2)/(tf.reduce_max(self.d_B2) - tf.reduce_min(self.d_B2)))*2 
        
        
        # exploit policy
        self.build_policy_exploit()
        #self.d = [self.d_W1, self.d_B1, self.d_W2, self.d_B2, self.d_W3, self.d_B3]
        self.exploit_policy_loss = -tf.reduce_sum(self.exploit_logprob * self.advantage_placeholder_exploit)
        self.d = [self.d_W3, self.d_B3]
        self.gradients_exploit = tf.gradients(self.exploit_policy_loss,self.d)
        
        # train encoder and decoder
        adam_optimizer_exploit =  tf.train.AdamOptimizer(self.lr*0.05)
        self.output_train_op = adam_optimizer_exploit.minimize(self.exploit_policy_loss)
        # train original network
#         adam_optimizer_explore = tf.train.AdamOptimizer(self.lr)
#         self.input_train_op = adam_optimizer_explore.minimize(self.explore_policy_loss)
        init = tf.global_variables_initializer()
        self.sess.run(init)
    
    def initialize(self):
        self.ConstructGraph()
#         # create tf session
#         self.sess = tf.Session()
        
#         # initiliaze all variables
#         init = tf.global_variables_initializer()
#         self.sess.run(init)
    
    def train_step(self): 
        # sample num_traj*num_MDPs
        for i in range(self.num_mdps):
            explore_paths, explore_rewards = self.sample_paths_explore(self.env,i)
            observations_explore = np.concatenate([path["observation"] for path in explore_paths])
            new_observations_explore = np.concatenate([path["new_obs"] for path in explore_paths])
            new_actions_explore = np.concatenate([path["new_acs"] for path in explore_paths])
            actions_explore = np.concatenate([path["action"] for path in explore_paths])
            rewards_explore = np.concatenate([path["reward"] for path in explore_paths])
            
            vtargs, vpreds, advantages_explore = [], [], []
            for path in explore_paths:
                rew_t = path["new_rewards"]
                return_t = utils.discount(rew_t, self.gamma)
                vpred_t = self.vf.predict(path["new_obs"])
                adv_t = return_t - vpred_t
                advantages_explore.append(adv_t)
                vtargs.append(return_t)
                vpreds.append(vpred_t)
                
                
            #returns_explore = self.get_returns(explore_paths, explore = True)
            advantages_explore = np.concatenate(advantages_explore)
            advantages_explore = (advantages_explore - advantages_explore.mean()) / (advantages_explore.std() + 1e-8)
            vtarg_n = np.concatenate(vtargs)
            vpred_n = np.concatenate(vpreds)
            #update baseline
            #print(advantages_explore.shape,new_observations_explore.shape)
            self.vf.fit(new_observations_explore, vtarg_n)
            
            surr_loss, oldmean_na, oldlogstd_a = self.policyfn.update_policy(
                    new_observations_explore, new_actions_explore, advantages_explore, self.lr)
            
            print("average reward explore:", np.mean(explore_rewards))
            
            #print(returns_explore)
#             print("average reward explore: MDP", i, np.sum(explore_rewards)/num_traj, len(explore_rewards))

#             baseline_explore = self.sess.run(self.baseline, {self.observation_placeholder_explore:new_observations_explore})
#             adv = returns_explore - np.squeeze(baseline_explore)
#             advantages_explore = (adv - np.mean(adv))/(np.std(adv) + 1e-12)


#             # update the baseline

#             self.sess.run(self.update_baseline_op, {self.observation_placeholder_explore:new_observations_explore, 
#                                            self.baseline_target_placeholder : returns_explore})

            # calculate explore gradients
    #         grads_explore = self.sess.run(self.gradients_explore, feed_dict={
    #                     self.observation_placeholder_explore : observations_explore,
    #                     self.action_placeholder_explore : actions_explore,
    #                     self.advantage_placeholder_explore : returns_explore})
            #print("explore",grads_explore )
            # form trajectory matrix

            M = np.array(self.stack_trajectories(explore_paths))


            #encoder LSTM
            Z = self.sess.run(self.Z, feed_dict = {self.encoder_input_placeholder: M,self.sequence_length_placeholder: self.length })
            Z = np.reshape(Z,[1,len(Z)])
#             #print("Z",Z)
#             #print(self.sess.run(self.d, feed_dict = {self.decoder_input_placeholder: Z}))
#             #print("d",self.d)
#             # sample paths
#             tvars = tf.trainable_variables()
#             tvars_vals = self.sess.run(tvars[-5:-1])
#             #print(tvars_vals)
            for j in range(10):
                exploit_paths, exploit_rewards = self.sample_paths_exploit(self.env,Z,i)
                # get observations, actions and rewards
                observations_exploit = np.concatenate([path["observation"] for path in exploit_paths])
                actions_exploit = np.concatenate([path["action"] for path in exploit_paths])
                rewards_exploit = np.concatenate([path["reward"] for path in exploit_paths])
                returns_exploit = self.get_returns(exploit_paths)
                print("average reward exploit: MDP", i, np.mean(exploit_rewards), len(exploit_rewards))


#             # exploit grads
#     #         grads_exploit = self.sess.run(self.gradients_exploit,feed_dict={
#     #                     self.observation_placeholder_exploit : observations_exploit,
#     #                     self.action_placeholder_exploit : actions_exploit,
#     #                     self.advantage_placeholder_exploit : returns_exploit,
#     #                     self.decoder_input_placeholder: Z})

            #train encoder and decoder network
                self.sess.run(self.output_train_op, feed_dict={
                                self.observation_placeholder_exploit : observations_exploit,
                                self.action_placeholder_exploit : actions_exploit,
                                self.advantage_placeholder_exploit : returns_exploit,
                                self.decoder_input_placeholder: Z})

            # find advantage for input network
    #         advantage_explore = 0
    #         for i in range(len(grads_exploit)):
    #             l1 = grads_exploit[i]
    #             l2 = grads_explore[i]
    #             advantage_explore = advantage_explore + np.matmul(l1.flatten(), l2.flatten())

            # train input policy
#             self.sess.run(self.input_train_op, feed_dict={
#                             self.observation_placeholder_explore : new_observations_explore,
#                             self.action_placeholder_explore : new_actions_explore,
#                             self.advantage_placeholder_explore : advantages_explore})
    
    def test(self):
        explore_paths, explore_rewards = self.sample_paths_explore(self.env,20*np.random.rand())
        print("gravity: ",self.sim.model.opt.gravity )
        M = self.stack_trajectories(explore_paths)
        Z = self.sess.run(self.Z, feed_dict = {self.encoder_input_placeholder: M,self.sequence_length_placeholder: self.length })
        Z = np.reshape(Z,[1,len(Z)])
        # sample paths
        exploit_paths, exploit_rewards = self.sample_paths_exploit(self.env,Z,20*np.random.rand())
        print(exploit_rewards)
        print("average reward exploit", np.sum(exploit_rewards) / num_traj, len(exploit_rewards))
        
    def train(self):
        self.initialize()
        num_epochs = 200
        for epoch in range(num_epochs):
            self.train_step()
            

In [134]:
a = MetaLearner(env, max_ep_len, num_traj, latent_size)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [135]:
a.train()

average reward explore: -1.1320883480610526
average reward exploit: MDP 0 -14.274819753628254 10
average reward exploit: MDP 0 -10.553365483602764 10
average reward exploit: MDP 0 -11.346338262858957 10
average reward exploit: MDP 0 -8.224616854545815 10
average reward exploit: MDP 0 -9.017437227343319 10
average reward exploit: MDP 0 -4.743108064870849 10
average reward exploit: MDP 0 1.7078127597622044 10
average reward exploit: MDP 0 -1.5675674377568076 10
average reward exploit: MDP 0 -0.04365109831180627 10
average reward exploit: MDP 0 27.830806276660887 10
average reward explore: 4.139010475981371
average reward exploit: MDP 1 -3.6213362374930327 10
average reward exploit: MDP 1 -2.4307151615649247 10
average reward exploit: MDP 1 13.441927546832195 10
average reward exploit: MDP 1 19.830845333912784 10
average reward exploit: MDP 1 7.475000930805192 10
average reward exploit: MDP 1 14.063242649530755 10
average reward exploit: MDP 1 20.371221182048668 10
average reward exploit:

average reward exploit: MDP 4 82.00699509636527 10
average reward exploit: MDP 4 42.56875557085026 10
average reward exploit: MDP 4 28.438812643865326 10
average reward exploit: MDP 4 2.0509939393040155 10
average reward explore: -34.143440759462095
average reward exploit: MDP 5 147.40140127786077 10
average reward exploit: MDP 5 113.4154967574539 10
average reward exploit: MDP 5 159.03550011949247 10
average reward exploit: MDP 5 105.96277305480149 10
average reward exploit: MDP 5 152.7440850781664 10
average reward exploit: MDP 5 104.89180728359501 10
average reward exploit: MDP 5 127.34296143871313 10
average reward exploit: MDP 5 114.29431869690393 10
average reward exploit: MDP 5 88.20632834963514 10
average reward exploit: MDP 5 133.02742581966686 10
average reward explore: 13.996243520075314
average reward exploit: MDP 6 175.101716381618 10
average reward exploit: MDP 6 110.19954596677584 10
average reward exploit: MDP 6 115.99902880163809 10
average reward exploit: MDP 6 123.24

average reward exploit: MDP 9 288.36964686191084 10
average reward exploit: MDP 9 175.36694652123043 10
average reward exploit: MDP 9 161.7153435475266 10
average reward exploit: MDP 9 291.0762214706276 10
average reward exploit: MDP 9 256.27718753357055 10
average reward exploit: MDP 9 258.8243773464753 10
average reward exploit: MDP 9 169.2719555051965 10
average reward explore: -13.7426299894252
average reward exploit: MDP 0 179.4042903433643 10
average reward exploit: MDP 0 154.89259078961877 10
average reward exploit: MDP 0 188.52241196951906 10
average reward exploit: MDP 0 166.9568846948272 10
average reward exploit: MDP 0 175.82130005001198 10
average reward exploit: MDP 0 195.66053809873068 10
average reward exploit: MDP 0 94.08344602086149 10
average reward exploit: MDP 0 128.41327266964836 10
average reward exploit: MDP 0 127.6273164755199 10
average reward exploit: MDP 0 115.46715390609397 10
average reward explore: -3.08339826499814
average reward exploit: MDP 1 113.543839

average reward exploit: MDP 4 196.22240835954457 10
average reward exploit: MDP 4 88.63342911490504 10
average reward exploit: MDP 4 160.4660419396227 10
average reward exploit: MDP 4 60.76773474649009 10
average reward exploit: MDP 4 70.78729849094175 10
average reward exploit: MDP 4 109.811350668225 10
average reward exploit: MDP 4 52.93318072423293 10
average reward exploit: MDP 4 85.91496036302182 10
average reward exploit: MDP 4 124.17965611745899 10
average reward exploit: MDP 4 203.06388061998786 10
average reward explore: 36.92407939259631
average reward exploit: MDP 5 177.88189256986806 10
average reward exploit: MDP 5 306.55589309964444 10
average reward exploit: MDP 5 245.9281410600507 10
average reward exploit: MDP 5 146.52593808604712 10
average reward exploit: MDP 5 288.23848326007067 10
average reward exploit: MDP 5 282.2335706074474 10
average reward exploit: MDP 5 225.76581961992466 10
average reward exploit: MDP 5 225.1241643603202 10
average reward exploit: MDP 5 247

average reward exploit: MDP 8 201.56511549826752 10
average reward exploit: MDP 8 187.5370850486926 10
average reward explore: 48.71943351980506
average reward exploit: MDP 9 248.98626916401577 10
average reward exploit: MDP 9 253.38956416701257 10
average reward exploit: MDP 9 256.93052197221175 10
average reward exploit: MDP 9 296.9734607596448 10
average reward exploit: MDP 9 240.8743362856033 10
average reward exploit: MDP 9 227.28287502959915 10
average reward exploit: MDP 9 270.63708844168383 10
average reward exploit: MDP 9 233.16394207324396 10
average reward exploit: MDP 9 282.5251183718673 10
average reward exploit: MDP 9 237.24381377211066 10
average reward explore: 23.834448658346993
average reward exploit: MDP 0 289.63239398913413 10
average reward exploit: MDP 0 212.65995508141182 10
average reward exploit: MDP 0 286.131303339244 10
average reward exploit: MDP 0 307.26328049067064 10
average reward exploit: MDP 0 263.42521927742257 10
average reward exploit: MDP 0 222.405

average reward exploit: MDP 3 257.5189786709919 10
average reward exploit: MDP 3 268.64427273326544 10
average reward exploit: MDP 3 237.63698998057856 10
average reward exploit: MDP 3 225.86757601561084 10
average reward exploit: MDP 3 232.30133112552352 10
average reward explore: -6.676414244988034
average reward exploit: MDP 4 264.71362833249566 10
average reward exploit: MDP 4 278.4564349577682 10
average reward exploit: MDP 4 265.0597543751296 10
average reward exploit: MDP 4 246.16270333044707 10
average reward exploit: MDP 4 258.02763878445325 10
average reward exploit: MDP 4 223.92852867575976 10
average reward exploit: MDP 4 268.80391404212855 10
average reward exploit: MDP 4 280.21998868833356 10
average reward exploit: MDP 4 245.74325520283293 10
average reward exploit: MDP 4 293.15830568860764 10
average reward explore: 22.262149739557575
average reward exploit: MDP 5 161.3654883718687 10
average reward exploit: MDP 5 125.85259938159375 10
average reward exploit: MDP 5 132.

average reward exploit: MDP 8 262.55616116114396 10
average reward exploit: MDP 8 245.47362002410756 10
average reward exploit: MDP 8 245.04988818708185 10
average reward exploit: MDP 8 248.73403256019765 10
average reward exploit: MDP 8 285.94149960773376 10
average reward exploit: MDP 8 231.56048712594142 10
average reward exploit: MDP 8 278.081003193156 10
average reward exploit: MDP 8 263.6888901241282 10
average reward explore: -36.94300081017557
average reward exploit: MDP 9 371.29612536276613 10
average reward exploit: MDP 9 315.6018171296738 10
average reward exploit: MDP 9 302.1016412161464 10
average reward exploit: MDP 9 245.255996527746 10
average reward exploit: MDP 9 279.70367647982926 10
average reward exploit: MDP 9 378.2549871599465 10
average reward exploit: MDP 9 275.3964529737515 10
average reward exploit: MDP 9 358.9859494526351 10
average reward exploit: MDP 9 399.7784777337567 10
average reward exploit: MDP 9 336.6409541003967 10
average reward explore: 9.7965271

average reward explore: 184.0678763797145
average reward exploit: MDP 3 654.0070354328022 10
average reward exploit: MDP 3 493.92844536038353 10
average reward exploit: MDP 3 831.0426375404091 10
average reward exploit: MDP 3 759.8729045062302 10
average reward exploit: MDP 3 652.9093029794001 10
average reward exploit: MDP 3 715.6992694431071 10
average reward exploit: MDP 3 515.5742617330773 10
average reward exploit: MDP 3 598.4293248452459 10
average reward exploit: MDP 3 453.26334630966295 10
average reward exploit: MDP 3 735.8646066976953 10
average reward explore: 266.8520034490108
average reward exploit: MDP 4 227.42521160548722 10
average reward exploit: MDP 4 491.7112193420438 10
average reward exploit: MDP 4 374.1268956479205 10
average reward exploit: MDP 4 482.84122791720927 10
average reward exploit: MDP 4 421.35659373403576 10
average reward exploit: MDP 4 476.0087796086832 10
average reward exploit: MDP 4 336.52110889982754 10
average reward exploit: MDP 4 395.592028365

average reward exploit: MDP 7 278.2519340668513 10
average reward exploit: MDP 7 271.61373138760257 10
average reward exploit: MDP 7 255.88214932898777 10
average reward explore: 166.17819674967024
average reward exploit: MDP 8 284.2902722537583 10
average reward exploit: MDP 8 319.2710037207212 10
average reward exploit: MDP 8 307.4508824341391 10
average reward exploit: MDP 8 402.8503470374392 10
average reward exploit: MDP 8 320.7538119351505 10
average reward exploit: MDP 8 351.4490872588834 10
average reward exploit: MDP 8 323.5025955680809 10
average reward exploit: MDP 8 326.61570006604165 10
average reward exploit: MDP 8 323.289116622726 10
average reward exploit: MDP 8 273.48536647270095 10
average reward explore: 158.25162223496383
average reward exploit: MDP 9 336.9905099010859 10
average reward exploit: MDP 9 356.71436247088536 10
average reward exploit: MDP 9 306.44688188369463 10
average reward exploit: MDP 9 323.918302308355 10
average reward exploit: MDP 9 325.506421566

average reward exploit: MDP 2 313.876789045397 10
average reward exploit: MDP 2 308.7091905088147 10
average reward exploit: MDP 2 295.6726828255163 10
average reward exploit: MDP 2 343.89630441138024 10
average reward exploit: MDP 2 316.170352349797 10
average reward explore: 296.7556908467018
average reward exploit: MDP 3 249.6285161019013 10
average reward exploit: MDP 3 217.6666097150789 10
average reward exploit: MDP 3 263.1749125554919 10
average reward exploit: MDP 3 229.89741876959448 10
average reward exploit: MDP 3 234.4205007450004 10
average reward exploit: MDP 3 234.60097975403704 10
average reward exploit: MDP 3 232.99017914445153 10
average reward exploit: MDP 3 240.88500971310296 10
average reward exploit: MDP 3 238.2436509689897 10
average reward exploit: MDP 3 288.669852465366 10
average reward explore: 249.8681134849221
average reward exploit: MDP 4 279.2485154649226 10
average reward exploit: MDP 4 239.77022489873357 10
average reward exploit: MDP 4 291.357229096824

average reward exploit: MDP 7 286.22066789347593 10
average reward exploit: MDP 7 277.0500382559602 10
average reward exploit: MDP 7 281.484813522777 10
average reward exploit: MDP 7 271.2272348065364 10
average reward exploit: MDP 7 239.2166323468503 10
average reward exploit: MDP 7 266.98048474073755 10
average reward exploit: MDP 7 250.95474569562376 10
average reward exploit: MDP 7 278.35035194489285 10
average reward explore: 197.13838345029112
average reward exploit: MDP 8 275.34669852311174 10
average reward exploit: MDP 8 297.0526755442549 10
average reward exploit: MDP 8 234.83930118826075 10
average reward exploit: MDP 8 267.9787282826063 10
average reward exploit: MDP 8 235.71539218605335 10
average reward exploit: MDP 8 271.69123902881057 10
average reward exploit: MDP 8 265.2700011532135 10
average reward exploit: MDP 8 273.58332409543317 10
average reward exploit: MDP 8 264.1753786376936 10
average reward exploit: MDP 8 217.1106914596955 10
average reward explore: 279.508

average reward explore: 218.82133691129107
average reward exploit: MDP 2 277.6394839670967 10
average reward exploit: MDP 2 238.81840345503656 10
average reward exploit: MDP 2 275.17930004933123 10
average reward exploit: MDP 2 297.0722019282358 10
average reward exploit: MDP 2 245.23079950262672 10
average reward exploit: MDP 2 256.97881775647136 10
average reward exploit: MDP 2 322.7472707797695 10
average reward exploit: MDP 2 287.17050426351733 10
average reward exploit: MDP 2 280.02275617141663 10
average reward exploit: MDP 2 292.22308951252603 10
average reward explore: 238.05457634445628
average reward exploit: MDP 3 227.45648166309576 10
average reward exploit: MDP 3 214.800812419211 10
average reward exploit: MDP 3 305.7504947667075 10
average reward exploit: MDP 3 231.55952768796683 10
average reward exploit: MDP 3 216.33067045285384 10
average reward exploit: MDP 3 206.3532123442032 10
average reward exploit: MDP 3 296.4031949263977 10
average reward exploit: MDP 3 236.3900

average reward exploit: MDP 6 259.02568561494627 10
average reward exploit: MDP 6 222.4946608702682 10
average reward exploit: MDP 6 276.5438173352274 10
average reward explore: 221.8086118853395
average reward exploit: MDP 7 235.5280883198671 10
average reward exploit: MDP 7 225.1025411763102 10
average reward exploit: MDP 7 265.58058627545404 10
average reward exploit: MDP 7 145.53457676520418 10
average reward exploit: MDP 7 188.3963013831171 10
average reward exploit: MDP 7 223.04745963770375 10
average reward exploit: MDP 7 144.52553291910075 10
average reward exploit: MDP 7 205.28360020191312 10
average reward exploit: MDP 7 313.4450588644607 10
average reward exploit: MDP 7 231.89484686968294 10
average reward explore: 224.79242522930136
average reward exploit: MDP 8 288.93465885109674 10
average reward exploit: MDP 8 184.55061450482657 10
average reward exploit: MDP 8 255.82947931124522 10
average reward exploit: MDP 8 212.68781675042078 10
average reward exploit: MDP 8 220.224

average reward exploit: MDP 1 298.8944807267965 10
average reward exploit: MDP 1 277.43203981795403 10
average reward exploit: MDP 1 254.43731793748344 10
average reward exploit: MDP 1 289.5099426747143 10
average reward exploit: MDP 1 275.957948281035 10
average reward exploit: MDP 1 257.892630279517 10
average reward explore: 178.27983870282577
average reward exploit: MDP 2 266.76929480353164 10
average reward exploit: MDP 2 239.85964494086346 10
average reward exploit: MDP 2 225.12642200661645 10
average reward exploit: MDP 2 235.58364166613578 10
average reward exploit: MDP 2 226.92742669870873 10
average reward exploit: MDP 2 234.67335991739213 10
average reward exploit: MDP 2 253.5313807954295 10
average reward exploit: MDP 2 243.08006368812295 10
average reward exploit: MDP 2 211.86976653368538 10
average reward exploit: MDP 2 242.17700916539837 10
average reward explore: 209.03905917292724
average reward exploit: MDP 3 163.23843634095664 10
average reward exploit: MDP 3 180.982

average reward exploit: MDP 6 236.27592516442846 10
average reward exploit: MDP 6 235.74476430970353 10
average reward exploit: MDP 6 271.6252068515617 10
average reward exploit: MDP 6 229.11125612660607 10
average reward exploit: MDP 6 221.2730700465442 10
average reward exploit: MDP 6 290.54277268872045 10
average reward exploit: MDP 6 261.7211445683875 10
average reward exploit: MDP 6 310.6939789273797 10
average reward exploit: MDP 6 258.3833555096572 10
average reward explore: 238.5749243973994
average reward exploit: MDP 7 250.18592060585502 10
average reward exploit: MDP 7 257.42071492788943 10
average reward exploit: MDP 7 252.64827855994457 10
average reward exploit: MDP 7 263.08289407382017 10
average reward exploit: MDP 7 207.46966609323576 10
average reward exploit: MDP 7 253.5186125694182 10
average reward exploit: MDP 7 215.20576056533181 10
average reward exploit: MDP 7 258.20870098963013 10
average reward exploit: MDP 7 280.65782260767264 10
average reward exploit: MDP 

average reward exploit: MDP 0 234.34511786060642 10
average reward explore: 248.429198668316
average reward exploit: MDP 1 263.6506153312038 10
average reward exploit: MDP 1 277.8602165004542 10
average reward exploit: MDP 1 228.35380373644975 10
average reward exploit: MDP 1 208.69976222376445 10
average reward exploit: MDP 1 262.840245032827 10
average reward exploit: MDP 1 243.7501283911406 10
average reward exploit: MDP 1 235.74274353718866 10
average reward exploit: MDP 1 176.70661415759537 10
average reward exploit: MDP 1 233.61081628320716 10
average reward exploit: MDP 1 281.3805156440241 10
average reward explore: 262.8252193633617
average reward exploit: MDP 2 231.90202348479116 10
average reward exploit: MDP 2 265.7287087275502 10
average reward exploit: MDP 2 285.8284435774719 10
average reward exploit: MDP 2 250.34819848335388 10
average reward exploit: MDP 2 242.62608086300648 10
average reward exploit: MDP 2 243.21213837095797 10
average reward exploit: MDP 2 281.7558091

average reward exploit: MDP 5 249.9702656975243 10
average reward exploit: MDP 5 179.86039088521346 10
average reward exploit: MDP 5 287.94336113101724 10
average reward exploit: MDP 5 246.95075992237466 10
average reward explore: 257.66846746573015
average reward exploit: MDP 6 255.21562348752755 10
average reward exploit: MDP 6 273.99692750889324 10
average reward exploit: MDP 6 263.05314774368765 10
average reward exploit: MDP 6 250.4296821531588 10
average reward exploit: MDP 6 264.16463150468815 10
average reward exploit: MDP 6 260.6360224210599 10
average reward exploit: MDP 6 240.66670498530203 10
average reward exploit: MDP 6 266.75967174957844 10
average reward exploit: MDP 6 273.98258059856914 10
average reward exploit: MDP 6 266.2317181655159 10
average reward explore: 258.31460891640523
average reward exploit: MDP 7 266.9237760092998 10
average reward exploit: MDP 7 216.17222151799174 10
average reward exploit: MDP 7 245.42045382167308 10
average reward exploit: MDP 7 240.4

average reward exploit: MDP 0 144.23335249257283 10
average reward exploit: MDP 0 192.96582176411957 10
average reward exploit: MDP 0 172.730821464307 10
average reward exploit: MDP 0 162.00063056740612 10
average reward exploit: MDP 0 104.93507390789918 10
average reward exploit: MDP 0 235.1074456799012 10
average reward exploit: MDP 0 101.62173196484979 10
average reward explore: 268.4647194886103
average reward exploit: MDP 1 159.02732936622644 10
average reward exploit: MDP 1 104.16378682159748 10
average reward exploit: MDP 1 124.3260959701225 10
average reward exploit: MDP 1 125.7904050791544 10
average reward exploit: MDP 1 141.8902052266295 10
average reward exploit: MDP 1 107.21329815822878 10
average reward exploit: MDP 1 113.82589372700491 10
average reward exploit: MDP 1 172.93059866401396 10
average reward exploit: MDP 1 116.05962495500776 10
average reward exploit: MDP 1 141.64241444528375 10
average reward explore: 278.49379213563543
average reward exploit: MDP 2 190.338

average reward explore: 202.7943083988427
average reward exploit: MDP 5 85.37636547406849 10
average reward exploit: MDP 5 109.91348712667373 10
average reward exploit: MDP 5 168.41461051666522 10
average reward exploit: MDP 5 130.98764807664082 10
average reward exploit: MDP 5 168.42943900133176 10
average reward exploit: MDP 5 115.79989151706108 10
average reward exploit: MDP 5 112.85270754294814 10
average reward exploit: MDP 5 197.4922635185527 10
average reward exploit: MDP 5 166.52789382858776 10
average reward exploit: MDP 5 101.9597922458701 10
average reward explore: 235.4471446594301
average reward exploit: MDP 6 81.49760444806749 10
average reward exploit: MDP 6 61.477682730030004 10
average reward exploit: MDP 6 64.33960145227525 10
average reward exploit: MDP 6 84.5613589274385 10
average reward exploit: MDP 6 94.25870500513822 10
average reward exploit: MDP 6 59.32024852991469 10
average reward exploit: MDP 6 126.85534238616432 10
average reward exploit: MDP 6 76.13460010

average reward exploit: MDP 9 62.86711551684149 10
average reward exploit: MDP 9 69.57307105702488 10
average reward explore: 194.34603706920444
average reward exploit: MDP 0 104.38294469020225 10
average reward exploit: MDP 0 105.52066418890904 10
average reward exploit: MDP 0 104.06392220462325 10
average reward exploit: MDP 0 65.41662558279839 10
average reward exploit: MDP 0 90.93426018576483 10
average reward exploit: MDP 0 93.57341011712461 10
average reward exploit: MDP 0 73.98380000648011 10
average reward exploit: MDP 0 119.43291235428129 10
average reward exploit: MDP 0 138.15585885852587 10
average reward exploit: MDP 0 70.43946241605435 10
average reward explore: 232.94335182247582
average reward exploit: MDP 1 93.1031416433173 10
average reward exploit: MDP 1 61.73160416442521 10
average reward exploit: MDP 1 56.76807707411498 10
average reward exploit: MDP 1 69.34664635107546 10
average reward exploit: MDP 1 67.14030589506953 10
average reward exploit: MDP 1 71.2776940776

average reward exploit: MDP 4 76.46719462863696 10
average reward exploit: MDP 4 103.80406647516202 10
average reward exploit: MDP 4 98.08170704803072 10
average reward exploit: MDP 4 74.43165837954919 10
average reward explore: 225.1527829123426
average reward exploit: MDP 5 75.30398000886677 10
average reward exploit: MDP 5 52.30029983590684 10
average reward exploit: MDP 5 68.66866230270787 10
average reward exploit: MDP 5 58.74997740391609 10
average reward exploit: MDP 5 62.14203288212501 10
average reward exploit: MDP 5 101.42903113665311 10
average reward exploit: MDP 5 64.04470253484934 10
average reward exploit: MDP 5 56.80088088301583 10
average reward exploit: MDP 5 90.67630676954337 10
average reward exploit: MDP 5 96.54870567081245 10
average reward explore: 280.57759944367723
average reward exploit: MDP 6 85.37410613782667 10
average reward exploit: MDP 6 117.01720211471756 10
average reward exploit: MDP 6 80.39335400131992 10
average reward exploit: MDP 6 100.48419973682

average reward exploit: MDP 9 116.35630347939505 10
average reward exploit: MDP 9 96.57756669402931 10
average reward exploit: MDP 9 95.84677593567888 10
average reward exploit: MDP 9 99.59240528943431 10
average reward exploit: MDP 9 153.33373431394054 10
average reward exploit: MDP 9 106.15373197917188 10
average reward explore: 291.1879030592888
average reward exploit: MDP 0 215.8291130438576 10
average reward exploit: MDP 0 281.67645353187515 10
average reward exploit: MDP 0 279.68092885691806 10
average reward exploit: MDP 0 254.34142168245427 10
average reward exploit: MDP 0 195.21401510941118 10
average reward exploit: MDP 0 298.3206556867838 10
average reward exploit: MDP 0 250.9885769664357 10
average reward exploit: MDP 0 241.27447764725025 10
average reward exploit: MDP 0 286.5432333796612 10
average reward exploit: MDP 0 247.65989480401632 10
average reward explore: 230.34954617578705
average reward exploit: MDP 1 113.11227918995601 10
average reward exploit: MDP 1 109.2057

average reward exploit: MDP 4 200.79353137799305 10
average reward exploit: MDP 4 223.3182092638953 10
average reward exploit: MDP 4 229.6854276862585 10
average reward exploit: MDP 4 182.14909935313074 10
average reward exploit: MDP 4 231.28379658096668 10
average reward exploit: MDP 4 126.1552058916345 10
average reward exploit: MDP 4 181.0090769234759 10
average reward exploit: MDP 4 140.49636706656253 10
average reward exploit: MDP 4 179.59110293521306 10
average reward explore: 243.2210652750116
average reward exploit: MDP 5 119.79147205531952 10
average reward exploit: MDP 5 135.97278912604867 10
average reward exploit: MDP 5 162.37426610719615 10
average reward exploit: MDP 5 119.87326522520686 10
average reward exploit: MDP 5 185.29058156824254 10
average reward exploit: MDP 5 181.52248817108878 10
average reward exploit: MDP 5 127.24087146417773 10
average reward exploit: MDP 5 191.08802236874254 10
average reward exploit: MDP 5 153.0293444820952 10
average reward exploit: MDP

average reward exploit: MDP 8 115.34593102471828 10
average reward explore: 217.53173303604308
average reward exploit: MDP 9 216.45955577713875 10
average reward exploit: MDP 9 159.16225400455127 10
average reward exploit: MDP 9 134.171045736555 10
average reward exploit: MDP 9 166.55484723219544 10
average reward exploit: MDP 9 108.05510651061395 10
average reward exploit: MDP 9 85.34238626605972 10
average reward exploit: MDP 9 164.18771424864792 10
average reward exploit: MDP 9 155.07072641896207 10
average reward exploit: MDP 9 87.16669528245443 10
average reward exploit: MDP 9 212.37537639541603 10
average reward explore: 219.94865749423965
average reward exploit: MDP 0 115.74693989731774 10
average reward exploit: MDP 0 172.26836811918642 10
average reward exploit: MDP 0 101.77503800421457 10
average reward exploit: MDP 0 94.49013529010008 10
average reward exploit: MDP 0 118.88314779114975 10
average reward exploit: MDP 0 69.89807163396986 10
average reward exploit: MDP 0 108.63

average reward exploit: MDP 3 175.64342362732788 10
average reward exploit: MDP 3 189.71256946389977 10
average reward exploit: MDP 3 183.59980161058823 10
average reward exploit: MDP 3 180.79753739061817 10
average reward explore: 240.4507926280618
average reward exploit: MDP 4 70.60981526730393 10
average reward exploit: MDP 4 78.73503337853724 10
average reward exploit: MDP 4 115.80491665957042 10
average reward exploit: MDP 4 67.90816600283864 10
average reward exploit: MDP 4 110.20131231801895 10
average reward exploit: MDP 4 147.49535599252994 10
average reward exploit: MDP 4 130.34458035291226 10
average reward exploit: MDP 4 67.0975010727012 10
average reward exploit: MDP 4 68.07888737830127 10
average reward exploit: MDP 4 139.2109752631819 10
average reward explore: 214.3473930573416
average reward exploit: MDP 5 95.47587735701934 10
average reward exploit: MDP 5 123.83864238450994 10
average reward exploit: MDP 5 111.54922764012267 10
average reward exploit: MDP 5 98.0348278

average reward exploit: MDP 8 145.34402385245943 10
average reward exploit: MDP 8 190.84276150431862 10
average reward exploit: MDP 8 235.59666259961742 10
average reward exploit: MDP 8 195.55430781912645 10
average reward exploit: MDP 8 180.85661914090664 10
average reward exploit: MDP 8 245.35108741347887 10
average reward exploit: MDP 8 203.6179159621933 10
average reward explore: 204.52296880867715
average reward exploit: MDP 9 126.73519198726278 10
average reward exploit: MDP 9 215.78239653925465 10
average reward exploit: MDP 9 141.32840627711042 10
average reward exploit: MDP 9 188.55808594155897 10
average reward exploit: MDP 9 164.1310504318073 10
average reward exploit: MDP 9 232.07305938373938 10
average reward exploit: MDP 9 201.5926408066244 10
average reward exploit: MDP 9 123.38254966912879 10
average reward exploit: MDP 9 206.65259014140398 10
average reward exploit: MDP 9 171.78546066549353 10
average reward explore: 227.01865358797113
average reward exploit: MDP 0 278

average reward exploit: MDP 3 237.3366905107026 10
average reward exploit: MDP 3 230.05758199604435 10
average reward exploit: MDP 3 244.6579756283323 10
average reward exploit: MDP 3 262.4238356428053 10
average reward exploit: MDP 3 267.85586298299074 10
average reward exploit: MDP 3 266.2209805380169 10
average reward exploit: MDP 3 245.39945284545075 10
average reward exploit: MDP 3 218.68345662882135 10
average reward exploit: MDP 3 187.8791306832564 10
average reward exploit: MDP 3 258.0721327269449 10
average reward explore: 238.537646509752
average reward exploit: MDP 4 220.64668294394323 10
average reward exploit: MDP 4 174.10406757079565 10
average reward exploit: MDP 4 141.58611381358307 10
average reward exploit: MDP 4 294.4182663023572 10
average reward exploit: MDP 4 265.3071052461096 10
average reward exploit: MDP 4 261.8079908694201 10
average reward exploit: MDP 4 242.3493096939131 10
average reward exploit: MDP 4 211.11353373170203 10
average reward exploit: MDP 4 134

average reward exploit: MDP 7 182.38693261245015 10
average reward exploit: MDP 7 158.2813049675694 10
average reward explore: 247.99293693515884
average reward exploit: MDP 8 159.76572243220252 10
average reward exploit: MDP 8 201.4128282091593 10
average reward exploit: MDP 8 137.12078473128068 10
average reward exploit: MDP 8 93.95329181280081 10
average reward exploit: MDP 8 138.32991021446438 10
average reward exploit: MDP 8 169.3717062831859 10
average reward exploit: MDP 8 140.4164881446207 10
average reward exploit: MDP 8 181.85074302816014 10
average reward exploit: MDP 8 85.71477404962684 10
average reward exploit: MDP 8 172.55314057601046 10
average reward explore: 191.46970945959376
average reward exploit: MDP 9 189.35575120794698 10
average reward exploit: MDP 9 200.31599861341607 10
average reward exploit: MDP 9 137.53110101296457 10
average reward exploit: MDP 9 130.3429591437815 10
average reward exploit: MDP 9 139.94909544450496 10
average reward exploit: MDP 9 126.515

average reward exploit: MDP 2 103.96970396746921 10
average reward exploit: MDP 2 55.35682537160078 10
average reward exploit: MDP 2 61.15205558144969 10
average reward exploit: MDP 2 100.39056915055605 10
average reward exploit: MDP 2 60.7122645224557 10
average reward explore: 244.28660248945465
average reward exploit: MDP 3 147.08285380413147 10
average reward exploit: MDP 3 59.038785126453476 10
average reward exploit: MDP 3 70.66422523157536 10
average reward exploit: MDP 3 98.79573104254813 10
average reward exploit: MDP 3 90.85297666635527 10
average reward exploit: MDP 3 59.18400359458276 10
average reward exploit: MDP 3 140.9555364865302 10
average reward exploit: MDP 3 76.40269958640796 10
average reward exploit: MDP 3 143.38397462107997 10
average reward exploit: MDP 3 65.99283607902085 10
average reward explore: 255.57334197271751
average reward exploit: MDP 4 132.8921447172412 10
average reward exploit: MDP 4 94.76348180333048 10
average reward exploit: MDP 4 79.5603407010

average reward exploit: MDP 7 122.41911868868583 10
average reward exploit: MDP 7 130.04002203125285 10
average reward exploit: MDP 7 97.11661489380609 10
average reward exploit: MDP 7 101.15030753610594 10
average reward exploit: MDP 7 173.51631817853485 10
average reward exploit: MDP 7 126.70213537495627 10
average reward exploit: MDP 7 87.17671039779239 10
average reward exploit: MDP 7 76.3157595154398 10
average reward explore: 189.94955588372105
average reward exploit: MDP 8 118.52014260713013 10
average reward exploit: MDP 8 122.13542386979766 10
average reward exploit: MDP 8 84.92650403929149 10
average reward exploit: MDP 8 171.58226221337526 10
average reward exploit: MDP 8 118.05054801446985 10
average reward exploit: MDP 8 245.92485258928576 10
average reward exploit: MDP 8 143.39289529717462 10
average reward exploit: MDP 8 118.22304908092758 10
average reward exploit: MDP 8 98.84000254690045 10
average reward exploit: MDP 8 84.14630875711836 10
average reward explore: 203.

average reward explore: 232.25440005451264
average reward exploit: MDP 2 61.13569282872872 10
average reward exploit: MDP 2 125.71351667813383 10
average reward exploit: MDP 2 54.33092110339972 10
average reward exploit: MDP 2 67.13600671819646 10
average reward exploit: MDP 2 80.79857919993925 10
average reward exploit: MDP 2 64.6141161336447 10
average reward exploit: MDP 2 69.31124227087413 10
average reward exploit: MDP 2 60.509017434435464 10
average reward exploit: MDP 2 54.307728415185906 10
average reward exploit: MDP 2 98.25897857557325 10
average reward explore: 210.03233680548448
average reward exploit: MDP 3 233.9359656868446 10
average reward exploit: MDP 3 226.4557485313067 10
average reward exploit: MDP 3 245.31433939775266 10
average reward exploit: MDP 3 230.7113383658102 10
average reward exploit: MDP 3 170.75977736990413 10
average reward exploit: MDP 3 229.51211204723796 10
average reward exploit: MDP 3 243.97920110529077 10
average reward exploit: MDP 3 295.0425169

average reward exploit: MDP 6 239.76450265589557 10
average reward exploit: MDP 6 250.71058274677927 10
average reward exploit: MDP 6 214.1256218233879 10
average reward explore: 220.00473087968743
average reward exploit: MDP 7 249.2473889396988 10
average reward exploit: MDP 7 229.10986318495867 10
average reward exploit: MDP 7 220.78736547606937 10
average reward exploit: MDP 7 310.5205185088603 10
average reward exploit: MDP 7 162.55402336459838 10
average reward exploit: MDP 7 252.9645950952582 10
average reward exploit: MDP 7 241.71711237982717 10
average reward exploit: MDP 7 225.30406298095585 10
average reward exploit: MDP 7 181.41862898310438 10
average reward exploit: MDP 7 267.15191991277663 10
average reward explore: 211.12959998172556
average reward exploit: MDP 8 236.00936645143125 10
average reward exploit: MDP 8 197.43350650210166 10
average reward exploit: MDP 8 199.40412375467733 10
average reward exploit: MDP 8 253.87629037169228 10
average reward exploit: MDP 8 199.

average reward exploit: MDP 1 250.76123780936217 10
average reward exploit: MDP 1 259.75548319614035 10
average reward exploit: MDP 1 259.8415119888595 10
average reward exploit: MDP 1 252.1073171479189 10
average reward exploit: MDP 1 264.06406258582194 10
average reward exploit: MDP 1 259.2085598126228 10
average reward explore: 199.41817125766278
average reward exploit: MDP 2 285.28091882518027 10
average reward exploit: MDP 2 274.8423216583271 10
average reward exploit: MDP 2 279.1135125777361 10
average reward exploit: MDP 2 287.6335518804951 10
average reward exploit: MDP 2 247.5353770994554 10
average reward exploit: MDP 2 236.76006806016252 10
average reward exploit: MDP 2 261.15913424057334 10
average reward exploit: MDP 2 296.5501316419292 10
average reward exploit: MDP 2 281.80668844670646 10
average reward exploit: MDP 2 295.4425571263847 10
average reward explore: 259.7096880828171
average reward exploit: MDP 3 60.99221446951859 10
average reward exploit: MDP 3 146.5114566

average reward exploit: MDP 6 248.4055723068459 10
average reward exploit: MDP 6 271.24846898686394 10
average reward exploit: MDP 6 258.12092515444823 10
average reward exploit: MDP 6 249.75357620126653 10
average reward exploit: MDP 6 278.5950731742888 10
average reward exploit: MDP 6 287.17311128476734 10
average reward exploit: MDP 6 214.9921781628604 10
average reward exploit: MDP 6 255.44585443273155 10
average reward exploit: MDP 6 272.1144578493642 10
average reward explore: 221.2566414235539
average reward exploit: MDP 7 243.9549482590325 10
average reward exploit: MDP 7 253.41735482880458 10
average reward exploit: MDP 7 222.7947233035033 10
average reward exploit: MDP 7 191.02833432186642 10
average reward exploit: MDP 7 264.75084640581264 10
average reward exploit: MDP 7 243.40878690384184 10
average reward exploit: MDP 7 215.19323457356592 10
average reward exploit: MDP 7 254.29145644714959 10
average reward exploit: MDP 7 214.4521454946248 10
average reward exploit: MDP 7

average reward exploit: MDP 0 265.20547273589784 10
average reward explore: 250.599344845158
average reward exploit: MDP 1 278.5386448610267 10
average reward exploit: MDP 1 254.27899315353085 10
average reward exploit: MDP 1 280.59079957705814 10
average reward exploit: MDP 1 266.62791609694983 10
average reward exploit: MDP 1 262.5455076021609 10
average reward exploit: MDP 1 266.12805681453267 10
average reward exploit: MDP 1 265.10047735894335 10
average reward exploit: MDP 1 257.06548213700336 10
average reward exploit: MDP 1 299.3225839290204 10
average reward exploit: MDP 1 275.5423672451952 10
average reward explore: 253.2256148521357
average reward exploit: MDP 2 251.22806457934243 10
average reward exploit: MDP 2 248.39599820079607 10
average reward exploit: MDP 2 262.653726363436 10
average reward exploit: MDP 2 259.44050775798354 10
average reward exploit: MDP 2 255.5355301209009 10
average reward exploit: MDP 2 248.9636578364174 10
average reward exploit: MDP 2 248.6146728

average reward exploit: MDP 5 234.03820736878146 10
average reward exploit: MDP 5 240.3657938770753 10
average reward exploit: MDP 5 236.80705337867442 10
average reward exploit: MDP 5 206.47496993366408 10
average reward explore: 304.8778431016802
average reward exploit: MDP 6 246.79308569015112 10
average reward exploit: MDP 6 258.3037845908665 10
average reward exploit: MDP 6 244.51586473499884 10
average reward exploit: MDP 6 245.85923040118783 10
average reward exploit: MDP 6 236.37823741362553 10
average reward exploit: MDP 6 251.30874117628582 10
average reward exploit: MDP 6 253.15882192896987 10
average reward exploit: MDP 6 263.6819478211088 10
average reward exploit: MDP 6 251.65520778264528 10
average reward exploit: MDP 6 250.69222294534657 10
average reward explore: 210.16209856812839
average reward exploit: MDP 7 232.3147463748091 10
average reward exploit: MDP 7 280.7764561419103 10
average reward exploit: MDP 7 257.44233732079084 10
average reward exploit: MDP 7 200.27

average reward exploit: MDP 0 257.93159997888824 10
average reward exploit: MDP 0 186.9866141395638 10
average reward exploit: MDP 0 204.99772876161637 10
average reward exploit: MDP 0 282.164497198706 10
average reward exploit: MDP 0 252.6778626677505 10
average reward exploit: MDP 0 199.88610069563543 10
average reward exploit: MDP 0 275.31604369447365 10
average reward explore: 255.33500579631527
average reward exploit: MDP 1 298.0139299640071 10
average reward exploit: MDP 1 249.7098287122732 10
average reward exploit: MDP 1 266.5004768543796 10
average reward exploit: MDP 1 233.41912311767987 10
average reward exploit: MDP 1 267.0062661414266 10
average reward exploit: MDP 1 207.4458205233015 10
average reward exploit: MDP 1 267.71015636741487 10
average reward exploit: MDP 1 276.73825875637465 10
average reward exploit: MDP 1 256.365676024043 10
average reward exploit: MDP 1 236.57041314388886 10
average reward explore: 289.8271137545463
average reward exploit: MDP 2 172.67833547

average reward exploit: MDP 5 264.7053362306641 10
average reward exploit: MDP 5 226.78093906607802 10
average reward exploit: MDP 5 268.99081419852746 10
average reward exploit: MDP 5 196.54295123358492 10
average reward exploit: MDP 5 245.01466513537403 10
average reward exploit: MDP 5 244.69904732782402 10
average reward exploit: MDP 5 211.17746091310104 10
average reward exploit: MDP 5 216.54314323593243 10
average reward exploit: MDP 5 247.92293456032593 10
average reward exploit: MDP 5 237.04610765245462 10
average reward explore: 250.4390382708729
average reward exploit: MDP 6 254.90798157087252 10
average reward exploit: MDP 6 290.03800758538665 10
average reward exploit: MDP 6 296.59941143266184 10
average reward exploit: MDP 6 277.00491847243933 10
average reward exploit: MDP 6 281.17888875053137 10
average reward exploit: MDP 6 291.8009276586944 10
average reward exploit: MDP 6 285.5906380008839 10
average reward exploit: MDP 6 262.40760232303955 10
average reward exploit: M

average reward exploit: MDP 9 250.96049762095868 10
average reward exploit: MDP 9 243.16080206229094 10
average reward explore: 272.03945326389623
average reward exploit: MDP 0 281.00583606244913 10
average reward exploit: MDP 0 269.3450645934834 10
average reward exploit: MDP 0 268.3303020935676 10
average reward exploit: MDP 0 290.6332676013494 10
average reward exploit: MDP 0 296.66614007193635 10
average reward exploit: MDP 0 275.55492828137614 10
average reward exploit: MDP 0 270.49102344089977 10
average reward exploit: MDP 0 270.5485208381722 10
average reward exploit: MDP 0 269.074204576512 10
average reward exploit: MDP 0 281.90440120177675 10
average reward explore: 321.15207457078975
average reward exploit: MDP 1 228.24395705606284 10
average reward exploit: MDP 1 201.4678161929982 10
average reward exploit: MDP 1 189.42160657116284 10
average reward exploit: MDP 1 213.32699079858025 10
average reward exploit: MDP 1 204.0131918354415 10
average reward exploit: MDP 1 144.6242

average reward exploit: MDP 4 298.4987695099174 10
average reward exploit: MDP 4 305.4991372474086 10
average reward exploit: MDP 4 271.9990818106124 10
average reward exploit: MDP 4 261.95056524212094 10
average reward exploit: MDP 4 283.2387557002389 10
average reward explore: 279.48622302630054
average reward exploit: MDP 5 281.411184258311 10
average reward exploit: MDP 5 282.7113707854147 10
average reward exploit: MDP 5 285.89909464001516 10
average reward exploit: MDP 5 252.3396863796956 10
average reward exploit: MDP 5 254.78700862262681 10
average reward exploit: MDP 5 254.56225498962803 10
average reward exploit: MDP 5 255.63405394653455 10
average reward exploit: MDP 5 286.7455570222676 10
average reward exploit: MDP 5 260.7695189599684 10
average reward exploit: MDP 5 253.3927029398407 10
average reward explore: 290.8279390875495
average reward exploit: MDP 6 94.04274834349327 10
average reward exploit: MDP 6 234.5239884140276 10
average reward exploit: MDP 6 195.4415184776

average reward exploit: MDP 9 136.53988150755885 10
average reward exploit: MDP 9 126.13794669162235 10
average reward exploit: MDP 9 124.70090475938113 10
average reward exploit: MDP 9 122.6116061796345 10
average reward exploit: MDP 9 123.87801535570233 10
average reward exploit: MDP 9 91.51293444724612 10
average reward exploit: MDP 9 111.36381975797192 10
average reward exploit: MDP 9 111.67014221682739 10
average reward explore: 305.47971460272555
average reward exploit: MDP 0 261.38838913653046 10
average reward exploit: MDP 0 278.5831308648504 10
average reward exploit: MDP 0 264.15955530364715 10
average reward exploit: MDP 0 269.9642837369018 10
average reward exploit: MDP 0 253.0708383181126 10
average reward exploit: MDP 0 280.2092453407992 10
average reward exploit: MDP 0 274.40261918768766 10
average reward exploit: MDP 0 256.245437584406 10
average reward exploit: MDP 0 275.41541241077715 10
average reward exploit: MDP 0 275.73170749062626 10
average reward explore: 244.2

average reward explore: 283.40375910036875
average reward exploit: MDP 4 239.9176247914284 10
average reward exploit: MDP 4 253.96780576842775 10
average reward exploit: MDP 4 234.94765625975782 10
average reward exploit: MDP 4 257.6157803506913 10
average reward exploit: MDP 4 257.0720361049719 10
average reward exploit: MDP 4 276.1633975240305 10
average reward exploit: MDP 4 258.4293984421482 10
average reward exploit: MDP 4 266.859385602732 10
average reward exploit: MDP 4 249.05402743512872 10
average reward exploit: MDP 4 253.24080763705973 10
average reward explore: 325.3827721832848
average reward exploit: MDP 5 193.9603709761413 10
average reward exploit: MDP 5 175.43740268942034 10
average reward exploit: MDP 5 180.81687606347006 10
average reward exploit: MDP 5 241.50549721549237 10
average reward exploit: MDP 5 256.1666317645619 10
average reward exploit: MDP 5 154.2629678297114 10
average reward exploit: MDP 5 149.64270913888873 10
average reward exploit: MDP 5 198.4481424

average reward exploit: MDP 8 247.13645820657962 10
average reward exploit: MDP 8 277.89794441398703 10
average reward exploit: MDP 8 213.8619840816379 10
average reward explore: 287.84557729154443
average reward exploit: MDP 9 242.2883534679212 10
average reward exploit: MDP 9 198.34522489815396 10
average reward exploit: MDP 9 223.8917900065244 10
average reward exploit: MDP 9 218.7472796562807 10
average reward exploit: MDP 9 234.94087925791905 10
average reward exploit: MDP 9 230.7599962846936 10
average reward exploit: MDP 9 209.9894686013976 10
average reward exploit: MDP 9 230.97536144404268 10
average reward exploit: MDP 9 214.1167775638984 10
average reward exploit: MDP 9 212.63559038032955 10
average reward explore: 283.0050464763858
average reward exploit: MDP 0 247.47335634721543 10
average reward exploit: MDP 0 263.2574850888388 10
average reward exploit: MDP 0 259.70216803467264 10
average reward exploit: MDP 0 225.16162668067346 10
average reward exploit: MDP 0 227.75216

average reward exploit: MDP 3 306.58152428984704 10
average reward exploit: MDP 3 273.8394160139393 10
average reward exploit: MDP 3 294.3653083934886 10
average reward exploit: MDP 3 309.6169496865853 10
average reward exploit: MDP 3 289.9507003832294 10
average reward exploit: MDP 3 277.3162709836372 10
average reward explore: 303.71828197697187
average reward exploit: MDP 4 258.29699458054165 10
average reward exploit: MDP 4 267.2799468051563 10
average reward exploit: MDP 4 254.69141281413403 10
average reward exploit: MDP 4 294.3943188038292 10
average reward exploit: MDP 4 264.4002714520025 10
average reward exploit: MDP 4 255.08391079738976 10
average reward exploit: MDP 4 260.9953346748413 10
average reward exploit: MDP 4 286.6988364799517 10
average reward exploit: MDP 4 253.45549646419803 10
average reward exploit: MDP 4 284.09193663312016 10
average reward explore: 307.1366681008859
average reward exploit: MDP 5 270.12647210595196 10
average reward exploit: MDP 5 250.6858025

average reward exploit: MDP 8 237.63953247766312 10
average reward exploit: MDP 8 270.9007544568587 10
average reward exploit: MDP 8 282.39315545477155 10
average reward exploit: MDP 8 226.96309237796822 10
average reward exploit: MDP 8 266.57584895327034 10
average reward exploit: MDP 8 280.40954400289206 10
average reward exploit: MDP 8 261.16777249750254 10
average reward exploit: MDP 8 241.63906234131758 10
average reward exploit: MDP 8 228.1094018401359 10
average reward explore: 341.17613418011524
average reward exploit: MDP 9 193.97226094979385 10
average reward exploit: MDP 9 234.0246892740724 10
average reward exploit: MDP 9 235.19279497350982 10
average reward exploit: MDP 9 262.950979546237 10
average reward exploit: MDP 9 231.05449478384313 10
average reward exploit: MDP 9 231.45739811278355 10
average reward exploit: MDP 9 165.65638310134452 10
average reward exploit: MDP 9 244.51520516607576 10
average reward exploit: MDP 9 207.37642628549852 10
average reward exploit: MD

average reward exploit: MDP 2 77.18499587874726 10
average reward explore: 275.67544022977916
average reward exploit: MDP 3 293.8058573081129 10
average reward exploit: MDP 3 280.9489820549695 10
average reward exploit: MDP 3 224.9717295197394 10
average reward exploit: MDP 3 275.81455874732364 10
average reward exploit: MDP 3 266.17450975352693 10
average reward exploit: MDP 3 286.02117002094775 10
average reward exploit: MDP 3 234.83185245679118 10
average reward exploit: MDP 3 278.46426143011996 10
average reward exploit: MDP 3 273.1143196287041 10
average reward exploit: MDP 3 234.49909256259266 10
average reward explore: 323.46776509578365
average reward exploit: MDP 4 282.73465976343505 10
average reward exploit: MDP 4 292.53771734165946 10
average reward exploit: MDP 4 271.7293143899099 10
average reward exploit: MDP 4 294.7615939998308 10
average reward exploit: MDP 4 251.458641988921 10
average reward exploit: MDP 4 269.022204089348 10
average reward exploit: MDP 4 294.0986153

average reward exploit: MDP 7 235.83526935877438 10
average reward exploit: MDP 7 185.02450005783845 10
average reward exploit: MDP 7 273.36826526092864 10
average reward exploit: MDP 7 275.98170764217616 10
average reward explore: 301.94353723599835
average reward exploit: MDP 8 229.1176676232335 10
average reward exploit: MDP 8 256.26208771622817 10
average reward exploit: MDP 8 242.76948552260856 10
average reward exploit: MDP 8 206.47626241270078 10
average reward exploit: MDP 8 167.76586122723515 10
average reward exploit: MDP 8 205.6432940355994 10
average reward exploit: MDP 8 132.83796315131772 10
average reward exploit: MDP 8 265.5283340236675 10
average reward exploit: MDP 8 230.05085029300972 10
average reward exploit: MDP 8 236.95104439942207 10
average reward explore: 356.4841571807269
average reward exploit: MDP 9 232.746606632386 10
average reward exploit: MDP 9 213.06136984001404 10
average reward exploit: MDP 9 131.38418644108575 10
average reward exploit: MDP 9 194.20

average reward exploit: MDP 2 260.5342768291774 10
average reward exploit: MDP 2 186.1308493846145 10
average reward exploit: MDP 2 222.52639133137527 10
average reward exploit: MDP 2 195.73631611183802 10
average reward exploit: MDP 2 236.45605418364934 10
average reward exploit: MDP 2 221.53374998191765 10
average reward exploit: MDP 2 245.20968023157292 10
average reward explore: 352.63529687133786
average reward exploit: MDP 3 221.6056620792064 10
average reward exploit: MDP 3 239.19022287208253 10
average reward exploit: MDP 3 226.54497450988075 10
average reward exploit: MDP 3 236.86471332430924 10
average reward exploit: MDP 3 206.47220866080244 10
average reward exploit: MDP 3 265.6509409791711 10
average reward exploit: MDP 3 195.8665915438929 10
average reward exploit: MDP 3 256.33548990281236 10
average reward exploit: MDP 3 294.9570022237382 10
average reward exploit: MDP 3 272.08633552635257 10
average reward explore: 364.35771418042407
average reward exploit: MDP 4 276.73

average reward exploit: MDP 7 211.5113863896011 10
average reward exploit: MDP 7 244.1522343990279 10
average reward exploit: MDP 7 297.9498164819389 10
average reward exploit: MDP 7 279.24660170709114 10
average reward exploit: MDP 7 239.14786245748263 10
average reward exploit: MDP 7 254.24299348314906 10
average reward exploit: MDP 7 223.11513829883984 10
average reward exploit: MDP 7 281.4101879961542 10
average reward exploit: MDP 7 230.81877842480398 10
average reward exploit: MDP 7 196.68149358569252 10
average reward explore: 303.4331638047209
average reward exploit: MDP 8 118.58637917649972 10
average reward exploit: MDP 8 124.09193806721433 10
average reward exploit: MDP 8 182.62117804633542 10
average reward exploit: MDP 8 214.0788335254643 10
average reward exploit: MDP 8 102.06665971872253 10
average reward exploit: MDP 8 199.23437657991204 10
average reward exploit: MDP 8 96.75613342341907 10
average reward exploit: MDP 8 127.47413155341374 10
average reward exploit: MDP 

average reward exploit: MDP 1 280.2305631413486 10
average reward exploit: MDP 1 274.2579770178849 10
average reward explore: 309.1693850277343
average reward exploit: MDP 2 196.98937544359137 10
average reward exploit: MDP 2 307.5273975004821 10
average reward exploit: MDP 2 282.7859409340732 10
average reward exploit: MDP 2 226.37706878733132 10
average reward exploit: MDP 2 255.2672908473508 10
average reward exploit: MDP 2 293.0366444955551 10
average reward exploit: MDP 2 268.8532896488608 10
average reward exploit: MDP 2 298.1428890777934 10
average reward exploit: MDP 2 274.2938984989789 10
average reward exploit: MDP 2 252.89516163227555 10
average reward explore: 344.6307396475798
average reward exploit: MDP 3 186.12190064312745 10
average reward exploit: MDP 3 130.65419121861913 10
average reward exploit: MDP 3 194.78026549477818 10
average reward exploit: MDP 3 240.04516810197697 10
average reward exploit: MDP 3 162.87071464600234 10
average reward exploit: MDP 3 188.4762040

average reward exploit: MDP 6 56.58996658803119 10
average reward exploit: MDP 6 102.6123600150225 10
average reward exploit: MDP 6 105.54489178180866 10
average reward exploit: MDP 6 82.23541826196217 10
average reward exploit: MDP 6 22.620095825911953 10
average reward explore: 298.8750563637639
average reward exploit: MDP 7 226.60822230648017 10
average reward exploit: MDP 7 242.56922219318344 10
average reward exploit: MDP 7 234.70088001828012 10
average reward exploit: MDP 7 242.36928621881916 10
average reward exploit: MDP 7 217.8011179870025 10
average reward exploit: MDP 7 261.7598042071829 10
average reward exploit: MDP 7 235.11808367456314 10
average reward exploit: MDP 7 186.01747994800664 10
average reward exploit: MDP 7 222.19730979442016 10
average reward exploit: MDP 7 225.9640546342799 10
average reward explore: 321.5072521126992
average reward exploit: MDP 8 208.45888704836366 10
average reward exploit: MDP 8 239.28693083517118 10
average reward exploit: MDP 8 242.9063

average reward exploit: MDP 1 144.26506069934516 10
average reward exploit: MDP 1 99.920773041427 10
average reward exploit: MDP 1 138.38628325335011 10
average reward exploit: MDP 1 153.2891922669224 10
average reward exploit: MDP 1 222.050156050626 10
average reward exploit: MDP 1 159.52480470957897 10
average reward exploit: MDP 1 143.90063334985012 10
average reward exploit: MDP 1 156.61907489434986 10
average reward explore: 378.22950785852424
average reward exploit: MDP 2 217.49384864798853 10
average reward exploit: MDP 2 232.7648855260411 10
average reward exploit: MDP 2 276.784105921774 10
average reward exploit: MDP 2 214.5158308555554 10
average reward exploit: MDP 2 225.14694641314455 10
average reward exploit: MDP 2 243.4481816551237 10
average reward exploit: MDP 2 226.35908468485573 10
average reward exploit: MDP 2 234.24412679181393 10
average reward exploit: MDP 2 169.6418405832441 10
average reward exploit: MDP 2 201.28807213126748 10
average reward explore: 311.70351

average reward explore: 360.33266992890015
average reward exploit: MDP 6 274.85304789784465 10
average reward exploit: MDP 6 253.8450130268055 10
average reward exploit: MDP 6 245.20426960126946 10
average reward exploit: MDP 6 313.4512101544543 10
average reward exploit: MDP 6 240.93472017820199 10
average reward exploit: MDP 6 248.9302995059912 10
average reward exploit: MDP 6 269.1033757414686 10
average reward exploit: MDP 6 225.89234929982157 10
average reward exploit: MDP 6 233.04189097941835 10
average reward exploit: MDP 6 272.6837917803417 10
average reward explore: 358.39288650098973
average reward exploit: MDP 7 45.47059736371652 10
average reward exploit: MDP 7 68.21974001950625 10
average reward exploit: MDP 7 13.77862108791039 10
average reward exploit: MDP 7 60.27236937183052 10
average reward exploit: MDP 7 155.10419483997774 10
average reward exploit: MDP 7 -10.321814962524122 10
average reward exploit: MDP 7 42.33141976048016 10
average reward exploit: MDP 7 50.700677

average reward exploit: MDP 0 175.99523007769628 10
average reward exploit: MDP 0 104.80894730711444 10
average reward exploit: MDP 0 130.52496163162095 10
average reward explore: 338.84910077701
average reward exploit: MDP 1 202.61738825684378 10
average reward exploit: MDP 1 199.0537705088936 10
average reward exploit: MDP 1 150.35650805270691 10
average reward exploit: MDP 1 134.26434489737213 10
average reward exploit: MDP 1 217.3415903106023 10
average reward exploit: MDP 1 113.83653256444181 10
average reward exploit: MDP 1 104.10922246803133 10
average reward exploit: MDP 1 182.83623356817847 10
average reward exploit: MDP 1 111.52383072935467 10
average reward exploit: MDP 1 186.73346994371 10
average reward explore: 375.1623862615843
average reward exploit: MDP 2 170.30433091195414 10
average reward exploit: MDP 2 142.51635976118092 10
average reward exploit: MDP 2 136.84684113021348 10
average reward exploit: MDP 2 166.51983444107265 10
average reward exploit: MDP 2 91.650195

average reward exploit: MDP 5 101.27696764198302 10
average reward exploit: MDP 5 127.19258586924919 10
average reward exploit: MDP 5 110.029191354424 10
average reward exploit: MDP 5 104.49528110244505 10
average reward exploit: MDP 5 54.543757055949435 10
average reward exploit: MDP 5 76.31332761942195 10
average reward explore: 351.73331272825874
average reward exploit: MDP 6 95.5080167566501 10
average reward exploit: MDP 6 104.44868835484158 10
average reward exploit: MDP 6 44.51404420649654 10
average reward exploit: MDP 6 164.46769119662707 10
average reward exploit: MDP 6 176.56088225273567 10
average reward exploit: MDP 6 153.36050999741792 10
average reward exploit: MDP 6 131.33624055661926 10
average reward exploit: MDP 6 200.4249528076569 10
average reward exploit: MDP 6 93.18055097887984 10
average reward exploit: MDP 6 125.14255641339045 10
average reward explore: 356.23980187294546
average reward exploit: MDP 7 137.52410158586576 10
average reward exploit: MDP 7 89.26273

average reward exploit: MDP 0 164.44640214781776 10
average reward exploit: MDP 0 217.08105528867455 10
average reward exploit: MDP 0 183.02815752060073 10
average reward exploit: MDP 0 156.24526294370796 10
average reward exploit: MDP 0 227.57899303529206 10
average reward exploit: MDP 0 204.43862258131307 10
average reward exploit: MDP 0 164.12155619391086 10
average reward exploit: MDP 0 188.81885022989957 10
average reward exploit: MDP 0 195.92011776198882 10
average reward explore: 364.12357753967433
average reward exploit: MDP 1 103.00773474200916 10
average reward exploit: MDP 1 71.4819291203773 10
average reward exploit: MDP 1 57.59192677710094 10
average reward exploit: MDP 1 80.03761746377833 10
average reward exploit: MDP 1 84.23321832072043 10
average reward exploit: MDP 1 158.227269310729 10
average reward exploit: MDP 1 46.15466669746952 10
average reward exploit: MDP 1 24.625215076616378 10
average reward exploit: MDP 1 89.55003647786965 10
average reward exploit: MDP 1 

average reward exploit: MDP 4 241.71643944213307 10
average reward explore: 337.5512932416483
average reward exploit: MDP 5 211.27158268632712 10
average reward exploit: MDP 5 189.4291899463994 10
average reward exploit: MDP 5 203.90995366431167 10
average reward exploit: MDP 5 190.06915096604126 10
average reward exploit: MDP 5 201.83779010129348 10
average reward exploit: MDP 5 179.9475754536292 10
average reward exploit: MDP 5 165.994354607558 10
average reward exploit: MDP 5 246.8769762316328 10
average reward exploit: MDP 5 231.30773380919473 10
average reward exploit: MDP 5 205.47976166146677 10
average reward explore: 360.2179110130951
average reward exploit: MDP 6 131.2977251654779 10
average reward exploit: MDP 6 197.7402775485811 10
average reward exploit: MDP 6 188.34675077007427 10
average reward exploit: MDP 6 255.30213822258526 10
average reward exploit: MDP 6 144.80121591218335 10
average reward exploit: MDP 6 186.13761235525362 10
average reward exploit: MDP 6 268.26644

average reward exploit: MDP 9 245.35552125277528 10
average reward exploit: MDP 9 195.71486221749134 10
average reward exploit: MDP 9 259.38443095114064 10
average reward exploit: MDP 9 239.21300675522122 10
average reward explore: 314.9871321849096
average reward exploit: MDP 0 280.5271733364699 10
average reward exploit: MDP 0 251.41097457988008 10
average reward exploit: MDP 0 264.6894165292415 10
average reward exploit: MDP 0 283.42175323481746 10
average reward exploit: MDP 0 288.551237218854 10
average reward exploit: MDP 0 284.0334386507915 10
average reward exploit: MDP 0 275.25356182733384 10
average reward exploit: MDP 0 262.7205575098365 10
average reward exploit: MDP 0 283.7938754757497 10
average reward exploit: MDP 0 262.682544942664 10
average reward explore: 356.5252101764556
average reward exploit: MDP 1 172.86855614406164 10
average reward exploit: MDP 1 226.36046294779027 10
average reward exploit: MDP 1 232.1582109127889 10
average reward exploit: MDP 1 241.28796283

average reward exploit: MDP 4 271.7588593100644 10
average reward exploit: MDP 4 239.23750817284795 10
average reward exploit: MDP 4 293.3882177228524 10
average reward exploit: MDP 4 295.8854583494779 10
average reward exploit: MDP 4 237.82465205105018 10
average reward exploit: MDP 4 277.3336947165768 10
average reward exploit: MDP 4 272.9448232925319 10
average reward explore: 366.70825252006364
average reward exploit: MDP 5 195.84349413800558 10
average reward exploit: MDP 5 193.34635847474328 10
average reward exploit: MDP 5 250.29929297345197 10
average reward exploit: MDP 5 206.41637095618776 10
average reward exploit: MDP 5 242.2237023412848 10
average reward exploit: MDP 5 247.4160705319241 10
average reward exploit: MDP 5 193.97725252066428 10
average reward exploit: MDP 5 250.79305098083856 10
average reward exploit: MDP 5 262.76754948775687 10
average reward exploit: MDP 5 260.7185506344556 10
average reward explore: 348.4002826883645
average reward exploit: MDP 6 290.73496

average reward exploit: MDP 9 291.93756926734557 10
average reward exploit: MDP 9 254.38864684018085 10
average reward exploit: MDP 9 261.26923122548027 10
average reward exploit: MDP 9 309.1152745938078 10
average reward exploit: MDP 9 246.88358464418735 10
average reward exploit: MDP 9 279.6324527640209 10
average reward exploit: MDP 9 275.592240782207 10
average reward exploit: MDP 9 220.37035442551723 10
average reward exploit: MDP 9 261.68348409229304 10
average reward exploit: MDP 9 205.72649877594102 10
average reward explore: 367.86387843881755
average reward exploit: MDP 0 237.77280336058234 10
average reward exploit: MDP 0 260.7407047692254 10
average reward exploit: MDP 0 266.7325759033051 10
average reward exploit: MDP 0 319.27384699744164 10
average reward exploit: MDP 0 221.48737310805055 10
average reward exploit: MDP 0 263.96654341343776 10
average reward exploit: MDP 0 283.7858304611218 10
average reward exploit: MDP 0 245.75859130720863 10
average reward exploit: MDP 

average reward exploit: MDP 3 207.92985534421632 10
average reward exploit: MDP 3 254.32527185866184 10
average reward explore: 326.3502938780979
average reward exploit: MDP 4 218.59180875049645 10
average reward exploit: MDP 4 199.64107417966105 10
average reward exploit: MDP 4 202.6071502157253 10
average reward exploit: MDP 4 228.37509621597536 10
average reward exploit: MDP 4 234.11745364335016 10
average reward exploit: MDP 4 215.56236573107225 10
average reward exploit: MDP 4 216.67765659970223 10
average reward exploit: MDP 4 213.45360988959442 10
average reward exploit: MDP 4 209.81049126949975 10
average reward exploit: MDP 4 222.98745305683096 10
average reward explore: 346.479262392715
average reward exploit: MDP 5 264.48956074516457 10
average reward exploit: MDP 5 280.0297398039671 10
average reward exploit: MDP 5 283.64830963530005 10
average reward exploit: MDP 5 227.4233938771516 10
average reward exploit: MDP 5 260.95406219210577 10
average reward exploit: MDP 5 264.61

average reward exploit: MDP 8 241.22957752294866 10
average reward exploit: MDP 8 238.4012961822585 10
average reward exploit: MDP 8 189.5153993887978 10
average reward exploit: MDP 8 244.26535551588591 10
average reward exploit: MDP 8 163.67229167915593 10
average reward explore: 364.5390858355527
average reward exploit: MDP 9 63.465674149237294 10
average reward exploit: MDP 9 31.60839073088335 10
average reward exploit: MDP 9 37.843770618357766 10
average reward exploit: MDP 9 112.14798068040326 10
average reward exploit: MDP 9 75.5186052474299 10
average reward exploit: MDP 9 37.53859848607779 10
average reward exploit: MDP 9 91.0754630309283 10
average reward exploit: MDP 9 32.143309388066385 10
average reward exploit: MDP 9 25.692353738858298 10
average reward exploit: MDP 9 61.19026186219952 10
average reward explore: 360.8006049970313
average reward exploit: MDP 0 111.42468847036926 10
average reward exploit: MDP 0 161.62712467798875 10
average reward exploit: MDP 0 150.4285202

average reward exploit: MDP 3 98.98983841326609 10
average reward exploit: MDP 3 96.77468167390528 10
average reward exploit: MDP 3 30.539748582247643 10
average reward exploit: MDP 3 112.51256008688522 10
average reward exploit: MDP 3 161.09635607916078 10
average reward exploit: MDP 3 92.05646421086135 10
average reward exploit: MDP 3 108.1832915250942 10
average reward exploit: MDP 3 32.65002828915584 10
average reward explore: 360.03824958890084
average reward exploit: MDP 4 36.80574626362041 10
average reward exploit: MDP 4 80.87625752368562 10
average reward exploit: MDP 4 109.57405705851689 10
average reward exploit: MDP 4 92.28955290045872 10
average reward exploit: MDP 4 35.054693748072324 10
average reward exploit: MDP 4 118.4444415493156 10
average reward exploit: MDP 4 73.47244349785377 10
average reward exploit: MDP 4 82.63739405994218 10
average reward exploit: MDP 4 120.08956678206349 10
average reward exploit: MDP 4 98.568550214068 10
average reward explore: 387.8359738

average reward explore: 366.7609324945409
average reward exploit: MDP 8 278.0053245030231 10
average reward exploit: MDP 8 275.3505481507632 10
average reward exploit: MDP 8 279.87631230177385 10
average reward exploit: MDP 8 269.83918335164935 10
average reward exploit: MDP 8 274.5395861852416 10
average reward exploit: MDP 8 268.13261429555706 10
average reward exploit: MDP 8 273.8746650836121 10
average reward exploit: MDP 8 286.67689266434303 10
average reward exploit: MDP 8 279.4104170842267 10
average reward exploit: MDP 8 263.87771784851907 10
average reward explore: 364.2402119652179
average reward exploit: MDP 9 233.2955937800823 10
average reward exploit: MDP 9 133.30119389812543 10
average reward exploit: MDP 9 183.26185656302172 10
average reward exploit: MDP 9 203.62808289116876 10
average reward exploit: MDP 9 184.07488260100575 10
average reward exploit: MDP 9 213.64080148270105 10
average reward exploit: MDP 9 142.28026936312258 10
average reward exploit: MDP 9 143.7226

average reward exploit: MDP 2 290.36455807440063 10
average reward exploit: MDP 2 270.0901770569009 10
average reward exploit: MDP 2 261.94448250531616 10
average reward explore: 400.0529386155113
average reward exploit: MDP 3 269.6245077980963 10
average reward exploit: MDP 3 300.45665508650825 10
average reward exploit: MDP 3 285.38762900152585 10
average reward exploit: MDP 3 299.74212838031076 10
average reward exploit: MDP 3 294.90971890253 10
average reward exploit: MDP 3 297.6153769677654 10
average reward exploit: MDP 3 298.13786764810993 10
average reward exploit: MDP 3 250.24842745903092 10
average reward exploit: MDP 3 303.2610692310172 10
average reward exploit: MDP 3 265.9219411481496 10
average reward explore: 364.22674855586683
average reward exploit: MDP 4 223.66785695221364 10
average reward exploit: MDP 4 260.51813454581077 10
average reward exploit: MDP 4 237.05791131459765 10
average reward exploit: MDP 4 259.92525528159774 10
average reward exploit: MDP 4 247.27890

average reward exploit: MDP 7 201.61817128336207 10
average reward exploit: MDP 7 238.29278230559507 10
average reward exploit: MDP 7 273.64918785820726 10
average reward exploit: MDP 7 216.65581928051944 10
average reward exploit: MDP 7 242.06334051321625 10
average reward exploit: MDP 7 268.13898851788224 10
average reward explore: 410.75658544968394
average reward exploit: MDP 8 221.55867989009374 10
average reward exploit: MDP 8 210.0609744613017 10
average reward exploit: MDP 8 235.27140403571713 10
average reward exploit: MDP 8 179.8945044721702 10
average reward exploit: MDP 8 249.9695799669515 10
average reward exploit: MDP 8 231.53170124637933 10
average reward exploit: MDP 8 243.45689148942023 10
average reward exploit: MDP 8 198.37343735445012 10
average reward exploit: MDP 8 225.1994273308616 10
average reward exploit: MDP 8 217.50664315668192 10
average reward explore: 394.90089632667633
average reward exploit: MDP 9 258.50968224999576 10
average reward exploit: MDP 9 268.

average reward exploit: MDP 2 259.19449570241153 10
average reward exploit: MDP 2 245.90534520111046 10
average reward exploit: MDP 2 262.6047107234248 10
average reward exploit: MDP 2 249.97526522124582 10
average reward exploit: MDP 2 272.9098486173114 10
average reward exploit: MDP 2 265.12931870078756 10
average reward exploit: MDP 2 267.10258556387794 10
average reward exploit: MDP 2 276.169716846667 10
average reward exploit: MDP 2 274.1207286938 10
average reward explore: 364.74773645039033
average reward exploit: MDP 3 207.40056114064518 10
average reward exploit: MDP 3 243.3249809204226 10
average reward exploit: MDP 3 208.68584615712888 10
average reward exploit: MDP 3 248.38098102703552 10
average reward exploit: MDP 3 221.4341476770585 10
average reward exploit: MDP 3 201.4504540899615 10
average reward exploit: MDP 3 239.12358803638662 10
average reward exploit: MDP 3 222.88270141457434 10
average reward exploit: MDP 3 252.99050023531987 10
average reward exploit: MDP 3 18

average reward exploit: MDP 6 245.99225360107593 10
average reward explore: 359.30629183333116
average reward exploit: MDP 7 265.07808092275826 10
average reward exploit: MDP 7 218.66491484446345 10
average reward exploit: MDP 7 286.13891808619104 10
average reward exploit: MDP 7 266.9892662418779 10
average reward exploit: MDP 7 251.34056347250734 10
average reward exploit: MDP 7 285.7093517143891 10
average reward exploit: MDP 7 264.7496172526032 10
average reward exploit: MDP 7 263.58883213914726 10
average reward exploit: MDP 7 280.99577853432766 10
average reward exploit: MDP 7 264.3822084571906 10
average reward explore: 380.7038139713842
average reward exploit: MDP 8 227.8558291821987 10
average reward exploit: MDP 8 245.90712389409555 10
average reward exploit: MDP 8 227.6911132139728 10
average reward exploit: MDP 8 225.83595932641202 10
average reward exploit: MDP 8 294.48651088486685 10
average reward exploit: MDP 8 213.2360327995556 10
average reward exploit: MDP 8 234.2191

average reward exploit: MDP 1 128.56328135357856 10
average reward exploit: MDP 1 151.6677838649444 10
average reward exploit: MDP 1 201.09681781992623 10
average reward exploit: MDP 1 126.86294825809257 10
average reward explore: 365.19521679409246
average reward exploit: MDP 2 291.459517794046 10
average reward exploit: MDP 2 298.57354695901535 10
average reward exploit: MDP 2 314.50357305774344 10
average reward exploit: MDP 2 288.85016844906556 10
average reward exploit: MDP 2 299.26359254362677 10
average reward exploit: MDP 2 289.4583058102977 10
average reward exploit: MDP 2 284.40898355211914 10
average reward exploit: MDP 2 306.99225547113855 10
average reward exploit: MDP 2 291.4806047451483 10
average reward exploit: MDP 2 274.3569470518936 10
average reward explore: 378.92021812322747
average reward exploit: MDP 3 287.41206963769173 10
average reward exploit: MDP 3 309.15710436048687 10
average reward exploit: MDP 3 290.1451784909022 10
average reward exploit: MDP 3 301.036

average reward exploit: MDP 6 157.4562103843935 10
average reward exploit: MDP 6 208.8534299667263 10
average reward exploit: MDP 6 155.43581085444345 10
average reward exploit: MDP 6 273.44606242151633 10
average reward exploit: MDP 6 220.072381134014 10
average reward exploit: MDP 6 154.99387366582798 10
average reward exploit: MDP 6 258.34592830592703 10
average reward explore: 344.71884632629843
average reward exploit: MDP 7 128.13377893292804 10
average reward exploit: MDP 7 161.5437066059964 10
average reward exploit: MDP 7 96.99237824267126 10
average reward exploit: MDP 7 106.497513231214 10
average reward exploit: MDP 7 84.38436743480982 10
average reward exploit: MDP 7 162.25863575225722 10
average reward exploit: MDP 7 151.7844017167798 10
average reward exploit: MDP 7 196.49397511316204 10
average reward exploit: MDP 7 145.22327002121114 10
average reward exploit: MDP 7 153.29487364876226 10
average reward explore: 312.1300315590577
average reward exploit: MDP 8 178.1289771

average reward exploit: MDP 1 280.1686897862096 10
average reward exploit: MDP 1 293.54969428348505 10
average reward exploit: MDP 1 263.2862321189089 10
average reward exploit: MDP 1 287.249709262593 10
average reward exploit: MDP 1 275.8346795192575 10
average reward exploit: MDP 1 296.9829404609259 10
average reward exploit: MDP 1 249.59472938653434 10
average reward exploit: MDP 1 264.69119046659097 10
average reward exploit: MDP 1 238.73708123537327 10
average reward exploit: MDP 1 255.01713334323887 10
average reward explore: 424.376159721446
average reward exploit: MDP 2 273.1683407424743 10
average reward exploit: MDP 2 279.04798065388394 10
average reward exploit: MDP 2 303.28763863689426 10
average reward exploit: MDP 2 309.5846149370307 10
average reward exploit: MDP 2 291.612101095189 10
average reward exploit: MDP 2 247.24075244948185 10
average reward exploit: MDP 2 272.75021236204776 10
average reward exploit: MDP 2 281.4089933262741 10
average reward exploit: MDP 2 287.

average reward exploit: MDP 5 310.379754157765 10
average reward exploit: MDP 5 283.41778791971495 10
average reward explore: 278.36302381670646
average reward exploit: MDP 6 261.3156944937866 10
average reward exploit: MDP 6 258.22374259520785 10
average reward exploit: MDP 6 258.0736367281431 10
average reward exploit: MDP 6 300.2194900961746 10
average reward exploit: MDP 6 274.78071163481275 10
average reward exploit: MDP 6 207.67833516981577 10
average reward exploit: MDP 6 248.7272194812353 10
average reward exploit: MDP 6 271.31417024789323 10
average reward exploit: MDP 6 266.2226568394269 10
average reward exploit: MDP 6 283.93296782980053 10
average reward explore: 356.0976067244639
average reward exploit: MDP 7 306.49341174018497 10
average reward exploit: MDP 7 324.13147556872775 10
average reward exploit: MDP 7 264.80802304945234 10
average reward exploit: MDP 7 233.21706397651286 10
average reward exploit: MDP 7 265.4301142980099 10
average reward exploit: MDP 7 283.87222

average reward exploit: MDP 0 254.361325752833 10
average reward exploit: MDP 0 245.67887052089364 10
average reward exploit: MDP 0 268.72116394107053 10
average reward exploit: MDP 0 251.78188556918366 10
average reward exploit: MDP 0 242.91445557777615 10
average reward explore: 342.33507788449504
average reward exploit: MDP 1 209.80583583745073 10
average reward exploit: MDP 1 222.82101388585897 10
average reward exploit: MDP 1 231.7599352012106 10
average reward exploit: MDP 1 216.8410022282917 10
average reward exploit: MDP 1 193.05636992079042 10
average reward exploit: MDP 1 199.34084817103817 10
average reward exploit: MDP 1 212.1771530073323 10
average reward exploit: MDP 1 237.24387962137524 10
average reward exploit: MDP 1 194.39523383360623 10
average reward exploit: MDP 1 223.36894951837104 10
average reward explore: 332.7999193852489
average reward exploit: MDP 2 270.5733994656964 10
average reward exploit: MDP 2 233.37826281518045 10
average reward exploit: MDP 2 249.676

average reward exploit: MDP 5 340.1555564898169 10
average reward exploit: MDP 5 472.5460214332699 10
average reward exploit: MDP 5 302.3348109371975 10
average reward exploit: MDP 5 334.704364050722 10
average reward exploit: MDP 5 478.2380955803028 10
average reward exploit: MDP 5 185.8029393209643 10
average reward exploit: MDP 5 272.121211892291 10
average reward exploit: MDP 5 240.14028593266517 10
average reward explore: 382.1004834983336
average reward exploit: MDP 6 232.0383574828956 10
average reward exploit: MDP 6 232.3672221472532 10
average reward exploit: MDP 6 187.06080451766192 10
average reward exploit: MDP 6 242.4988784820855 10
average reward exploit: MDP 6 245.926359313996 10
average reward exploit: MDP 6 184.60775568191627 10
average reward exploit: MDP 6 230.57736857196193 10
average reward exploit: MDP 6 241.35491228055918 10
average reward exploit: MDP 6 218.89653043945503 10
average reward exploit: MDP 6 219.76030709859077 10
average reward explore: 400.07541997

average reward explore: 412.60812629081255
average reward exploit: MDP 0 239.9328188108942 10
average reward exploit: MDP 0 310.4881462391666 10
average reward exploit: MDP 0 287.46127313840736 10
average reward exploit: MDP 0 239.43475318289114 10
average reward exploit: MDP 0 274.4907196482031 10
average reward exploit: MDP 0 279.2859119546057 10
average reward exploit: MDP 0 245.0254266726791 10
average reward exploit: MDP 0 264.79746699155453 10
average reward exploit: MDP 0 257.4276910878241 10
average reward exploit: MDP 0 263.51535564836973 10
average reward explore: 369.8868849892665
average reward exploit: MDP 1 219.8491158122169 10
average reward exploit: MDP 1 225.5073461679263 10
average reward exploit: MDP 1 219.07974985806567 10
average reward exploit: MDP 1 242.68762004663054 10
average reward exploit: MDP 1 190.94723580173414 10
average reward exploit: MDP 1 238.78262596028154 10
average reward exploit: MDP 1 235.50569936709263 10
average reward exploit: MDP 1 243.20692

average reward exploit: MDP 4 218.60865987255073 10
average reward exploit: MDP 4 234.80414189739213 10
average reward exploit: MDP 4 239.39127033760246 10
average reward explore: 324.97102062401154
average reward exploit: MDP 5 213.91682357907584 10
average reward exploit: MDP 5 132.97870088823035 10
average reward exploit: MDP 5 188.0956475809457 10
average reward exploit: MDP 5 200.85401719197006 10
average reward exploit: MDP 5 202.51007121271857 10
average reward exploit: MDP 5 145.50792700928076 10
average reward exploit: MDP 5 239.0446285667099 10
average reward exploit: MDP 5 163.0392479841813 10
average reward exploit: MDP 5 221.12869409075606 10
average reward exploit: MDP 5 154.125418263704 10
average reward explore: 320.63510716227705
average reward exploit: MDP 6 179.91100458792863 10
average reward exploit: MDP 6 245.95758836814804 10
average reward exploit: MDP 6 221.90927128281356 10
average reward exploit: MDP 6 222.02591533393678 10
average reward exploit: MDP 6 217.4

average reward exploit: MDP 9 224.38467050119826 10
average reward exploit: MDP 9 255.8080813438168 10
average reward exploit: MDP 9 245.42760633198003 10
average reward exploit: MDP 9 197.8580062804174 10
average reward exploit: MDP 9 179.31663261162146 10
average reward exploit: MDP 9 236.96937103748488 10
average reward explore: 336.185378608257
average reward exploit: MDP 0 178.65292066513524 10
average reward exploit: MDP 0 212.3744294779273 10
average reward exploit: MDP 0 205.8755429505132 10
average reward exploit: MDP 0 204.06096710092447 10
average reward exploit: MDP 0 168.78214032559316 10
average reward exploit: MDP 0 194.0211061534156 10
average reward exploit: MDP 0 178.56084830264166 10
average reward exploit: MDP 0 187.65887827683633 10
average reward exploit: MDP 0 225.85877135114933 10
average reward exploit: MDP 0 175.0930141085141 10
average reward explore: 331.0447180585684
average reward exploit: MDP 1 230.05899217083487 10
average reward exploit: MDP 1 222.37230

average reward exploit: MDP 4 288.02832869882684 10
average reward exploit: MDP 4 283.473084596278 10
average reward exploit: MDP 4 263.3959484749845 10
average reward exploit: MDP 4 277.9403570320069 10
average reward exploit: MDP 4 278.88969105341096 10
average reward exploit: MDP 4 256.4920081806218 10
average reward exploit: MDP 4 278.93908781624543 10
average reward exploit: MDP 4 260.3502116034391 10
average reward exploit: MDP 4 264.439652492145 10
average reward explore: 390.56716271633286
average reward exploit: MDP 5 279.7182466777917 10
average reward exploit: MDP 5 239.81016254436318 10
average reward exploit: MDP 5 257.23921168832186 10
average reward exploit: MDP 5 261.44477965706193 10
average reward exploit: MDP 5 259.3922720618299 10
average reward exploit: MDP 5 291.0217087554928 10
average reward exploit: MDP 5 265.31026990865087 10
average reward exploit: MDP 5 280.765289009113 10
average reward exploit: MDP 5 287.21593511433593 10
average reward exploit: MDP 5 270.

average reward exploit: MDP 8 231.12097459424612 10
average reward explore: 339.30019314227155
average reward exploit: MDP 9 199.22986250326616 10
average reward exploit: MDP 9 207.27880381854612 10
average reward exploit: MDP 9 169.1420919129094 10
average reward exploit: MDP 9 232.4203395849998 10
average reward exploit: MDP 9 265.1089211939187 10
average reward exploit: MDP 9 242.59179938094775 10
average reward exploit: MDP 9 238.4405797443144 10
average reward exploit: MDP 9 185.96515387946968 10
average reward exploit: MDP 9 200.20646475376103 10
average reward exploit: MDP 9 229.76118449928262 10
average reward explore: 337.84900762797395
average reward exploit: MDP 0 263.6682824579627 10
average reward exploit: MDP 0 241.9823380179186 10
average reward exploit: MDP 0 245.3269734405266 10
average reward exploit: MDP 0 274.9564829742589 10
average reward exploit: MDP 0 261.6770824621606 10
average reward exploit: MDP 0 249.70767661195896 10
average reward exploit: MDP 0 236.45786

average reward exploit: MDP 3 189.00877846781856 10
average reward exploit: MDP 3 243.7273789035068 10
average reward exploit: MDP 3 219.68960217882818 10
average reward exploit: MDP 3 238.05597774171284 10
average reward explore: 395.06174407715406
average reward exploit: MDP 4 268.50026661798904 10
average reward exploit: MDP 4 268.7782132396334 10
average reward exploit: MDP 4 278.0313724512643 10
average reward exploit: MDP 4 286.7965849561134 10
average reward exploit: MDP 4 285.3353262258694 10
average reward exploit: MDP 4 268.48028701335204 10
average reward exploit: MDP 4 257.22810530404433 10
average reward exploit: MDP 4 243.8746136471368 10
average reward exploit: MDP 4 228.7617471251693 10
average reward exploit: MDP 4 254.6769612834928 10
average reward explore: 385.3251211514033
average reward exploit: MDP 5 238.588192951863 10
average reward exploit: MDP 5 277.43373817208214 10
average reward exploit: MDP 5 285.1761904683454 10
average reward exploit: MDP 5 253.09007193

average reward exploit: MDP 8 256.2390670951409 10
average reward exploit: MDP 8 241.49801343852755 10
average reward exploit: MDP 8 243.01809268945914 10
average reward exploit: MDP 8 269.9369202966802 10
average reward exploit: MDP 8 266.8196188002864 10
average reward exploit: MDP 8 262.4542362931107 10
average reward exploit: MDP 8 253.42963808388077 10
average reward explore: 381.8913944458244
average reward exploit: MDP 9 253.29555922332344 10
average reward exploit: MDP 9 267.25143122887096 10
average reward exploit: MDP 9 214.6525422119367 10
average reward exploit: MDP 9 198.41124639510855 10
average reward exploit: MDP 9 230.5225515671939 10
average reward exploit: MDP 9 307.35322265834856 10
average reward exploit: MDP 9 250.66728020191763 10
average reward exploit: MDP 9 295.0772479574765 10
average reward exploit: MDP 9 260.2185286358658 10
average reward exploit: MDP 9 278.10303789385927 10
average reward explore: 397.432667663919
average reward exploit: MDP 0 124.3905123

average reward exploit: MDP 3 126.83318624758253 10
average reward exploit: MDP 3 188.57750494081216 10
average reward exploit: MDP 3 94.7824568642958 10
average reward exploit: MDP 3 133.1194716146045 10
average reward exploit: MDP 3 90.09154710276451 10
average reward exploit: MDP 3 98.08847751585449 10
average reward exploit: MDP 3 130.06286406523995 10
average reward exploit: MDP 3 161.11733658806241 10
average reward exploit: MDP 3 120.01703000328033 10
average reward exploit: MDP 3 72.32401437424603 10
average reward explore: 403.3008631742323
average reward exploit: MDP 4 87.33002002213064 10
average reward exploit: MDP 4 70.74733445846951 10
average reward exploit: MDP 4 126.10068079867742 10
average reward exploit: MDP 4 100.5720363542545 10
average reward exploit: MDP 4 61.12261645779129 10
average reward exploit: MDP 4 101.60009548936434 10
average reward exploit: MDP 4 96.70442167416138 10
average reward exploit: MDP 4 114.95981076571324 10
average reward exploit: MDP 4 89.

average reward exploit: MDP 7 224.50762444497997 10
average reward exploit: MDP 7 234.90128679101167 10
average reward explore: 354.93709286236015
average reward exploit: MDP 8 208.49561690663836 10
average reward exploit: MDP 8 238.14757446288564 10
average reward exploit: MDP 8 225.60745628925056 10
average reward exploit: MDP 8 240.64915171634146 10
average reward exploit: MDP 8 215.29492441631504 10
average reward exploit: MDP 8 220.3996380162901 10
average reward exploit: MDP 8 224.7327794117523 10
average reward exploit: MDP 8 221.24397005592874 10
average reward exploit: MDP 8 220.7361088304453 10
average reward exploit: MDP 8 224.638053607637 10
average reward explore: 400.25807770489575
average reward exploit: MDP 9 216.96186385449246 10
average reward exploit: MDP 9 213.756807644091 10
average reward exploit: MDP 9 187.5031718400873 10
average reward exploit: MDP 9 198.99914580988624 10
average reward exploit: MDP 9 207.45081669023457 10
average reward exploit: MDP 9 226.8530

average reward exploit: MDP 2 242.17419683686984 10
average reward exploit: MDP 2 214.27739373094295 10
average reward exploit: MDP 2 258.21385529478505 10
average reward exploit: MDP 2 244.68870372990145 10
average reward exploit: MDP 2 220.9633561616853 10
average reward explore: 389.5793341466439
average reward exploit: MDP 3 207.6754677037473 10
average reward exploit: MDP 3 290.15970686430285 10
average reward exploit: MDP 3 247.90129372882262 10
average reward exploit: MDP 3 259.23171095046996 10
average reward exploit: MDP 3 228.54786484909454 10
average reward exploit: MDP 3 208.1451419069918 10
average reward exploit: MDP 3 229.23947325852546 10
average reward exploit: MDP 3 262.3316077183484 10
average reward exploit: MDP 3 204.0729261555785 10
average reward exploit: MDP 3 185.46782596690554 10
average reward explore: 319.0855877881709
average reward exploit: MDP 4 219.8545767095239 10
average reward exploit: MDP 4 212.35647314796765 10
average reward exploit: MDP 4 234.5125

average reward exploit: MDP 7 257.0344104304465 10
average reward exploit: MDP 7 281.7384629284165 10
average reward exploit: MDP 7 277.7191892975043 10
average reward exploit: MDP 7 234.33421871458353 10
average reward exploit: MDP 7 275.99596806200196 10
average reward exploit: MDP 7 214.59657458035113 10
average reward exploit: MDP 7 243.27676117509162 10
average reward exploit: MDP 7 286.2559946377488 10
average reward explore: 380.4044199260841
average reward exploit: MDP 8 259.39422149963127 10
average reward exploit: MDP 8 263.11926950339864 10
average reward exploit: MDP 8 258.7270389345213 10
average reward exploit: MDP 8 285.06807406526195 10
average reward exploit: MDP 8 261.0211127189234 10
average reward exploit: MDP 8 244.98453537140986 10
average reward exploit: MDP 8 264.9652204975161 10
average reward exploit: MDP 8 269.8278842323342 10
average reward exploit: MDP 8 276.82028092031317 10
average reward exploit: MDP 8 227.85167354189153 10
average reward explore: 349.40

average reward explore: 366.10903285804983
average reward exploit: MDP 2 253.4343253105093 10
average reward exploit: MDP 2 249.77004830245292 10
average reward exploit: MDP 2 242.83650291873306 10
average reward exploit: MDP 2 262.5571352469764 10
average reward exploit: MDP 2 261.1895793769584 10
average reward exploit: MDP 2 253.80718569966848 10
average reward exploit: MDP 2 245.70570879266538 10
average reward exploit: MDP 2 238.41152617465136 10
average reward exploit: MDP 2 223.80139414367144 10
average reward exploit: MDP 2 253.2608704529875 10
average reward explore: 377.9993958795204
average reward exploit: MDP 3 261.584808682455 10
average reward exploit: MDP 3 257.090262706428 10
average reward exploit: MDP 3 255.527521182907 10
average reward exploit: MDP 3 233.64528423714742 10
average reward exploit: MDP 3 256.3722543575895 10
average reward exploit: MDP 3 221.5837075804747 10
average reward exploit: MDP 3 259.1592299191557 10
average reward exploit: MDP 3 255.7056183451

average reward exploit: MDP 6 221.31744668238267 10
average reward exploit: MDP 6 176.39233913377709 10
average reward exploit: MDP 6 205.4287976835984 10
average reward explore: 385.2809345271349
average reward exploit: MDP 7 191.35039459729168 10
average reward exploit: MDP 7 147.78301112537608 10
average reward exploit: MDP 7 184.13823875806355 10
average reward exploit: MDP 7 226.60260124424235 10
average reward exploit: MDP 7 197.1651356059436 10
average reward exploit: MDP 7 189.87162790889016 10
average reward exploit: MDP 7 245.43298530590354 10
average reward exploit: MDP 7 120.71779105496005 10
average reward exploit: MDP 7 184.13788394333284 10
average reward exploit: MDP 7 218.05153679727724 10
average reward explore: 385.13744961053345
average reward exploit: MDP 8 257.8622990716985 10
average reward exploit: MDP 8 220.68902754205027 10
average reward exploit: MDP 8 237.44120481958348 10
average reward exploit: MDP 8 227.75426290719315 10
average reward exploit: MDP 8 224.

average reward exploit: MDP 1 209.9358709245194 10
average reward exploit: MDP 1 207.27726549920598 10
average reward exploit: MDP 1 192.08461293993687 10
average reward exploit: MDP 1 260.0323244138696 10
average reward exploit: MDP 1 189.9183414037604 10
average reward exploit: MDP 1 251.20172706534635 10
average reward explore: 357.529039393897
average reward exploit: MDP 2 271.44980601605005 10
average reward exploit: MDP 2 241.2581588740381 10
average reward exploit: MDP 2 245.47208735711706 10
average reward exploit: MDP 2 230.76926937714407 10
average reward exploit: MDP 2 229.5568890456703 10
average reward exploit: MDP 2 250.0490844252747 10
average reward exploit: MDP 2 227.71312633512434 10
average reward exploit: MDP 2 235.2748435930947 10
average reward exploit: MDP 2 254.11987337854862 10
average reward exploit: MDP 2 242.9341068115175 10
average reward explore: 360.95616140274876
average reward exploit: MDP 3 237.13624175840044 10
average reward exploit: MDP 3 272.702038

average reward exploit: MDP 6 108.5875760277273 10
average reward exploit: MDP 6 166.261759191836 10
average reward exploit: MDP 6 179.6874675567446 10
average reward exploit: MDP 6 130.856793740205 10
average reward exploit: MDP 6 162.04830161802528 10
average reward exploit: MDP 6 93.97139636959037 10
average reward exploit: MDP 6 131.56031371026447 10
average reward exploit: MDP 6 189.9401260461572 10
average reward exploit: MDP 6 208.14490906816476 10
average reward explore: 301.5255463128116
average reward exploit: MDP 7 262.61202994415333 10
average reward exploit: MDP 7 269.8077646144071 10
average reward exploit: MDP 7 296.54253265651766 10
average reward exploit: MDP 7 315.0089295566402 10
average reward exploit: MDP 7 298.7994022688289 10
average reward exploit: MDP 7 249.57533445396493 10
average reward exploit: MDP 7 267.97857299369923 10
average reward exploit: MDP 7 249.32415757139952 10
average reward exploit: MDP 7 258.6184649570793 10
average reward exploit: MDP 7 299.

average reward exploit: MDP 0 194.92983288626368 10
average reward explore: 321.21428997050816
average reward exploit: MDP 1 202.93144599789466 10
average reward exploit: MDP 1 117.0302621020326 10
average reward exploit: MDP 1 146.5724822699487 10
average reward exploit: MDP 1 175.35944254531412 10
average reward exploit: MDP 1 138.9092138046663 10
average reward exploit: MDP 1 255.30392950073383 10
average reward exploit: MDP 1 251.86140032898498 10
average reward exploit: MDP 1 203.78795454564235 10
average reward exploit: MDP 1 227.38059619069298 10
average reward exploit: MDP 1 198.19935729497615 10
average reward explore: 351.2109241596671
average reward exploit: MDP 2 240.03907262960917 10
average reward exploit: MDP 2 268.1779942774539 10
average reward exploit: MDP 2 263.8672491497696 10
average reward exploit: MDP 2 274.5842370397072 10
average reward exploit: MDP 2 310.81317890688774 10
average reward exploit: MDP 2 246.90253278439332 10
average reward exploit: MDP 2 275.807

average reward exploit: MDP 5 181.83497171199414 10
average reward exploit: MDP 5 208.18395397851654 10
average reward exploit: MDP 5 122.88321527152141 10
average reward exploit: MDP 5 247.0142919590293 10
average reward explore: 357.01448397161283
average reward exploit: MDP 6 83.8924982569292 10
average reward exploit: MDP 6 159.98020900041917 10
average reward exploit: MDP 6 107.27752752299925 10
average reward exploit: MDP 6 111.94370340096482 10
average reward exploit: MDP 6 135.51211155380378 10
average reward exploit: MDP 6 165.48997154716437 10
average reward exploit: MDP 6 123.01725487373237 10
average reward exploit: MDP 6 107.70882908257472 10
average reward exploit: MDP 6 152.02442509796168 10
average reward exploit: MDP 6 130.7561085382126 10
average reward explore: 326.6513626149448
average reward exploit: MDP 7 244.9185217498362 10
average reward exploit: MDP 7 237.41728485409863 10
average reward exploit: MDP 7 258.44467026883206 10
average reward exploit: MDP 7 292.76

average reward exploit: MDP 0 136.2607697195645 10
average reward exploit: MDP 0 144.73304749466902 10
average reward exploit: MDP 0 133.9901570133895 10
average reward exploit: MDP 0 126.73390676672577 10
average reward exploit: MDP 0 99.14471651875297 10
average reward exploit: MDP 0 154.40926271315294 10
average reward exploit: MDP 0 184.39783479290594 10
average reward explore: 382.69861082417464
average reward exploit: MDP 1 212.60501676104928 10
average reward exploit: MDP 1 291.4425296636924 10
average reward exploit: MDP 1 185.5115061969239 10
average reward exploit: MDP 1 204.41101492869694 10
average reward exploit: MDP 1 187.29615330086497 10
average reward exploit: MDP 1 218.2634515803789 10
average reward exploit: MDP 1 277.58419029325506 10
average reward exploit: MDP 1 180.49638906811032 10
average reward exploit: MDP 1 275.95166282392904 10
average reward exploit: MDP 1 254.9598413524447 10
average reward explore: 320.53053006667966
average reward exploit: MDP 2 177.684

average reward exploit: MDP 5 170.2980797556279 10
average reward exploit: MDP 5 117.28179642289646 10
average reward exploit: MDP 5 176.6622326236613 10
average reward exploit: MDP 5 147.20328887493048 10
average reward exploit: MDP 5 170.36458275339106 10
average reward exploit: MDP 5 132.68191534903667 10
average reward exploit: MDP 5 171.20567554111648 10
average reward exploit: MDP 5 87.83869770478299 10
average reward exploit: MDP 5 106.43553611901739 10
average reward exploit: MDP 5 124.91890311300423 10
average reward explore: 390.08372145752634
average reward exploit: MDP 6 193.8046080725057 10
average reward exploit: MDP 6 230.79429211717246 10
average reward exploit: MDP 6 257.04844194384094 10
average reward exploit: MDP 6 207.19663072908187 10
average reward exploit: MDP 6 267.597089025139 10
average reward exploit: MDP 6 193.8331201612194 10
average reward exploit: MDP 6 249.22262638126784 10
average reward exploit: MDP 6 260.23391636364533 10
average reward exploit: MDP 

average reward exploit: MDP 9 284.784157720126 10
average reward exploit: MDP 9 271.1538537022535 10
average reward explore: 375.8958970331463
average reward exploit: MDP 0 251.61365341534264 10
average reward exploit: MDP 0 241.18591732331333 10
average reward exploit: MDP 0 224.8143033023367 10
average reward exploit: MDP 0 225.44383028651265 10
average reward exploit: MDP 0 227.24045637537853 10
average reward exploit: MDP 0 198.5263562728305 10
average reward exploit: MDP 0 232.41741035880978 10
average reward exploit: MDP 0 201.2048955956312 10
average reward exploit: MDP 0 218.74295774704984 10
average reward exploit: MDP 0 209.75916691348712 10
average reward explore: 401.787522315317
average reward exploit: MDP 1 233.10177340211317 10
average reward exploit: MDP 1 174.0222196620643 10
average reward exploit: MDP 1 188.92742852011912 10
average reward exploit: MDP 1 215.16739868785012 10
average reward exploit: MDP 1 244.85876903577622 10
average reward exploit: MDP 1 226.825939

average reward exploit: MDP 4 303.64964934406316 10
average reward exploit: MDP 4 203.40598634301585 10
average reward exploit: MDP 4 191.61527348538976 10
average reward exploit: MDP 4 259.62366319983096 10
average reward exploit: MDP 4 201.62470898421975 10
average reward explore: 373.4149756386909
average reward exploit: MDP 5 76.54911111132402 10
average reward exploit: MDP 5 108.10434939453248 10
average reward exploit: MDP 5 99.30283235367727 10
average reward exploit: MDP 5 92.74411452114649 10
average reward exploit: MDP 5 88.94156817246308 10
average reward exploit: MDP 5 101.59938629376265 10
average reward exploit: MDP 5 84.82057481629433 10
average reward exploit: MDP 5 92.56837484872084 10
average reward exploit: MDP 5 107.01247963248753 10
average reward exploit: MDP 5 94.78395596484505 10
average reward explore: 376.5088807970519
average reward exploit: MDP 6 191.2073629866216 10
average reward exploit: MDP 6 278.5689016197155 10
average reward exploit: MDP 6 236.3680575

average reward exploit: MDP 9 149.66603759071185 10
average reward exploit: MDP 9 253.2075263157134 10
average reward exploit: MDP 9 195.76859673216828 10
average reward exploit: MDP 9 166.1746358068489 10
average reward exploit: MDP 9 163.6288600245805 10
average reward exploit: MDP 9 183.62460548583138 10
average reward exploit: MDP 9 177.53886291415023 10
average reward exploit: MDP 9 135.05577159700556 10
average reward explore: 391.15495727049904
average reward exploit: MDP 0 232.56253349652025 10
average reward exploit: MDP 0 199.7271097344353 10
average reward exploit: MDP 0 247.12034469887686 10
average reward exploit: MDP 0 184.16979332015993 10
average reward exploit: MDP 0 214.63625824694276 10
average reward exploit: MDP 0 258.1331682374856 10
average reward exploit: MDP 0 277.30574988149573 10
average reward exploit: MDP 0 162.52460963100742 10
average reward exploit: MDP 0 188.76336821947032 10
average reward exploit: MDP 0 197.7472083449353 10
average reward explore: 370

average reward explore: 379.78504946780794
average reward exploit: MDP 4 233.58414564770678 10
average reward exploit: MDP 4 288.1385699117177 10
average reward exploit: MDP 4 225.63211737348175 10
average reward exploit: MDP 4 288.6947694957559 10
average reward exploit: MDP 4 277.1235278750249 10
average reward exploit: MDP 4 234.79636567639267 10
average reward exploit: MDP 4 244.13854961317693 10
average reward exploit: MDP 4 231.31593438595968 10
average reward exploit: MDP 4 197.5220759796601 10
average reward exploit: MDP 4 237.1055280682674 10
average reward explore: 403.997868290091
average reward exploit: MDP 5 168.29615318859186 10
average reward exploit: MDP 5 189.5866697516273 10
average reward exploit: MDP 5 210.3180244910713 10
average reward exploit: MDP 5 196.59972439386013 10
average reward exploit: MDP 5 216.97520419710435 10
average reward exploit: MDP 5 223.14082090765478 10
average reward exploit: MDP 5 174.13471641413443 10
average reward exploit: MDP 5 163.29770

average reward exploit: MDP 8 183.91788114852528 10
average reward exploit: MDP 8 195.36368342802461 10
average reward exploit: MDP 8 177.62520483838642 10
average reward explore: 398.46350028230756
average reward exploit: MDP 9 209.10042346537483 10
average reward exploit: MDP 9 209.3327333275143 10
average reward exploit: MDP 9 251.8552176511102 10
average reward exploit: MDP 9 212.50899362229643 10
average reward exploit: MDP 9 219.68845349956 10
average reward exploit: MDP 9 212.59418617939622 10
average reward exploit: MDP 9 218.78745620493424 10
average reward exploit: MDP 9 209.38293275714696 10
average reward exploit: MDP 9 242.55065368480808 10
average reward exploit: MDP 9 201.32411190005203 10
average reward explore: 372.89735969159244
average reward exploit: MDP 0 239.8677960729552 10
average reward exploit: MDP 0 209.5314257187262 10
average reward exploit: MDP 0 221.95131009332894 10
average reward exploit: MDP 0 256.34960514229226 10
average reward exploit: MDP 0 253.201

average reward exploit: MDP 3 153.1118181921567 10
average reward exploit: MDP 3 142.0815729666465 10
average reward exploit: MDP 3 217.60067232024335 10
average reward exploit: MDP 3 254.56104368219093 10
average reward exploit: MDP 3 149.28249330155768 10
average reward exploit: MDP 3 190.4192108422605 10
average reward explore: 390.44072852250497
average reward exploit: MDP 4 221.66975639669954 10
average reward exploit: MDP 4 216.06230715242435 10
average reward exploit: MDP 4 226.17160419735728 10
average reward exploit: MDP 4 221.53039825461215 10
average reward exploit: MDP 4 217.7772440215893 10
average reward exploit: MDP 4 222.06023272144662 10
average reward exploit: MDP 4 215.60111493770177 10
average reward exploit: MDP 4 207.85162453558686 10
average reward exploit: MDP 4 220.12035308831574 10
average reward exploit: MDP 4 229.2488885120245 10
average reward explore: 359.7043120289485
average reward exploit: MDP 5 278.10649745734287 10
average reward exploit: MDP 5 223.94

average reward exploit: MDP 8 254.1177919995376 10
average reward exploit: MDP 8 249.90021686657778 10
average reward exploit: MDP 8 242.2368126706409 10
average reward exploit: MDP 8 223.8773533608408 10
average reward exploit: MDP 8 277.8961279827492 10
average reward exploit: MDP 8 297.2925554818879 10
average reward exploit: MDP 8 262.43146338641316 10
average reward exploit: MDP 8 246.40626498285283 10
average reward exploit: MDP 8 240.7994978575348 10
average reward explore: 351.12670938360276
average reward exploit: MDP 9 273.26275855380607 10
average reward exploit: MDP 9 252.86932424888613 10
average reward exploit: MDP 9 260.7827030494153 10
average reward exploit: MDP 9 213.53104378684287 10
average reward exploit: MDP 9 270.82300712151607 10
average reward exploit: MDP 9 230.59748276049226 10
average reward exploit: MDP 9 283.9642956696583 10
average reward exploit: MDP 9 284.6565258406889 10
average reward exploit: MDP 9 264.03660445045654 10
average reward exploit: MDP 9 

average reward exploit: MDP 2 217.26798062747844 10
average reward explore: 362.85207549101926
average reward exploit: MDP 3 242.92709822255452 10
average reward exploit: MDP 3 218.38650082010946 10
average reward exploit: MDP 3 238.54018507142524 10
average reward exploit: MDP 3 215.30738620983175 10
average reward exploit: MDP 3 214.4099228631972 10
average reward exploit: MDP 3 231.43754070687646 10
average reward exploit: MDP 3 238.7538750062259 10
average reward exploit: MDP 3 235.10729279698722 10
average reward exploit: MDP 3 231.8009408744086 10
average reward exploit: MDP 3 229.2702933765018 10
average reward explore: 246.58480507233244
average reward exploit: MDP 4 234.26533981773224 10
average reward exploit: MDP 4 246.51232942832894 10
average reward exploit: MDP 4 267.5338072766935 10
average reward exploit: MDP 4 200.49884998702885 10
average reward exploit: MDP 4 245.5701881708481 10
average reward exploit: MDP 4 224.13489439945988 10
average reward exploit: MDP 4 204.61

average reward exploit: MDP 7 225.42637087351756 10
average reward exploit: MDP 7 91.20281845768166 10
average reward exploit: MDP 7 91.3450016691342 10
average reward exploit: MDP 7 265.42341305241064 10
average reward explore: 388.9203265637284
average reward exploit: MDP 8 87.99235648803392 10
average reward exploit: MDP 8 181.39072100355565 10
average reward exploit: MDP 8 118.08916995210464 10
average reward exploit: MDP 8 188.79400955517127 10
average reward exploit: MDP 8 98.0890776445934 10
average reward exploit: MDP 8 131.19555302216972 10
average reward exploit: MDP 8 114.93799836580224 10
average reward exploit: MDP 8 186.8015080526217 10
average reward exploit: MDP 8 141.01943654025368 10
average reward exploit: MDP 8 139.13604216856783 10
average reward explore: 411.6776895410253
average reward exploit: MDP 9 226.53865421389315 10
average reward exploit: MDP 9 194.81193389712732 10
average reward exploit: MDP 9 177.5022425255427 10
average reward exploit: MDP 9 228.524409

average reward exploit: MDP 2 187.88735190580618 10
average reward exploit: MDP 2 278.12093304623136 10
average reward exploit: MDP 2 230.05924119046549 10
average reward exploit: MDP 2 240.0767428843943 10
average reward exploit: MDP 2 234.1068589925356 10
average reward exploit: MDP 2 265.4702734296974 10
average reward exploit: MDP 2 232.7187259537587 10
average reward explore: 434.7053384291936
average reward exploit: MDP 3 208.65661652907062 10
average reward exploit: MDP 3 139.40688643822827 10
average reward exploit: MDP 3 152.41846626373726 10
average reward exploit: MDP 3 191.8476313491298 10
average reward exploit: MDP 3 170.3174298795746 10
average reward exploit: MDP 3 218.10019956298493 10
average reward exploit: MDP 3 109.9725048551326 10
average reward exploit: MDP 3 197.34021804834163 10
average reward exploit: MDP 3 179.54045844073488 10
average reward exploit: MDP 3 309.45526212305026 10
average reward explore: 421.94273852821846
average reward exploit: MDP 4 228.1374

average reward exploit: MDP 7 214.62934541285773 10
average reward exploit: MDP 7 191.02267812413032 10
average reward exploit: MDP 7 207.7310397539217 10
average reward exploit: MDP 7 211.05439401146614 10
average reward exploit: MDP 7 209.5982732128221 10
average reward exploit: MDP 7 190.27008700817493 10
average reward exploit: MDP 7 205.95674381308828 10
average reward exploit: MDP 7 215.28127423945483 10
average reward exploit: MDP 7 214.77402175188143 10
average reward exploit: MDP 7 201.49587940803792 10
average reward explore: 269.22088627717915
average reward exploit: MDP 8 222.3023196588424 10
average reward exploit: MDP 8 230.95219329333818 10
average reward exploit: MDP 8 239.05893128270117 10
average reward exploit: MDP 8 216.86461679001437 10
average reward exploit: MDP 8 197.38815799097168 10
average reward exploit: MDP 8 224.54928258056697 10
average reward exploit: MDP 8 231.53251284818748 10
average reward exploit: MDP 8 236.3622813883175 10
average reward exploit: M

average reward exploit: MDP 1 234.60788724381987 10
average reward exploit: MDP 1 235.3392197285988 10
average reward explore: 388.96068710269174
average reward exploit: MDP 2 190.92901132027296 10
average reward exploit: MDP 2 198.44479003941026 10
average reward exploit: MDP 2 230.215698280983 10
average reward exploit: MDP 2 199.89194875250104 10
average reward exploit: MDP 2 171.7414388886544 10
average reward exploit: MDP 2 221.40929625840513 10
average reward exploit: MDP 2 250.0830036924652 10
average reward exploit: MDP 2 212.86207170128864 10
average reward exploit: MDP 2 216.6783237971287 10
average reward exploit: MDP 2 214.3270732952239 10
average reward explore: 402.97691391346655
average reward exploit: MDP 3 187.73433983207718 10
average reward exploit: MDP 3 190.34724850609382 10
average reward exploit: MDP 3 175.77630521424118 10
average reward exploit: MDP 3 206.74442589736805 10
average reward exploit: MDP 3 215.51754104277902 10
average reward exploit: MDP 3 146.566

average reward exploit: MDP 6 208.71155040895306 10
average reward exploit: MDP 6 246.57766816841644 10
average reward exploit: MDP 6 234.21938148559153 10
average reward exploit: MDP 6 225.83982133147074 10
average reward exploit: MDP 6 277.5236709486709 10
average reward explore: 373.7784108015491
average reward exploit: MDP 7 224.50386778400784 10
average reward exploit: MDP 7 228.41383284719063 10
average reward exploit: MDP 7 230.50103568470067 10
average reward exploit: MDP 7 257.4720991744456 10
average reward exploit: MDP 7 241.1294648382997 10
average reward exploit: MDP 7 228.1159873803214 10
average reward exploit: MDP 7 243.16878351226984 10
average reward exploit: MDP 7 245.99333358910297 10
average reward exploit: MDP 7 243.12820329310586 10
average reward exploit: MDP 7 228.0800919688706 10
average reward explore: 381.31577549504885
average reward exploit: MDP 8 284.49590283311403 10
average reward exploit: MDP 8 222.17101747087062 10
average reward exploit: MDP 8 220.07

average reward exploit: MDP 1 204.22657179696392 10
average reward exploit: MDP 1 303.87661396219744 10
average reward exploit: MDP 1 244.71458737609655 10
average reward exploit: MDP 1 295.499699020719 10
average reward exploit: MDP 1 270.51866504252666 10
average reward exploit: MDP 1 271.371165562397 10
average reward exploit: MDP 1 245.6753746633367 10
average reward exploit: MDP 1 283.8748154491522 10
average reward explore: 370.50853925593094
average reward exploit: MDP 2 230.6441879222221 10
average reward exploit: MDP 2 268.5186599858868 10
average reward exploit: MDP 2 244.15707368083434 10
average reward exploit: MDP 2 250.3329357873435 10
average reward exploit: MDP 2 231.02173457133387 10
average reward exploit: MDP 2 207.69671544082547 10
average reward exploit: MDP 2 193.801728308458 10
average reward exploit: MDP 2 233.81186769670984 10
average reward exploit: MDP 2 233.1409572262899 10
average reward exploit: MDP 2 237.16628100586144 10
average reward explore: 322.22311

average reward explore: 353.6313195631328
average reward exploit: MDP 6 107.63515090757012 10
average reward exploit: MDP 6 171.09989539068332 10
average reward exploit: MDP 6 148.13612557578313 10
average reward exploit: MDP 6 175.25322660111584 10
average reward exploit: MDP 6 150.81654008299125 10
average reward exploit: MDP 6 131.82764890687793 10
average reward exploit: MDP 6 157.30906665949004 10
average reward exploit: MDP 6 161.34860149773914 10
average reward exploit: MDP 6 152.37807973121707 10
average reward exploit: MDP 6 161.6698158260295 10
average reward explore: 325.4729615174731
average reward exploit: MDP 7 87.58578110193669 10
average reward exploit: MDP 7 154.82401068587455 10
average reward exploit: MDP 7 144.1643634274438 10
average reward exploit: MDP 7 122.11883473841881 10
average reward exploit: MDP 7 160.84947728106357 10
average reward exploit: MDP 7 100.90528104218417 10
average reward exploit: MDP 7 194.13613036862358 10
average reward exploit: MDP 7 70.69

average reward exploit: MDP 0 269.7097124474319 10
average reward exploit: MDP 0 236.71526103938305 10
average reward exploit: MDP 0 280.8690989163554 10
average reward explore: 342.307119602473
average reward exploit: MDP 1 252.27193495632918 10
average reward exploit: MDP 1 263.6067353848077 10
average reward exploit: MDP 1 251.10029153790506 10
average reward exploit: MDP 1 276.1140954279111 10
average reward exploit: MDP 1 272.41207591989763 10
average reward exploit: MDP 1 268.2739689158266 10
average reward exploit: MDP 1 249.5479261275399 10
average reward exploit: MDP 1 274.25294782899033 10
average reward exploit: MDP 1 250.7989989921905 10
average reward exploit: MDP 1 262.66033904156825 10
average reward explore: 369.98963469909614
average reward exploit: MDP 2 78.09993292635494 10
average reward exploit: MDP 2 111.72014223723818 10
average reward exploit: MDP 2 145.74718473710863 10
average reward exploit: MDP 2 123.27694008243805 10
average reward exploit: MDP 2 139.934127

average reward exploit: MDP 5 154.88034402720456 10
average reward exploit: MDP 5 206.6750851482315 10
average reward exploit: MDP 5 163.6546178427047 10
average reward exploit: MDP 5 158.1693673329715 10
average reward exploit: MDP 5 202.4368668689723 10
average reward exploit: MDP 5 180.32391206650345 10
average reward explore: 344.75484209797077
average reward exploit: MDP 6 263.62154829607164 10
average reward exploit: MDP 6 266.3154226366912 10
average reward exploit: MDP 6 304.674311915381 10
average reward exploit: MDP 6 272.9724238734155 10
average reward exploit: MDP 6 262.0703364799786 10
average reward exploit: MDP 6 262.8730263986196 10
average reward exploit: MDP 6 300.93155098255846 10
average reward exploit: MDP 6 296.30188872960844 10
average reward exploit: MDP 6 300.1054772393771 10
average reward exploit: MDP 6 291.4690955473118 10
average reward explore: 404.0678355878912
average reward exploit: MDP 7 252.84625163316565 10
average reward exploit: MDP 7 221.837730134

average reward exploit: MDP 0 263.85057217121545 10
average reward exploit: MDP 0 323.453751655927 10
average reward exploit: MDP 0 289.5260519040436 10
average reward exploit: MDP 0 300.16685097848915 10
average reward exploit: MDP 0 250.72079993785664 10
average reward exploit: MDP 0 282.3452689750178 10
average reward exploit: MDP 0 273.7478480268453 10
average reward exploit: MDP 0 273.3566276164839 10
average reward exploit: MDP 0 242.24151937540555 10
average reward explore: 383.98341947709207
average reward exploit: MDP 1 180.75644667898024 10
average reward exploit: MDP 1 225.95519714025605 10
average reward exploit: MDP 1 172.85617178224436 10
average reward exploit: MDP 1 175.08794400197746 10
average reward exploit: MDP 1 164.4756397309078 10
average reward exploit: MDP 1 192.84816164279158 10
average reward exploit: MDP 1 196.3917362950611 10
average reward exploit: MDP 1 196.9080538104794 10
average reward exploit: MDP 1 168.18613568186686 10
average reward exploit: MDP 1 

average reward exploit: MDP 4 202.98449596537986 10
average reward explore: 346.2887039210872
average reward exploit: MDP 5 242.28639975467883 10
average reward exploit: MDP 5 304.36793507531735 10
average reward exploit: MDP 5 265.2439999171261 10
average reward exploit: MDP 5 255.76129720433784 10
average reward exploit: MDP 5 297.3480050270256 10
average reward exploit: MDP 5 275.2467442477843 10
average reward exploit: MDP 5 295.65143625563917 10
average reward exploit: MDP 5 277.3136426419872 10
average reward exploit: MDP 5 281.55881219850994 10
average reward exploit: MDP 5 270.81718882370996 10
average reward explore: 410.2903589073218
average reward exploit: MDP 6 262.4085451342163 10
average reward exploit: MDP 6 267.71452584781173 10
average reward exploit: MDP 6 299.74420098303074 10
average reward exploit: MDP 6 224.67050729836492 10
average reward exploit: MDP 6 245.7998770278795 10
average reward exploit: MDP 6 275.185363653486 10
average reward exploit: MDP 6 251.735379

average reward exploit: MDP 9 230.09672048485967 10
average reward exploit: MDP 9 237.0108339610807 10
average reward exploit: MDP 9 199.36161772897492 10
average reward exploit: MDP 9 261.74385396401766 10
average reward explore: 421.64799496357546
average reward exploit: MDP 0 279.44701134532704 10
average reward exploit: MDP 0 188.11659987934257 10
average reward exploit: MDP 0 272.1760786239885 10
average reward exploit: MDP 0 265.93047954792496 10
average reward exploit: MDP 0 243.51596975463445 10
average reward exploit: MDP 0 268.7031026645505 10
average reward exploit: MDP 0 239.47660566189808 10
average reward exploit: MDP 0 250.39389779047815 10
average reward exploit: MDP 0 234.7515293398516 10
average reward exploit: MDP 0 232.19017721944752 10
average reward explore: 408.4095658886371
average reward exploit: MDP 1 212.26842380269926 10
average reward exploit: MDP 1 216.27547227603017 10
average reward exploit: MDP 1 229.47755171293883 10
average reward exploit: MDP 1 214.8

average reward exploit: MDP 4 230.74732916734638 10
average reward exploit: MDP 4 167.78964986162552 10
average reward exploit: MDP 4 186.34982903570844 10
average reward exploit: MDP 4 199.59329158503436 10
average reward exploit: MDP 4 181.82558325270332 10
average reward exploit: MDP 4 227.63251215542317 10
average reward exploit: MDP 4 165.0843979825718 10
average reward explore: 414.99278585629907
average reward exploit: MDP 5 229.12156318166382 10
average reward exploit: MDP 5 254.38197607271636 10
average reward exploit: MDP 5 190.84584759903788 10
average reward exploit: MDP 5 286.55700659621675 10
average reward exploit: MDP 5 212.37976320269186 10
average reward exploit: MDP 5 169.96783269995439 10
average reward exploit: MDP 5 181.82077020865523 10
average reward exploit: MDP 5 132.85271928746093 10
average reward exploit: MDP 5 172.22350965010673 10
average reward exploit: MDP 5 263.40951319499396 10
average reward explore: 396.4350181971207
average reward exploit: MDP 6 24

average reward exploit: MDP 9 218.0695301396867 10
average reward exploit: MDP 9 214.71975751943947 10
average reward exploit: MDP 9 215.41501177695304 10
average reward exploit: MDP 9 218.72070317425764 10
average reward exploit: MDP 9 215.25998414737538 10
average reward exploit: MDP 9 213.61259709226238 10
average reward exploit: MDP 9 229.4496766032758 10
average reward exploit: MDP 9 226.7095973885217 10
average reward exploit: MDP 9 222.38781662000378 10
average reward exploit: MDP 9 219.76497108731724 10
average reward explore: 317.8660990731738
average reward exploit: MDP 0 212.91646825236367 10
average reward exploit: MDP 0 210.37736535212247 10
average reward exploit: MDP 0 222.71645460578242 10
average reward exploit: MDP 0 200.68004062858114 10
average reward exploit: MDP 0 176.94624931045274 10
average reward exploit: MDP 0 222.92604341625875 10
average reward exploit: MDP 0 233.913001789998 10
average reward exploit: MDP 0 207.80158323410797 10
average reward exploit: MDP

average reward exploit: MDP 3 224.2589688093495 10
average reward exploit: MDP 3 244.9852919099407 10
average reward explore: 386.1838918190814
average reward exploit: MDP 4 261.0176306764451 10
average reward exploit: MDP 4 261.05248942617305 10
average reward exploit: MDP 4 276.7606074423262 10
average reward exploit: MDP 4 262.0180747562748 10
average reward exploit: MDP 4 266.6820932394767 10
average reward exploit: MDP 4 257.8443431228178 10
average reward exploit: MDP 4 266.3242202929826 10
average reward exploit: MDP 4 263.3394089389201 10
average reward exploit: MDP 4 268.85193936304563 10
average reward exploit: MDP 4 260.22661041450374 10
average reward explore: 360.8715673008939
average reward exploit: MDP 5 211.4444236229239 10
average reward exploit: MDP 5 202.38653787671603 10
average reward exploit: MDP 5 208.07860521110766 10
average reward exploit: MDP 5 210.4405147066369 10
average reward exploit: MDP 5 190.8625306377715 10
average reward exploit: MDP 5 194.7250085893

average reward exploit: MDP 8 269.7881311163283 10
average reward exploit: MDP 8 285.74960645461874 10
average reward exploit: MDP 8 259.96385172054175 10
average reward exploit: MDP 8 273.6187981735858 10
average reward exploit: MDP 8 263.9262518933948 10
average reward explore: 369.5149904224907
average reward exploit: MDP 9 300.73268261922425 10
average reward exploit: MDP 9 233.00125818559732 10
average reward exploit: MDP 9 291.14501777463767 10
average reward exploit: MDP 9 276.2262576141991 10
average reward exploit: MDP 9 240.44632754364937 10
average reward exploit: MDP 9 266.8462359036572 10
average reward exploit: MDP 9 245.48606008308616 10
average reward exploit: MDP 9 266.7968010818625 10
average reward exploit: MDP 9 247.9940557561706 10
average reward exploit: MDP 9 251.76895720856072 10
average reward explore: 340.4512370625058
average reward exploit: MDP 0 277.07068521387134 10
average reward exploit: MDP 0 279.49751599599523 10
average reward exploit: MDP 0 260.77704

average reward exploit: MDP 3 140.94244678415626 10
average reward exploit: MDP 3 139.8966801948093 10
average reward exploit: MDP 3 203.75846953516222 10
average reward exploit: MDP 3 206.52969075970503 10
average reward exploit: MDP 3 160.50710151872636 10
average reward exploit: MDP 3 182.66571865879126 10
average reward exploit: MDP 3 219.78306418575275 10
average reward explore: 328.68154092313733
average reward exploit: MDP 4 228.26359607377367 10
average reward exploit: MDP 4 217.876162631815 10
average reward exploit: MDP 4 209.11839014600423 10
average reward exploit: MDP 4 252.656239411949 10
average reward exploit: MDP 4 244.06142407982375 10
average reward exploit: MDP 4 180.40010694260985 10
average reward exploit: MDP 4 215.91574525769389 10
average reward exploit: MDP 4 203.30927919880304 10
average reward exploit: MDP 4 259.19800727174913 10
average reward exploit: MDP 4 212.67176816840146 10
average reward explore: 355.10989987938535
average reward exploit: MDP 5 246.2

average reward exploit: MDP 8 240.86880461289405 10
average reward exploit: MDP 8 246.11588674328928 10
average reward exploit: MDP 8 176.93441551629064 10
average reward exploit: MDP 8 209.91832817425907 10
average reward exploit: MDP 8 240.20186148604816 10
average reward exploit: MDP 8 215.90942976464376 10
average reward exploit: MDP 8 243.22794040040412 10
average reward exploit: MDP 8 261.8871705377961 10
average reward exploit: MDP 8 241.76772755309676 10
average reward exploit: MDP 8 245.30711865712982 10
average reward explore: 339.2941987780714
average reward exploit: MDP 9 245.44225891241484 10
average reward exploit: MDP 9 218.6207166691608 10
average reward exploit: MDP 9 286.7008106181928 10
average reward exploit: MDP 9 279.27280840229594 10
average reward exploit: MDP 9 275.6154315021785 10
average reward exploit: MDP 9 310.5055375435111 10
average reward exploit: MDP 9 286.47834199414365 10
average reward exploit: MDP 9 256.35486944389214 10
average reward exploit: MDP

average reward exploit: MDP 2 239.44973547819254 10
average reward exploit: MDP 2 218.33037544367608 10
average reward explore: 393.95669551346583
average reward exploit: MDP 3 171.0927992362576 10
average reward exploit: MDP 3 240.9394736945555 10
average reward exploit: MDP 3 232.739009066302 10
average reward exploit: MDP 3 170.84050458279864 10
average reward exploit: MDP 3 228.69397360787138 10
average reward exploit: MDP 3 195.50947582354965 10
average reward exploit: MDP 3 175.2835286533662 10
average reward exploit: MDP 3 194.91068205113635 10
average reward exploit: MDP 3 170.439976109396 10
average reward exploit: MDP 3 175.93963022752197 10
average reward explore: 410.20824056489556
average reward exploit: MDP 4 177.65150209961718 10
average reward exploit: MDP 4 223.17963935766784 10
average reward exploit: MDP 4 197.4032899863477 10
average reward exploit: MDP 4 218.98351363142 10
average reward exploit: MDP 4 196.68977568924907 10
average reward exploit: MDP 4 198.0532998

average reward exploit: MDP 7 174.185688007428 10
average reward exploit: MDP 7 26.25686207347461 10
average reward exploit: MDP 7 177.82040632934456 10
average reward exploit: MDP 7 207.03632204658157 10
average reward exploit: MDP 7 129.5168627982859 10
average reward explore: 363.27679913095056
average reward exploit: MDP 8 113.23390627322365 10
average reward exploit: MDP 8 107.80114092081378 10
average reward exploit: MDP 8 123.32444841304451 10
average reward exploit: MDP 8 83.75421282070616 10
average reward exploit: MDP 8 90.13932091914658 10
average reward exploit: MDP 8 63.223134054164646 10
average reward exploit: MDP 8 83.53806230156812 10
average reward exploit: MDP 8 104.58533542419744 10
average reward exploit: MDP 8 102.08804870209833 10
average reward exploit: MDP 8 118.71836670570417 10
average reward explore: 405.59329541667955
average reward exploit: MDP 9 57.17417622296292 10
average reward exploit: MDP 9 90.16157911207979 10
average reward exploit: MDP 9 108.27715

average reward exploit: MDP 2 -10.54410191840775 10
average reward exploit: MDP 2 2.7154023074880387 10
average reward exploit: MDP 2 -49.2864180012007 10
average reward exploit: MDP 2 -38.48478152027815 10
average reward exploit: MDP 2 -7.934007658213895 10
average reward exploit: MDP 2 -4.824887053368768 10
average reward exploit: MDP 2 -13.972527316839745 10
average reward exploit: MDP 2 24.224821310560905 10
average reward explore: 353.2942004938138
average reward exploit: MDP 3 64.61374842797822 10
average reward exploit: MDP 3 64.67748982234431 10
average reward exploit: MDP 3 62.97514429645081 10
average reward exploit: MDP 3 85.79231248997687 10
average reward exploit: MDP 3 171.5033140430172 10
average reward exploit: MDP 3 52.83370587139357 10
average reward exploit: MDP 3 53.53749678025175 10
average reward exploit: MDP 3 58.004314042634675 10
average reward exploit: MDP 3 58.70417252999384 10
average reward exploit: MDP 3 104.90124684039843 10
average reward explore: 337.45

average reward exploit: MDP 7 92.72264666117162 10
average reward exploit: MDP 7 53.83189851960003 10
average reward exploit: MDP 7 59.90863152740709 10
average reward exploit: MDP 7 93.46406794279615 10
average reward exploit: MDP 7 54.34554239101068 10
average reward exploit: MDP 7 130.93309266146872 10
average reward exploit: MDP 7 61.23746022913555 10
average reward exploit: MDP 7 94.90280407993754 10
average reward exploit: MDP 7 103.91197685013515 10
average reward exploit: MDP 7 68.64930143407963 10
average reward explore: 307.9455497522107
average reward exploit: MDP 8 69.45771846134903 10
average reward exploit: MDP 8 71.42075755796552 10
average reward exploit: MDP 8 65.64691484946408 10
average reward exploit: MDP 8 73.76718275833608 10
average reward exploit: MDP 8 80.42024995179062 10
average reward exploit: MDP 8 68.49196701390785 10
average reward exploit: MDP 8 67.32350590618734 10
average reward exploit: MDP 8 72.05189112766672 10
average reward exploit: MDP 8 95.44123

average reward exploit: MDP 1 94.379938212064 10
average reward explore: 304.3979067746029
average reward exploit: MDP 2 102.64605061606005 10
average reward exploit: MDP 2 75.50196805464655 10
average reward exploit: MDP 2 69.15569590142913 10
average reward exploit: MDP 2 71.58482403727032 10
average reward exploit: MDP 2 96.72768349109629 10
average reward exploit: MDP 2 59.96854855074871 10
average reward exploit: MDP 2 88.84173073709727 10
average reward exploit: MDP 2 81.50663057722754 10
average reward exploit: MDP 2 77.79038617493407 10
average reward exploit: MDP 2 68.35544395797011 10
average reward explore: 366.36106074368524
average reward exploit: MDP 3 63.231610981349306 10
average reward exploit: MDP 3 69.70558090722389 10
average reward exploit: MDP 3 88.58011514076529 10
average reward exploit: MDP 3 111.34084453108059 10
average reward exploit: MDP 3 102.48435897468639 10
average reward exploit: MDP 3 68.34711486219996 10
average reward exploit: MDP 3 60.6485442739698

average reward exploit: MDP 6 180.1834017751427 10
average reward exploit: MDP 6 190.7262816747096 10
average reward exploit: MDP 6 257.4402301534589 10
average reward explore: 300.08535124811846
average reward exploit: MDP 7 134.35235735103217 10
average reward exploit: MDP 7 164.44091608502097 10
average reward exploit: MDP 7 248.0298737269914 10
average reward exploit: MDP 7 211.56009040980788 10
average reward exploit: MDP 7 196.28986692779748 10
average reward exploit: MDP 7 160.37593383443308 10
average reward exploit: MDP 7 153.35943753323448 10
average reward exploit: MDP 7 168.14525554712333 10
average reward exploit: MDP 7 217.8049705248029 10
average reward exploit: MDP 7 179.3072285249362 10
average reward explore: 361.0974304290438
average reward exploit: MDP 8 149.3612139459305 10
average reward exploit: MDP 8 144.64444516931536 10
average reward exploit: MDP 8 92.58490471387405 10
average reward exploit: MDP 8 119.29222681404868 10
average reward exploit: MDP 8 205.14818

average reward exploit: MDP 1 64.53519911963328 10
average reward exploit: MDP 1 95.78415127290529 10
average reward exploit: MDP 1 98.80980625314496 10
average reward exploit: MDP 1 99.42932726825006 10
average reward exploit: MDP 1 110.48218047333808 10
average reward exploit: MDP 1 66.80572349376963 10
average reward explore: 410.56963221074255
average reward exploit: MDP 2 159.02219063119827 10
average reward exploit: MDP 2 222.9945512757543 10
average reward exploit: MDP 2 144.58563728673613 10
average reward exploit: MDP 2 215.87471287688885 10
average reward exploit: MDP 2 166.4396639471696 10
average reward exploit: MDP 2 102.84413005936078 10
average reward exploit: MDP 2 195.49886984075152 10
average reward exploit: MDP 2 178.9927109872664 10
average reward exploit: MDP 2 184.05173600708028 10
average reward exploit: MDP 2 234.34849917572086 10
average reward explore: 421.01054413692435
average reward exploit: MDP 3 70.4588230217847 10
average reward exploit: MDP 3 140.432043

average reward exploit: MDP 6 245.45901060496794 10
average reward exploit: MDP 6 157.82820833376817 10
average reward exploit: MDP 6 202.19491299316482 10
average reward exploit: MDP 6 230.61705019176438 10
average reward exploit: MDP 6 219.01861593935286 10
average reward exploit: MDP 6 224.24067486801897 10
average reward exploit: MDP 6 167.5845198362906 10
average reward exploit: MDP 6 139.87626473594455 10
average reward exploit: MDP 6 171.3048704548794 10
average reward explore: 374.76808439924537
average reward exploit: MDP 7 69.42585394092136 10
average reward exploit: MDP 7 60.9221623653098 10
average reward exploit: MDP 7 113.43278440277246 10
average reward exploit: MDP 7 95.63906413025822 10
average reward exploit: MDP 7 67.8580342141698 10
average reward exploit: MDP 7 66.3604876352201 10
average reward exploit: MDP 7 127.14612735793052 10
average reward exploit: MDP 7 93.40461277039762 10
average reward exploit: MDP 7 98.85006514197981 10
average reward exploit: MDP 7 105

average reward exploit: MDP 0 276.7643010459021 10
average reward explore: 312.1718165092764
average reward exploit: MDP 1 280.02670643951456 10
average reward exploit: MDP 1 241.93550949475576 10
average reward exploit: MDP 1 257.46048311147695 10
average reward exploit: MDP 1 284.7070098154272 10
average reward exploit: MDP 1 210.26115488996828 10
average reward exploit: MDP 1 245.67669380014422 10
average reward exploit: MDP 1 281.7476709657034 10
average reward exploit: MDP 1 196.25065572048047 10
average reward exploit: MDP 1 196.94393780465774 10
average reward exploit: MDP 1 228.53982204458754 10
average reward explore: 357.99266719664894
average reward exploit: MDP 2 66.28749672488428 10
average reward exploit: MDP 2 22.635051524978543 10
average reward exploit: MDP 2 60.279708652137366 10
average reward exploit: MDP 2 100.28818576165766 10
average reward exploit: MDP 2 61.3489145251007 10
average reward exploit: MDP 2 41.11871085679395 10
average reward exploit: MDP 2 93.07364

average reward exploit: MDP 5 137.2818006861705 10
average reward exploit: MDP 5 120.2721746624572 10
average reward exploit: MDP 5 150.37097796104825 10
average reward exploit: MDP 5 243.3353794896047 10
average reward explore: 373.46352068732006
average reward exploit: MDP 6 185.339519196578 10
average reward exploit: MDP 6 163.6817915744324 10
average reward exploit: MDP 6 216.6262320331967 10
average reward exploit: MDP 6 130.61993633086658 10
average reward exploit: MDP 6 197.70114199644982 10
average reward exploit: MDP 6 171.55927451075917 10
average reward exploit: MDP 6 234.15144641870444 10
average reward exploit: MDP 6 199.21324438093933 10
average reward exploit: MDP 6 237.67736422524314 10
average reward exploit: MDP 6 271.0679294398958 10
average reward explore: 356.4133618345375
average reward exploit: MDP 7 80.40990170928187 10
average reward exploit: MDP 7 157.40455309626537 10
average reward exploit: MDP 7 128.7177564212681 10
average reward exploit: MDP 7 94.59307000

average reward exploit: MDP 0 244.91898133970625 10
average reward exploit: MDP 0 212.05153003775848 10
average reward exploit: MDP 0 228.48029389293623 10
average reward exploit: MDP 0 231.539018601013 10
average reward exploit: MDP 0 273.6759150766507 10
average reward exploit: MDP 0 258.84508234879553 10
average reward exploit: MDP 0 256.20397217378695 10
average reward explore: 332.07197877747063
average reward exploit: MDP 1 207.71824820459022 10
average reward exploit: MDP 1 245.5784602654857 10
average reward exploit: MDP 1 218.82881452419974 10
average reward exploit: MDP 1 219.90659316934813 10
average reward exploit: MDP 1 248.72230658405624 10
average reward exploit: MDP 1 198.06508825376577 10
average reward exploit: MDP 1 278.61661041459575 10
average reward exploit: MDP 1 175.13983742367026 10
average reward exploit: MDP 1 223.15539634327723 10
average reward exploit: MDP 1 135.2970059871776 10
average reward explore: 360.79912588554106
average reward exploit: MDP 2 149.6

average reward exploit: MDP 5 228.3532194708526 10
average reward exploit: MDP 5 229.49647057949892 10
average reward exploit: MDP 5 227.23992830748324 10
average reward exploit: MDP 5 226.970927664996 10
average reward exploit: MDP 5 231.71835510821097 10
average reward exploit: MDP 5 223.1262042630529 10
average reward exploit: MDP 5 234.14654617591918 10
average reward exploit: MDP 5 240.25765330364047 10
average reward exploit: MDP 5 222.467910936927 10
average reward exploit: MDP 5 215.9902298905451 10
average reward explore: 360.4080098447497
average reward exploit: MDP 6 207.90530714810362 10
average reward exploit: MDP 6 237.1404810889998 10
average reward exploit: MDP 6 178.3099660457949 10
average reward exploit: MDP 6 240.26032184943725 10
average reward exploit: MDP 6 218.48014585220503 10
average reward exploit: MDP 6 229.37569433289772 10
average reward exploit: MDP 6 234.60241518387852 10
average reward exploit: MDP 6 196.0306517162345 10
average reward exploit: MDP 6 13

average reward exploit: MDP 9 308.8974358207264 10
average reward exploit: MDP 9 282.68983596675156 10
average reward explore: 394.01372002019326
average reward exploit: MDP 0 313.4649951687296 10
average reward exploit: MDP 0 303.70003597654454 10
average reward exploit: MDP 0 315.5852662838438 10
average reward exploit: MDP 0 299.92037289012745 10
average reward exploit: MDP 0 273.51152100025763 10
average reward exploit: MDP 0 310.753718129234 10
average reward exploit: MDP 0 263.6310392372264 10
average reward exploit: MDP 0 235.29613822711326 10
average reward exploit: MDP 0 236.37437531344193 10
average reward exploit: MDP 0 221.804237310833 10
average reward explore: 390.4624830661151
average reward exploit: MDP 1 240.71085715756894 10
average reward exploit: MDP 1 234.7629971106009 10
average reward exploit: MDP 1 184.8013442551175 10
average reward exploit: MDP 1 236.79790423092723 10
average reward exploit: MDP 1 204.77116365248594 10
average reward exploit: MDP 1 195.4472183

average reward exploit: MDP 4 200.15090112437127 10
average reward exploit: MDP 4 156.29415734449347 10
average reward exploit: MDP 4 228.08160801311692 10
average reward exploit: MDP 4 170.09524153468044 10
average reward exploit: MDP 4 205.0122182126075 10
average reward explore: 322.28725894978487
average reward exploit: MDP 5 175.74174829387704 10
average reward exploit: MDP 5 244.54397606117942 10
average reward exploit: MDP 5 182.8450060333756 10
average reward exploit: MDP 5 153.14170673342062 10
average reward exploit: MDP 5 207.910406596526 10
average reward exploit: MDP 5 127.16461179507981 10
average reward exploit: MDP 5 219.02621658165975 10
average reward exploit: MDP 5 246.03026068194262 10
average reward exploit: MDP 5 170.47372910026715 10
average reward exploit: MDP 5 109.17748369232629 10
average reward explore: 335.20187302966076
average reward exploit: MDP 6 216.08789557956487 10
average reward exploit: MDP 6 206.23060764225306 10
average reward exploit: MDP 6 228.

average reward exploit: MDP 9 153.6242580103804 10
average reward exploit: MDP 9 240.55439287182827 10
average reward exploit: MDP 9 195.13062131211535 10
average reward exploit: MDP 9 266.6392324228933 10
average reward exploit: MDP 9 277.01711858740794 10
average reward exploit: MDP 9 256.35960951304804 10
average reward exploit: MDP 9 291.1952088346031 10
average reward exploit: MDP 9 284.06903056987585 10
average reward explore: 356.6099636275333
average reward exploit: MDP 0 213.2977499402453 10
average reward exploit: MDP 0 229.87616948157455 10
average reward exploit: MDP 0 246.5770052792067 10
average reward exploit: MDP 0 233.9142821264848 10
average reward exploit: MDP 0 247.55686192240654 10
average reward exploit: MDP 0 222.37462446835775 10
average reward exploit: MDP 0 245.0160325019744 10
average reward exploit: MDP 0 251.04437496048257 10
average reward exploit: MDP 0 232.0556091286559 10
average reward exploit: MDP 0 217.02145634020985 10
average reward explore: 393.35

average reward explore: 337.2592155480123
average reward exploit: MDP 4 260.0847143459662 10
average reward exploit: MDP 4 246.44845240514223 10
average reward exploit: MDP 4 237.2597553592453 10
average reward exploit: MDP 4 241.18521562294455 10
average reward exploit: MDP 4 269.14327123340234 10
average reward exploit: MDP 4 284.8174798856461 10
average reward exploit: MDP 4 233.24185479527395 10
average reward exploit: MDP 4 253.32018821182368 10
average reward exploit: MDP 4 278.38750376538536 10
average reward exploit: MDP 4 273.0648106508973 10
average reward explore: 339.77793171033835
average reward exploit: MDP 5 156.78819648900148 10
average reward exploit: MDP 5 207.48431502027037 10
average reward exploit: MDP 5 152.4108249729101 10
average reward exploit: MDP 5 234.5826191574326 10
average reward exploit: MDP 5 264.6973702789875 10
average reward exploit: MDP 5 304.0158343761658 10
average reward exploit: MDP 5 194.69314701973875 10
average reward exploit: MDP 5 226.96510

average reward exploit: MDP 8 281.87529624400923 10
average reward exploit: MDP 8 175.13180823800528 10
average reward exploit: MDP 8 190.6755676573381 10
average reward explore: 386.65280290154084
average reward exploit: MDP 9 289.5162800143727 10
average reward exploit: MDP 9 254.25637419683645 10
average reward exploit: MDP 9 258.44056761666667 10
average reward exploit: MDP 9 290.0711635591882 10
average reward exploit: MDP 9 269.9752269914449 10
average reward exploit: MDP 9 267.44777779474487 10
average reward exploit: MDP 9 281.36677520903237 10
average reward exploit: MDP 9 267.0488853999072 10
average reward exploit: MDP 9 273.31565245899407 10
average reward exploit: MDP 9 298.0881142098388 10
average reward explore: 365.18588171806266
average reward exploit: MDP 0 234.77597753543495 10
average reward exploit: MDP 0 227.78969730090267 10
average reward exploit: MDP 0 259.35215677306945 10
average reward exploit: MDP 0 234.23658390854433 10
average reward exploit: MDP 0 257.56

AttributeError: 'MetaLearner' object has no attribute 'exploit_rewards'

In [92]:
for epoch in range(200):
        print("epoch number: ", epoch)
        a.train_step()

epoch number:  0
average reward explore: 197.4367190503391
average reward exploit: MDP 0 182.6423977032496 200
average reward exploit: MDP 0 177.83136573437966 200
average reward exploit: MDP 0 186.4590380054437 200
average reward explore: 208.62477861875507
average reward exploit: MDP 1 200.1499529335926 200
average reward exploit: MDP 1 203.96533608881452 200
average reward exploit: MDP 1 203.2714773634698 200
average reward explore: 206.44248577942912
average reward exploit: MDP 2 173.34431477504702 200
average reward exploit: MDP 2 180.10862083826888 200
average reward exploit: MDP 2 175.50692178789933 200
average reward explore: 214.38352005067472


KeyboardInterrupt: 

In [58]:
explore_paths, explore_rewards = a.sample_paths_explore(a.env,1)

In [59]:
print("average reward explore:", np.mean(explore_rewards))
M = a.stack_trajectories(explore_paths)

average reward explore: 46.3447291282569


In [60]:
Z = a.sess.run(a.Z, feed_dict = {a.encoder_input_placeholder: M,a.sequence_length_placeholder: a.length })

In [61]:
Z = np.reshape(Z,[1,len(Z)])

In [62]:
exploit_paths, exploit_rewards = a.sample_paths_exploit(a.env,Z,1)

In [63]:
print("average reward exploit", np.mean(exploit_rewards), len(exploit_rewards))

average reward exploit 152.95256858651558 800


In [91]:
a.num_traj = 200

In [40]:
for i in range(20):
    explore_paths, explore_rewards = a.sample_paths_explore(a.env,30*np.random.rand())
    print("gravity:", a.sim.model.opt.gravity)
    print("average reward explore:", np.mean(explore_rewards))
    M = a.stack_trajectories(explore_paths)
    Z = a.sess.run(a.Z, feed_dict = {a.encoder_input_placeholder: M,a.sequence_length_placeholder: a.length })
    Z = np.reshape(Z,[1,len(Z)])
    exploit_paths, exploit_rewards = a.sample_paths_exploit(a.env,Z,1)
    print("average reward exploit", np.sum(exploit_rewards) / num_traj, len(exploit_rewards))
    

gravity: [  0.           0.         -11.78211953]
average reward explore: 1304.1956304786172
average reward exploit 8011.892223745158 800
gravity: [  0.           0.         -19.76739288]
average reward explore: 513.5210888464073
average reward exploit 8236.849973173716 800
gravity: [ 0.          0.         -5.67745507]
average reward explore: 1733.9094007923966
average reward exploit 9397.641993205756 800
gravity: [  0.         0.       -33.888614]
average reward explore: 290.24741591480824
average reward exploit 9128.682414141625 800
gravity: [  0.           0.         -14.66682571]
average reward explore: 892.629266461097
average reward exploit 9177.183052697012 800
gravity: [  0.           0.         -16.68177216]
average reward explore: 690.623843804044
average reward exploit 9046.119152302825 800
gravity: [  0.           0.         -11.42352458]
average reward explore: 1286.0432377482339
average reward exploit 6973.652444899903 800
gravity: [  0.           0.         -34.55160267

KeyboardInterrupt: 

In [None]:
observations_explore = np.concatenate([path["observation"] for path in explore_paths])
new_observations_explore = np.concatenate([path["new_obs"] for path in explore_paths])
new_actions_explore = np.concatenate([path["new_acs"] for path in explore_paths])
actions_explore = np.concatenate([path["action"] for path in explore_paths])
rewards_explore = np.concatenate([path["reward"] for path in explore_paths])
returns_explore = a.get_returns(explore_paths, explore = True)

In [None]:
M = np.array(a.stack_trajectories(explore_paths))
Z = a.sess.run(a.Z, feed_dict = {a.encoder_input_placeholder: M,a.sequence_length_placeholder: a.length })
Z = np.reshape(Z,[1,len(Z)])

In [None]:
 print("average reward explore: MDP", np.sum(explore_rewards)/num_traj, len(explore_rewards))

In [None]:
exploit_paths, exploit_rewards = a.sample_paths_exploit(a.env,Z,1000)

In [None]:
print("average reward exploit: MDP", np.sum(exploit_rewards) / num_traj, len(exploit_rewards))

In [None]:
def randomize_pendulum(env):
    m = config['pendulum_mass_min'] + np.random.rand()*(config['pendulum_mass_max'] - config['pendulum_mass_min'])
    l = config['pendulum_len_min'] + np.random.rand()*(config['pendulum_len_max'] - config['pendulum_len_min'])
    env.m = m
    env.l = l

In [None]:
def randomize_hopper(env):
    ts = config['torso_min'] + np.random.rand()*(config['torso_max'] - config['torso_min'])
    f = config['friction_min'] + np.random.rand()*(config['friction_max'] - config['friction_min'])
    
    env.friction = f
    env.torso_size = ts
    env.apply_env_modifications()

In [None]:
import yaml
cfg_filename = 'hopper-config.yml'
with open(cfg_filename,'r') as ymlfile:
    config = yaml.load(ymlfile)

In [None]:
env = Randomizer(gym.make('Hopper-v2'), randomize_hopper)
env.unwrapped.__dict__.keys()
env.reset()

In [None]:
m = config['pendulum_mass_min'] + np.random.rand()*(config['pendulum_mass_max'] - config['pendulum_mass_min'])

In [None]:
config

In [None]:
env_id = 'Hopper-v2'
e = gym.make(env_id)

In [None]:
e.unwrapped.__dict__.keys()

In [None]:
e.unwrapped.apply_env_modifications()

In [7]:
a.sim.model.opt

<mujoco_py.cymj.PyMjOption at 0x1c23ba8ac8>

In [41]:
exploit_rewards

[276.1102081018689,
 96.70334286691545,
 85.08844921445562,
 269.074506860484,
 157.6271624984381,
 98.24853913162067,
 285.81629053901,
 47.571903330221204,
 305.7646644475844,
 214.54603825106128,
 318.79671274020046,
 108.78849606101353,
 295.16834090059507,
 304.4609969983021,
 114.21098544869861,
 319.95648068618993,
 381.6618264988649,
 304.1486178093876,
 82.12237601439787,
 266.6843573219772,
 246.96691589862735,
 93.80999550171873,
 317.11151807815327,
 280.0507518589623,
 88.88056611395263,
 56.41848653966207,
 103.50173763966043,
 60.028258282030876,
 110.03073741847325,
 78.99782323464933,
 204.20943212350457,
 63.15781328295692,
 69.76077023209112,
 302.7306806301403,
 372.0257841968678,
 94.7188407595137,
 286.5314683913839,
 292.27559707101716,
 69.19350133789911,
 331.6337563028473,
 49.59573762857002,
 293.9671743156787,
 92.63588041783703,
 338.77067309601244,
 309.0596304951951,
 49.45457787549484,
 115.98180376950395,
 109.69929221601465,
 254.3979814832278,
 310.49