In [1]:
# Include Libraries
import numpy as np
import tensorflow as tf
import gym
import tensorflow.contrib.layers as layers
from mujoco_py import load_model_from_xml, MjSim, MjViewer
import policies
import value_functions as vfuncs
import utils_pg as utils

# Environment setup
#env = "CartPole-v0"
#env="InvertedPendulum-v2"
env = "Humanoid-v2"

# discrete = isinstance(env.action_space, gym.spaces.Discrete)
# observation_dim = env.observation_space.shape[0]
# action_dim = env.action_space.n if discrete else env.action_space.shape[0]
max_ep_len = 1000
num_traj = 30
#traj_length = max_ep_len*(observation_dim + 2)
latent_size = 150
use_baseline = True



  from ._conv import register_converters as _register_converters


Instructions for updating:
Use the retry module or similar alternatives.


In [2]:
# Feed forward network (multi-layer-perceptron, or mlp)

def build_mlp(mlp_input,output_size,scope,n_layers,size,output_activation=None):
    '''
    Build a feed forward network
    '''
    Input = mlp_input
    with tf.variable_scope(scope):
        # Dense Layers
        for i in range(n_layers-1):
            dense = tf.layers.dense(inputs = Input, units = size, activation = tf.nn.relu, bias_initializer=tf.constant_initializer(1.0))
            Input = dense
        # Fully Connected Layer
        out = layers.fully_connected(inputs = Input, num_outputs = output_size, activation_fn=output_activation)
    return out

In [5]:
class MetaLearner():
    def __init__(self, env, max_ep_len, num_traj,latent_size ):
        self.env = gym.make(env)
        self.discrete = isinstance(self.env.action_space, gym.spaces.Discrete)
        self.observation_dim = self.env.observation_space.shape[0]
        self.action_dim = self.env.action_space.n if self.discrete else self.env.action_space.shape[0]
        self.max_ep_len = max_ep_len
        self.num_traj = num_traj
        self.traj_length = self.max_ep_len*(self.observation_dim + 2) # TO Change
        self.use_baseline = True
        self.latent_size = latent_size
        self.feature_size = self.observation_dim + 1 + self.env.action_space.shape[0] # HC
        self.lr = 3e-3
        self.num_layers = 2
        self.layers_size = 32
        self.num_mdps = 10
        self.model = self.env.env.model
        self.sim = MjSim(self.model)
        self.gamma = 0.97
        self.decoder_len = 32

        # build model
        self.ConstructGraph()
    
    def add_placeholders(self):
        self.observation_placeholder_explore = tf.placeholder(tf.float32, shape=(None,self.observation_dim))
        if(self.discrete):
            self.action_placeholder_explore = tf.placeholder(tf.int32, shape=(None))
            self.action_placeholder_exploit = tf.placeholder(tf.int32, shape=(None))
        else:
            self.action_placeholder_explore = tf.placeholder(tf.float32, shape=(None,self.action_dim))
            self.action_placeholder_exploit= tf.placeholder(tf.float32, shape=(None,self.action_dim))

        self.baseline_target_placeholder = tf.placeholder(tf.float32, shape= None)
        self.advantage_placeholder_explore = tf.placeholder(tf.float32, shape=(None))
        
        #self.encoder_input_placeholder = tf.placeholder(tf.float32, shape= (self.num_traj,self.traj_length))
        self.encoder_input_placeholder = tf.placeholder(tf.float32, [None, None, self.feature_size])
        self.decoder_input_placeholder = tf.placeholder(tf.float32, shape= (1,self.latent_size))
        self.sequence_length_placeholder = tf.placeholder(tf.int32, [None, ])
        
        self.observation_placeholder_exploit = tf.placeholder(tf.float32, shape=(None,self.observation_dim))
        #TODO
        self.advantage_placeholder_exploit = tf.placeholder(tf.float32, shape=(None))
        
        
    def build_policy_explore(self, scope = "policy_explore"):
        if (self.discrete):
            self.action_logits = build_mlp(self.observation_placeholder_explore,self.action_dim,scope = scope,n_layers=self.num_layers,size = self.layers_size,output_activation=None)
            self.explore_action = tf.multinomial(self.action_logits,1)
            self.explore_action = tf.squeeze(self.explore_action, axis=1)
            self.explore_logprob = -tf.nn.sparse_softmax_cross_entropy_with_logits(logits = self.action_logits, labels = self.action_placeholder_explore)

        else:   
            action_means = build_mlp(self.observation_placeholder_explore,self.action_dim,scope,n_layers=self.num_layers, size = self.layers_size,output_activation=None)
            init = tf.constant(np.random.rand(1, 2))
            #log_std = tf.get_variable("log_std", [self.action_dim])
            log_std = tf.get_variable(
                'log_std',
                shape=[1, self.action_dim],
                dtype=tf.float32,
                initializer=tf.zeros_initializer(),
                trainable=True
            )
            log_std = tf.log(tf.exp(log_std) + 1.)
            log_std = tf.tile(log_std,
                                   [tf.shape(action_means)[0], 1])
            self.explore_action = tf.random_normal((1,),
                                                   action_means,
                                                   log_std)
            dist = tf.contrib.distributions.MultivariateNormalDiag(action_means, log_std)
            self.explore_logprob = dist.prob(self.action_placeholder_explore, name = 'log_prob')
            #self.explore_action =   action_means + tf.multiply(tf.exp(log_std),tf.random_normal(shape = (self.action_dim,1),mean=0,stddev=1))
            #mvn = tf.contrib.distributions.MultivariateNormalDiag(action_means, tf.exp(log_std))
            #self.explore_logprob =  mvn.log_prob(value = self.action_placeholder_explore, name='log_prob')

    
    
    def build_policy_exploit(self, scope = "policy_exploit"):
        if(self.discrete):
            #self.exploit_action_logits = (tf.matmul(tf.nn.relu(tf.matmul(tf.nn.relu(tf.matmul(self.observation_placeholder_exploit,self.d_W1) + self.d_B1), self.d_W2) + self.d_B2),self.d_W3) + self.d_B3)
            #self.exploit_action_logits = tf.matmul(self.observation_placeholder_exploit,self.d_W3) + self.d_B3
            self.exploit_action_logits = tf.matmul(tf.nn.relu(tf.matmul(self.observation_placeholder_exploit,self.d_W2) + self.d_B2), self.d_W3) + self.d_B3
            self.exploit_action = tf.multinomial(self.exploit_action_logits,1)
            self.exploit_action = tf.squeeze(self.exploit_action, axis=1)
            self.exploit_logprob = -tf.nn.sparse_softmax_cross_entropy_with_logits(logits = self.exploit_action_logits, labels = self.action_placeholder_exploit)
        else:
            action_means = tf.matmul(tf.nn.relu(tf.matmul(self.observation_placeholder_exploit,self.d_W2) + self.d_B2), self.d_W3) + self.d_B3
            init = tf.constant(np.random.rand(1, 2))
            log_std = tf.get_variable("exploit_log_prob", [self.action_dim])
            self.exploit_action = action_means + tf.multiply(tf.exp(log_std),tf.random_normal(shape = (self.action_dim,1),mean=0,stddev=1))
            self.exploit_logprob = utils.gauss_log_prob(mu=action_means, logstd=log_std, x=self.action_placeholder_exploit)
            
#             init = tf.constant(np.random.rand(1, 2))
#             log_std = tf.get_variable("exploit_log_prob", [self.action_dim])
#             self.exploit_action =   action_means + tf.multiply(tf.exp(log_std),tf.random_normal(shape = (self.action_dim,1),mean=0,stddev=1))
#             mvn = tf.contrib.distributions.MultivariateNormalDiag(action_means, tf.exp(log_std))
#             self.exploit_logprob =  mvn.log_prob(value = self.action_placeholder_exploit, name='exploit_log_prob')

        #self.loss_grads_exploit = self.exploit_logprob * self.advantage_placeholder_exploit
        
    def NNEncoder(self, scope = "NNEncoder"):
        self.Z = build_mlp(self.encoder_input_placeholder,self.latent_size,scope = scope,n_layers=3,size = 60,output_activation=None)
    
 

    # input [num_traj, length, features (obs + action + reward) ]
    def LSTMEncoder(self, scope = "LSTMEncoder"):
        self.hidden_size = 64
        initializer = tf.random_uniform_initializer(-1, 1)
        cell = tf.nn.rnn_cell.LSTMCell(self.hidden_size, self.feature_size, initializer=initializer)
        cell_out = tf.contrib.rnn.OutputProjectionWrapper(cell, self.latent_size)
        self.output, _ = tf.nn.dynamic_rnn(cell_out,self.encoder_input_placeholder,self.sequence_length_placeholder,dtype=tf.float32,)
        batch_size = tf.shape(self.output)[0]
        max_length = tf.shape(self.output)[1]
        out_size = int(self.output.get_shape()[2])
        index = tf.range(0, batch_size) * max_length + (self.sequence_length_placeholder - 1)
        flat = tf.reshape(self.output, [-1, out_size])
        self.Z = tf.reduce_mean(tf.gather(flat, index), axis = 0)
        
    
    def Decoder(self, decoder_out_dim, scope):
        return build_mlp(self.decoder_input_placeholder,decoder_out_dim,scope = scope,n_layers=1,size = 64,output_activation=None)
    
    
    def sample_paths_explore(self, env,mdp_num,Test = False, num_episodes = None):
        paths = []
        self.length = []
        episode_rewards = []
        self.sim.model.opt.gravity[-1] = -5 - mdp_num
        for i in range(self.num_traj):
            pad = False
            state = env.reset()
            new_states,states,new_actions, actions, rewards,new_rewards = [], [], [], [], [],[]
            episode_reward = 0
            for step in range(self.max_ep_len):
                if (pad):
                    states.append([0]*self.observation_dim)
                    if(self.discrete):
                        actions.append(0)
                    else:
                        actions.append([0]*self.action_dim)
                    rewards.append(0)
                else:
                    states.append(state)
                    new_states.append(state)
                    #action = self.sess.run(self.explore_action, feed_dict={self.observation_placeholder_explore : states[-1][None]})[0]
                    action = self.policyfn.sample_action(state) #NEW
                    state, reward, done, info = env.step(action)
                    actions.append(action)
                    new_actions.append(action)
                    rewards.append(reward)
                    new_rewards.append(reward)
                    episode_reward += reward
                    if (done or step == self.max_ep_len-1):
                        episode_rewards.append(episode_reward)
                        self.length.append(step + 1)
                        pad = True 
                        
            episode_rewards.append(episode_reward)            
            #print("explore",np.array(actions))
            path = {"new_obs" : np.array(new_states),"observation" : np.array(states),
                                "reward" : np.array(rewards),"new_acs" : np.array(new_actions),
                                "action" : np.array(actions),"new_rewards":np.array(new_rewards)}
            paths.append(path)
        return paths, episode_rewards
    
    def sample_paths_exploit(self, env,Z,mdp_num,Test = False, num_episodes = None):
        self.sim.model.opt.gravity[-1] = -5 - mdp_num
        
        paths = []
        num = 0
        episode_rewards = []
        for i in range(self.num_traj):
            state = env.reset()
            states, actions, rewards = [], [], []
            episode_reward = 0
            for step in range(self.max_ep_len):
                states.append(state)
                action = self.sess.run(self.exploit_action, feed_dict={self.observation_placeholder_exploit : state[None], self.decoder_input_placeholder: Z})[0]
                state, reward, done, info = env.step(action)
                actions.append(action)
                rewards.append(reward)
                episode_reward += reward
                if (done or step == self.max_ep_len-1):
                    episode_rewards.append(episode_reward)
                    break
            #print("exploit",np.array(actions))
            path = {"observation" : np.array(states),
                                "reward" : np.array(rewards),
                                "action" : np.array(actions)}
            paths.append(path)
 
        #print("exploit success: ", num)
        return paths, episode_rewards
    
    def get_returns(self,paths, explore = False):
        all_returns = []
        m = 0
        for path in paths:
            rewards = path["reward"]
            returns = []
            if(explore):
                length = self.length[m]
            else:
                length = len(rewards)
            for i in range(length):
                path_returns = 0
                k = 0
                for j in range(i,length):
                    path_returns = path_returns + rewards[j]*(self.gamma)**k
                    k = k+1
                returns.append(path_returns)
            all_returns.append(returns)
            m+=1
        returns = np.concatenate(all_returns)
        return returns
    
    def stack_trajectories(self,paths):
        trajectories = []
        for path in paths:
            rewards = path["reward"]
            states = path["observation"]
            action = path["action"]
            
            SAR = []
            for i in range(len(states)):
                SAR.append(list(states[i]) + list(action[i]) + [rewards[i]])
            trajectories.append(SAR)
        return np.array(trajectories)
    
    def addBaseline(self):
        self.baseline = build_mlp(self.observation_placeholder_explore,1,scope = "baseline",n_layers=self.num_layers, size = self.layers_size,output_activation=None)
        self.baseline_loss = tf.losses.mean_squared_error(self.baseline_target_placeholder,self.baseline,scope = "baseline")
        baseline_adam_optimizer =  tf.train.AdamOptimizer(learning_rate = self.lr)
        self.update_baseline_op = baseline_adam_optimizer.minimize(self.baseline_loss)

    def calculate_advantage(self,returns, observations):
        if (self.use_baseline):
            baseline = self.sess.run(self.baseline, {input_placeholder:observations})
            adv = returns - baseline
            adv = (adv - np.mean(adv))/np.std(adv)
        else:
            adv = returns
        return adv
    

        
    def ConstructGraph(self):
        tf.reset_default_graph()
        self.sess = tf.Session()
        
        self.vf = vfuncs.NnValueFunction(session=self.sess,ob_dim=self.observation_dim)
        self.policyfn = policies.GaussianPolicy(self.sess,self.observation_dim, self.action_dim)
        
        self.add_placeholders()
        
#         self.add_placeholders()
        
#         self.build_policy_explore()
#         self.explore_policy_loss = -tf.reduce_sum(self.explore_logprob * self.advantage_placeholder_explore)
#         self.loss_grads_explore = -self.explore_logprob * self.advantage_placeholder_explore
#         self.tvars_explore = tf.trainable_variables()
#         self.gradients_explore = tf.gradients(self.explore_policy_loss,self.tvars_explore)
        
#         #self.addBaseline()
        
#         self.baseline = build_mlp(self.observation_placeholder_explore,1,scope = "baseline",n_layers=1, size = 16,output_activation=None)
#         self.baseline_loss = tf.losses.mean_squared_error(self.baseline_target_placeholder,self.baseline,scope = "baseline")
#         baseline_adam_optimizer =  tf.train.AdamOptimizer(learning_rate = self.lr)
#         self.update_baseline_op = baseline_adam_optimizer.minimize(self.baseline_loss)
        
#         #Encoder LSTM
        self.LSTMEncoder()
        
        
        #decoder weights
        #self.d_W1 = self.Decoder(scope = "W1", decoder_out_dim = self.observation_dim*self.decoder_len)
        self.d_W2 = self.Decoder(scope = "W2", decoder_out_dim = self.observation_dim*self.decoder_len)
        #self.d_W3 = self.Decoder(scope = "W3", decoder_out_dim = self.decoder_len*action_dim)
        self.d_W3 = self.Decoder(scope = "W3", decoder_out_dim = self.decoder_len*self.action_dim)
        
        #self.d_W1 = ((self.d_W1 - (tf.reduce_max(self.d_W1) + tf.reduce_min(self.d_W1))/2)/(tf.reduce_max(self.d_W1) - tf.reduce_min(self.d_W1)))*2 
        #self.d_W2 = ((self.d_W2 - (tf.reduce_max(self.d_W2) + tf.reduce_min(self.d_W2))/2)/(tf.reduce_max(self.d_W2) - tf.reduce_min(self.d_W2)))*2 
        #self.d_W3 = ((self.d_W3 - (tf.reduce_max(self.d_W3) + tf.reduce_min(self.d_W3))/2)/(tf.reduce_max(self.d_W3) - tf.reduce_min(self.d_W3)))*2 
        
        #self.d_W1 = tf.reshape(self.d_W1, [self.observation_dim, self.decoder_len])
        self.d_W2 = tf.reshape(self.d_W2, [self.observation_dim, self.decoder_len])
        #self.d_W3 = tf.reshape(self.d_W3, [self.decoder_len, self.action_dim])
        self.d_W3 = tf.reshape(self.d_W3, [self.decoder_len, self.action_dim])
        
        # decoder output bias
        #self.d_B1 = tf.reshape(self.Decoder(decoder_out_dim = self.decoder_len, scope = "B1"), [self.decoder_len])
        self.d_B2 = tf.reshape(self.Decoder(decoder_out_dim = self.decoder_len, scope = "B2"), [self.decoder_len])
        self.d_B3 = tf.reshape(self.Decoder(decoder_out_dim = self.action_dim, scope = "B3"), [self.action_dim])
        #self.d_B1 = ((self.d_B1 - (tf.reduce_max(self.d_B1) + tf.reduce_min(self.d_B1))/2)/(tf.reduce_max(self.d_B1) - tf.reduce_min(self.d_B1)))*2 
        #self.d_B2 = ((self.d_B2 - (tf.reduce_max(self.d_B2) + tf.reduce_min(self.d_B2))/2)/(tf.reduce_max(self.d_B2) - tf.reduce_min(self.d_B2)))*2 
        
        
        # exploit policy
        self.build_policy_exploit()
        #self.d = [self.d_W1, self.d_B1, self.d_W2, self.d_B2, self.d_W3, self.d_B3]
        self.exploit_policy_loss = -tf.reduce_sum(self.exploit_logprob * self.advantage_placeholder_exploit)
        self.d = [self.d_W3, self.d_B3]
        self.gradients_exploit = tf.gradients(self.exploit_policy_loss,self.d)
        
        # train encoder and decoder
        adam_optimizer_exploit =  tf.train.AdamOptimizer(self.lr*0.1)
        self.output_train_op = adam_optimizer_exploit.minimize(self.exploit_policy_loss)
        # train original network
#         adam_optimizer_explore = tf.train.AdamOptimizer(self.lr)
#         self.input_train_op = adam_optimizer_explore.minimize(self.explore_policy_loss)
        init = tf.global_variables_initializer()
        self.sess.run(init)
    
    def initialize(self):
        self.ConstructGraph()
#         # create tf session
#         self.sess = tf.Session()
        
#         # initiliaze all variables
#         init = tf.global_variables_initializer()
#         self.sess.run(init)
    
    def train_step(self): 
        # sample num_traj*num_MDPs
        for i in range(self.num_mdps):
            explore_paths, explore_rewards = self.sample_paths_explore(self.env,i)
            observations_explore = np.concatenate([path["observation"] for path in explore_paths])
            new_observations_explore = np.concatenate([path["new_obs"] for path in explore_paths])
            new_actions_explore = np.concatenate([path["new_acs"] for path in explore_paths])
            actions_explore = np.concatenate([path["action"] for path in explore_paths])
            rewards_explore = np.concatenate([path["reward"] for path in explore_paths])
            
            vtargs, vpreds, advantages_explore = [], [], []
            for path in explore_paths:
                rew_t = path["new_rewards"]
                return_t = utils.discount(rew_t, self.gamma)
                vpred_t = self.vf.predict(path["new_obs"])
                adv_t = return_t - vpred_t
                advantages_explore.append(adv_t)
                vtargs.append(return_t)
                vpreds.append(vpred_t)
                
                
            #returns_explore = self.get_returns(explore_paths, explore = True)
            advantages_explore = np.concatenate(advantages_explore)
            advantages_explore = (advantages_explore - advantages_explore.mean()) / (advantages_explore.std() + 1e-8)
            vtarg_n = np.concatenate(vtargs)
            vpred_n = np.concatenate(vpreds)
            #update baseline
            #print(advantages_explore.shape,new_observations_explore.shape)
            self.vf.fit(new_observations_explore, vtarg_n)
            
            surr_loss, oldmean_na, oldlogstd_a = self.policyfn.update_policy(
                    new_observations_explore, new_actions_explore, advantages_explore, self.lr)
            
            print("average reward explore:", np.mean(explore_rewards))
            
            #print(returns_explore)
#             print("average reward explore: MDP", i, np.sum(explore_rewards)/num_traj, len(explore_rewards))

#             baseline_explore = self.sess.run(self.baseline, {self.observation_placeholder_explore:new_observations_explore})
#             adv = returns_explore - np.squeeze(baseline_explore)
#             advantages_explore = (adv - np.mean(adv))/(np.std(adv) + 1e-12)


#             # update the baseline

#             self.sess.run(self.update_baseline_op, {self.observation_placeholder_explore:new_observations_explore, 
#                                            self.baseline_target_placeholder : returns_explore})

            # calculate explore gradients
    #         grads_explore = self.sess.run(self.gradients_explore, feed_dict={
    #                     self.observation_placeholder_explore : observations_explore,
    #                     self.action_placeholder_explore : actions_explore,
    #                     self.advantage_placeholder_explore : returns_explore})
            #print("explore",grads_explore )
            # form trajectory matrix

            M = np.array(self.stack_trajectories(explore_paths))


            #encoder LSTM
            Z = self.sess.run(self.Z, feed_dict = {self.encoder_input_placeholder: M,self.sequence_length_placeholder: self.length })
            Z = np.reshape(Z,[1,len(Z)])
#             #print("Z",Z)
#             #print(self.sess.run(self.d, feed_dict = {self.decoder_input_placeholder: Z}))
#             #print("d",self.d)
#             # sample paths
#             tvars = tf.trainable_variables()
#             tvars_vals = self.sess.run(tvars[-5:-1])
#             #print(tvars_vals)
            for j in range(10):
                exploit_paths, exploit_rewards = self.sample_paths_exploit(self.env,Z,i)
                # get observations, actions and rewards
                observations_exploit = np.concatenate([path["observation"] for path in exploit_paths])
                actions_exploit = np.concatenate([path["action"] for path in exploit_paths])
                rewards_exploit = np.concatenate([path["reward"] for path in exploit_paths])
                returns_exploit = self.get_returns(exploit_paths)
                print("average reward exploit: MDP", i, np.sum(exploit_rewards) / num_traj, len(exploit_rewards))


#             # exploit grads
#     #         grads_exploit = self.sess.run(self.gradients_exploit,feed_dict={
#     #                     self.observation_placeholder_exploit : observations_exploit,
#     #                     self.action_placeholder_exploit : actions_exploit,
#     #                     self.advantage_placeholder_exploit : returns_exploit,
#     #                     self.decoder_input_placeholder: Z})

            #train encoder and decoder network
                self.sess.run(self.output_train_op, feed_dict={
                                self.observation_placeholder_exploit : observations_exploit,
                                self.action_placeholder_exploit : actions_exploit,
                                self.advantage_placeholder_exploit : returns_exploit,
                                self.decoder_input_placeholder: Z})

            # find advantage for input network
    #         advantage_explore = 0
    #         for i in range(len(grads_exploit)):
    #             l1 = grads_exploit[i]
    #             l2 = grads_explore[i]
    #             advantage_explore = advantage_explore + np.matmul(l1.flatten(), l2.flatten())

            # train input policy
#             self.sess.run(self.input_train_op, feed_dict={
#                             self.observation_placeholder_explore : new_observations_explore,
#                             self.action_placeholder_explore : new_actions_explore,
#                             self.advantage_placeholder_explore : advantages_explore})
    
    def test(self):
        explore_paths, explore_rewards = self.sample_paths_explore(self.env,20*np.random.rand())
        print("gravity: ",self.sim.model.opt.gravity )
        M = self.stack_trajectories(explore_paths)
        Z = self.sess.run(self.Z, feed_dict = {self.encoder_input_placeholder: M,self.sequence_length_placeholder: self.length })
        Z = np.reshape(Z,[1,len(Z)])
        # sample paths
        exploit_paths, exploit_rewards = self.sample_paths_exploit(self.env,Z,20*np.random.rand())
        print("average reward exploit", np.sum(exploit_rewards) / num_traj, len(exploit_rewards))
        
    def train(self):
        self.initialize()
        num_epochs = 200
        for epoch in range(num_epochs):
            self.num_traj = 20
            print("epoch number: ", epoch)
            self.train_step()
            print("evaluating on new MDPs")
            self.num_traj = 400
            self.test()
            

In [6]:
a = MetaLearner(env, max_ep_len, num_traj, latent_size)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [7]:
a.train()

epoch number:  0
average reward explore: 87.89810504610844
average reward exploit: MDP 0 70.67102469406363 20
average reward exploit: MDP 0 56.9229227137672 20
average reward exploit: MDP 0 58.806632485294294 20
average reward exploit: MDP 0 59.07209559249964 20
average reward exploit: MDP 0 57.0035024271645 20
average reward exploit: MDP 0 59.071488782162845 20
average reward exploit: MDP 0 55.15726973189076 20
average reward exploit: MDP 0 58.31024338805231 20
average reward exploit: MDP 0 57.0984469601896 20
average reward exploit: MDP 0 57.79130592306911 20
average reward explore: 84.0697328782916
average reward exploit: MDP 1 49.68789335136884 20
average reward exploit: MDP 1 51.28713936863396 20
average reward exploit: MDP 1 54.95621693150508 20
average reward exploit: MDP 1 63.23139727293232 20
average reward exploit: MDP 1 76.55806374232762 20
average reward exploit: MDP 1 95.5536503244824 20
average reward exploit: MDP 1 106.09777742265584 20
average reward exploit: MDP 1 102.

average reward exploit: MDP 4 49.12162240112514 20
average reward exploit: MDP 4 48.01776439952097 20
average reward exploit: MDP 4 45.18155250799414 20
average reward exploit: MDP 4 46.272394239307445 20
average reward exploit: MDP 4 46.37933500190595 20
average reward explore: 70.06068097015786
average reward exploit: MDP 5 43.783824143093995 20
average reward exploit: MDP 5 45.66788002252704 20
average reward exploit: MDP 5 47.76322651465385 20
average reward exploit: MDP 5 48.702136445419164 20
average reward exploit: MDP 5 51.688346085902616 20
average reward exploit: MDP 5 55.024022292768194 20
average reward exploit: MDP 5 54.61702685994758 20
average reward exploit: MDP 5 56.59535561061875 20
average reward exploit: MDP 5 54.86336677396745 20
average reward exploit: MDP 5 57.2467184156978 20
average reward explore: 67.12222934957988
average reward exploit: MDP 6 54.150360664473574 20
average reward exploit: MDP 6 56.2440483361329 20
average reward exploit: MDP 6 54.615375225716

average reward exploit: MDP 9 36.653786253302755 20
average reward exploit: MDP 9 36.581127786715015 20
average reward exploit: MDP 9 36.88643455636521 20
average reward exploit: MDP 9 39.32744146544621 20
average reward exploit: MDP 9 39.494137283349325 20
average reward exploit: MDP 9 39.44467079241338 20
average reward exploit: MDP 9 39.37477077844574 20
average reward exploit: MDP 9 39.73196418430064 20
average reward exploit: MDP 9 39.54884729083745 20
evaluating on new MDPs
gravity:  [  0.           0.         -14.64366015]
average reward exploit 818.2004798669532 400
epoch number:  3
average reward explore: 86.50264549943772
average reward exploit: MDP 0 43.260424424410466 20
average reward exploit: MDP 0 43.56992190158436 20
average reward exploit: MDP 0 44.0065618950582 20
average reward exploit: MDP 0 43.91812720911541 20
average reward exploit: MDP 0 44.025388800736394 20
average reward exploit: MDP 0 43.72605237107578 20
average reward exploit: MDP 0 43.117647852729746 20
a

average reward exploit: MDP 3 40.48710428269362 20
average reward exploit: MDP 3 40.0865645800085 20
average reward exploit: MDP 3 40.23775879211912 20
average reward exploit: MDP 3 40.78991762818598 20
average reward exploit: MDP 3 41.100968090075575 20
average reward explore: 64.8034223785269
average reward exploit: MDP 4 37.731216300497145 20
average reward exploit: MDP 4 38.062828355351584 20
average reward exploit: MDP 4 37.93412419596034 20
average reward exploit: MDP 4 37.45595065653505 20
average reward exploit: MDP 4 36.99808828802312 20
average reward exploit: MDP 4 38.06206914663649 20
average reward exploit: MDP 4 37.11367833966119 20
average reward exploit: MDP 4 38.12537742934856 20
average reward exploit: MDP 4 37.69928772000527 20
average reward exploit: MDP 4 37.67222607082049 20
average reward explore: 60.34076216368021
average reward exploit: MDP 5 35.923525072083756 20
average reward exploit: MDP 5 35.93127560469137 20
average reward exploit: MDP 5 35.93729977008493

average reward exploit: MDP 8 59.283079025649315 20
average reward exploit: MDP 8 58.06182886367091 20
average reward exploit: MDP 8 58.44810590833074 20
average reward exploit: MDP 8 57.392360301978286 20
average reward exploit: MDP 8 58.652918450764524 20
average reward exploit: MDP 8 58.816836474673096 20
average reward exploit: MDP 8 58.23638416424392 20
average reward exploit: MDP 8 58.25571964971186 20
average reward exploit: MDP 8 58.24669689540618 20
average reward explore: 55.62829038892537
average reward exploit: MDP 9 54.19761823343362 20
average reward exploit: MDP 9 54.54742059814707 20
average reward exploit: MDP 9 54.49503756017104 20
average reward exploit: MDP 9 55.091478152565784 20
average reward exploit: MDP 9 54.395210300886816 20
average reward exploit: MDP 9 54.75005058319225 20
average reward exploit: MDP 9 54.25257169256479 20
average reward exploit: MDP 9 54.21095378174455 20
average reward exploit: MDP 9 54.19045113648748 20
average reward exploit: MDP 9 53.7

average reward exploit: MDP 2 42.352370361055904 20
average reward exploit: MDP 2 42.308384925779144 20
average reward exploit: MDP 2 42.13337466369767 20
average reward exploit: MDP 2 42.334588334657774 20
average reward exploit: MDP 2 42.325714031977896 20
average reward explore: 70.80389091375756
average reward exploit: MDP 3 64.12567246635498 20
average reward exploit: MDP 3 54.834265279581935 20
average reward exploit: MDP 3 60.884528605089784 20
average reward exploit: MDP 3 60.19949288409129 20
average reward exploit: MDP 3 70.25390652117467 20
average reward exploit: MDP 3 53.89997826750631 20
average reward exploit: MDP 3 45.35588408997282 20
average reward exploit: MDP 3 51.86928448000386 20
average reward exploit: MDP 3 68.09607725130515 20
average reward exploit: MDP 3 70.17380901805535 20
average reward explore: 67.05097627762137
average reward exploit: MDP 4 58.55590499394202 20
average reward exploit: MDP 4 58.317084754573834 20
average reward exploit: MDP 4 56.870638907

average reward exploit: MDP 7 33.14379985904803 20
average reward exploit: MDP 7 32.83983536854596 20
average reward exploit: MDP 7 33.29434569072367 20
average reward exploit: MDP 7 31.064730187626207 20
average reward exploit: MDP 7 31.12183038286298 20
average reward exploit: MDP 7 30.711513731647365 20
average reward exploit: MDP 7 30.250933395921667 20
average reward exploit: MDP 7 32.04818621879987 20
average reward exploit: MDP 7 31.813000697667697 20
average reward exploit: MDP 7 31.818745649737416 20
average reward explore: 53.50639936706095
average reward exploit: MDP 8 30.83214753569807 20
average reward exploit: MDP 8 30.89031851024956 20
average reward exploit: MDP 8 31.076214820903328 20
average reward exploit: MDP 8 30.808986668201644 20
average reward exploit: MDP 8 31.254451905474607 20
average reward exploit: MDP 8 30.38056059767009 20
average reward exploit: MDP 8 30.203783731846375 20
average reward exploit: MDP 8 30.957811131407833 20
average reward exploit: MDP 8 

KeyboardInterrupt: 

In [16]:
for epoch in range(200):
        print("epoch number: ", epoch)
        a.train_step()

epoch number:  0
average reward explore: 82.96251755642649
average reward exploit: MDP 0 73.82496456351859 30
average reward exploit: MDP 0 72.63798571588255 30
average reward exploit: MDP 0 72.9195911751872 30
average reward exploit: MDP 0 73.57823085200245 30
average reward exploit: MDP 0 73.94708967913637 30
average reward exploit: MDP 0 74.49656268863008 30
average reward exploit: MDP 0 73.65975738682121 30
average reward exploit: MDP 0 75.16580731982253 30
average reward exploit: MDP 0 74.54466412274041 30
average reward exploit: MDP 0 73.5883272924395 30
average reward explore: 78.37416715003496
average reward exploit: MDP 1 67.51819471570535 30
average reward exploit: MDP 1 67.97158607986732 30
average reward exploit: MDP 1 66.72792142490734 30
average reward exploit: MDP 1 74.5388548880442 30
average reward exploit: MDP 1 84.03547942829246 30
average reward exploit: MDP 1 84.68315754125462 30
average reward exploit: MDP 1 84.1231174736365 30
average reward exploit: MDP 1 80.079

average reward exploit: MDP 4 103.82656167706229 30
average reward exploit: MDP 4 106.74874124081475 30
average reward exploit: MDP 4 107.61798199112913 30
average reward explore: 55.44522103852402
average reward exploit: MDP 5 97.57707892030976 30
average reward exploit: MDP 5 97.35794309574439 30
average reward exploit: MDP 5 97.14304035226478 30
average reward exploit: MDP 5 96.75155723414983 30
average reward exploit: MDP 5 97.95902305871641 30
average reward exploit: MDP 5 98.16005921381894 30
average reward exploit: MDP 5 98.43185242857135 30
average reward exploit: MDP 5 97.15975616034534 30
average reward exploit: MDP 5 96.42401201018356 30
average reward exploit: MDP 5 98.07924893318658 30
average reward explore: 52.21646807267904
average reward exploit: MDP 6 98.82165005722794 30
average reward exploit: MDP 6 95.56435337720399 30
average reward exploit: MDP 6 95.24608281692635 30
average reward exploit: MDP 6 95.8071353368924 30
average reward exploit: MDP 6 95.54619788171735

average reward exploit: MDP 9 48.998675166760876 30
average reward exploit: MDP 9 48.97631813856061 30
average reward exploit: MDP 9 48.98356888138383 30
average reward exploit: MDP 9 49.04473449566569 30
average reward exploit: MDP 9 48.98695784681199 30
epoch number:  3
average reward explore: 81.89384749909992
average reward exploit: MDP 0 71.2346432749161 30
average reward exploit: MDP 0 70.28655539223102 30
average reward exploit: MDP 0 70.77535255736056 30
average reward exploit: MDP 0 70.5225059208699 30
average reward exploit: MDP 0 70.23771475628105 30
average reward exploit: MDP 0 68.8419965769967 30
average reward exploit: MDP 0 70.20093058808654 30
average reward exploit: MDP 0 71.57894717266548 30
average reward exploit: MDP 0 70.3250963406393 30
average reward exploit: MDP 0 71.42248928948986 30
average reward explore: 76.49678503106965
average reward exploit: MDP 1 64.30695015566421 30
average reward exploit: MDP 1 65.19956960400094 30
average reward exploit: MDP 1 65.66

average reward exploit: MDP 4 52.94277566161709 30
average reward exploit: MDP 4 52.89045764120508 30
average reward exploit: MDP 4 52.83260943217138 30
average reward exploit: MDP 4 52.451106552446625 30
average reward exploit: MDP 4 52.42036412047077 30
average reward exploit: MDP 4 51.875005268701734 30
average reward exploit: MDP 4 52.82398235024883 30
average reward explore: 63.933107610325784
average reward exploit: MDP 5 51.11134724537389 30
average reward exploit: MDP 5 51.378204326913504 30
average reward exploit: MDP 5 50.894521301115915 30
average reward exploit: MDP 5 51.55437040849117 30
average reward exploit: MDP 5 50.65754005266491 30
average reward exploit: MDP 5 50.19004309521834 30
average reward exploit: MDP 5 51.2355832265118 30
average reward exploit: MDP 5 51.22204190752637 30
average reward exploit: MDP 5 51.7334290097888 30
average reward exploit: MDP 5 50.29785398781415 30
average reward explore: 62.1257571773232
average reward exploit: MDP 6 48.06396829335037

average reward exploit: MDP 9 44.32430036725562 30
average reward exploit: MDP 9 45.261324468493164 30
average reward exploit: MDP 9 43.87523388186061 30
average reward exploit: MDP 9 44.85740674424618 30
average reward exploit: MDP 9 45.05557786045133 30
average reward exploit: MDP 9 44.67317379482956 30
average reward exploit: MDP 9 45.64198274423094 30
average reward exploit: MDP 9 44.64079683065837 30
average reward exploit: MDP 9 44.89994515779729 30
epoch number:  6
average reward explore: 83.07040747490943
average reward exploit: MDP 0 92.88432619783582 30
average reward exploit: MDP 0 93.38040797132786 30
average reward exploit: MDP 0 93.41461219111642 30
average reward exploit: MDP 0 93.99440564312984 30
average reward exploit: MDP 0 93.95803026987466 30
average reward exploit: MDP 0 87.76285248932255 30
average reward exploit: MDP 0 83.05423236633432 30
average reward exploit: MDP 0 83.81528665441506 30
average reward exploit: MDP 0 83.83455980408257 30
average reward exploit

average reward explore: 68.76127394420395
average reward exploit: MDP 4 58.18784440215668 30
average reward exploit: MDP 4 58.002565720865036 30
average reward exploit: MDP 4 53.715575727645486 30
average reward exploit: MDP 4 56.09300642818838 30
average reward exploit: MDP 4 54.866451681682555 30
average reward exploit: MDP 4 58.016452690740465 30
average reward exploit: MDP 4 57.91725460699533 30
average reward exploit: MDP 4 57.87350790515076 30
average reward exploit: MDP 4 58.13792982011311 30
average reward exploit: MDP 4 58.58337608492526 30
average reward explore: 65.6527473892298
average reward exploit: MDP 5 53.88986251226899 30
average reward exploit: MDP 5 53.901609550300286 30
average reward exploit: MDP 5 53.87624203807219 30
average reward exploit: MDP 5 53.897635446921846 30
average reward exploit: MDP 5 53.86422104763445 30
average reward exploit: MDP 5 53.87269160136109 30
average reward exploit: MDP 5 53.906427515951655 30
average reward exploit: MDP 5 53.8741337561

average reward exploit: MDP 8 79.9923761725203 30
average reward exploit: MDP 8 80.44646521253686 30
average reward explore: 57.04974233418146
average reward exploit: MDP 9 74.4609610106847 30
average reward exploit: MDP 9 74.48455813323618 30
average reward exploit: MDP 9 74.55025635049354 30
average reward exploit: MDP 9 74.8083747325085 30
average reward exploit: MDP 9 74.84279831022576 30
average reward exploit: MDP 9 74.8674676230805 30
average reward exploit: MDP 9 74.45832788305371 30
average reward exploit: MDP 9 74.90865706793699 30
average reward exploit: MDP 9 74.79438860931302 30
average reward exploit: MDP 9 74.60881699463175 30
epoch number:  9
average reward explore: 83.12274117336963
average reward exploit: MDP 0 109.04418372016445 30
average reward exploit: MDP 0 96.92640703776893 30
average reward exploit: MDP 0 77.38421828043516 30
average reward exploit: MDP 0 79.90646846126515 30
average reward exploit: MDP 0 81.29132003282372 30
average reward exploit: MDP 0 87.47

average reward exploit: MDP 3 110.88460702822768 30
average reward exploit: MDP 3 111.23813972094972 30
average reward exploit: MDP 3 112.38868972110268 30
average reward exploit: MDP 3 111.42249153952524 30
average reward explore: 70.07766017761331
average reward exploit: MDP 4 93.03606128025788 30
average reward exploit: MDP 4 93.35959260610201 30
average reward exploit: MDP 4 94.14526158163487 30
average reward exploit: MDP 4 91.89655352077615 30
average reward exploit: MDP 4 96.46346600437843 30
average reward exploit: MDP 4 99.58752441401022 30
average reward exploit: MDP 4 98.36000187259933 30
average reward exploit: MDP 4 93.37671866661209 30
average reward exploit: MDP 4 99.22013094226142 30
average reward exploit: MDP 4 94.40438649641214 30
average reward explore: 67.30632710620385
average reward exploit: MDP 5 101.57040684769959 30
average reward exploit: MDP 5 101.78410828481839 30
average reward exploit: MDP 5 101.01284706878917 30
average reward exploit: MDP 5 102.40057396

average reward exploit: MDP 8 102.00643244653125 30
average reward exploit: MDP 8 94.7711465590042 30
average reward exploit: MDP 8 102.84346441270824 30
average reward exploit: MDP 8 115.82209927666557 30
average reward exploit: MDP 8 114.06191432366394 30
average reward exploit: MDP 8 119.5426029945118 30
average reward exploit: MDP 8 110.49556022797181 30
average reward explore: 61.406611890299835
average reward exploit: MDP 9 133.47149899373866 30
average reward exploit: MDP 9 105.86027756952204 30
average reward exploit: MDP 9 119.11747296646833 30
average reward exploit: MDP 9 97.97857849216821 30
average reward exploit: MDP 9 87.17940023932562 30
average reward exploit: MDP 9 83.5705942035279 30
average reward exploit: MDP 9 65.72348597376585 30
average reward exploit: MDP 9 65.37260561525217 30
average reward exploit: MDP 9 68.30113686217592 30
average reward exploit: MDP 9 67.99039133125554 30
epoch number:  12
average reward explore: 89.70477021503957
average reward exploit: 

average reward exploit: MDP 3 73.66396230913868 30
average reward exploit: MDP 3 73.30953653696125 30
average reward exploit: MDP 3 73.14882714575272 30
average reward exploit: MDP 3 72.95377623390301 30
average reward exploit: MDP 3 73.03756046129722 30
average reward exploit: MDP 3 72.06387383344216 30
average reward exploit: MDP 3 72.28895011925653 30
average reward exploit: MDP 3 73.22678338181306 30
average reward exploit: MDP 3 72.55888798687879 30
average reward exploit: MDP 3 72.03811957401629 30
average reward explore: 70.65676707027454
average reward exploit: MDP 4 70.18883322305179 30
average reward exploit: MDP 4 70.90321373010005 30
average reward exploit: MDP 4 70.12444772102616 30
average reward exploit: MDP 4 70.88325257044411 30
average reward exploit: MDP 4 74.64581928980054 30
average reward exploit: MDP 4 74.19364481964469 30
average reward exploit: MDP 4 74.48024267854647 30
average reward exploit: MDP 4 74.58389070156899 30
average reward exploit: MDP 4 76.9849560

average reward exploit: MDP 7 53.88052867458833 30
average reward explore: 56.27110484548292
average reward exploit: MDP 8 50.87058116957672 30
average reward exploit: MDP 8 51.05371862109396 30
average reward exploit: MDP 8 50.44937627091124 30
average reward exploit: MDP 8 50.867504436888694 30
average reward exploit: MDP 8 51.21069106334114 30
average reward exploit: MDP 8 50.59545870533239 30
average reward exploit: MDP 8 50.879659185820046 30
average reward exploit: MDP 8 51.55610752513851 30
average reward exploit: MDP 8 50.7468752120424 30
average reward exploit: MDP 8 51.34582981500308 30
average reward explore: 52.86434150177158
average reward exploit: MDP 9 49.14039657836849 30
average reward exploit: MDP 9 49.19819327590791 30
average reward exploit: MDP 9 49.16593254806002 30
average reward exploit: MDP 9 49.17811900218943 30
average reward exploit: MDP 9 49.1592703953447 30
average reward exploit: MDP 9 49.133117510295634 30
average reward exploit: MDP 9 49.144930024741846

average reward exploit: MDP 2 89.71530347018349 30
average reward exploit: MDP 2 90.85450913862536 30
average reward exploit: MDP 2 91.82760248669274 30
average reward explore: 74.6260317230936
average reward exploit: MDP 3 84.08073235125183 30
average reward exploit: MDP 3 84.41646457151016 30
average reward exploit: MDP 3 83.85288894099797 30
average reward exploit: MDP 3 84.02518714202834 30
average reward exploit: MDP 3 84.60599484598653 30
average reward exploit: MDP 3 85.39305968704406 30
average reward exploit: MDP 3 84.77511952302338 30
average reward exploit: MDP 3 85.42906010066667 30
average reward exploit: MDP 3 84.7898876290288 30
average reward exploit: MDP 3 83.67023199366444 30
average reward explore: 70.47311986154783
average reward exploit: MDP 4 84.42567016899132 30
average reward exploit: MDP 4 84.05315726566879 30
average reward exploit: MDP 4 83.45617357456153 30
average reward exploit: MDP 4 79.62156164922752 30
average reward exploit: MDP 4 77.0978739958015 30
a

average reward exploit: MDP 7 82.7446047147944 30
average reward exploit: MDP 7 76.98659537970346 30
average reward exploit: MDP 7 76.04450088801997 30
average reward exploit: MDP 7 74.40523329777747 30
average reward explore: 60.99100842286589
average reward exploit: MDP 8 94.50673471877924 30
average reward exploit: MDP 8 90.3209626239925 30
average reward exploit: MDP 8 91.61082171537429 30
average reward exploit: MDP 8 102.31942149413625 30
average reward exploit: MDP 8 107.64240685285937 30
average reward exploit: MDP 8 107.47168759128 30
average reward exploit: MDP 8 111.3031446145456 30
average reward exploit: MDP 8 97.22781681036318 30
average reward exploit: MDP 8 84.7164114929505 30
average reward exploit: MDP 8 99.329485452568 30
average reward explore: 59.468074768761745
average reward exploit: MDP 9 148.48550443732333 30
average reward exploit: MDP 9 126.25550689037541 30
average reward exploit: MDP 9 105.7800484389031 30
average reward exploit: MDP 9 105.65178549249782 30

average reward exploit: MDP 2 81.28865161710205 30
average reward exploit: MDP 2 80.68002388339278 30
average reward exploit: MDP 2 80.13615637370641 30
average reward exploit: MDP 2 81.41260234956256 30
average reward exploit: MDP 2 80.89950066669017 30
average reward exploit: MDP 2 81.46089971451178 30
average reward explore: 77.20808910361391
average reward exploit: MDP 3 76.72371672549679 30
average reward exploit: MDP 3 76.55717100571464 30
average reward exploit: MDP 3 73.46703391665125 30
average reward exploit: MDP 3 71.45757258820062 30
average reward exploit: MDP 3 71.61693053567022 30
average reward exploit: MDP 3 70.67357536435549 30
average reward exploit: MDP 3 71.5869852923718 30
average reward exploit: MDP 3 70.16302724488737 30
average reward exploit: MDP 3 70.3027289643427 30
average reward exploit: MDP 3 70.27087400654541 30
average reward explore: 72.59432496543754
average reward exploit: MDP 4 68.8532307166961 30
average reward exploit: MDP 4 68.83562968560817 30
a

average reward exploit: MDP 7 59.26515419673261 30
average reward exploit: MDP 7 59.40991560076574 30
average reward exploit: MDP 7 59.389809674830325 30
average reward exploit: MDP 7 59.40469412986836 30
average reward exploit: MDP 7 59.3888379574058 30
average reward exploit: MDP 7 59.40198961960025 30
average reward exploit: MDP 7 59.38415840604386 30
average reward exploit: MDP 7 59.38717202864663 30
average reward explore: 58.96482357181379
average reward exploit: MDP 8 57.702011029155095 30
average reward exploit: MDP 8 57.39188152775655 30
average reward exploit: MDP 8 56.88014455648925 30
average reward exploit: MDP 8 58.42558336704349 30
average reward exploit: MDP 8 58.75055754071802 30
average reward exploit: MDP 8 58.86312543772374 30
average reward exploit: MDP 8 58.83395937509723 30
average reward exploit: MDP 8 58.66722017472334 30
average reward exploit: MDP 8 58.46061671991484 30
average reward exploit: MDP 8 58.26179826512871 30
average reward explore: 57.891825947303

average reward exploit: MDP 2 118.03173447018736 30
average reward exploit: MDP 2 112.3690563259983 30
average reward exploit: MDP 2 113.81684574914901 30
average reward exploit: MDP 2 116.1847873549892 30
average reward exploit: MDP 2 118.28527732137792 30
average reward exploit: MDP 2 116.27624464599984 30
average reward exploit: MDP 2 114.83994334030353 30
average reward exploit: MDP 2 117.40046119634547 30
average reward exploit: MDP 2 117.22933578167827 30
average reward exploit: MDP 2 118.12247549095999 30
average reward explore: 71.68914896206338
average reward exploit: MDP 3 100.90913120674651 30
average reward exploit: MDP 3 98.33716671023818 30
average reward exploit: MDP 3 98.30095912413391 30
average reward exploit: MDP 3 98.45766350633163 30
average reward exploit: MDP 3 98.70579795902017 30
average reward exploit: MDP 3 98.42096324281265 30
average reward exploit: MDP 3 95.60431222060247 30
average reward exploit: MDP 3 96.85784118363162 30
average reward exploit: MDP 3 9

average reward exploit: MDP 6 86.3275352976516 30
average reward exploit: MDP 6 86.43928572718086 30
average reward explore: 60.527319459275944
average reward exploit: MDP 7 79.47086707034983 30
average reward exploit: MDP 7 79.35196847883851 30
average reward exploit: MDP 7 79.27118997932554 30
average reward exploit: MDP 7 79.62286133630565 30
average reward exploit: MDP 7 79.41076701679562 30
average reward exploit: MDP 7 79.36546228633 30
average reward exploit: MDP 7 79.24088261434356 30
average reward exploit: MDP 7 79.10554086646022 30
average reward exploit: MDP 7 79.06490151053305 30
average reward exploit: MDP 7 79.32653568597065 30
average reward explore: 58.55965543098024
average reward exploit: MDP 8 76.63227258139051 30
average reward exploit: MDP 8 77.61335072437194 30
average reward exploit: MDP 8 77.449179878232 30
average reward exploit: MDP 8 76.04569260656962 30
average reward exploit: MDP 8 77.19427905235054 30
average reward exploit: MDP 8 76.66045231339261 30
ave

average reward exploit: MDP 1 75.03081104841937 30
average reward exploit: MDP 1 73.47663103050385 30
average reward exploit: MDP 1 73.14562961161108 30
average reward exploit: MDP 1 72.80865524629752 30
average reward explore: 74.27820489095926
average reward exploit: MDP 2 81.5833355858208 30
average reward exploit: MDP 2 81.889469080142 30
average reward exploit: MDP 2 81.8706427161545 30
average reward exploit: MDP 2 81.21752245702677 30
average reward exploit: MDP 2 82.98595614817957 30
average reward exploit: MDP 2 81.37495061250374 30
average reward exploit: MDP 2 82.19459188897234 30
average reward exploit: MDP 2 81.22853398390633 30
average reward exploit: MDP 2 81.55100775905804 30
average reward exploit: MDP 2 81.26036607279843 30
average reward explore: 71.03826536527994
average reward exploit: MDP 3 76.8487878889785 30
average reward exploit: MDP 3 76.79588840395019 30
average reward exploit: MDP 3 76.59321594930708 30
average reward exploit: MDP 3 77.27961376037999 30
ave

average reward exploit: MDP 6 55.83200522769571 30
average reward exploit: MDP 6 55.29091290666716 30
average reward exploit: MDP 6 55.76225638058815 30
average reward exploit: MDP 6 55.78269398097506 30
average reward exploit: MDP 6 55.62097702687228 30
average reward exploit: MDP 6 56.082226893567814 30
average reward explore: 62.287502785268344
average reward exploit: MDP 7 52.84672498951146 30
average reward exploit: MDP 7 52.88641593496219 30
average reward exploit: MDP 7 52.88218999878908 30
average reward exploit: MDP 7 52.90525903278773 30
average reward exploit: MDP 7 52.66584531514378 30
average reward exploit: MDP 7 52.910364137446685 30
average reward exploit: MDP 7 52.8440120576519 30
average reward exploit: MDP 7 53.0273674137784 30
average reward exploit: MDP 7 52.8630359395874 30
average reward exploit: MDP 7 52.861465367298884 30
average reward explore: 61.65305472792582
average reward exploit: MDP 8 48.8453589970785 30
average reward exploit: MDP 8 48.76667645997182 3

average reward exploit: MDP 1 65.8756659234459 30
average reward exploit: MDP 1 65.53346563589248 30
average reward exploit: MDP 1 65.57684210911366 30
average reward exploit: MDP 1 66.32499294202765 30
average reward exploit: MDP 1 65.75984436767605 30
average reward exploit: MDP 1 66.00483249817104 30
average reward exploit: MDP 1 66.02286849959422 30
average reward exploit: MDP 1 65.59761115922183 30
average reward explore: 76.25531602933805
average reward exploit: MDP 2 61.28184135887662 30
average reward exploit: MDP 2 61.614121205092125 30
average reward exploit: MDP 2 61.74858226914988 30
average reward exploit: MDP 2 62.69252874141947 30
average reward exploit: MDP 2 61.452832263023105 30
average reward exploit: MDP 2 61.877272110178616 30
average reward exploit: MDP 2 62.18170638661078 30
average reward exploit: MDP 2 62.43946765984698 30
average reward exploit: MDP 2 61.90240929066512 30
average reward exploit: MDP 2 61.944524035599265 30
average reward explore: 72.4734947023

average reward exploit: MDP 6 53.874440656604655 30
average reward exploit: MDP 6 53.919621697767525 30
average reward exploit: MDP 6 53.897133325422836 30
average reward exploit: MDP 6 53.914501595776734 30
average reward exploit: MDP 6 53.89329582579921 30
average reward exploit: MDP 6 53.88654704170142 30
average reward exploit: MDP 6 53.91170498717155 30
average reward exploit: MDP 6 53.88227171305362 30
average reward exploit: MDP 6 53.89438255769928 30
average reward exploit: MDP 6 53.88443976493643 30
average reward explore: 65.26182776427241
average reward exploit: MDP 7 51.842765712406816 30
average reward exploit: MDP 7 51.22108250472043 30
average reward exploit: MDP 7 51.52018857136415 30
average reward exploit: MDP 7 52.473961489120924 30
average reward exploit: MDP 7 53.25836156309522 30
average reward exploit: MDP 7 53.594050009746795 30
average reward exploit: MDP 7 53.90707573486154 30
average reward exploit: MDP 7 53.8962102490761 30
average reward exploit: MDP 7 53.5

average reward exploit: MDP 0 386.25987770998086 30
average reward exploit: MDP 0 359.7420281425873 30
average reward explore: 83.04218112748009
average reward exploit: MDP 1 232.50758761272363 30
average reward exploit: MDP 1 236.50507044151658 30
average reward exploit: MDP 1 233.78978858526813 30
average reward exploit: MDP 1 249.7949313700966 30
average reward exploit: MDP 1 251.90128509305447 30
average reward exploit: MDP 1 250.17621193773314 30
average reward exploit: MDP 1 242.53987231909403 30
average reward exploit: MDP 1 120.91870625326125 30
average reward exploit: MDP 1 111.79232059638804 30
average reward exploit: MDP 1 104.31697752140985 30
average reward explore: 77.1132100241462
average reward exploit: MDP 2 226.09790190280185 30
average reward exploit: MDP 2 227.21858270662287 30
average reward exploit: MDP 2 224.00135220499433 30
average reward exploit: MDP 2 226.1317412422479 30
average reward exploit: MDP 2 228.31382958895657 30
average reward exploit: MDP 2 224.37

average reward exploit: MDP 5 211.83109813091085 30
average reward exploit: MDP 5 213.48518091381345 30
average reward exploit: MDP 5 212.60035457402518 30
average reward exploit: MDP 5 212.75144232899643 30
average reward exploit: MDP 5 210.47092959994208 30
average reward explore: 66.48212175700269
average reward exploit: MDP 6 201.72769906818436 30
average reward exploit: MDP 6 202.5571509882361 30
average reward exploit: MDP 6 197.59813686591633 30
average reward exploit: MDP 6 197.09128753013184 30
average reward exploit: MDP 6 199.94787948405786 30
average reward exploit: MDP 6 205.06391195988812 30
average reward exploit: MDP 6 210.68842855459488 30
average reward exploit: MDP 6 217.56718911138998 30
average reward exploit: MDP 6 209.8288788529774 30
average reward exploit: MDP 6 211.45935379978042 30
average reward explore: 62.15721439169188
average reward exploit: MDP 7 185.6824125421094 30
average reward exploit: MDP 7 186.09924852234192 30
average reward exploit: MDP 7 194.2

average reward exploit: MDP 0 324.89531232266563 30
average reward exploit: MDP 0 324.61271387071105 30
average reward exploit: MDP 0 326.03537024261755 30
average reward exploit: MDP 0 328.1600659231347 30
average reward exploit: MDP 0 322.96524762237476 30
average reward exploit: MDP 0 322.79539247215934 30
average reward exploit: MDP 0 326.31125940068466 30
average reward exploit: MDP 0 325.44659202980324 30
average reward exploit: MDP 0 325.6705906248579 30
average reward explore: 82.34775765026988
average reward exploit: MDP 1 299.1274040182278 30
average reward exploit: MDP 1 297.129265821815 30
average reward exploit: MDP 1 299.3302019061337 30
average reward exploit: MDP 1 298.07664577983525 30
average reward exploit: MDP 1 299.2989319704341 30
average reward exploit: MDP 1 299.0005136454756 30
average reward exploit: MDP 1 300.5090357673147 30
average reward exploit: MDP 1 296.0255976027523 30
average reward exploit: MDP 1 298.99310504215987 30
average reward exploit: MDP 1 29

average reward exploit: MDP 4 250.81277901254956 30
average reward explore: 65.87326240598759
average reward exploit: MDP 5 240.91819928218965 30
average reward exploit: MDP 5 240.91133923671563 30
average reward exploit: MDP 5 243.1512061181913 30
average reward exploit: MDP 5 240.10023981754787 30
average reward exploit: MDP 5 241.9407418297889 30
average reward exploit: MDP 5 239.83886935958574 30
average reward exploit: MDP 5 241.36375262546588 30
average reward exploit: MDP 5 242.33843480816992 30
average reward exploit: MDP 5 244.7881755927616 30
average reward exploit: MDP 5 241.30346851852156 30
average reward explore: 62.27738525249196
average reward exploit: MDP 6 232.8654343726406 30
average reward exploit: MDP 6 231.94821379976204 30
average reward exploit: MDP 6 232.07915392612932 30
average reward exploit: MDP 6 233.1526532588035 30
average reward exploit: MDP 6 233.27985524015614 30
average reward exploit: MDP 6 232.6079056935857 30
average reward exploit: MDP 6 233.6931

average reward exploit: MDP 9 134.804335670095 30
average reward exploit: MDP 9 126.15651126291077 30
average reward exploit: MDP 9 148.4071786057567 30
average reward exploit: MDP 9 69.22639463665368 30
epoch number:  37
average reward explore: 89.81528288664813
average reward exploit: MDP 0 107.04506643308044 30
average reward exploit: MDP 0 100.2774045625845 30
average reward exploit: MDP 0 104.69164361354885 30
average reward exploit: MDP 0 102.17593873864611 30
average reward exploit: MDP 0 101.05867495219348 30
average reward exploit: MDP 0 98.19419210749616 30
average reward exploit: MDP 0 96.56243143498891 30
average reward exploit: MDP 0 95.35791169637858 30
average reward exploit: MDP 0 96.24695580219478 30
average reward exploit: MDP 0 104.14011933892478 30
average reward explore: 81.99401198855557
average reward exploit: MDP 1 157.41028876858132 30
average reward exploit: MDP 1 93.68072906442696 30
average reward exploit: MDP 1 94.45227255890985 30
average reward exploit: M

average reward exploit: MDP 4 69.56955944081506 30
average reward exploit: MDP 4 70.9308554571985 30
average reward exploit: MDP 4 68.88072227106844 30
average reward exploit: MDP 4 67.87368819978653 30
average reward exploit: MDP 4 68.23867821715595 30
average reward exploit: MDP 4 68.2392080749242 30
average reward explore: 67.76045338779444
average reward exploit: MDP 5 67.93223731737821 30
average reward exploit: MDP 5 67.99345496869594 30
average reward exploit: MDP 5 67.91921415008657 30
average reward exploit: MDP 5 67.81098786520582 30
average reward exploit: MDP 5 67.71629332229288 30
average reward exploit: MDP 5 67.4842175549092 30
average reward exploit: MDP 5 67.31379652035876 30
average reward exploit: MDP 5 66.81130078258788 30
average reward exploit: MDP 5 66.75520525004805 30
average reward exploit: MDP 5 65.84478500710571 30
average reward explore: 64.71460430398285
average reward exploit: MDP 6 67.20774724753926 30
average reward exploit: MDP 6 67.89898158141605 30
a

average reward exploit: MDP 9 59.989231137447774 30
average reward exploit: MDP 9 60.43186638509222 30
average reward exploit: MDP 9 61.04650908811573 30
average reward exploit: MDP 9 61.34070805838742 30
average reward exploit: MDP 9 59.16339893798556 30
average reward exploit: MDP 9 59.967035803611545 30
average reward exploit: MDP 9 59.95692667328846 30
average reward exploit: MDP 9 59.68302420599309 30
epoch number:  40
average reward explore: 91.95693811396153
average reward exploit: MDP 0 80.69983715986841 30
average reward exploit: MDP 0 80.61669680160247 30
average reward exploit: MDP 0 80.79898429329559 30
average reward exploit: MDP 0 81.45001958492816 30
average reward exploit: MDP 0 80.31875422787816 30
average reward exploit: MDP 0 82.26433397070754 30
average reward exploit: MDP 0 86.33650789502903 30
average reward exploit: MDP 0 94.07639710195467 30
average reward exploit: MDP 0 93.19696708840702 30
average reward exploit: MDP 0 94.02276051915985 30
average reward explo

average reward exploit: MDP 4 56.06344894071338 30
average reward exploit: MDP 4 56.19051664899383 30
average reward exploit: MDP 4 56.45075960713955 30
average reward exploit: MDP 4 56.05543640981214 30
average reward exploit: MDP 4 55.61173170889588 30
average reward exploit: MDP 4 56.73746143804331 30
average reward exploit: MDP 4 56.35175122979834 30
average reward exploit: MDP 4 56.6376030163939 30
average reward exploit: MDP 4 55.92754726587843 30
average reward exploit: MDP 4 55.71674538108245 30
average reward explore: 67.66638598873574
average reward exploit: MDP 5 53.37658942778737 30
average reward exploit: MDP 5 49.09066824917632 30
average reward exploit: MDP 5 52.048512154508366 30
average reward exploit: MDP 5 51.63884047035679 30
average reward exploit: MDP 5 48.59312698457881 30
average reward exploit: MDP 5 51.47362930199293 30
average reward exploit: MDP 5 53.719900122299016 30
average reward exploit: MDP 5 55.746156193459804 30
average reward exploit: MDP 5 56.32419

average reward exploit: MDP 8 57.317798015921085 30
average reward explore: 60.19298034583309
average reward exploit: MDP 9 52.27765614622422 30
average reward exploit: MDP 9 51.454715211743284 30
average reward exploit: MDP 9 52.423697766944045 30
average reward exploit: MDP 9 51.94269868706298 30
average reward exploit: MDP 9 52.086381784526644 30
average reward exploit: MDP 9 51.64594721414838 30
average reward exploit: MDP 9 52.09091807480449 30
average reward exploit: MDP 9 52.866305422164686 30
average reward exploit: MDP 9 52.42619434409398 30
average reward exploit: MDP 9 52.281820827483834 30
epoch number:  43
average reward explore: 90.06158781570245
average reward exploit: MDP 0 88.229894747884 30
average reward exploit: MDP 0 88.23330750535479 30
average reward exploit: MDP 0 88.29156024386381 30
average reward exploit: MDP 0 87.4855523211877 30
average reward exploit: MDP 0 87.55931674584139 30
average reward exploit: MDP 0 87.5473676307194 30
average reward exploit: MDP 0

average reward exploit: MDP 3 57.98644997204951 30
average reward exploit: MDP 3 57.97823327639739 30
average reward exploit: MDP 3 58.008074864991684 30
average reward explore: 70.69197039419151
average reward exploit: MDP 4 56.1790772414721 30
average reward exploit: MDP 4 55.60787230135879 30
average reward exploit: MDP 4 56.17501424873759 30
average reward exploit: MDP 4 56.060732615404085 30
average reward exploit: MDP 4 56.04778652024869 30
average reward exploit: MDP 4 55.15125203312701 30
average reward exploit: MDP 4 55.887963095264844 30
average reward exploit: MDP 4 55.72002432798215 30
average reward exploit: MDP 4 55.733635948955815 30
average reward exploit: MDP 4 56.46091290399742 30
average reward explore: 67.51049167922612
average reward exploit: MDP 5 63.25830382008182 30
average reward exploit: MDP 5 62.81896168074113 30
average reward exploit: MDP 5 62.83542817923622 30
average reward exploit: MDP 5 62.276291154385596 30
average reward exploit: MDP 5 62.742221497647

average reward exploit: MDP 8 59.71298064614235 30
average reward exploit: MDP 8 60.23787681902227 30
average reward exploit: MDP 8 60.281448732017104 30
average reward exploit: MDP 8 59.846064133224004 30
average reward exploit: MDP 8 60.18596464944246 30
average reward explore: 59.19713570948682
average reward exploit: MDP 9 56.10100868172747 30
average reward exploit: MDP 9 56.06204149521787 30
average reward exploit: MDP 9 57.368488509040255 30
average reward exploit: MDP 9 57.61415708539 30
average reward exploit: MDP 9 59.726486323773116 30
average reward exploit: MDP 9 60.37474782111381 30
average reward exploit: MDP 9 62.14468837581746 30
average reward exploit: MDP 9 62.266919996512726 30
average reward exploit: MDP 9 62.46233496944285 30
average reward exploit: MDP 9 60.97719176909696 30
epoch number:  46
average reward explore: 89.77779467316343
average reward exploit: MDP 0 93.56557292898395 30
average reward exploit: MDP 0 93.86984047666179 30
average reward exploit: MDP 0

average reward exploit: MDP 3 56.39077512411803 30
average reward exploit: MDP 3 57.51383266180236 30
average reward exploit: MDP 3 56.8375162462488 30
average reward exploit: MDP 3 56.307961944889854 30
average reward exploit: MDP 3 57.66356398557566 30
average reward exploit: MDP 3 58.81888763204691 30
average reward exploit: MDP 3 60.92613975223269 30
average reward explore: 71.07453332155994
average reward exploit: MDP 4 66.5845825293182 30
average reward exploit: MDP 4 69.87106299598082 30
average reward exploit: MDP 4 69.21407495590633 30
average reward exploit: MDP 4 70.60943830081582 30
average reward exploit: MDP 4 70.00574822017775 30
average reward exploit: MDP 4 71.01176790909173 30
average reward exploit: MDP 4 70.0828106273839 30
average reward exploit: MDP 4 69.02474534338269 30
average reward exploit: MDP 4 69.32049346036032 30
average reward exploit: MDP 4 69.47660845780264 30
average reward explore: 67.39203585208683
average reward exploit: MDP 5 64.30115245963735 30


average reward exploit: MDP 8 58.00681632854882 30
average reward exploit: MDP 8 57.997265266975475 30
average reward exploit: MDP 8 57.97948682527747 30
average reward exploit: MDP 8 57.97153843242542 30
average reward exploit: MDP 8 57.79621006487411 30
average reward exploit: MDP 8 57.97343643973792 30
average reward exploit: MDP 8 57.64435747716613 30
average reward exploit: MDP 8 57.22033864619572 30
average reward exploit: MDP 8 56.97323014177053 30
average reward explore: 57.824063926482026
average reward exploit: MDP 9 54.70115892119417 30
average reward exploit: MDP 9 54.375291917046766 30
average reward exploit: MDP 9 53.70243308442966 30
average reward exploit: MDP 9 53.786724968663286 30
average reward exploit: MDP 9 54.15913451933008 30
average reward exploit: MDP 9 53.612257900137635 30
average reward exploit: MDP 9 53.32422014123342 30
average reward exploit: MDP 9 53.23506806609903 30
average reward exploit: MDP 9 53.640122484531936 30
average reward exploit: MDP 9 53.5

average reward explore: 74.59951210143545
average reward exploit: MDP 3 72.38574414880624 30
average reward exploit: MDP 3 71.75883829006355 30
average reward exploit: MDP 3 71.38575945096765 30
average reward exploit: MDP 3 72.11449582204554 30
average reward exploit: MDP 3 71.45251781228477 30
average reward exploit: MDP 3 71.23409761627777 30
average reward exploit: MDP 3 71.18697348615136 30
average reward exploit: MDP 3 71.63623769799842 30
average reward exploit: MDP 3 71.71213278753034 30
average reward exploit: MDP 3 71.6560771285288 30
average reward explore: 70.69989895246454
average reward exploit: MDP 4 71.71053607227327 30
average reward exploit: MDP 4 71.82784757395368 30
average reward exploit: MDP 4 71.88122238998425 30
average reward exploit: MDP 4 71.35751172064327 30
average reward exploit: MDP 4 71.49319780669936 30
average reward exploit: MDP 4 71.24643677083972 30
average reward exploit: MDP 4 71.68163918570853 30
average reward exploit: MDP 4 70.3472751927813 30


average reward exploit: MDP 7 66.20195733867772 30
average reward explore: 59.51980342601027
average reward exploit: MDP 8 65.07882037802648 30
average reward exploit: MDP 8 64.62968948798024 30
average reward exploit: MDP 8 64.59973697997653 30
average reward exploit: MDP 8 64.63026829552844 30
average reward exploit: MDP 8 64.45507885338142 30
average reward exploit: MDP 8 63.34421535714362 30
average reward exploit: MDP 8 64.27742950392499 30
average reward exploit: MDP 8 64.65593359066109 30
average reward exploit: MDP 8 65.27762983799414 30
average reward exploit: MDP 8 65.65769049540546 30
average reward explore: 56.51375401501484
average reward exploit: MDP 9 70.36481081769179 30
average reward exploit: MDP 9 73.33706859513993 30
average reward exploit: MDP 9 76.86763652719888 30
average reward exploit: MDP 9 78.51795331138099 30
average reward exploit: MDP 9 79.60394253128315 30
average reward exploit: MDP 9 78.21862880776175 30
average reward exploit: MDP 9 77.70062535934606 3

average reward exploit: MDP 2 78.18254786840447 30
average reward exploit: MDP 2 78.65894163879378 30
average reward exploit: MDP 2 78.05533998608075 30
average reward explore: 75.2736400288379
average reward exploit: MDP 3 75.36889403712107 30
average reward exploit: MDP 3 74.93882068202664 30
average reward exploit: MDP 3 75.93602213054797 30
average reward exploit: MDP 3 75.68283653312524 30
average reward exploit: MDP 3 74.32703113106922 30
average reward exploit: MDP 3 74.81483608816346 30
average reward exploit: MDP 3 74.04054425974462 30
average reward exploit: MDP 3 74.3330122409225 30
average reward exploit: MDP 3 73.85706469965471 30
average reward exploit: MDP 3 73.32630610751039 30
average reward explore: 70.57174493761985
average reward exploit: MDP 4 72.28900758423622 30
average reward exploit: MDP 4 72.20934911005502 30
average reward exploit: MDP 4 71.94986022065625 30
average reward exploit: MDP 4 68.5831253802892 30
average reward exploit: MDP 4 68.62157942081032 30
a

average reward exploit: MDP 7 72.83711496013647 30
average reward exploit: MDP 7 72.5890431143443 30
average reward exploit: MDP 7 73.23446624881895 30
average reward exploit: MDP 7 72.7242897535378 30
average reward explore: 59.10840255264311
average reward exploit: MDP 8 67.75243847771408 30
average reward exploit: MDP 8 68.61415752607925 30
average reward exploit: MDP 8 69.07390016007595 30
average reward exploit: MDP 8 68.88860282701911 30
average reward exploit: MDP 8 68.90388343939752 30
average reward exploit: MDP 8 68.88628250136087 30
average reward exploit: MDP 8 68.6509923664887 30
average reward exploit: MDP 8 68.65926945990007 30
average reward exploit: MDP 8 68.87234934927008 30
average reward exploit: MDP 8 68.59401820942104 30
average reward explore: 56.58450256743939
average reward exploit: MDP 9 64.95215005087422 30
average reward exploit: MDP 9 65.08723006715284 30
average reward exploit: MDP 9 65.09371031401027 30
average reward exploit: MDP 9 65.83977021246355 30
a

average reward exploit: MDP 2 77.73088374730352 30
average reward exploit: MDP 2 77.2498675950663 30
average reward exploit: MDP 2 77.3545239597115 30
average reward exploit: MDP 2 77.68064907050095 30
average reward exploit: MDP 2 77.7387334901444 30
average reward exploit: MDP 2 77.89562562809407 30
average reward explore: 75.13049911686699
average reward exploit: MDP 3 73.78786659553175 30
average reward exploit: MDP 3 74.00385330881554 30
average reward exploit: MDP 3 73.74113775534731 30
average reward exploit: MDP 3 73.46205156146243 30
average reward exploit: MDP 3 73.3228273795912 30
average reward exploit: MDP 3 74.12035077614516 30
average reward exploit: MDP 3 73.65480043328134 30
average reward exploit: MDP 3 73.63421260457224 30
average reward exploit: MDP 3 73.7244899680084 30
average reward exploit: MDP 3 73.68915101673056 30
average reward explore: 70.53537960461136
average reward exploit: MDP 4 63.83226848405387 30
average reward exploit: MDP 4 64.5528351919259 30
aver

average reward exploit: MDP 7 67.59095916944301 30
average reward exploit: MDP 7 68.24734041882559 30
average reward exploit: MDP 7 68.79425894609011 30
average reward exploit: MDP 7 68.95856910412287 30
average reward exploit: MDP 7 68.7866202919793 30
average reward exploit: MDP 7 68.86543967995615 30
average reward exploit: MDP 7 68.05371171548127 30
average reward explore: 58.987568717341674
average reward exploit: MDP 8 59.10633099229138 30
average reward exploit: MDP 8 58.78245735851574 30
average reward exploit: MDP 8 58.98657285497147 30
average reward exploit: MDP 8 60.47499671830024 30
average reward exploit: MDP 8 60.04198496919894 30
average reward exploit: MDP 8 60.19341924624779 30
average reward exploit: MDP 8 60.16555024130605 30
average reward exploit: MDP 8 60.052242253025355 30
average reward exploit: MDP 8 59.833093541218766 30
average reward exploit: MDP 8 55.64795009357461 30
average reward explore: 56.46998262239832
average reward exploit: MDP 9 52.6685967121956 

average reward exploit: MDP 2 61.09999906316662 30
average reward exploit: MDP 2 59.636042954812226 30
average reward exploit: MDP 2 62.47546182283561 30
average reward exploit: MDP 2 61.916368491606754 30
average reward exploit: MDP 2 63.337393768611214 30
average reward exploit: MDP 2 61.613414367328474 30
average reward exploit: MDP 2 61.57201974337389 30
average reward exploit: MDP 2 61.57006050964437 30
average reward exploit: MDP 2 60.29760019322989 30
average reward explore: 74.92308242492506
average reward exploit: MDP 3 56.02287052110522 30
average reward exploit: MDP 3 55.7232732202103 30
average reward exploit: MDP 3 55.5436895967528 30
average reward exploit: MDP 3 55.430348495143804 30
average reward exploit: MDP 3 56.14616067791637 30
average reward exploit: MDP 3 55.25966897120165 30
average reward exploit: MDP 3 55.65887328638277 30
average reward exploit: MDP 3 55.87784870931191 30
average reward exploit: MDP 3 55.73531726894847 30
average reward exploit: MDP 3 55.3856

average reward explore: 62.94411816539762
average reward exploit: MDP 7 78.7982995844574 30
average reward exploit: MDP 7 78.5898280650522 30
average reward exploit: MDP 7 77.36870715411736 30
average reward exploit: MDP 7 78.09366706897738 30
average reward exploit: MDP 7 78.99972873587451 30
average reward exploit: MDP 7 77.96604870171225 30
average reward exploit: MDP 7 79.0201358638081 30
average reward exploit: MDP 7 77.2510814618585 30
average reward exploit: MDP 7 78.13479851575603 30
average reward exploit: MDP 7 77.63094628363032 30
average reward explore: 61.00364939015942
average reward exploit: MDP 8 64.37496712215996 30
average reward exploit: MDP 8 64.38258407050829 30
average reward exploit: MDP 8 64.3659353547691 30
average reward exploit: MDP 8 64.40411076608612 30
average reward exploit: MDP 8 64.42135641426596 30
average reward exploit: MDP 8 64.40955621242986 30
average reward exploit: MDP 8 64.43731248213018 30
average reward exploit: MDP 8 64.55953486628732 30
ave

average reward exploit: MDP 1 120.54081386999982 30
average reward exploit: MDP 1 121.21320852211207 30
average reward explore: 77.77017089938468
average reward exploit: MDP 2 117.24530913831214 30
average reward exploit: MDP 2 112.43011289369619 30
average reward exploit: MDP 2 114.32852042369096 30
average reward exploit: MDP 2 114.88899022492225 30
average reward exploit: MDP 2 105.56840524707708 30
average reward exploit: MDP 2 109.54634006833857 30
average reward exploit: MDP 2 107.7688998695263 30
average reward exploit: MDP 2 106.42220013866617 30
average reward exploit: MDP 2 106.4382220391155 30
average reward exploit: MDP 2 105.2890188103194 30
average reward explore: 75.24671296693002
average reward exploit: MDP 3 101.00293648048327 30
average reward exploit: MDP 3 100.54980711181716 30
average reward exploit: MDP 3 101.93193484575562 30
average reward exploit: MDP 3 99.64337333992106 30
average reward exploit: MDP 3 100.84986620011493 30
average reward exploit: MDP 3 101.66

average reward exploit: MDP 6 85.62372010835027 30
average reward exploit: MDP 6 85.14377166776963 30
average reward exploit: MDP 6 85.83707174925856 30
average reward exploit: MDP 6 86.5931537007163 30
average reward explore: 59.49739082781821
average reward exploit: MDP 7 85.36572603317966 30
average reward exploit: MDP 7 83.92767013901646 30
average reward exploit: MDP 7 86.10622912223947 30
average reward exploit: MDP 7 84.81740354153577 30
average reward exploit: MDP 7 84.64165644164369 30
average reward exploit: MDP 7 86.9244018537551 30
average reward exploit: MDP 7 83.60052592377494 30
average reward exploit: MDP 7 84.0953568071966 30
average reward exploit: MDP 7 82.64462178536206 30
average reward exploit: MDP 7 85.96414686020232 30
average reward explore: 58.167422948021844
average reward exploit: MDP 8 78.91256212406857 30
average reward exploit: MDP 8 79.12122361481937 30
average reward exploit: MDP 8 79.10871892125033 30
average reward exploit: MDP 8 79.03264432281112 30


average reward exploit: MDP 1 62.08411343524146 30
average reward exploit: MDP 1 62.29290922622751 30
average reward exploit: MDP 1 61.86820090660463 30
average reward exploit: MDP 1 61.87613128600508 30
average reward exploit: MDP 1 62.268469673370895 30
average reward exploit: MDP 1 61.532323883985654 30
average reward explore: 71.07771715128872
average reward exploit: MDP 2 58.62199262702016 30
average reward exploit: MDP 2 58.31147867292318 30
average reward exploit: MDP 2 58.46890430594595 30
average reward exploit: MDP 2 57.964348938992984 30
average reward exploit: MDP 2 58.104745055567314 30
average reward exploit: MDP 2 58.22715537330105 30
average reward exploit: MDP 2 58.014126570412216 30
average reward exploit: MDP 2 57.97872123232902 30
average reward exploit: MDP 2 58.19853436761773 30
average reward exploit: MDP 2 57.44413235752514 30
average reward explore: 66.816069698353
average reward exploit: MDP 3 55.41223098774645 30
average reward exploit: MDP 3 56.1519651378354

average reward exploit: MDP 6 47.346440463709605 30
average reward exploit: MDP 6 47.76125136616541 30
average reward exploit: MDP 6 48.08035879985978 30
average reward exploit: MDP 6 47.33223608117427 30
average reward exploit: MDP 6 47.905556913260426 30
average reward exploit: MDP 6 47.850672706189 30
average reward exploit: MDP 6 48.11960800297593 30
average reward exploit: MDP 6 47.333912037249576 30
average reward explore: 60.92834572233249
average reward exploit: MDP 7 47.56919965826273 30
average reward exploit: MDP 7 47.94409545298579 30
average reward exploit: MDP 7 47.319157341313684 30
average reward exploit: MDP 7 47.910641648734796 30
average reward exploit: MDP 7 47.92820241449985 30
average reward exploit: MDP 7 47.50595840800326 30
average reward exploit: MDP 7 47.35483951058872 30
average reward exploit: MDP 7 47.82555670804374 30
average reward exploit: MDP 7 47.732051363124455 30
average reward exploit: MDP 7 47.68997816130862 30
average reward explore: 60.911659945

average reward explore: 78.99959486054655
average reward exploit: MDP 1 61.87187368739107 30
average reward exploit: MDP 1 61.133369608964834 30
average reward exploit: MDP 1 61.70701684425297 30
average reward exploit: MDP 1 61.82774867956149 30
average reward exploit: MDP 1 62.29379024419313 30
average reward exploit: MDP 1 62.27957634433257 30
average reward exploit: MDP 1 61.6445118874888 30
average reward exploit: MDP 1 62.072719922164474 30
average reward exploit: MDP 1 62.07583352056803 30
average reward exploit: MDP 1 62.04401172000916 30
average reward explore: 74.45102050501376
average reward exploit: MDP 2 57.7815570998408 30
average reward exploit: MDP 2 58.02822003963537 30
average reward exploit: MDP 2 57.80713054871792 30
average reward exploit: MDP 2 58.445670658118196 30
average reward exploit: MDP 2 58.27514759705472 30
average reward exploit: MDP 2 57.7393751009629 30
average reward exploit: MDP 2 57.77270949915604 30
average reward exploit: MDP 2 59.17821648923018 3

average reward exploit: MDP 5 52.12448638325881 30
average reward exploit: MDP 5 50.536303032992265 30
average reward explore: 60.21456586059933
average reward exploit: MDP 6 47.547523175155995 30
average reward exploit: MDP 6 47.939882387046424 30
average reward exploit: MDP 6 47.69035887463352 30
average reward exploit: MDP 6 47.65694599681303 30
average reward exploit: MDP 6 47.878251754078974 30
average reward exploit: MDP 6 46.756776948738285 30
average reward exploit: MDP 6 47.55297231531516 30
average reward exploit: MDP 6 48.254542625401925 30
average reward exploit: MDP 6 47.58925832551335 30
average reward exploit: MDP 6 48.06892563464263 30
average reward explore: 59.71702514084509
average reward exploit: MDP 7 48.174059558462275 30
average reward exploit: MDP 7 47.906938254618055 30
average reward exploit: MDP 7 47.88296113159962 30
average reward exploit: MDP 7 47.33371675839291 30
average reward exploit: MDP 7 47.743445477304824 30
average reward exploit: MDP 7 47.6235876

average reward exploit: MDP 0 65.87043779402656 30
average reward exploit: MDP 0 66.64141420663825 30
average reward exploit: MDP 0 66.00694442847238 30
average reward exploit: MDP 0 66.30875665285907 30
average reward explore: 81.2928977646722
average reward exploit: MDP 1 62.87591903499091 30
average reward exploit: MDP 1 62.43039806346481 30
average reward exploit: MDP 1 60.858930691265144 30
average reward exploit: MDP 1 59.219231310003174 30
average reward exploit: MDP 1 61.7180959032105 30
average reward exploit: MDP 1 63.86341972585133 30
average reward exploit: MDP 1 64.6695068479068 30
average reward exploit: MDP 1 64.54071376049197 30
average reward exploit: MDP 1 64.80209327449467 30
average reward exploit: MDP 1 65.33960404656543 30
average reward explore: 75.13980208740766
average reward exploit: MDP 2 62.30129627840702 30
average reward exploit: MDP 2 61.47352682166263 30
average reward exploit: MDP 2 61.175841083372326 30
average reward exploit: MDP 2 61.56922594309792 3

average reward exploit: MDP 5 50.512033373372724 30
average reward exploit: MDP 5 50.932746410859586 30
average reward exploit: MDP 5 50.23927751860968 30
average reward exploit: MDP 5 50.06593520560337 30
average reward exploit: MDP 5 50.08097408454937 30
average reward exploit: MDP 5 50.376727153521195 30
average reward explore: 62.64590137488541
average reward exploit: MDP 6 47.943917991634244 30
average reward exploit: MDP 6 47.65893944999301 30
average reward exploit: MDP 6 48.13044110549725 30
average reward exploit: MDP 6 47.727807936411864 30
average reward exploit: MDP 6 48.083884463425015 30
average reward exploit: MDP 6 47.96797264853305 30
average reward exploit: MDP 6 47.272246523856204 30
average reward exploit: MDP 6 48.04024432346107 30
average reward exploit: MDP 6 47.937248995506586 30
average reward exploit: MDP 6 48.300828177275015 30
average reward explore: 60.15429443094069
average reward exploit: MDP 7 41.556494008380085 30
average reward exploit: MDP 7 41.421193

average reward exploit: MDP 0 130.43990331468385 30
average reward exploit: MDP 0 129.61571832115277 30
average reward exploit: MDP 0 129.1776712873749 30
average reward exploit: MDP 0 131.74979503358242 30
average reward exploit: MDP 0 130.4124550183155 30
average reward exploit: MDP 0 132.28383291875278 30
average reward exploit: MDP 0 130.55715265317184 30
average reward exploit: MDP 0 127.61132782080396 30
average reward explore: 78.16771649226332
average reward exploit: MDP 1 115.0542307935277 30
average reward exploit: MDP 1 115.70034562044029 30
average reward exploit: MDP 1 114.17301874678577 30
average reward exploit: MDP 1 114.45129813397125 30
average reward exploit: MDP 1 115.79640871904786 30
average reward exploit: MDP 1 114.63797500776774 30
average reward exploit: MDP 1 114.36417134014667 30
average reward exploit: MDP 1 115.62863473869534 30
average reward exploit: MDP 1 114.6945745801505 30
average reward exploit: MDP 1 114.11274000924593 30
average reward explore: 72

average reward explore: 63.945032259798246
average reward exploit: MDP 5 94.64272875527604 30
average reward exploit: MDP 5 96.33352593142556 30
average reward exploit: MDP 5 96.72095034548283 30
average reward exploit: MDP 5 97.03337961782809 30
average reward exploit: MDP 5 96.52096720768833 30
average reward exploit: MDP 5 97.07563385244183 30
average reward exploit: MDP 5 95.55922049729772 30
average reward exploit: MDP 5 95.15420775829092 30
average reward exploit: MDP 5 95.9980387581912 30
average reward exploit: MDP 5 95.05072764794059 30
average reward explore: 60.68592990447158
average reward exploit: MDP 6 86.4614851191499 30
average reward exploit: MDP 6 86.46315818899782 30
average reward exploit: MDP 6 86.46814403566385 30
average reward exploit: MDP 6 86.37480421565697 30
average reward exploit: MDP 6 86.30326746979038 30
average reward exploit: MDP 6 86.29429615294022 30
average reward exploit: MDP 6 86.42282073674484 30
average reward exploit: MDP 6 86.22740194297913 30

average reward exploit: MDP 9 81.14531355183765 30
average reward exploit: MDP 9 80.90232487082956 30
epoch number:  77
average reward explore: 66.87548938085406
average reward exploit: MDP 0 132.69941300395246 30
average reward exploit: MDP 0 131.49053427751264 30
average reward exploit: MDP 0 131.07046780922772 30
average reward exploit: MDP 0 132.24175563014748 30
average reward exploit: MDP 0 134.9277136108337 30
average reward exploit: MDP 0 133.69818004453475 30
average reward exploit: MDP 0 136.66180468779064 30
average reward exploit: MDP 0 132.91636413739872 30
average reward exploit: MDP 0 133.92095934773076 30
average reward exploit: MDP 0 133.81942676233294 30
average reward explore: 61.80265431646444
average reward exploit: MDP 1 119.01450018978413 30
average reward exploit: MDP 1 117.95125794546666 30
average reward exploit: MDP 1 118.60188794884601 30
average reward exploit: MDP 1 119.14121216711993 30
average reward exploit: MDP 1 118.57988059599357 30
average reward ex

average reward exploit: MDP 4 109.00558902592189 30
average reward exploit: MDP 4 108.7645371155591 30
average reward exploit: MDP 4 108.94676271745102 30
average reward exploit: MDP 4 107.65917016852947 30
average reward exploit: MDP 4 108.54608071476294 30
average reward explore: 47.77770742377751
average reward exploit: MDP 5 95.43258485758483 30
average reward exploit: MDP 5 96.71833949422407 30
average reward exploit: MDP 5 95.7173848984828 30
average reward exploit: MDP 5 94.59256220079223 30
average reward exploit: MDP 5 94.4431589606854 30
average reward exploit: MDP 5 95.02182398276528 30
average reward exploit: MDP 5 94.18232198038791 30
average reward exploit: MDP 5 95.80413487745123 30
average reward exploit: MDP 5 95.53878557349783 30
average reward exploit: MDP 5 94.47859014444275 30
average reward explore: 46.65025691027067
average reward exploit: MDP 6 88.34605652457316 30
average reward exploit: MDP 6 88.15581735140437 30
average reward exploit: MDP 6 88.6060310996755 

average reward exploit: MDP 9 81.88962610668285 30
average reward exploit: MDP 9 82.15083946879001 30
average reward exploit: MDP 9 80.8521228042111 30
average reward exploit: MDP 9 82.05385953452199 30
average reward exploit: MDP 9 82.3030824085997 30
average reward exploit: MDP 9 82.19135944150132 30
average reward exploit: MDP 9 83.08272184340873 30
epoch number:  80
average reward explore: 69.90353833324069
average reward exploit: MDP 0 141.22104185251425 30
average reward exploit: MDP 0 142.16253630814288 30
average reward exploit: MDP 0 141.53215855594735 30
average reward exploit: MDP 0 141.71519908264176 30
average reward exploit: MDP 0 141.57358910274127 30
average reward exploit: MDP 0 142.57077306503624 30
average reward exploit: MDP 0 141.74531106259565 30
average reward exploit: MDP 0 141.84077667622353 30
average reward exploit: MDP 0 143.68566106225612 30
average reward exploit: MDP 0 142.4420102275666 30
average reward explore: 66.32293115470401
average reward exploit: 

average reward exploit: MDP 4 107.82740975733503 30
average reward exploit: MDP 4 109.0281909515916 30
average reward exploit: MDP 4 108.69910475982094 30
average reward exploit: MDP 4 109.87805126818168 30
average reward exploit: MDP 4 108.92411752739874 30
average reward exploit: MDP 4 108.95800968453055 30
average reward exploit: MDP 4 108.81716166968243 30
average reward exploit: MDP 4 110.09996188823217 30
average reward exploit: MDP 4 110.05763192757404 30
average reward exploit: MDP 4 108.7206227033882 30
average reward explore: 55.21418497690889
average reward exploit: MDP 5 94.2452627322374 30
average reward exploit: MDP 5 97.25638819409713 30
average reward exploit: MDP 5 97.95783801561333 30
average reward exploit: MDP 5 96.14825377417978 30
average reward exploit: MDP 5 95.8112240100562 30
average reward exploit: MDP 5 93.87329278098902 30
average reward exploit: MDP 5 96.02019876701578 30
average reward exploit: MDP 5 95.77732317647379 30
average reward exploit: MDP 5 98.1

average reward exploit: MDP 8 87.95817309237938 30
average reward exploit: MDP 8 88.77910401650645 30
average reward explore: 60.35230300415294
average reward exploit: MDP 9 82.33793310189486 30
average reward exploit: MDP 9 81.58551901150837 30
average reward exploit: MDP 9 82.61414787921811 30
average reward exploit: MDP 9 81.40149187421417 30
average reward exploit: MDP 9 81.94177320679792 30
average reward exploit: MDP 9 80.17718343650857 30
average reward exploit: MDP 9 80.39264405285834 30
average reward exploit: MDP 9 81.5058746634987 30
average reward exploit: MDP 9 82.38096581392999 30
average reward exploit: MDP 9 82.09007647950608 30
epoch number:  83
average reward explore: 85.1710413826327
average reward exploit: MDP 0 142.04387516852634 30
average reward exploit: MDP 0 142.68912600615215 30
average reward exploit: MDP 0 141.25152980996307 30
average reward exploit: MDP 0 141.89108880555005 30
average reward exploit: MDP 0 141.66337272726724 30
average reward exploit: MDP 

average reward exploit: MDP 3 105.56609022045501 30
average reward exploit: MDP 3 105.35611404791116 30
average reward exploit: MDP 3 103.80144949619297 30
average reward exploit: MDP 3 105.49061433516131 30
average reward exploit: MDP 3 103.62606184568486 30
average reward explore: 67.75091260682258
average reward exploit: MDP 4 108.89597172445552 30
average reward exploit: MDP 4 107.3944720704076 30
average reward exploit: MDP 4 106.51876662935096 30
average reward exploit: MDP 4 108.39276218049551 30
average reward exploit: MDP 4 108.61283232830806 30
average reward exploit: MDP 4 107.51891437366943 30
average reward exploit: MDP 4 106.45256631965523 30
average reward exploit: MDP 4 107.74470852127129 30
average reward exploit: MDP 4 109.03414650242225 30
average reward exploit: MDP 4 107.34876471772246 30
average reward explore: 64.63569460401727
average reward exploit: MDP 5 93.01528025486134 30
average reward exploit: MDP 5 93.74954144618872 30
average reward exploit: MDP 5 96.37

average reward exploit: MDP 8 86.93818390414857 30
average reward exploit: MDP 8 88.05108616968924 30
average reward exploit: MDP 8 87.97745367536439 30
average reward exploit: MDP 8 88.01452919175003 30
average reward exploit: MDP 8 86.82059285982304 30
average reward exploit: MDP 8 89.39075092838635 30
average reward exploit: MDP 8 88.17252068760605 30
average reward explore: 59.33235425938771
average reward exploit: MDP 9 81.90021418653161 30
average reward exploit: MDP 9 81.36964216258475 30
average reward exploit: MDP 9 82.03146878409358 30
average reward exploit: MDP 9 81.43305711463687 30
average reward exploit: MDP 9 82.41700113751236 30
average reward exploit: MDP 9 82.36504770319357 30
average reward exploit: MDP 9 81.00953676574528 30
average reward exploit: MDP 9 81.58140755578911 30
average reward exploit: MDP 9 83.07630762160251 30
average reward exploit: MDP 9 81.16965022948375 30
epoch number:  86
average reward explore: 83.49343475053138
average reward exploit: MDP 0 1

average reward exploit: MDP 3 115.23849787384206 30
average reward exploit: MDP 3 115.2274955108842 30
average reward exploit: MDP 3 115.57905010399644 30
average reward exploit: MDP 3 115.51140086150366 30
average reward exploit: MDP 3 115.32698300540432 30
average reward exploit: MDP 3 115.17097254170477 30
average reward exploit: MDP 3 115.41313233724507 30
average reward exploit: MDP 3 115.77279394937106 30
average reward exploit: MDP 3 115.64917036732064 30
average reward exploit: MDP 3 115.38655666653226 30
average reward explore: 68.45918134429066
average reward exploit: MDP 4 109.44059858220369 30
average reward exploit: MDP 4 109.53593244792827 30
average reward exploit: MDP 4 109.83914348989985 30
average reward exploit: MDP 4 109.60922389890604 30
average reward exploit: MDP 4 109.33828911244535 30
average reward exploit: MDP 4 110.11738985152549 30
average reward exploit: MDP 4 109.767826015453 30
average reward exploit: MDP 4 109.61335228985133 30
average reward exploit: M

average reward exploit: MDP 7 90.82823307833186 30
average reward exploit: MDP 7 92.70052974728695 30
average reward explore: 57.07092600180383
average reward exploit: MDP 8 89.24466791345584 30
average reward exploit: MDP 8 89.57953335160184 30
average reward exploit: MDP 8 89.61743656864665 30
average reward exploit: MDP 8 89.58931722388 30
average reward exploit: MDP 8 89.79394163497012 30
average reward exploit: MDP 8 89.04803515753643 30
average reward exploit: MDP 8 89.8228381925346 30
average reward exploit: MDP 8 89.70737590479308 30
average reward exploit: MDP 8 89.69112747072654 30
average reward exploit: MDP 8 89.47367597868666 30
average reward explore: 52.767824328634774
average reward exploit: MDP 9 81.08035833774876 30
average reward exploit: MDP 9 82.93471843282423 30
average reward exploit: MDP 9 81.71047410226541 30
average reward exploit: MDP 9 81.1347159707933 30
average reward exploit: MDP 9 81.31990609542927 30
average reward exploit: MDP 9 81.01792482476357 30
av

average reward exploit: MDP 2 116.68106722352213 30
average reward exploit: MDP 2 118.71071230094165 30
average reward exploit: MDP 2 119.41825439126121 30
average reward exploit: MDP 2 120.3432338750882 30
average reward exploit: MDP 2 115.98500767349248 30
average reward explore: 72.87994923546489
average reward exploit: MDP 3 115.73916559131891 30
average reward exploit: MDP 3 114.91700558818057 30
average reward exploit: MDP 3 115.06162189481573 30
average reward exploit: MDP 3 115.32457249178789 30
average reward exploit: MDP 3 115.31285432075019 30
average reward exploit: MDP 3 115.03193160774359 30
average reward exploit: MDP 3 115.91717698166735 30
average reward exploit: MDP 3 115.51986391920049 30
average reward exploit: MDP 3 115.3148760527939 30
average reward exploit: MDP 3 115.34411456078043 30
average reward explore: 68.18599868572053
average reward exploit: MDP 4 108.80025207723472 30
average reward exploit: MDP 4 109.40874258493206 30
average reward exploit: MDP 4 108.

average reward exploit: MDP 7 96.15116982906562 30
average reward exploit: MDP 7 96.29437706365766 30
average reward exploit: MDP 7 96.47852163151127 30
average reward exploit: MDP 7 96.35231904343917 30
average reward exploit: MDP 7 96.58820432739601 30
average reward exploit: MDP 7 96.4941421768414 30
average reward exploit: MDP 7 96.34537272500985 30
average reward exploit: MDP 7 96.27166719594679 30
average reward explore: 55.59577558636607
average reward exploit: MDP 8 89.55999291120376 30
average reward exploit: MDP 8 89.86491518816057 30
average reward exploit: MDP 8 89.62845681475184 30
average reward exploit: MDP 8 89.97742496763769 30
average reward exploit: MDP 8 89.67484277062802 30
average reward exploit: MDP 8 89.77641412943295 30
average reward exploit: MDP 8 90.09556079953497 30
average reward exploit: MDP 8 89.97782681455267 30
average reward exploit: MDP 8 89.6409489275325 30
average reward exploit: MDP 8 89.70057796632342 30
average reward explore: 51.81565529591709


average reward explore: 77.49069413345781
average reward exploit: MDP 2 122.56594337436313 30
average reward exploit: MDP 2 121.9409616989155 30
average reward exploit: MDP 2 122.97896043718696 30
average reward exploit: MDP 2 122.86980706447737 30
average reward exploit: MDP 2 122.37087767288527 30
average reward exploit: MDP 2 121.53482840332892 30
average reward exploit: MDP 2 122.45280111132256 30
average reward exploit: MDP 2 122.14919205998648 30
average reward exploit: MDP 2 121.63825926454881 30
average reward exploit: MDP 2 122.10961843775928 30
average reward explore: 73.05627700932206
average reward exploit: MDP 3 115.32432094470921 30
average reward exploit: MDP 3 115.47103569116182 30
average reward exploit: MDP 3 115.29714646090908 30
average reward exploit: MDP 3 115.6767154517528 30
average reward exploit: MDP 3 115.26657891156265 30
average reward exploit: MDP 3 115.24679009219635 30
average reward exploit: MDP 3 115.06032921158224 30
average reward exploit: MDP 3 115.

average reward exploit: MDP 6 61.08802630358381 30
average reward exploit: MDP 6 57.30970160162106 30
average reward exploit: MDP 6 58.14971576325573 30
average reward explore: 60.37608424668195
average reward exploit: MDP 7 62.785780512666726 30
average reward exploit: MDP 7 57.407197668717714 30
average reward exploit: MDP 7 56.454362590091556 30
average reward exploit: MDP 7 56.47181363310504 30
average reward exploit: MDP 7 56.04102642113332 30
average reward exploit: MDP 7 56.185145319836735 30
average reward exploit: MDP 7 57.01473545659768 30
average reward exploit: MDP 7 57.644523396413575 30
average reward exploit: MDP 7 57.56476027982718 30
average reward exploit: MDP 7 57.681634123782416 30
average reward explore: 56.84469412435507
average reward exploit: MDP 8 52.83910057868082 30
average reward exploit: MDP 8 52.82355008819804 30
average reward exploit: MDP 8 52.684721926079796 30
average reward exploit: MDP 8 52.34714013025984 30
average reward exploit: MDP 8 52.422279029

average reward exploit: MDP 1 63.82292166490353 30
average reward exploit: MDP 1 63.39178953869094 30
average reward exploit: MDP 1 63.62152435117166 30
average reward exploit: MDP 1 64.60194108387189 30
average reward exploit: MDP 1 63.12622650744158 30
average reward explore: 73.05985161209368
average reward exploit: MDP 2 59.269229847565065 30
average reward exploit: MDP 2 59.7690413932459 30
average reward exploit: MDP 2 59.84914416556389 30
average reward exploit: MDP 2 59.48676276766591 30
average reward exploit: MDP 2 60.36573889422921 30
average reward exploit: MDP 2 59.8499222813022 30
average reward exploit: MDP 2 59.58392982711575 30
average reward exploit: MDP 2 59.82656053484212 30
average reward exploit: MDP 2 59.72997209991333 30
average reward exploit: MDP 2 59.95524259239202 30
average reward explore: 68.52323959236197
average reward exploit: MDP 3 57.29866182944325 30
average reward exploit: MDP 3 56.038642700618794 30
average reward exploit: MDP 3 56.880713241655705 

average reward exploit: MDP 6 47.56212556410446 30
average reward exploit: MDP 6 48.101520651086965 30
average reward exploit: MDP 6 47.887230708300564 30
average reward exploit: MDP 6 47.94842050912588 30
average reward exploit: MDP 6 48.14467249061746 30
average reward exploit: MDP 6 47.2813258894133 30
average reward exploit: MDP 6 48.36491125046429 30
average reward explore: 56.56751792593185
average reward exploit: MDP 7 47.749799019501545 30
average reward exploit: MDP 7 47.93017266872015 30
average reward exploit: MDP 7 48.11947497202484 30
average reward exploit: MDP 7 47.48222073506955 30
average reward exploit: MDP 7 47.57315156074815 30
average reward exploit: MDP 7 47.64803030733229 30
average reward exploit: MDP 7 48.08303736837832 30
average reward exploit: MDP 7 47.85575635809121 30
average reward exploit: MDP 7 47.9088177570269 30
average reward exploit: MDP 7 47.5501811928045 30
average reward explore: 54.98907851274078
average reward exploit: MDP 8 49.98900187290972 3

average reward exploit: MDP 1 61.66942744316604 30
average reward exploit: MDP 1 62.056322079936535 30
average reward exploit: MDP 1 61.86665012091007 30
average reward exploit: MDP 1 62.11532182309233 30
average reward exploit: MDP 1 62.56791550389822 30
average reward exploit: MDP 1 62.04703321782214 30
average reward exploit: MDP 1 62.4773588374896 30
average reward exploit: MDP 1 61.20257777971717 30
average reward exploit: MDP 1 61.84419359525321 30
average reward explore: 73.58601721271364
average reward exploit: MDP 2 58.626918573590736 30
average reward exploit: MDP 2 58.762742867661835 30
average reward exploit: MDP 2 58.2080174507557 30
average reward exploit: MDP 2 58.0015640925913 30
average reward exploit: MDP 2 58.46300875031986 30
average reward exploit: MDP 2 58.82601893404673 30
average reward exploit: MDP 2 57.64030984521789 30
average reward exploit: MDP 2 57.78312275054171 30
average reward exploit: MDP 2 58.67547831019082 30
average reward exploit: MDP 2 57.3407221

average reward explore: 60.86746554229661
average reward exploit: MDP 6 47.7653127707329 30
average reward exploit: MDP 6 47.953639395416694 30
average reward exploit: MDP 6 47.078908633816695 30
average reward exploit: MDP 6 47.773816292042596 30
average reward exploit: MDP 6 47.72529888898869 30
average reward exploit: MDP 6 48.0321777543464 30
average reward exploit: MDP 6 47.74711790294896 30
average reward exploit: MDP 6 47.99147958028384 30
average reward exploit: MDP 6 47.571437669712644 30
average reward exploit: MDP 6 47.34192448713768 30
average reward explore: 56.22344070295207
average reward exploit: MDP 7 47.685469071623515 30
average reward exploit: MDP 7 47.52994118540114 30
average reward exploit: MDP 7 47.75154288022937 30
average reward exploit: MDP 7 47.717288736613014 30
average reward exploit: MDP 7 47.870666696827314 30
average reward exploit: MDP 7 47.727980460729825 30
average reward exploit: MDP 7 47.87809668861336 30
average reward exploit: MDP 7 47.3423649768

average reward exploit: MDP 0 66.26854224749883 30
average reward exploit: MDP 0 66.78379006850534 30
average reward exploit: MDP 0 66.58243274349572 30
average reward explore: 79.28361858292779
average reward exploit: MDP 1 61.905244955447834 30
average reward exploit: MDP 1 61.926332345270595 30
average reward exploit: MDP 1 62.40868017069968 30
average reward exploit: MDP 1 61.37317869873209 30
average reward exploit: MDP 1 61.91120057309608 30
average reward exploit: MDP 1 62.0978825469631 30
average reward exploit: MDP 1 61.712779098754595 30
average reward exploit: MDP 1 61.49238584630965 30
average reward exploit: MDP 1 62.04227969730902 30
average reward exploit: MDP 1 61.52088420206679 30
average reward explore: 75.00035552549363
average reward exploit: MDP 2 58.20297388693248 30
average reward exploit: MDP 2 58.020938743947504 30
average reward exploit: MDP 2 58.645299072709136 30
average reward exploit: MDP 2 57.63985730600584 30
average reward exploit: MDP 2 58.355635899055

average reward exploit: MDP 5 50.39115470708177 30
average reward exploit: MDP 5 51.167960600525056 30
average reward exploit: MDP 5 50.127158043339556 30
average reward exploit: MDP 5 51.37847767096278 30
average reward exploit: MDP 5 50.687902090625236 30
average reward explore: 61.74653036162748
average reward exploit: MDP 6 47.89038014312757 30
average reward exploit: MDP 6 47.931934438135364 30
average reward exploit: MDP 6 47.94582078247784 30
average reward exploit: MDP 6 47.5808041890067 30
average reward exploit: MDP 6 47.53817757747088 30
average reward exploit: MDP 6 47.33762911177307 30
average reward exploit: MDP 6 47.33999312910659 30
average reward exploit: MDP 6 48.30711181358985 30
average reward exploit: MDP 6 47.91574445592557 30
average reward exploit: MDP 6 47.37888728738748 30
average reward explore: 59.57042102498379
average reward exploit: MDP 7 47.93532850622227 30
average reward exploit: MDP 7 47.94278047376084 30
average reward exploit: MDP 7 48.1100043626304

average reward exploit: MDP 0 66.37576254141634 30
average reward exploit: MDP 0 66.16767324970034 30
average reward exploit: MDP 0 66.82563354443185 30
average reward exploit: MDP 0 66.05096314248296 30
average reward exploit: MDP 0 66.8169204029201 30
average reward exploit: MDP 0 66.09102599812611 30
average reward exploit: MDP 0 66.77460737449395 30
average reward exploit: MDP 0 67.01380948028107 30
average reward explore: 78.99167100561515
average reward exploit: MDP 1 61.68635789712924 30
average reward exploit: MDP 1 58.233236830924696 30
average reward exploit: MDP 1 58.43859317368398 30
average reward exploit: MDP 1 58.339304681292816 30
average reward exploit: MDP 1 57.29376763677577 30
average reward exploit: MDP 1 58.80375619290768 30
average reward exploit: MDP 1 58.195843704384835 30
average reward exploit: MDP 1 59.38397384370267 30
average reward exploit: MDP 1 58.14060346817327 30
average reward exploit: MDP 1 57.282736163884024 30
average reward explore: 74.6946854249

average reward exploit: MDP 5 51.266556093936444 30
average reward exploit: MDP 5 50.85726803394099 30
average reward exploit: MDP 5 50.64465906887207 30
average reward exploit: MDP 5 51.02974211016208 30
average reward exploit: MDP 5 50.787477975282215 30
average reward exploit: MDP 5 50.24056071067592 30
average reward exploit: MDP 5 50.58715552693244 30
average reward exploit: MDP 5 49.768542182506806 30
average reward exploit: MDP 5 50.11424349606749 30
average reward exploit: MDP 5 50.39185503020917 30
average reward explore: 62.12457670287013
average reward exploit: MDP 6 47.691454871747844 30
average reward exploit: MDP 6 46.951855801855366 30
average reward exploit: MDP 6 47.71431037120131 30
average reward exploit: MDP 6 48.29267149774201 30
average reward exploit: MDP 6 48.167250753680676 30
average reward exploit: MDP 6 47.94617648707314 30
average reward exploit: MDP 6 47.860883522802325 30
average reward exploit: MDP 6 47.91209590477554 30
average reward exploit: MDP 6 48.

average reward exploit: MDP 9 44.4305935292362 30
average reward exploit: MDP 9 44.26325324887475 30
epoch number:  108
average reward explore: 83.98400566156073
average reward exploit: MDP 0 65.95553938602745 30
average reward exploit: MDP 0 66.0594569635735 30
average reward exploit: MDP 0 66.80431737599294 30
average reward exploit: MDP 0 66.59730998234497 30
average reward exploit: MDP 0 66.25172459477564 30
average reward exploit: MDP 0 66.56866486673815 30
average reward exploit: MDP 0 66.24756574033115 30
average reward exploit: MDP 0 66.64675703553243 30
average reward exploit: MDP 0 66.51209468468323 30
average reward exploit: MDP 0 66.63484833196883 30
average reward explore: 79.3427696277897
average reward exploit: MDP 1 61.87825655271397 30
average reward exploit: MDP 1 61.917584042611296 30
average reward exploit: MDP 1 62.25480820409134 30
average reward exploit: MDP 1 61.69062306878353 30
average reward exploit: MDP 1 62.22777113777121 30
average reward exploit: MDP 1 62

average reward exploit: MDP 4 59.85499193329756 30
average reward exploit: MDP 4 61.44472218863134 30
average reward exploit: MDP 4 62.30949614568999 30
average reward exploit: MDP 4 60.16487696175417 30
average reward explore: 64.55805039713194
average reward exploit: MDP 5 58.28235031970631 30
average reward exploit: MDP 5 60.715839794728886 30
average reward exploit: MDP 5 63.02547725211083 30
average reward exploit: MDP 5 61.7687555221571 30
average reward exploit: MDP 5 61.280953816820066 30
average reward exploit: MDP 5 60.29038514045181 30
average reward exploit: MDP 5 56.99016529165472 30
average reward exploit: MDP 5 56.97489270208747 30
average reward exploit: MDP 5 52.519718894763606 30
average reward exploit: MDP 5 52.19092541575329 30
average reward explore: 61.68920128129817
average reward exploit: MDP 6 53.83723650467502 30
average reward exploit: MDP 6 53.3830413665772 30
average reward exploit: MDP 6 54.23865818485956 30
average reward exploit: MDP 6 55.57026792212342 

average reward exploit: MDP 9 50.83025451539184 30
average reward exploit: MDP 9 50.792747151014446 30
average reward exploit: MDP 9 51.10988711719154 30
average reward exploit: MDP 9 51.18857318071518 30
average reward exploit: MDP 9 48.116563052503004 30
average reward exploit: MDP 9 46.743416017488364 30
epoch number:  111
average reward explore: 83.93526025817796
average reward exploit: MDP 0 81.96966051881735 30
average reward exploit: MDP 0 82.48581105344812 30
average reward exploit: MDP 0 82.83689924905005 30
average reward exploit: MDP 0 82.35744665966048 30
average reward exploit: MDP 0 84.24033725457073 30
average reward exploit: MDP 0 84.53039434156261 30
average reward exploit: MDP 0 84.41414698238414 30
average reward exploit: MDP 0 64.72706675647436 30
average reward exploit: MDP 0 61.526612008003504 30
average reward exploit: MDP 0 61.89936517296418 30
average reward explore: 79.13182204676332
average reward exploit: MDP 1 57.31092185805918 30
average reward exploit: MD

average reward exploit: MDP 4 52.441537796687804 30
average reward exploit: MDP 4 52.629815923778175 30
average reward exploit: MDP 4 52.883588019634225 30
average reward exploit: MDP 4 52.43462968908457 30
average reward exploit: MDP 4 52.47001840630505 30
average reward exploit: MDP 4 52.062157939886035 30
average reward exploit: MDP 4 52.821114265828236 30
average reward exploit: MDP 4 52.60963175865059 30
average reward exploit: MDP 4 52.271763605300194 30
average reward explore: 63.563573993804994
average reward exploit: MDP 5 51.227709568542984 30
average reward exploit: MDP 5 50.70649463577016 30
average reward exploit: MDP 5 50.338504749033426 30
average reward exploit: MDP 5 51.18039183096946 30
average reward exploit: MDP 5 51.17829397264211 30
average reward exploit: MDP 5 50.69735205538326 30
average reward exploit: MDP 5 50.654876277295 30
average reward exploit: MDP 5 50.73364250944564 30
average reward exploit: MDP 5 50.71692031577867 30
average reward exploit: MDP 5 50.

average reward exploit: MDP 8 46.421307086616096 30
average reward explore: 53.28338185300528
average reward exploit: MDP 9 44.26926619440491 30
average reward exploit: MDP 9 44.320024599608246 30
average reward exploit: MDP 9 44.28665685759818 30
average reward exploit: MDP 9 44.3101954782243 30
average reward exploit: MDP 9 44.27840097886476 30
average reward exploit: MDP 9 44.28461768117388 30
average reward exploit: MDP 9 44.43050102407714 30
average reward exploit: MDP 9 44.292964341804115 30
average reward exploit: MDP 9 44.240778621590366 30
average reward exploit: MDP 9 44.59584127108119 30
epoch number:  114
average reward explore: 84.22752354244103
average reward exploit: MDP 0 66.6310629548951 30
average reward exploit: MDP 0 66.81432175066222 30
average reward exploit: MDP 0 66.86347426871251 30
average reward exploit: MDP 0 66.3511653685432 30
average reward exploit: MDP 0 66.4363549254016 30
average reward exploit: MDP 0 67.06047171397165 30
average reward exploit: MDP 0 

average reward exploit: MDP 3 55.7482783352911 30
average reward exploit: MDP 3 55.91960630234669 30
average reward exploit: MDP 3 55.66433848776781 30
average reward exploit: MDP 3 55.81108231839065 30
average reward explore: 68.27050517222406
average reward exploit: MDP 4 52.09681350607045 30
average reward exploit: MDP 4 52.60154106751179 30
average reward exploit: MDP 4 52.44096499597973 30
average reward exploit: MDP 4 52.628063955139126 30
average reward exploit: MDP 4 52.39301092367213 30
average reward exploit: MDP 4 52.44785424985797 30
average reward exploit: MDP 4 52.24950509599289 30
average reward exploit: MDP 4 52.21858246530088 30
average reward exploit: MDP 4 52.23578151092014 30
average reward exploit: MDP 4 52.13295835864948 30
average reward explore: 63.54051548710602
average reward exploit: MDP 5 49.82422944329082 30
average reward exploit: MDP 5 49.83343768455618 30
average reward exploit: MDP 5 50.327218265102445 30
average reward exploit: MDP 5 50.28274949461888 

average reward exploit: MDP 8 46.01720497562425 30
average reward exploit: MDP 8 46.75724394945174 30
average reward exploit: MDP 8 46.031586526023915 30
average reward exploit: MDP 8 46.18907858970978 30
average reward exploit: MDP 8 46.383778276959646 30
average reward exploit: MDP 8 46.49882280765478 30
average reward explore: 52.92903563181949
average reward exploit: MDP 9 44.6985361777297 30
average reward exploit: MDP 9 44.44775356911921 30
average reward exploit: MDP 9 44.43560973227696 30
average reward exploit: MDP 9 44.44937596508675 30
average reward exploit: MDP 9 44.29162782077609 30
average reward exploit: MDP 9 44.31417995230397 30
average reward exploit: MDP 9 44.30066640578918 30
average reward exploit: MDP 9 44.30494278261845 30
average reward exploit: MDP 9 44.50690540011967 30
average reward exploit: MDP 9 44.42011725588157 30
epoch number:  117
average reward explore: 83.82867968504182
average reward exploit: MDP 0 66.18928654647424 30
average reward exploit: MDP 0

average reward exploit: MDP 3 54.75991347134764 30
average reward exploit: MDP 3 54.92706631624539 30
average reward exploit: MDP 3 55.98717045549868 30
average reward exploit: MDP 3 56.01596917132635 30
average reward exploit: MDP 3 55.045843325226514 30
average reward exploit: MDP 3 54.85234555915882 30
average reward exploit: MDP 3 56.074628444424675 30
average reward exploit: MDP 3 55.07881911515885 30
average reward explore: 68.45355428704276
average reward exploit: MDP 4 52.26128504126664 30
average reward exploit: MDP 4 52.598955401698966 30
average reward exploit: MDP 4 52.667748347552646 30
average reward exploit: MDP 4 52.636807112745345 30
average reward exploit: MDP 4 52.841122580496126 30
average reward exploit: MDP 4 52.809804701806755 30
average reward exploit: MDP 4 52.85278446576087 30
average reward exploit: MDP 4 52.44993629680049 30
average reward exploit: MDP 4 52.645897248346046 30
average reward exploit: MDP 4 52.20958110791173 30
average reward explore: 63.78957

average reward explore: 56.245745183596014
average reward exploit: MDP 8 46.403406118555274 30
average reward exploit: MDP 8 46.18327801662882 30
average reward exploit: MDP 8 46.24661385923943 30
average reward exploit: MDP 8 46.37107071132392 30
average reward exploit: MDP 8 46.3313074145206 30
average reward exploit: MDP 8 46.809636235739426 30
average reward exploit: MDP 8 45.92233576587843 30
average reward exploit: MDP 8 46.69900130726915 30
average reward exploit: MDP 8 46.856720257321086 30
average reward exploit: MDP 8 46.40723408659583 30
average reward explore: 52.22802082878418
average reward exploit: MDP 9 44.32268873771411 30
average reward exploit: MDP 9 44.30855651544526 30
average reward exploit: MDP 9 44.43034241615884 30
average reward exploit: MDP 9 44.27472018500053 30
average reward exploit: MDP 9 44.438388612277755 30
average reward exploit: MDP 9 44.26989564417239 30
average reward exploit: MDP 9 44.254245980776965 30
average reward exploit: MDP 9 44.30859796148

average reward exploit: MDP 2 57.80113784237579 30
average reward exploit: MDP 2 58.05350040166156 30
average reward exploit: MDP 2 57.974227594167296 30
average reward explore: 71.56971967663829
average reward exploit: MDP 3 56.47437400652412 30
average reward exploit: MDP 3 55.14841698830255 30
average reward exploit: MDP 3 56.23596812698319 30
average reward exploit: MDP 3 56.30294415108233 30
average reward exploit: MDP 3 55.83881366541346 30
average reward exploit: MDP 3 56.00853539054124 30
average reward exploit: MDP 3 55.398343120071814 30
average reward exploit: MDP 3 55.94154591644341 30
average reward exploit: MDP 3 55.7305463870487 30
average reward exploit: MDP 3 55.81725338951106 30
average reward explore: 68.21648418868799
average reward exploit: MDP 4 52.04752205558463 30
average reward exploit: MDP 4 52.10022958876536 30
average reward exploit: MDP 4 52.841007214153976 30
average reward exploit: MDP 4 52.2231573057345 30
average reward exploit: MDP 4 52.45880382900418 

average reward exploit: MDP 7 47.746790125245404 30
average reward exploit: MDP 7 48.163868762941874 30
average reward exploit: MDP 7 47.515392224607254 30
average reward exploit: MDP 7 48.08711905106544 30
average reward exploit: MDP 7 47.491857452506615 30
average reward explore: 55.685861506669674
average reward exploit: MDP 8 46.93407740592468 30
average reward exploit: MDP 8 46.72630729450263 30
average reward exploit: MDP 8 46.84917419396889 30
average reward exploit: MDP 8 45.95749524809904 30
average reward exploit: MDP 8 46.191410945234075 30
average reward exploit: MDP 8 46.29443961305071 30
average reward exploit: MDP 8 46.03828404797277 30
average reward exploit: MDP 8 46.679696299675136 30
average reward exploit: MDP 8 46.96956050696766 30
average reward exploit: MDP 8 46.44248845874701 30
average reward explore: 51.91549708985247
average reward exploit: MDP 9 44.59045698664201 30
average reward exploit: MDP 9 44.55591316192038 30
average reward exploit: MDP 9 44.433895192

average reward exploit: MDP 2 58.4560938844122 30
average reward exploit: MDP 2 58.17460691613465 30
average reward exploit: MDP 2 57.27143935465434 30
average reward exploit: MDP 2 58.600608258448226 30
average reward exploit: MDP 2 57.7344154213018 30
average reward exploit: MDP 2 57.96240192449144 30
average reward exploit: MDP 2 57.99077058778628 30
average reward exploit: MDP 2 58.94737150133912 30
average reward explore: 71.221431613828
average reward exploit: MDP 3 55.591297070878376 30
average reward exploit: MDP 3 55.57537985868357 30
average reward exploit: MDP 3 56.11740456634617 30
average reward exploit: MDP 3 55.6864525615594 30
average reward exploit: MDP 3 54.87907771020147 30
average reward exploit: MDP 3 55.29134765407177 30
average reward exploit: MDP 3 55.28736431353915 30
average reward exploit: MDP 3 55.847825160056125 30
average reward exploit: MDP 3 56.198963160639686 30
average reward exploit: MDP 3 56.50987654584103 30
average reward explore: 68.06648381714945

average reward exploit: MDP 7 48.11664921431064 30
average reward exploit: MDP 7 47.278493914014355 30
average reward exploit: MDP 7 47.332790224071815 30
average reward exploit: MDP 7 47.55764830947769 30
average reward exploit: MDP 7 48.32761177244926 30
average reward exploit: MDP 7 48.156025643297546 30
average reward exploit: MDP 7 47.94575208691681 30
average reward exploit: MDP 7 47.72544600767674 30
average reward exploit: MDP 7 47.08561831330262 30
average reward exploit: MDP 7 47.30326397828561 30
average reward explore: 53.5301930056153
average reward exploit: MDP 8 46.37384094051563 30
average reward exploit: MDP 8 47.24178650475523 30
average reward exploit: MDP 8 45.992659088898996 30
average reward exploit: MDP 8 46.64083134584303 30
average reward exploit: MDP 8 46.56116188649983 30
average reward exploit: MDP 8 46.394333880433024 30
average reward exploit: MDP 8 46.28500760102272 30
average reward exploit: MDP 8 46.50406191890789 30
average reward exploit: MDP 8 46.048

average reward exploit: MDP 1 61.752061725029314 30
average reward exploit: MDP 1 61.91440335551318 30
average reward explore: 71.94180781784578
average reward exploit: MDP 2 58.95115464719856 30
average reward exploit: MDP 2 58.150688046463024 30
average reward exploit: MDP 2 57.64241344343673 30
average reward exploit: MDP 2 57.95756623282184 30
average reward exploit: MDP 2 58.174043238110066 30
average reward exploit: MDP 2 58.38036272734152 30
average reward exploit: MDP 2 58.04803224844655 30
average reward exploit: MDP 2 58.08009751208097 30
average reward exploit: MDP 2 58.583233362578405 30
average reward exploit: MDP 2 57.72670439913319 30
average reward explore: 67.89267774670174
average reward exploit: MDP 3 55.43263682386244 30
average reward exploit: MDP 3 55.49112172992168 30
average reward exploit: MDP 3 56.05942037064694 30
average reward exploit: MDP 3 55.318709626203606 30
average reward exploit: MDP 3 55.0529157112283 30
average reward exploit: MDP 3 55.791368323930

average reward exploit: MDP 6 47.50538918397155 30
average reward exploit: MDP 6 47.71464958410913 30
average reward exploit: MDP 6 47.731644404866316 30
average reward exploit: MDP 6 47.52693546515837 30
average reward explore: 59.36422038767658
average reward exploit: MDP 7 47.342630145041475 30
average reward exploit: MDP 7 47.559971778815225 30
average reward exploit: MDP 7 47.5081988239473 30
average reward exploit: MDP 7 47.78040406139931 30
average reward exploit: MDP 7 47.62988111077099 30
average reward exploit: MDP 7 47.84933620582976 30
average reward exploit: MDP 7 48.15169416012178 30
average reward exploit: MDP 7 47.81475965570847 30
average reward exploit: MDP 7 47.566455482118904 30
average reward exploit: MDP 7 47.938273554123825 30
average reward explore: 55.08851464434099
average reward exploit: MDP 8 45.88905084153121 30
average reward exploit: MDP 8 45.972428960119515 30
average reward exploit: MDP 8 46.20461775315918 30
average reward exploit: MDP 8 46.24490719113

average reward exploit: MDP 1 61.87160117469291 30
average reward exploit: MDP 1 62.29973136372124 30
average reward exploit: MDP 1 61.536087849582984 30
average reward exploit: MDP 1 61.87333358925056 30
average reward exploit: MDP 1 61.12103998051875 30
average reward exploit: MDP 1 61.92208250334143 30
average reward exploit: MDP 1 61.911486144391034 30
average reward explore: 74.5526937404409
average reward exploit: MDP 2 58.716497406664935 30
average reward exploit: MDP 2 57.78732306718397 30
average reward exploit: MDP 2 58.39574347022396 30
average reward exploit: MDP 2 57.298156675267954 30
average reward exploit: MDP 2 58.03942097492587 30
average reward exploit: MDP 2 58.2805736293825 30
average reward exploit: MDP 2 57.88540155630796 30
average reward exploit: MDP 2 58.699334511086924 30
average reward exploit: MDP 2 58.16085771372459 30
average reward exploit: MDP 2 58.19335485800614 30
average reward explore: 71.67667019211427
average reward exploit: MDP 3 55.6897410159949

average reward exploit: MDP 6 48.48859011396361 30
average reward exploit: MDP 6 48.08280873486549 30
average reward exploit: MDP 6 48.07569169725954 30
average reward exploit: MDP 6 47.96869534261749 30
average reward exploit: MDP 6 47.34462639991193 30
average reward exploit: MDP 6 47.74171220328671 30
average reward exploit: MDP 6 47.93323025651312 30
average reward exploit: MDP 6 47.73207932889442 30
average reward exploit: MDP 6 47.27775252341207 30
average reward explore: 54.24685085889587
average reward exploit: MDP 7 48.18017298370496 30
average reward exploit: MDP 7 48.14744522263784 30
average reward exploit: MDP 7 46.54761570121496 30
average reward exploit: MDP 7 47.542764236651436 30
average reward exploit: MDP 7 47.521565349081065 30
average reward exploit: MDP 7 47.745344787187896 30
average reward exploit: MDP 7 48.36010424830855 30
average reward exploit: MDP 7 47.9042708373631 30
average reward exploit: MDP 7 47.31413191851879 30
average reward exploit: MDP 7 47.55851

average reward exploit: MDP 0 66.5984905664278 30
average reward explore: 75.75224330735541
average reward exploit: MDP 1 61.26325576403881 30
average reward exploit: MDP 1 62.31235840457806 30
average reward exploit: MDP 1 62.1086580562779 30
average reward exploit: MDP 1 61.514051567459326 30
average reward exploit: MDP 1 62.0628336503114 30
average reward exploit: MDP 1 61.891696730241634 30
average reward exploit: MDP 1 62.32019851207445 30
average reward exploit: MDP 1 62.12611429583344 30
average reward exploit: MDP 1 61.725644147967266 30
average reward exploit: MDP 1 61.502724342337395 30
average reward explore: 71.64116537922433
average reward exploit: MDP 2 58.415751403859666 30
average reward exploit: MDP 2 58.104608572151626 30
average reward exploit: MDP 2 58.31431802975272 30
average reward exploit: MDP 2 57.76930079755274 30
average reward exploit: MDP 2 58.49103414016694 30
average reward exploit: MDP 2 58.79023942465149 30
average reward exploit: MDP 2 58.2404817556943

average reward exploit: MDP 5 50.70866369730977 30
average reward exploit: MDP 5 51.04399577916893 30
average reward exploit: MDP 5 50.2508741011484 30
average reward explore: 58.10330677686135
average reward exploit: MDP 6 47.90467261711373 30
average reward exploit: MDP 6 47.71877617334014 30
average reward exploit: MDP 6 47.31868823091659 30
average reward exploit: MDP 6 47.93695320922281 30
average reward exploit: MDP 6 47.67270887755078 30
average reward exploit: MDP 6 48.01540565402393 30
average reward exploit: MDP 6 46.50996841610834 30
average reward exploit: MDP 6 47.573493117862526 30
average reward exploit: MDP 6 46.72942541626807 30
average reward exploit: MDP 6 47.33832408223445 30
average reward explore: 54.470753642972475
average reward exploit: MDP 7 47.1710746741224 30
average reward exploit: MDP 7 47.529117120358116 30
average reward exploit: MDP 7 47.855796347408436 30
average reward exploit: MDP 7 47.93381886662501 30
average reward exploit: MDP 7 48.25637671985684

average reward exploit: MDP 0 67.02627181229421 30
average reward exploit: MDP 0 66.87949078661849 30
average reward exploit: MDP 0 66.47071882067202 30
average reward exploit: MDP 0 66.43899555472018 30
average reward exploit: MDP 0 67.01030395769165 30
average reward exploit: MDP 0 66.85135514770975 30
average reward explore: 75.8648285636943
average reward exploit: MDP 1 61.88388909192051 30
average reward exploit: MDP 1 61.51077460305839 30
average reward exploit: MDP 1 61.87844459425085 30
average reward exploit: MDP 1 61.70467233705945 30
average reward exploit: MDP 1 61.55156312862044 30
average reward exploit: MDP 1 62.12139859672638 30
average reward exploit: MDP 1 61.89314192093854 30
average reward exploit: MDP 1 62.1179003588579 30
average reward exploit: MDP 1 62.09056240144768 30
average reward exploit: MDP 1 61.23495937175737 30
average reward explore: 71.67213918177808
average reward exploit: MDP 2 58.58184126596484 30
average reward exploit: MDP 2 57.913298677047344 30

average reward exploit: MDP 5 50.18878288481541 30
average reward exploit: MDP 5 51.04707508830354 30
average reward exploit: MDP 5 50.66787733533349 30
average reward exploit: MDP 5 51.279219829544246 30
average reward exploit: MDP 5 49.85977608227622 30
average reward exploit: MDP 5 50.89112041151235 30
average reward exploit: MDP 5 50.88653672532339 30
average reward exploit: MDP 5 51.07743123018067 30
average reward explore: 59.09636658263759
average reward exploit: MDP 6 48.05882413697546 30
average reward exploit: MDP 6 48.10513203988777 30
average reward exploit: MDP 6 48.452265333305604 30
average reward exploit: MDP 6 48.11169701446296 30
average reward exploit: MDP 6 48.34782641022108 30
average reward exploit: MDP 6 47.563328319920316 30
average reward exploit: MDP 6 47.11165836239465 30
average reward exploit: MDP 6 47.581644001239674 30
average reward exploit: MDP 6 47.75988013566821 30
average reward exploit: MDP 6 47.749577455503 30
average reward explore: 55.22401742357

average reward explore: 80.32410591691745
average reward exploit: MDP 0 66.42264124853081 30
average reward exploit: MDP 0 66.38596682119238 30
average reward exploit: MDP 0 66.62398899320205 30
average reward exploit: MDP 0 66.38818513522031 30
average reward exploit: MDP 0 65.92680333610211 30
average reward exploit: MDP 0 66.31854515363445 30
average reward exploit: MDP 0 65.43226750621561 30
average reward exploit: MDP 0 66.5694384962159 30
average reward exploit: MDP 0 66.42838376155967 30
average reward exploit: MDP 0 66.63695794986309 30
average reward explore: 75.7467633386503
average reward exploit: MDP 1 62.30711073901732 30
average reward exploit: MDP 1 61.89667309976216 30
average reward exploit: MDP 1 61.87298396279657 30
average reward exploit: MDP 1 62.31324813506863 30
average reward exploit: MDP 1 61.29578058281286 30
average reward exploit: MDP 1 61.71374757105432 30
average reward exploit: MDP 1 61.74319998336839 30
average reward exploit: MDP 1 61.91951779473462 30


average reward exploit: MDP 4 52.426644676134394 30
average reward exploit: MDP 4 52.81619017645779 30
average reward explore: 61.70423473853789
average reward exploit: MDP 5 51.08394475801613 30
average reward exploit: MDP 5 50.3530066189576 30
average reward exploit: MDP 5 50.88620791452083 30
average reward exploit: MDP 5 49.921942475274484 30
average reward exploit: MDP 5 50.77433559457109 30
average reward exploit: MDP 5 50.523924937521194 30
average reward exploit: MDP 5 50.39755773468543 30
average reward exploit: MDP 5 51.18145039739998 30
average reward exploit: MDP 5 50.31059027915981 30
average reward exploit: MDP 5 50.22490575629048 30
average reward explore: 58.09468056369211
average reward exploit: MDP 6 47.70183899904226 30
average reward exploit: MDP 6 47.92500302854463 30
average reward exploit: MDP 6 47.49650523052944 30
average reward exploit: MDP 6 47.11381014623179 30
average reward exploit: MDP 6 48.24208896178256 30
average reward exploit: MDP 6 47.53327248513710

average reward exploit: MDP 9 44.31752898879929 30
average reward exploit: MDP 9 44.41508641664247 30
average reward exploit: MDP 9 44.293230123178546 30
average reward exploit: MDP 9 44.48615024671838 30
epoch number:  142
average reward explore: 80.39140072511569
average reward exploit: MDP 0 66.26166453661385 30
average reward exploit: MDP 0 66.38891144779161 30
average reward exploit: MDP 0 65.90442330160047 30
average reward exploit: MDP 0 67.0486957178546 30
average reward exploit: MDP 0 67.28100290430089 30
average reward exploit: MDP 0 66.59091774832049 30
average reward exploit: MDP 0 65.88309333511711 30
average reward exploit: MDP 0 66.01132745979159 30
average reward exploit: MDP 0 66.89452076117729 30
average reward exploit: MDP 0 66.52451110232751 30
average reward explore: 75.72061071799368
average reward exploit: MDP 1 62.083411569366476 30
average reward exploit: MDP 1 61.49861462764764 30
average reward exploit: MDP 1 62.314218772912895 30
average reward exploit: MDP 

average reward exploit: MDP 4 69.75341598721171 30
average reward exploit: MDP 4 73.97343959489498 30
average reward exploit: MDP 4 71.38487815746916 30
average reward exploit: MDP 4 71.91913309647347 30
average reward exploit: MDP 4 73.19651804261254 30
average reward exploit: MDP 4 70.05977320981948 30
average reward explore: 61.90111732346893
average reward exploit: MDP 5 47.00000133236488 30
average reward exploit: MDP 5 46.64626872724278 30
average reward exploit: MDP 5 47.27693400674571 30
average reward exploit: MDP 5 46.57206619576091 30
average reward exploit: MDP 5 45.66126668267316 30
average reward exploit: MDP 5 46.92130059429367 30
average reward exploit: MDP 5 48.208953092168976 30
average reward exploit: MDP 5 47.228864521511696 30
average reward exploit: MDP 5 46.571997978684614 30
average reward exploit: MDP 5 46.535202631523774 30
average reward explore: 60.3127806822656
average reward exploit: MDP 6 47.88350236203855 30
average reward exploit: MDP 6 48.0612288837416

average reward exploit: MDP 9 44.28601048272097 30
average reward exploit: MDP 9 44.27766380253914 30
average reward exploit: MDP 9 44.29563203250174 30
average reward exploit: MDP 9 44.46978587484648 30
average reward exploit: MDP 9 44.31029339144144 30
average reward exploit: MDP 9 44.57623357419708 30
average reward exploit: MDP 9 44.544192716932926 30
average reward exploit: MDP 9 44.26520258180908 30
epoch number:  145
average reward explore: 81.72578317435128
average reward exploit: MDP 0 66.98798114669547 30
average reward exploit: MDP 0 66.31834844899144 30
average reward exploit: MDP 0 66.73651365503335 30
average reward exploit: MDP 0 66.43157325309228 30
average reward exploit: MDP 0 66.64691993867703 30
average reward exploit: MDP 0 66.62561119480218 30
average reward exploit: MDP 0 66.63210243699713 30
average reward exploit: MDP 0 66.07664766650001 30
average reward exploit: MDP 0 66.47960341895165 30
average reward exploit: MDP 0 67.04433699022209 30
average reward explo

average reward exploit: MDP 4 95.62687985952472 30
average reward exploit: MDP 4 95.24996407594581 30
average reward exploit: MDP 4 93.92144922018798 30
average reward exploit: MDP 4 94.63482588829767 30
average reward exploit: MDP 4 95.98820341300599 30
average reward exploit: MDP 4 95.9405489760389 30
average reward exploit: MDP 4 94.15195793862475 30
average reward exploit: MDP 4 95.9611579732573 30
average reward exploit: MDP 4 96.28398921548045 30
average reward exploit: MDP 4 96.9846694746026 30
average reward explore: 61.903515858442276
average reward exploit: MDP 5 76.98882892945717 30
average reward exploit: MDP 5 76.29957693160299 30
average reward exploit: MDP 5 74.65123223199988 30
average reward exploit: MDP 5 73.21589012203302 30
average reward exploit: MDP 5 72.38711207023727 30
average reward exploit: MDP 5 73.62619112204372 30
average reward exploit: MDP 5 73.12054493717561 30
average reward exploit: MDP 5 72.44250635536511 30
average reward exploit: MDP 5 71.340896153

average reward exploit: MDP 8 45.956200014082306 30
average reward explore: 50.10331106048844
average reward exploit: MDP 9 44.430259535235024 30
average reward exploit: MDP 9 44.31418160970054 30
average reward exploit: MDP 9 44.30954734518553 30
average reward exploit: MDP 9 44.42409655872187 30
average reward exploit: MDP 9 44.33506450727375 30
average reward exploit: MDP 9 44.31353346365711 30
average reward exploit: MDP 9 44.42248117254256 30
average reward exploit: MDP 9 44.53027764959314 30
average reward exploit: MDP 9 44.45538774458943 30
average reward exploit: MDP 9 44.28526171183728 30
epoch number:  148
average reward explore: 82.40578733091174
average reward exploit: MDP 0 66.63928698801507 30
average reward exploit: MDP 0 66.15250552812766 30
average reward exploit: MDP 0 66.86498586347828 30
average reward exploit: MDP 0 66.43038366321267 30
average reward exploit: MDP 0 66.25986238790213 30
average reward exploit: MDP 0 66.63292169928033 30
average reward exploit: MDP 

average reward exploit: MDP 3 55.22582065535933 30
average reward exploit: MDP 3 54.92436612083751 30
average reward exploit: MDP 3 56.105499541055174 30
average reward exploit: MDP 3 55.16919463103572 30
average reward explore: 62.90297943082128
average reward exploit: MDP 4 52.38303965799949 30
average reward exploit: MDP 4 52.57729170235424 30
average reward exploit: MDP 4 52.20318480981391 30
average reward exploit: MDP 4 52.042092710608934 30
average reward exploit: MDP 4 52.888987612075724 30
average reward exploit: MDP 4 51.88074477825937 30
average reward exploit: MDP 4 52.80402126570583 30
average reward exploit: MDP 4 52.43409709229724 30
average reward exploit: MDP 4 52.28033248948163 30
average reward exploit: MDP 4 52.986285347668975 30
average reward explore: 61.87548918884759
average reward exploit: MDP 5 51.383862757008814 30
average reward exploit: MDP 5 50.59712150521611 30
average reward exploit: MDP 5 50.6299188084196 30
average reward exploit: MDP 5 50.529869603952

average reward exploit: MDP 8 46.163172354709154 30
average reward exploit: MDP 8 45.66905413685449 30
average reward exploit: MDP 8 45.917855034259524 30
average reward exploit: MDP 8 46.136003104489205 30
average reward exploit: MDP 8 46.06592398658214 30
average reward exploit: MDP 8 46.095435961627366 30
average reward explore: 49.952371423798986
average reward exploit: MDP 9 44.307813561094385 30
average reward exploit: MDP 9 44.430862943343634 30
average reward exploit: MDP 9 44.45945959520726 30
average reward exploit: MDP 9 44.29997877568664 30
average reward exploit: MDP 9 44.29424921865674 30
average reward exploit: MDP 9 44.38190861775771 30
average reward exploit: MDP 9 44.27350329788928 30
average reward exploit: MDP 9 44.30205937203962 30
average reward exploit: MDP 9 44.30735149523021 30
average reward exploit: MDP 9 44.445498031852956 30
epoch number:  151
average reward explore: 82.51024158062661
average reward exploit: MDP 0 66.46953675312808 30
average reward exploit

average reward exploit: MDP 3 56.33641321869558 30
average reward exploit: MDP 3 55.397866505878696 30
average reward exploit: MDP 3 55.66137829433124 30
average reward exploit: MDP 3 56.010762747391524 30
average reward exploit: MDP 3 55.4345227236985 30
average reward exploit: MDP 3 55.40747801004021 30
average reward exploit: MDP 3 55.35992891905594 30
average reward exploit: MDP 3 55.68284342060364 30
average reward explore: 62.756765782861144
average reward exploit: MDP 4 52.44542285360193 30
average reward exploit: MDP 4 52.405482935909205 30
average reward exploit: MDP 4 52.371999577011515 30
average reward exploit: MDP 4 52.214461623308146 30
average reward exploit: MDP 4 52.63763074003969 30
average reward exploit: MDP 4 52.43029180237087 30
average reward exploit: MDP 4 53.06256451280289 30
average reward exploit: MDP 4 52.47307676879316 30
average reward exploit: MDP 4 52.22580553927266 30
average reward exploit: MDP 4 52.465701984125026 30
average reward explore: 61.7899138

average reward exploit: MDP 8 45.83953623968541 30
average reward exploit: MDP 8 46.18613292140989 30
average reward exploit: MDP 8 46.452531464777934 30
average reward exploit: MDP 8 46.66624133014509 30
average reward exploit: MDP 8 45.91618159446173 30
average reward exploit: MDP 8 45.7908544136388 30
average reward exploit: MDP 8 46.062726410643116 30
average reward exploit: MDP 8 45.688557061447945 30
average reward exploit: MDP 8 46.50654360078467 30
average reward exploit: MDP 8 46.44924923342996 30
average reward explore: 50.178046372970314
average reward exploit: MDP 9 44.253167755358405 30
average reward exploit: MDP 9 44.43351195345994 30
average reward exploit: MDP 9 44.541112007356205 30
average reward exploit: MDP 9 44.29382405352841 30
average reward exploit: MDP 9 44.39744552755637 30
average reward exploit: MDP 9 44.29761390297121 30
average reward exploit: MDP 9 44.42545354136187 30
average reward exploit: MDP 9 44.56623883456311 30
average reward exploit: MDP 9 44.57

average reward exploit: MDP 2 58.56501408856301 30
average reward exploit: MDP 2 58.551452203608754 30
average reward explore: 66.35109409178023
average reward exploit: MDP 3 55.08661679252328 30
average reward exploit: MDP 3 54.92707922844443 30
average reward exploit: MDP 3 55.567400676048244 30
average reward exploit: MDP 3 55.87068063918479 30
average reward exploit: MDP 3 55.29636163545982 30
average reward exploit: MDP 3 56.09434868516515 30
average reward exploit: MDP 3 55.22962324154284 30
average reward exploit: MDP 3 55.27905079279301 30
average reward exploit: MDP 3 56.88183329794306 30
average reward exploit: MDP 3 55.60575903983848 30
average reward explore: 63.25382736655791
average reward exploit: MDP 4 53.019959105935136 30
average reward exploit: MDP 4 52.64270978573258 30
average reward exploit: MDP 4 52.47924983844898 30
average reward exploit: MDP 4 52.24643360704124 30
average reward exploit: MDP 4 52.68623661061352 30
average reward exploit: MDP 4 52.4726436775574

average reward exploit: MDP 7 47.42267015100513 30
average reward exploit: MDP 7 47.9773862030883 30
average reward exploit: MDP 7 48.3540938742176 30
average reward exploit: MDP 7 47.76465291383623 30
average reward explore: 53.653581308343846
average reward exploit: MDP 8 45.86655690824126 30
average reward exploit: MDP 8 45.81227346507124 30
average reward exploit: MDP 8 46.72696538135997 30
average reward exploit: MDP 8 46.13677972869219 30
average reward exploit: MDP 8 46.31344212491385 30
average reward exploit: MDP 8 45.92654548061214 30
average reward exploit: MDP 8 45.61344845665892 30
average reward exploit: MDP 8 45.96300842706196 30
average reward exploit: MDP 8 46.26040682843116 30
average reward exploit: MDP 8 45.70364944664338 30
average reward explore: 49.72871150120791
average reward exploit: MDP 9 44.31676508061271 30
average reward exploit: MDP 9 44.294080305037056 30
average reward exploit: MDP 9 44.437323235053064 30
average reward exploit: MDP 9 44.28158358609226 

average reward exploit: MDP 2 58.22493544723363 30
average reward exploit: MDP 2 57.939897230571226 30
average reward exploit: MDP 2 58.353424155501074 30
average reward exploit: MDP 2 58.49157277978515 30
average reward exploit: MDP 2 57.90002365464374 30
average reward exploit: MDP 2 58.33836033970501 30
average reward explore: 66.47285001052695
average reward exploit: MDP 3 54.94517497540797 30
average reward exploit: MDP 3 55.87657590609753 30
average reward exploit: MDP 3 55.190304955785834 30
average reward exploit: MDP 3 55.49594604731117 30
average reward exploit: MDP 3 55.67161624672324 30
average reward exploit: MDP 3 55.41439908371668 30
average reward exploit: MDP 3 55.53075151879178 30
average reward exploit: MDP 3 55.38585817191714 30
average reward exploit: MDP 3 56.001628919001135 30
average reward exploit: MDP 3 55.12090609190332 30
average reward explore: 63.225699256810685
average reward exploit: MDP 4 52.830891925554276 30
average reward exploit: MDP 4 52.6156540838

average reward exploit: MDP 7 47.71254620690361 30
average reward exploit: MDP 7 47.54736580907319 30
average reward exploit: MDP 7 47.58219405255917 30
average reward exploit: MDP 7 47.493295463040916 30
average reward exploit: MDP 7 47.923858904453795 30
average reward exploit: MDP 7 47.72863737507843 30
average reward exploit: MDP 7 47.37839350479365 30
average reward exploit: MDP 7 47.56533046009085 30
average reward explore: 54.20093431180435
average reward exploit: MDP 8 45.80505954811836 30
average reward exploit: MDP 8 45.798805295370826 30
average reward exploit: MDP 8 45.840944069695496 30
average reward exploit: MDP 8 46.904378468795635 30
average reward exploit: MDP 8 45.65879455929782 30
average reward exploit: MDP 8 46.395850973235305 30
average reward exploit: MDP 8 45.749405582651114 30
average reward exploit: MDP 8 46.357919082119004 30
average reward exploit: MDP 8 46.48099017963821 30
average reward exploit: MDP 8 46.11175709140914 30
average reward explore: 49.57705

average reward explore: 72.12848868604505
average reward exploit: MDP 2 57.8557225009291 30
average reward exploit: MDP 2 58.168213277035264 30
average reward exploit: MDP 2 58.699795380179495 30
average reward exploit: MDP 2 57.59431611186196 30
average reward exploit: MDP 2 58.29273233208605 30
average reward exploit: MDP 2 58.08576328215326 30
average reward exploit: MDP 2 58.4144862368937 30
average reward exploit: MDP 2 58.23640733676624 30
average reward exploit: MDP 2 57.73116923671284 30
average reward exploit: MDP 2 57.4704416797779 30
average reward explore: 69.14429439324914
average reward exploit: MDP 3 55.97390178509393 30
average reward exploit: MDP 3 56.16755377867386 30
average reward exploit: MDP 3 55.10886272734279 30
average reward exploit: MDP 3 55.36974067533957 30
average reward exploit: MDP 3 54.44174049760546 30
average reward exploit: MDP 3 55.452367481155584 30
average reward exploit: MDP 3 55.78177077866925 30
average reward exploit: MDP 3 55.84975954417098 3

average reward exploit: MDP 6 47.766605688212984 30
average reward exploit: MDP 6 47.70661328964088 30
average reward explore: 56.7098996626929
average reward exploit: MDP 7 47.92566079827486 30
average reward exploit: MDP 7 48.303167122944025 30
average reward exploit: MDP 7 47.5300972765317 30
average reward exploit: MDP 7 47.92014316765873 30
average reward exploit: MDP 7 47.77556148111727 30
average reward exploit: MDP 7 48.10969363032827 30
average reward exploit: MDP 7 47.696017626396944 30
average reward exploit: MDP 7 47.87135296836303 30
average reward exploit: MDP 7 47.4830039355118 30
average reward exploit: MDP 7 48.07495197438056 30
average reward explore: 54.496949047718
average reward exploit: MDP 8 46.42650517890923 30
average reward exploit: MDP 8 45.55709698433184 30
average reward exploit: MDP 8 46.21082508117841 30
average reward exploit: MDP 8 46.204703755727806 30
average reward exploit: MDP 8 46.159029091990426 30
average reward exploit: MDP 8 46.8693004890124 30

average reward exploit: MDP 1 61.47690401683501 30
average reward exploit: MDP 1 61.88556680032605 30
average reward exploit: MDP 1 61.23653538852441 30
average reward exploit: MDP 1 61.89316315361222 30
average reward explore: 73.28879057473075
average reward exploit: MDP 2 57.92350615705197 30
average reward exploit: MDP 2 58.93298400392375 30
average reward exploit: MDP 2 58.00550498432329 30
average reward exploit: MDP 2 57.46234857743338 30
average reward exploit: MDP 2 56.90803979943834 30
average reward exploit: MDP 2 57.19916557101858 30
average reward exploit: MDP 2 58.28454340807312 30
average reward exploit: MDP 2 58.75515455399373 30
average reward exploit: MDP 2 58.339453493934506 30
average reward exploit: MDP 2 58.15154060007228 30
average reward explore: 68.98454749352224
average reward exploit: MDP 3 56.12798502355614 30
average reward exploit: MDP 3 55.26970036685283 30
average reward exploit: MDP 3 55.890302971263516 30
average reward exploit: MDP 3 55.90105519531355

average reward exploit: MDP 6 47.69194114200079 30
average reward exploit: MDP 6 47.70023563915684 30
average reward exploit: MDP 6 48.15423198115476 30
average reward exploit: MDP 6 48.00544641317142 30
average reward exploit: MDP 6 48.084661276041196 30
average reward exploit: MDP 6 47.48403119368311 30
average reward explore: 57.328439562453
average reward exploit: MDP 7 47.75516919273296 30
average reward exploit: MDP 7 47.7721921582896 30
average reward exploit: MDP 7 48.28425489364049 30
average reward exploit: MDP 7 47.5324920886574 30
average reward exploit: MDP 7 47.44560533562572 30
average reward exploit: MDP 7 47.526036866800425 30
average reward exploit: MDP 7 47.50924336352533 30
average reward exploit: MDP 7 48.32994512517492 30
average reward exploit: MDP 7 48.09189124808389 30
average reward exploit: MDP 7 48.30599697544485 30
average reward explore: 54.74560203828034
average reward exploit: MDP 8 46.00513597655409 30
average reward exploit: MDP 8 46.17253920587186 30


average reward exploit: MDP 1 62.10142910283671 30
average reward exploit: MDP 1 61.880543308226294 30
average reward exploit: MDP 1 61.83269229077505 30
average reward exploit: MDP 1 62.282476386396674 30
average reward exploit: MDP 1 62.08828039631315 30
average reward exploit: MDP 1 61.54286826179512 30
average reward exploit: MDP 1 61.84055531124502 30
average reward exploit: MDP 1 61.688708207226746 30
average reward explore: 69.31850152121527
average reward exploit: MDP 2 58.8492102555643 30
average reward exploit: MDP 2 58.48196478678947 30
average reward exploit: MDP 2 57.86883677025005 30
average reward exploit: MDP 2 57.90129012803489 30
average reward exploit: MDP 2 58.14222419179866 30
average reward exploit: MDP 2 58.33747353534624 30
average reward exploit: MDP 2 57.159133546028755 30
average reward exploit: MDP 2 57.64805759787607 30
average reward exploit: MDP 2 58.149118206574265 30
average reward exploit: MDP 2 57.67255740547912 30
average reward explore: 67.463472424

average reward explore: 59.834480337055325
average reward exploit: MDP 6 48.29786897937813 30
average reward exploit: MDP 6 46.973688231269286 30
average reward exploit: MDP 6 47.70303261715071 30
average reward exploit: MDP 6 47.76498220729842 30
average reward exploit: MDP 6 47.5221545571428 30
average reward exploit: MDP 6 48.120238601413455 30
average reward exploit: MDP 6 47.54371765181772 30
average reward exploit: MDP 6 47.89183611989321 30
average reward exploit: MDP 6 47.76371636330488 30
average reward exploit: MDP 6 48.27299861783399 30
average reward explore: 56.764659902413605
average reward exploit: MDP 7 47.93147527995204 30
average reward exploit: MDP 7 47.75753986879142 30
average reward exploit: MDP 7 47.8894594777754 30
average reward exploit: MDP 7 47.35521117610945 30
average reward exploit: MDP 7 47.695237773026385 30
average reward exploit: MDP 7 48.3101326921367 30
average reward exploit: MDP 7 48.10237031831652 30
average reward exploit: MDP 7 47.56464260963663

average reward exploit: MDP 0 66.58830767235474 30
average reward exploit: MDP 0 66.57863333118812 30
average reward explore: 78.96965456985168
average reward exploit: MDP 1 62.32995444108314 30
average reward exploit: MDP 1 62.09272177063916 30
average reward exploit: MDP 1 61.8550666974116 30
average reward exploit: MDP 1 61.4709741263644 30
average reward exploit: MDP 1 62.16021487872578 30
average reward exploit: MDP 1 62.05746860964487 30
average reward exploit: MDP 1 61.704410653395385 30
average reward exploit: MDP 1 61.89841685158282 30
average reward exploit: MDP 1 61.70857846374344 30
average reward exploit: MDP 1 62.072075169857264 30
average reward explore: 74.45462479499078
average reward exploit: MDP 2 58.16435845418173 30
average reward exploit: MDP 2 58.3047720333425 30
average reward exploit: MDP 2 57.01410304717489 30
average reward exploit: MDP 2 57.15956798177671 30
average reward exploit: MDP 2 58.44808092600418 30
average reward exploit: MDP 2 57.77215041702179 30

average reward exploit: MDP 5 50.29123363853735 30
average reward exploit: MDP 5 51.09856795527911 30
average reward exploit: MDP 5 49.98648417107116 30
average reward exploit: MDP 5 50.79766148922183 30
average reward explore: 60.345814027359786
average reward exploit: MDP 6 47.56809074354705 30
average reward exploit: MDP 6 46.98079493533496 30
average reward exploit: MDP 6 47.33325701055261 30
average reward exploit: MDP 6 48.08716433744111 30
average reward exploit: MDP 6 47.789998696519255 30
average reward exploit: MDP 6 47.74593363439428 30
average reward exploit: MDP 6 47.872613232442895 30
average reward exploit: MDP 6 47.851966843143444 30
average reward exploit: MDP 6 47.548652159976264 30
average reward exploit: MDP 6 47.6825146006098 30
average reward explore: 57.79720183755195
average reward exploit: MDP 7 47.543870705203275 30
average reward exploit: MDP 7 47.074961088377066 30
average reward exploit: MDP 7 47.904398834185905 30
average reward exploit: MDP 7 46.987959436

average reward exploit: MDP 0 66.88488314696181 30
average reward exploit: MDP 0 66.80255035267338 30
average reward exploit: MDP 0 66.47455908343616 30
average reward exploit: MDP 0 66.97334635123774 30
average reward exploit: MDP 0 66.75985188982278 30
average reward exploit: MDP 0 66.19762462855925 30
average reward exploit: MDP 0 66.78303827790097 30
average reward explore: 80.23112705075567
average reward exploit: MDP 1 62.208556685482 30
average reward exploit: MDP 1 61.92819947657703 30
average reward exploit: MDP 1 62.05573132810825 30
average reward exploit: MDP 1 61.83251528059902 30
average reward exploit: MDP 1 61.84515903360259 30
average reward exploit: MDP 1 61.234008321126595 30
average reward exploit: MDP 1 61.84179772788588 30
average reward exploit: MDP 1 61.679013630523315 30
average reward exploit: MDP 1 62.031239349044284 30
average reward exploit: MDP 1 61.65658829229465 30
average reward explore: 75.8006421627135
average reward exploit: MDP 2 57.78916214070988 3

average reward exploit: MDP 5 50.74313200608427 30
average reward exploit: MDP 5 51.65773580736864 30
average reward exploit: MDP 5 50.188753097808885 30
average reward exploit: MDP 5 51.277430218008206 30
average reward exploit: MDP 5 50.8048436195721 30
average reward exploit: MDP 5 50.706008056938536 30
average reward exploit: MDP 5 51.39637815467784 30
average reward exploit: MDP 5 50.4953341524877 30
average reward exploit: MDP 5 50.98382058463141 30
average reward explore: 61.43853749451802
average reward exploit: MDP 6 48.1617996018845 30
average reward exploit: MDP 6 47.9360670040063 30
average reward exploit: MDP 6 47.72544602944164 30
average reward exploit: MDP 6 47.901500205966684 30
average reward exploit: MDP 6 47.752066757915394 30
average reward exploit: MDP 6 48.33215058137562 30
average reward exploit: MDP 6 47.73533758062692 30
average reward exploit: MDP 6 48.12576492355751 30
average reward exploit: MDP 6 47.72178487352082 30
average reward exploit: MDP 6 47.895534

average reward explore: 84.85546332957601
average reward exploit: MDP 0 66.06805441935032 30
average reward exploit: MDP 0 66.46610474021098 30
average reward exploit: MDP 0 66.27384782872184 30
average reward exploit: MDP 0 66.69394443077152 30
average reward exploit: MDP 0 66.30556780187581 30
average reward exploit: MDP 0 66.82270523107272 30
average reward exploit: MDP 0 67.04120279555852 30
average reward exploit: MDP 0 66.24269893785798 30
average reward exploit: MDP 0 67.10715131331463 30
average reward exploit: MDP 0 66.88951578664604 30
average reward explore: 79.7572464036351
average reward exploit: MDP 1 61.922582952301624 30
average reward exploit: MDP 1 61.87467403237806 30
average reward exploit: MDP 1 62.07760576965609 30
average reward exploit: MDP 1 61.31913211916655 30
average reward exploit: MDP 1 61.89554784620653 30
average reward exploit: MDP 1 61.72232918451129 30
average reward exploit: MDP 1 61.921643123887364 30
average reward exploit: MDP 1 62.104301042759666

average reward exploit: MDP 4 52.83893746272817 30
average reward exploit: MDP 4 52.44703326826049 30
average reward explore: 63.03409331754896
average reward exploit: MDP 5 50.846544354163264 30
average reward exploit: MDP 5 50.69275191112477 30
average reward exploit: MDP 5 50.587114315764175 30
average reward exploit: MDP 5 51.467568891494416 30
average reward exploit: MDP 5 51.311261835162945 30
average reward exploit: MDP 5 50.071044552928825 30
average reward exploit: MDP 5 50.599801250625646 30
average reward exploit: MDP 5 50.58167557529405 30
average reward exploit: MDP 5 50.51893515257959 30
average reward exploit: MDP 5 51.5840328427236 30
average reward explore: 59.55420359866796
average reward exploit: MDP 6 47.93969713370564 30
average reward exploit: MDP 6 47.9113788753911 30
average reward exploit: MDP 6 48.083054773907584 30
average reward exploit: MDP 6 47.86581455748213 30
average reward exploit: MDP 6 47.54183874845596 30
average reward exploit: MDP 6 47.31449271616

average reward exploit: MDP 9 44.42077984059691 30
average reward exploit: MDP 9 44.44536672091352 30
average reward exploit: MDP 9 44.420449572080514 30
average reward exploit: MDP 9 44.40556417697004 30
epoch number:  179
average reward explore: 86.4723727161026
average reward exploit: MDP 0 67.03220921011805 30
average reward exploit: MDP 0 66.48061553951814 30
average reward exploit: MDP 0 66.58035391828356 30
average reward exploit: MDP 0 66.86309029805673 30
average reward exploit: MDP 0 65.87889544568218 30
average reward exploit: MDP 0 66.9630051836044 30
average reward exploit: MDP 0 66.42062541651008 30
average reward exploit: MDP 0 66.4061793369646 30
average reward exploit: MDP 0 66.6663942271007 30
average reward exploit: MDP 0 66.92734478484734 30
average reward explore: 80.96648627870218
average reward exploit: MDP 1 62.088478924352366 30
average reward exploit: MDP 1 62.161472505508655 30
average reward exploit: MDP 1 62.05085417631796 30
average reward exploit: MDP 1 6

average reward exploit: MDP 4 52.019349980400875 30
average reward exploit: MDP 4 52.49591518774686 30
average reward exploit: MDP 4 53.037032473711655 30
average reward exploit: MDP 4 52.09247718847004 30
average reward exploit: MDP 4 52.67594781932515 30
average reward exploit: MDP 4 51.88723452366192 30
average reward explore: 65.68036784837128
average reward exploit: MDP 5 50.24474759980242 30
average reward exploit: MDP 5 51.113517380732866 30
average reward exploit: MDP 5 50.78429209476485 30
average reward exploit: MDP 5 51.58876538229607 30
average reward exploit: MDP 5 50.38966390264285 30
average reward exploit: MDP 5 50.15106351138815 30
average reward exploit: MDP 5 50.69699753752062 30
average reward exploit: MDP 5 51.04082986888463 30
average reward exploit: MDP 5 49.91988163718024 30
average reward exploit: MDP 5 50.414112519364174 30
average reward explore: 61.867595230556454
average reward exploit: MDP 6 48.14465361296854 30
average reward exploit: MDP 6 48.15483612598

average reward exploit: MDP 9 44.43451061908821 30
average reward exploit: MDP 9 44.32983416612543 30
average reward exploit: MDP 9 44.466324852986375 30
average reward exploit: MDP 9 44.375398590277634 30
average reward exploit: MDP 9 44.4524652690806 30
average reward exploit: MDP 9 44.41560578640443 30
average reward exploit: MDP 9 44.30707930055723 30
average reward exploit: MDP 9 44.72066370777368 30
epoch number:  182
average reward explore: 87.54791691239001
average reward exploit: MDP 0 65.89366641457814 30
average reward exploit: MDP 0 67.22235975627949 30
average reward exploit: MDP 0 65.88992854947558 30
average reward exploit: MDP 0 66.3991964633832 30
average reward exploit: MDP 0 66.07485178621299 30
average reward exploit: MDP 0 66.78672013718138 30
average reward exploit: MDP 0 66.08387121878458 30
average reward exploit: MDP 0 66.0552043479937 30
average reward exploit: MDP 0 67.11425863835574 30
average reward exploit: MDP 0 66.58111429875962 30
average reward explore

average reward explore: 67.43183147963742
average reward exploit: MDP 4 52.69746795453818 30
average reward exploit: MDP 4 52.61887929182056 30
average reward exploit: MDP 4 52.67169116106628 30
average reward exploit: MDP 4 52.7818986101656 30
average reward exploit: MDP 4 51.855673394193296 30
average reward exploit: MDP 4 53.06676290137087 30
average reward exploit: MDP 4 52.46315008650404 30
average reward exploit: MDP 4 52.569802058376574 30
average reward exploit: MDP 4 52.63024967828707 30
average reward exploit: MDP 4 52.422000743905464 30
average reward explore: 64.1545211936642
average reward exploit: MDP 5 51.372116601871014 30
average reward exploit: MDP 5 51.1374892190335 30
average reward exploit: MDP 5 50.73197314027279 30
average reward exploit: MDP 5 50.55968285324846 30
average reward exploit: MDP 5 49.78131624120834 30
average reward exploit: MDP 5 50.97956596619841 30
average reward exploit: MDP 5 50.884058724231856 30
average reward exploit: MDP 5 50.41747178269263

average reward exploit: MDP 8 46.09827962371739 30
average reward exploit: MDP 8 46.438724893718096 30
average reward explore: 53.30480193264021
average reward exploit: MDP 9 44.33033194835207 30
average reward exploit: MDP 9 44.42055991244569 30
average reward exploit: MDP 9 44.57179014895143 30
average reward exploit: MDP 9 44.577561918113936 30
average reward exploit: MDP 9 44.40051952686683 30
average reward exploit: MDP 9 44.31480865314587 30
average reward exploit: MDP 9 44.30501552589195 30
average reward exploit: MDP 9 44.322911001090574 30
average reward exploit: MDP 9 44.46293156339888 30
average reward exploit: MDP 9 44.303725481662994 30
epoch number:  185
average reward explore: 87.72935278964484
average reward exploit: MDP 0 66.5402371342515 30
average reward exploit: MDP 0 66.90457977490657 30
average reward exploit: MDP 0 66.33911911969045 30
average reward exploit: MDP 0 66.67186111740833 30
average reward exploit: MDP 0 66.6614670606155 30
average reward exploit: MDP 

average reward exploit: MDP 3 58.4475305995564 30
average reward exploit: MDP 3 58.17475210165042 30
average reward exploit: MDP 3 58.323515825942756 30
average reward exploit: MDP 3 58.47507447808502 30
average reward explore: 67.97243875855239
average reward exploit: MDP 4 65.73333589080123 30
average reward exploit: MDP 4 66.52771271337518 30
average reward exploit: MDP 4 66.16077406753074 30
average reward exploit: MDP 4 66.48117689640846 30
average reward exploit: MDP 4 66.25139476726538 30
average reward exploit: MDP 4 66.30827175153523 30
average reward exploit: MDP 4 66.21104493668146 30
average reward exploit: MDP 4 64.90166595114813 30
average reward exploit: MDP 4 59.83806240240933 30
average reward exploit: MDP 4 59.54573672631037 30
average reward explore: 63.95428211382404
average reward exploit: MDP 5 59.4379267808844 30
average reward exploit: MDP 5 59.732416553786635 30
average reward exploit: MDP 5 59.73383282746497 30
average reward exploit: MDP 5 59.79203537752371 3

average reward exploit: MDP 8 46.03997548703456 30
average reward exploit: MDP 8 46.33970119519147 30
average reward exploit: MDP 8 45.64439766459885 30
average reward exploit: MDP 8 46.45324919511269 30
average reward exploit: MDP 8 45.777962649428176 30
average reward exploit: MDP 8 46.817620914177304 30
average reward explore: 54.13831800170153
average reward exploit: MDP 9 44.307767816349376 30
average reward exploit: MDP 9 44.323217493565394 30
average reward exploit: MDP 9 44.44066489055297 30
average reward exploit: MDP 9 44.46422081369676 30
average reward exploit: MDP 9 44.296385391949755 30
average reward exploit: MDP 9 44.4428806156955 30
average reward exploit: MDP 9 44.439722178399784 30
average reward exploit: MDP 9 44.29739477413291 30
average reward exploit: MDP 9 44.247898632527566 30
average reward exploit: MDP 9 44.496094142086726 30
epoch number:  188
average reward explore: 88.2549938980209
average reward exploit: MDP 0 81.3710626782811 30
average reward exploit: M

average reward exploit: MDP 3 100.45840165347937 30
average reward exploit: MDP 3 100.28999172635143 30
average reward exploit: MDP 3 100.37958667673023 30
average reward exploit: MDP 3 100.09777213544167 30
average reward exploit: MDP 3 100.43527581685319 30
average reward exploit: MDP 3 100.2838994530301 30
average reward exploit: MDP 3 100.42980929351651 30
average reward exploit: MDP 3 100.39312396281267 30
average reward exploit: MDP 3 100.33960281092378 30
average reward explore: 68.12275555241953
average reward exploit: MDP 4 96.75919397642176 30
average reward exploit: MDP 4 95.6109967092067 30
average reward exploit: MDP 4 95.6039187972968 30
average reward exploit: MDP 4 95.72729350203637 30
average reward exploit: MDP 4 94.98753887660911 30
average reward exploit: MDP 4 96.18277859734549 30
average reward exploit: MDP 4 94.96083889202434 30
average reward exploit: MDP 4 95.57098889318817 30
average reward exploit: MDP 4 95.89584656910431 30
average reward exploit: MDP 4 96.0

average reward explore: 56.18327009946204
average reward exploit: MDP 8 46.15964203286937 30
average reward exploit: MDP 8 45.67031203859385 30
average reward exploit: MDP 8 46.341791781243096 30
average reward exploit: MDP 8 46.4878582186271 30
average reward exploit: MDP 8 46.67240522712972 30
average reward exploit: MDP 8 46.12677395428186 30
average reward exploit: MDP 8 46.83049886961794 30
average reward exploit: MDP 8 46.40858428701521 30
average reward exploit: MDP 8 46.078534728559376 30
average reward exploit: MDP 8 46.23423341163551 30
average reward explore: 54.02208484289805
average reward exploit: MDP 9 44.276606250959354 30
average reward exploit: MDP 9 44.82800257451463 30
average reward exploit: MDP 9 44.53615140892257 30
average reward exploit: MDP 9 44.43478635952494 30
average reward exploit: MDP 9 44.29844984051609 30
average reward exploit: MDP 9 44.43382832447985 30
average reward exploit: MDP 9 44.42449326745013 30
average reward exploit: MDP 9 44.43251319200049

average reward exploit: MDP 2 58.496394101162196 30
average reward exploit: MDP 2 57.665886235866324 30
average reward exploit: MDP 2 57.782896718897376 30
average reward explore: 72.10858870273745
average reward exploit: MDP 3 55.38764749647411 30
average reward exploit: MDP 3 54.838696337882126 30
average reward exploit: MDP 3 56.22202508033258 30
average reward exploit: MDP 3 56.09710410769801 30
average reward exploit: MDP 3 56.62850264501079 30
average reward exploit: MDP 3 55.92022768078187 30
average reward exploit: MDP 3 56.16149769312148 30
average reward exploit: MDP 3 56.015604596859234 30
average reward exploit: MDP 3 56.40168730580987 30
average reward exploit: MDP 3 55.940894424818154 30
average reward explore: 67.85426752898599
average reward exploit: MDP 4 51.41879808632011 30
average reward exploit: MDP 4 52.62703111513411 30
average reward exploit: MDP 4 52.410298447685854 30
average reward exploit: MDP 4 52.46834799882549 30
average reward exploit: MDP 4 52.624542767

average reward exploit: MDP 7 47.767914299419395 30
average reward exploit: MDP 7 47.97102641225295 30
average reward exploit: MDP 7 47.07688702058965 30
average reward exploit: MDP 7 47.951114741911525 30
average reward exploit: MDP 7 47.476636662686325 30
average reward explore: 55.00265414256167
average reward exploit: MDP 8 46.30578258542317 30
average reward exploit: MDP 8 46.85893515161296 30
average reward exploit: MDP 8 45.6452704022918 30
average reward exploit: MDP 8 47.24339839248315 30
average reward exploit: MDP 8 45.88529911542695 30
average reward exploit: MDP 8 46.51814140591761 30
average reward exploit: MDP 8 46.50982375119046 30
average reward exploit: MDP 8 46.458269334313066 30
average reward exploit: MDP 8 46.17151534423731 30
average reward exploit: MDP 8 45.84295466248245 30
average reward explore: 53.42533774087668
average reward exploit: MDP 9 44.28797485259867 30
average reward exploit: MDP 9 44.241849988746374 30
average reward exploit: MDP 9 44.263558530577

average reward exploit: MDP 2 58.03668365028167 30
average reward exploit: MDP 2 57.657014743007956 30
average reward exploit: MDP 2 58.235104610676906 30
average reward exploit: MDP 2 57.94128480839893 30
average reward exploit: MDP 2 58.16533881633333 30
average reward exploit: MDP 2 58.339335524071615 30
average reward exploit: MDP 2 58.538624913181636 30
average reward exploit: MDP 2 57.97664056854067 30
average reward explore: 70.14960539298642
average reward exploit: MDP 3 56.087535313469054 30
average reward exploit: MDP 3 54.87680716166672 30
average reward exploit: MDP 3 55.06822887586133 30
average reward exploit: MDP 3 55.93132982969053 30
average reward exploit: MDP 3 55.915955947514725 30
average reward exploit: MDP 3 55.56780893822291 30
average reward exploit: MDP 3 55.62526298102205 30
average reward exploit: MDP 3 55.84215947133806 30
average reward exploit: MDP 3 56.181353390343425 30
average reward exploit: MDP 3 56.02985343523476 30
average reward explore: 66.264852

average reward exploit: MDP 7 47.87769444091493 30
average reward exploit: MDP 7 47.743087383394204 30
average reward exploit: MDP 7 47.055073409840006 30
average reward exploit: MDP 7 48.32812757300708 30
average reward exploit: MDP 7 47.27551220675402 30
average reward exploit: MDP 7 47.528685919474455 30
average reward exploit: MDP 7 47.31793582970886 30
average reward exploit: MDP 7 47.161605420357155 30
average reward exploit: MDP 7 48.285436263529476 30
average reward exploit: MDP 7 46.994844583494796 30
average reward explore: 54.784925374336154
average reward exploit: MDP 8 46.38210059490739 30
average reward exploit: MDP 8 46.39870422558361 30
average reward exploit: MDP 8 46.71625379195284 30
average reward exploit: MDP 8 45.95005034922723 30
average reward exploit: MDP 8 46.268894105631425 30
average reward exploit: MDP 8 46.46989198236249 30
average reward exploit: MDP 8 46.616090527128414 30
average reward exploit: MDP 8 46.417588499634 30
average reward exploit: MDP 8 46.

average reward exploit: MDP 1 61.58503374082059 30
average reward exploit: MDP 1 62.68821345366691 30
average reward explore: 76.36374391574476
average reward exploit: MDP 2 58.38414406810781 30
average reward exploit: MDP 2 58.6863132705267 30
average reward exploit: MDP 2 57.935036434011096 30
average reward exploit: MDP 2 58.344031362358926 30
average reward exploit: MDP 2 58.10073635590925 30
average reward exploit: MDP 2 58.35883251341773 30
average reward exploit: MDP 2 58.55122478943885 30
average reward exploit: MDP 2 58.279555396056494 30
average reward exploit: MDP 2 58.91468284893606 30
average reward exploit: MDP 2 57.7662881146136 30
average reward explore: 72.74671283816407
average reward exploit: MDP 3 55.29519431738791 30
average reward exploit: MDP 3 56.25815903604395 30
average reward exploit: MDP 3 55.575140906932575 30
average reward exploit: MDP 3 56.2864204091456 30
average reward exploit: MDP 3 55.94624475523525 30
average reward exploit: MDP 3 56.2308275208326 3

average reward exploit: MDP 6 48.14546285732521 30
average reward exploit: MDP 6 47.40052686963064 30
average reward exploit: MDP 6 47.51361734495236 30
average reward exploit: MDP 6 47.755639739308954 30
average reward explore: 61.48038951850906
average reward exploit: MDP 7 47.72820867581065 30
average reward exploit: MDP 7 47.144622920149416 30
average reward exploit: MDP 7 47.321356901830356 30
average reward exploit: MDP 7 48.12192375480393 30
average reward exploit: MDP 7 47.73916254470809 30
average reward exploit: MDP 7 48.09216811439426 30
average reward exploit: MDP 7 47.77662340054132 30
average reward exploit: MDP 7 47.95521880585566 30
average reward exploit: MDP 7 47.690819225597146 30
average reward exploit: MDP 7 47.9010055369964 30
average reward explore: 58.706389927482164
average reward exploit: MDP 8 46.26242564336824 30
average reward exploit: MDP 8 45.49098012674761 30
average reward exploit: MDP 8 45.85799870462668 30
average reward exploit: MDP 8 46.226214028199

In [9]:
explore_paths, explore_rewards = a.sample_paths_explore(a.env,1)

In [10]:
print("average reward explore:", np.mean(explore_rewards))
M = a.stack_trajectories(explore_paths)

average reward explore: 78.12981287064275


In [11]:
Z = a.sess.run(a.Z, feed_dict = {a.encoder_input_placeholder: M,a.sequence_length_placeholder: a.length })

In [12]:
Z = np.reshape(Z,[1,len(Z)])

In [13]:
exploit_paths, exploit_rewards = a.sample_paths_exploit(a.env,Z,1)

In [14]:
print("average reward exploit", np.sum(exploit_rewards) / num_traj, len(exploit_rewards))

average reward exploit 4466.792213821265 2000


In [21]:
a.num_traj = 2000

In [None]:
for i in range(20):
    explore_paths, explore_rewards = a.sample_paths_explore(a.env,30*np.random.rand())
    print("gravity:", a.sim.model.opt.gravity)
    print("average reward explore:", np.mean(explore_rewards))
    M = a.stack_trajectories(explore_paths)
    Z = a.sess.run(a.Z, feed_dict = {a.encoder_input_placeholder: M,a.sequence_length_placeholder: a.length })
    Z = np.reshape(Z,[1,len(Z)])
    exploit_paths, exploit_rewards = a.sample_paths_exploit(a.env,Z,1)
    print("average reward exploit", np.sum(exploit_rewards) / num_traj, len(exploit_rewards))
    

gravity: [  0.           0.         -14.23799705]
average reward explore: 55.876055596810076
average reward exploit 4131.248029732218 2000
gravity: [  0.           0.         -28.92469711]
average reward explore: 26.375929544684027
average reward exploit 4128.3822263169 2000
gravity: [  0.           0.         -13.78489803]
average reward explore: 56.98693803520648
average reward exploit 4131.243695054299 2000
gravity: [  0.           0.         -13.99769514]
average reward explore: 56.371840402102315
average reward exploit 4127.052310839154 2000
gravity: [  0.           0.         -24.81371372]
average reward explore: 35.30427806908966
average reward exploit 4130.655895877959 2000
gravity: [  0.          0.        -20.5110446]
average reward explore: 39.77185976141281
average reward exploit 4126.369389991184 2000
gravity: [  0.           0.         -24.32955676]
average reward explore: 36.2316832686866
average reward exploit 4124.35278186741 2000
gravity: [  0.           0.         -1

In [None]:
observations_explore = np.concatenate([path["observation"] for path in explore_paths])
new_observations_explore = np.concatenate([path["new_obs"] for path in explore_paths])
new_actions_explore = np.concatenate([path["new_acs"] for path in explore_paths])
actions_explore = np.concatenate([path["action"] for path in explore_paths])
rewards_explore = np.concatenate([path["reward"] for path in explore_paths])
returns_explore = a.get_returns(explore_paths, explore = True)

In [None]:
M = np.array(a.stack_trajectories(explore_paths))
Z = a.sess.run(a.Z, feed_dict = {a.encoder_input_placeholder: M,a.sequence_length_placeholder: a.length })
Z = np.reshape(Z,[1,len(Z)])

In [None]:
 print("average reward explore: MDP", np.sum(explore_rewards)/num_traj, len(explore_rewards))

In [None]:
exploit_paths, exploit_rewards = a.sample_paths_exploit(a.env,Z,1000)

In [None]:
print("average reward exploit: MDP", np.sum(exploit_rewards) / num_traj, len(exploit_rewards))

In [None]:
def randomize_pendulum(env):
    m = config['pendulum_mass_min'] + np.random.rand()*(config['pendulum_mass_max'] - config['pendulum_mass_min'])
    l = config['pendulum_len_min'] + np.random.rand()*(config['pendulum_len_max'] - config['pendulum_len_min'])
    env.m = m
    env.l = l

In [None]:
def randomize_hopper(env):
    ts = config['torso_min'] + np.random.rand()*(config['torso_max'] - config['torso_min'])
    f = config['friction_min'] + np.random.rand()*(config['friction_max'] - config['friction_min'])
    
    env.friction = f
    env.torso_size = ts
    env.apply_env_modifications()

In [None]:
import yaml
cfg_filename = 'hopper-config.yml'
with open(cfg_filename,'r') as ymlfile:
    config = yaml.load(ymlfile)

In [None]:
env = Randomizer(gym.make('Hopper-v2'), randomize_hopper)
env.unwrapped.__dict__.keys()
env.reset()

In [None]:
m = config['pendulum_mass_min'] + np.random.rand()*(config['pendulum_mass_max'] - config['pendulum_mass_min'])

In [None]:
config

In [None]:
env_id = 'Hopper-v2'
e = gym.make(env_id)

In [None]:
e.unwrapped.__dict__.keys()

In [None]:
e.unwrapped.apply_env_modifications()

In [7]:
a.sim.model.opt

<mujoco_py.cymj.PyMjOption at 0x1c23ba8ac8>