# Run The Agent on Mountain Car

In [1]:
import tensorflow as tf
import tensorflow_probability as tfp
import keras
from keras import layers
import numpy as np
import matplotlib.pyplot as plt
import gym

import pandas as pd

In [2]:
from vae_recurrent import VAE, create_decoder, create_encoder
from transition_gru import TransitionGRU
from recurrent_agent import DAIFAgentRecurrent
from prior_model import PriorModelBellman
from habitual_action_network import HabitualAction

In [3]:
from util import random_observation_sequence, transform_observations
from train_agent import train_single_agent

In [4]:
from identity_vae import IdentityVAE, identity_encoder, identity_decoder

What does the agent do?
- The agent plans using a policy then executes that policy for 12 simulation timesteps, the first two actions of the policy are executed for 6 steps each

What data does it accumulate?
- It accumulates 12 observation actions pairs

How is it trained?
- VAE is trained to reproduce observations using the latent states
- Transition is trained by taking previous hidden state and previous latent state and trying to predict the next latent state


Online learning For all tasks, we initialize all the agents with random weights and learn online only. Training an agent for 150 epochs takes about 3 minutes on a single CPU core (Intel I7-4870HQ). In contrast, previous approaches using active inference [Ueltzhöffer, 2018, Tschantz et al., 2019, 2020] and policy gradient methods (e.g., [Liu et al., 2017]) use (offline) policy replay and typically need hours of GPU-accelerated compute while achieving similar convergence. To our knowledge, this is the first model-based RL method to learn online using neural network representations. This is afforded by the high sample efficiency of the FEEF, which directs exploration towards states that are uncertain for both the encoder and transition models.


Why this is true?

In [5]:
# Hide GPU from visible devices
tf.config.set_visible_devices([], 'GPU')

## Test with no prior model FEEF

In [6]:
# class HabitualAction(keras.Model):
#
#     def __init__(self, latent_dim, action_dim, planning_horizon, dense_units, action_std_dev=0.05, **kwargs):
#         super(HabitualAction, self).__init__(**kwargs)
#
#         habit_action_inputs = layers.Input(latent_dim)
#         h = habit_action_inputs
#         for d in dense_units:
#             h = layers.Dense(d, activation="relu")(h)
#
#         a_mean = layers.Dense(action_dim, activation="tanh", name="z_mean")(h)
#         # a_log_sd = layers.Dense(action_dim, name="z_log_var")(h)
#         # a_stddev = tf.exp(a_log_sd)
#
#         # self.habit_action_model = keras.Model(habit_action_inputs, [a_mean, a_stddev], name="habit_action")
#         self.habit_action_model = keras.Model(habit_action_inputs, a_mean, name="habit_action")
#
#         # add the loss over all time steps
#         self.loss_tracker = keras.metrics.Sum(name="loss")
#
#         self.action_std_dev = action_std_dev
#
#     def call(self, inputs, training=None, mask=None):
#         return self.habit_action_model(inputs)
#
#
#     @property
#     def metrics(self):
#         return [self.loss_tracker]
#
#
#     def train_step(self, data):
#         # Unpack the data. Its structure depends on your model and
#         # on what you pass to `fit()`.
#
#         latent_states, outcomes = data
#         true_actions, cum_discounted_rewards = outcomes
#
#         # TODO what do I assume the
#         with tf.GradientTape() as tape:
#             # a_mean, a_stddev = self.habit_action_model(latent_states, training=True)  # Forward pass
#             a_mean = self.habit_action_model(latent_states, training=True)  # Forward pass
#
#             print(a_mean, true_actions)
#
#             log_loss = ll_gaussian(a_mean, true_actions, self.action_std_dev**2, use_consts=False)
#             weighted_log_loss = log_loss * cum_discounted_rewards
#
#             # need to multiply by negative one because neural net does gradient descent not ascent
#             neg_weighted_log_loss = -1 * weighted_log_loss
#
#         # Compute gradients
#         trainable_vars = self.trainable_variables
#         gradients = tape.gradient(neg_weighted_log_loss, trainable_vars)
#         # Update weights
#         self.optimizer.apply_gradients(zip(gradients, trainable_vars))
#         # Update metrics (includes the metric that tracks the loss)
#         self.loss_tracker.update_state(neg_weighted_log_loss)
#         return {
#             "loss": self.loss_tracker.result()
#         }
#
#
# def compute_discounted_cumulative_reward(rewards, discount_factor):
#
#     gamma = np.ones_like(rewards) * discount_factor
#     # print(gamma.shape)
#     gamma_t = np.power(gamma, np.arange(rewards.shape[0]).reshape(rewards.shape[0], 1))  # discounted through time
#
#     # print(gamma_t)
#     # print(gamma_t.shape)
#
#     # discounted rewards starting from the start
#     discounted_rewards = np.multiply(rewards, gamma_t)
#
#     # print(rewards)
#     # print(discounted_rewards)
#
#     n = rewards.shape[0]
#
#     # upper trianglur matrix with row i equal to 1/(discount_factor**i)  indexing from 0
#     discount_factor_matrix = (np.tril(np.ones((n,n))) * (1/gamma_t)).T
#     discount_factor_matrix
#
#     # cumulative discounted_rewards
#     cumulative_discounted_rewards = np.matmul(discount_factor_matrix, discounted_rewards)
#     return cumulative_discounted_rewards
#
#
#
# def ll_gaussian(pred, target, variance, use_consts=True):
#
#     log_prob = -1 * ((pred - target)**2/(2*variance))
#
#     if use_consts:
#         const = 0.5*np.log(2*np.pi*variance)
#         log_prob += const
#
#     return tf.reduce_sum(log_prob, axis=1)

In [7]:
# import numpy as np
# # import gym
# import pandas as pd
# import tensorflow as tf
#
# from util import transform_observations
#
#
# def train_single_agent(mcc_env,
#                        agent,
#                        obs_max,
#                        obs_min,
#                        observation_noise_stddev,
#                        action_repeats,
#                        num_actions_to_execute,
#                        num_episodes=100,
#                        train_on_full_data=True,
#                        show_replay_training=False,
#                        replay_train_epochs=2,
#                        train_during_episode=True,
#                        train_vae=True,
#                        train_tran=True,
#                        train_prior=False,
#                        train_habit=True):
#
#     # Set up to store results in pandas frame
#     cols = ["episode", "success", "sim_steps", "VFE_post_run", "noise_stddev"]
#     rows = []
#
#     for n in range(num_episodes):
#
#         print("Episode", n+1)
#         # arrays to store observations, actions and rewards
#
#         all_pre_observations = []
#         all_post_observations = []
#         all_action = []
#         observation_sequence = []
#         actions_executed = []
#         reward_sequence = []
#
#         # get the first observation from the environment
#         first_observation = mcc_env.reset()
#         print(first_observation)
#         mcc_env.render()
#         # first_observation = np.array([first_observation, 0])
#
#         # apply noise to and scaling to first observation
#         first_observation_noisy = transform_observations(first_observation, obs_max, obs_min, observation_noise_stddev)
#
#         # find the first policy
#         policy_observation = first_observation_noisy
#         policy = agent.select_policy(policy_observation)
#
#         # loop until episode ends or the agent succeeds
#         t = 0
#         done = False
#         while not done:
#
#             # get the actions from the policy and reshape to desired form
#             actions = policy.mean()
#             actions = tf.reshape(actions, (actions.shape[0], agent.tran.action_dim))  # [num_actions, action_dim]
#             actions = actions.numpy()
#
#             # print(policy_observation, actions)
#
#             # get the actions that we will execute before changing policy
#             actions_to_execute = []
#             for action in actions[0:num_actions_to_execute]:
#                 actions_to_execute = actions_to_execute + [action]*action_repeats
#
#             # agent executes policy and gathers observations
#             for action in actions_to_execute:
#                 observation, reward, done, info = mcc_env.step(action)  # action should be array to satisfy gym requirements
#
#                 # view the environment
#                 mcc_env.render()
#
#                 actions_executed.append(action)
#                 observation_sequence.append(observation)
#                 reward_sequence.append(reward)
#
#                 t += 1
#                 if done:
#
#                     # did we succeed
#                     if t < 999:
#                         print(policy_observation)
#                         print(policy.mean())
#
#                         success = True
#
#                     else:
#                         success = False
#
#                     # get a full observations set
#                     all_post_observations = np.vstack(all_post_observations)
#                     all_pre_observations = np.vstack(all_pre_observations)
#                     all_action = np.vstack(all_action)
#
#                     # should we train on final full data run
#                     if train_on_full_data:
#                         # agent.model_vae.fit(all_post_observations, epochs=replay_train_epochs, verbose=show_replay_training)
#                         agent.reset_tran_hidden_state()
#                         agent.train(all_pre_observations, all_post_observations, all_action, rewards=None, train_vae=train_vae, train_tran=train_tran, train_prior=train_prior, train_habit=train_habit)
#
#
#                     # get the VFE of the model for the run
#                     VFE = float(tf.reduce_mean(agent.model_vae.compute_loss(all_post_observations)))
#
#                     # finally break free from the loop
#                     break
#
#             if not done:
#
#                 actions_executed = np.array(actions_executed).reshape((len(actions_executed), agent.tran.action_dim))
#
#                 # scale and add noise to the observation
#                 observation_sequence = transform_observations(observation_sequence, obs_max, obs_min, observation_noise_stddev)
#
#                 # get the noisy observations for pre and post actions
#                 pre_observation_sequence = np.vstack([policy_observation, observation_sequence[:-1]])
#                 post_action_observation_sequence = observation_sequence
#
#                 # print(post_action_observation_sequence)
#
#                 all_pre_observations.append(pre_observation_sequence)
#                 all_post_observations.append(post_action_observation_sequence)
#                 all_action.append(actions_executed)
#
#                 # print("pol", policy_observation)
#                 # print("obs", observation_sequence)
#                 # print("pre", pre_observation_sequence)
#                 # print("post", post_action_observation_sequence)
#
#                 # if time to train the agent
#                 if train_during_episode:
#                     agent.train(pre_observation_sequence, post_action_observation_sequence, actions_executed, reward_sequence, train_vae=train_vae, train_tran=train_tran, train_prior=train_prior, train_habit=train_habit)
#
#                 # the new observation we use to select a policy is the last observation in observation_sequences
#                 policy_observation = observation_sequence[-1]
#
#                 # select a new policy and clear everything
#                 policy = agent.select_policy(policy_observation)
#
#                 # clear the observations
#                 observation_sequence = []
#                 reward_sequence = []
#                 actions_executed = []
#
#         rows.append(dict(zip(cols, [n, success, t, VFE, observation_noise_stddev])))
#
#         if success:
#             print("Success in episode", n+1, "at time step", t)
#         else:
#             print("No Success")
#
#     results = pd.DataFrame(rows, columns=cols)
#
#     mcc_env.close()
#
#     return agent, results
#
#
#


In [8]:
# import tensorflow as tf
# import tensorflow_probability as tfp
# import numpy as np
#
# # from vae_recurrent import VAE
#
#
# class DAIFAgentRecurrent:
#
#     def __init__(self,
#                  prior_model,
#                  vae,
#                  tran,
#                  habitual_action_net,
#                  given_prior_mean,
#                  given_prior_stddev,
#                  agent_time_ratio=6,
#                  planning_horizon=15,
#                  n_policies=1500,
#                  n_cem_policy_iterations=2,
#                  n_policy_candidates=70,
#                  tran_train_epochs=1,
#                  vae_train_epochs=1,
#                  habit_train_epochs=1,
#                  train_vae=True,
#                  train_tran=True,
#                  train_prior_model=False,
#                  train_habit_net=False,
#                  use_kl_extrinsic=True,
#                  use_kl_intrinsic=True,
#                  use_FEEF=True,
#                  show_vae_training=False,
#                  show_tran_training=False,
#                  show_prior_training=False,
#                  show_habit_training=False):
#
#         super(DAIFAgentRecurrent, self).__init__()
#
#         self.planning_horizon = planning_horizon
#         self.n_policy_candidates = n_policy_candidates
#         self.n_policies = n_policies
#         self.n_cem_policy_iterations = n_cem_policy_iterations
#
#         self.vae_train_epochs = vae_train_epochs
#         self.tran_train_epochs = tran_train_epochs
#         self.train_habit_net = train_habit_net
#         self.train_vae = train_vae
#         self.train_tran = train_tran
#         self.habit_train_epochs = habit_train_epochs
#
#         # do we use the kl divergence for extrinsic vs intrinsic
#         self.use_kl_intrinsic = use_kl_intrinsic
#         self.use_kl_extrinsic = use_kl_extrinsic
#
#         # do we use the FEEF or EFE?
#         self.use_FEEF = use_FEEF
#
#         self.given_prior_mean = given_prior_mean
#         self.given_prior_stddev = given_prior_stddev
#
#         # full vae
#         self.model_vae = vae
#         self.model_vae.compile(optimizer=tf.keras.optimizers.Adam())
#         self.show_vae_training = show_vae_training
#
#         # transition
#         # takes action plus last state and outputs next latent state
#         self.tran = tran
#         self.tran.compile(optimizer=tf.keras.optimizers.Adam())
#         self.show_tran_training = show_tran_training
#         # track the hidden state of the transition gru model
#         self.hidden_state = None
#
#         # Prior model
#         self.prior_model = prior_model
#         self.train_prior = train_prior_model
#         self.show_prior_training = show_prior_training
#
#
#         # habitual action model
#         self.habit_action_model = habitual_action_net
#         self.habit_action_model.compile(optimizer=tf.keras.optimizers.Adam())
#         self.show_habit_training = show_habit_training
#
#         # how much is the agents planning time compressed compared to the simulation time
#         self.agent_time_ratio = agent_time_ratio
#
#
#     # We use this function to reset the hidden state of the transition model when we want to train on the full data set
#     def reset_tran_hidden_state(self):
#         self.hidden_state = None
#
#
#     # def perceive_and_act(self, observation, reward, done):
#
#
#
#
#     def train(self,
#               pre_observations_raw,
#               post_observations_raw,
#               actions_complete,
#               rewards,
#               train_vae=True,
#               train_tran=True,
#               train_prior=True,
#               train_habit=True):
#
#
#         # If we the episode terminates then pre observations and post obersvations raw could be any length but
#
#
#         # compress the observations based on the agents time compression factor
#         pre_observations = pre_observations_raw[::self.agent_time_ratio]  # for example take every 6th element
#         post_observations = post_observations_raw[self.agent_time_ratio - 1:][::self.agent_time_ratio]
#
#         # only look at the unique actions we took rather than the repeated actions
#         actions = actions_complete[::self.agent_time_ratio]
#
#         # there is a chance pre_observations is longer than post if we are training on the whole data set because the episode ended
#         if len(pre_observations) > len(post_observations):
#             print("yes")
#             print(pre_observations)
#             print(post_observations)
#             print(actions)
#             pre_observations = pre_observations[:-1]
#             actions = actions[:-1]
#
#         # print(pre_observations_raw)
#         # print(pre_observations)
#         # print(post_observations_raw)
#         # print(post_observations)
#
#         #### TRAIN THE TRANSITION MODEL ####
#         if train_tran:
#
#             num_observations = pre_observations.shape[0]
#             observation_dim = pre_observations.shape[1]
#             action_dim = actions.shape[1]
#             latent_dim = self.model_vae.latent_dim
#
#             # find the actual observed latent states using the vae
#             pre_latent_mean, pre_latent_stddev, pre_latent = self.model_vae.encoder(pre_observations)
#             post_latent_mean, post_latent_stddev, post_latent = self.model_vae.encoder(post_observations)
#
#             # set up the input training data that we use to train the transition model
#             z_train = np.concatenate([np.array(pre_latent_mean), np.array(actions)], axis=1)
#
#             # we use the sequence to find the right hidden states to use as input
#             z_train_seq = z_train.reshape((1, num_observations, latent_dim + action_dim))
#             z_train_singles = z_train.reshape(num_observations, 1, latent_dim + action_dim)
#
#             # the previous hidden state is the memory after observing some sequences but it might be None if we're just starting
#             if self.hidden_state is None:
#                 self.hidden_state = np.zeros((1, self.tran.hidden_units))
#
#             # find the hidden states at t=0, t=1, t=2, ..., t=num_observations - 1
#             _, _, _, h_states = self.tran((z_train_seq, self.hidden_state))
#
#             # squeeze so we make the shape [num_observations, hidden_units]
#             h_states = tf.squeeze(h_states)
#
#             # exclude the last state as this will become the hidden state later on. next hidden state will become our new memory
#             h_states_for_training = h_states[:-1]
#             # next_hidden_state = h_states[-1]
#
#             # add the current hidden state we saved to the start. This has h0, h1, h2, .. h=num_observations - 1
#             h_states_for_training = tf.concat([self.hidden_state, h_states_for_training], axis=0)
#
#             # use the hidden states with the pre and post observations to train transition model
#             self.tran.fit((z_train_singles, h_states_for_training), (post_latent_mean, post_latent_stddev), epochs=self.tran_train_epochs, verbose=self.show_tran_training, batch_size=z_train_singles.shape[0])
#
#             # now find the new predicted hidden state that we will use for finding the policy
#             # TODO not sure if I should pass the old hidden state or reset it to 0
#             _, _, final_hidden_state, _ = self.tran((z_train_seq, self.hidden_state))
#             # _, _, final_hidden_state, _ = self.tran((z_train_seq, None))
#
#             z_pred, _, _, _ = self.tran((z_train_singles, h_states_for_training))
#
#             # print("LOSS")
#             # print(tf.reduce_mean(self.tran.compute_loss((z_train_singles, h_states_for_training), (post_latent_mean, post_latent_stddev))))
#             #
#             # print("PRED VS REAL")
#             # for i in range(len(z_pred)):
#             #     print(z_pred[i], post_latent_mean[i])
#
#             self.hidden_state = final_hidden_state
#
#         #### TRAIN THE VAE ####
#         if train_vae:
#             # train the vae model on post_observations because these are all new
#             # self.model_vae.fit(pre_observations_raw, epochs=self.vae_train_epochs, verbose=self.show_vae_training)
#             self.model_vae.fit(pre_observations, epochs=self.vae_train_epochs, verbose=self.show_vae_training, batch_size=pre_observations.shape[0])
#
#             # print("true", pre_observations)
#             # print("pred", self.model_vae(pre_observations))
#
#         #### TRAIN THE PRIOR MODEL ####
#         if train_prior:
#             # self.prior_model.train(post_observations, rewards, verbose=self.show_prior_training)
#             self.prior_model.train(post_observations_raw, rewards, verbose=self.show_prior_training)
#
#         #### TRAIN THE HABIT ACTION NET ####
#         if train_habit:
#             self.habit_action_model.fit(pre_observations, actions, epochs=self.habit_train_epochs, verbose=self.show_habit_training, batch_size=pre_observations.shape[0])
#
#
#     def select_policy(self, observation):
#
#         # TODO do you take the mean or that latent here?
#         # get the latent state from this observation
#         _,  _, latent_state = self.model_vae.encoder(observation.reshape(1, observation.shape[0]))
#         # latent_state = latent_state.numpy().reshape((1, latent_state.shape[0]))
#
#         # print(latent_state)
#         # select the policy
#         policy_mean, policy_stddev = self.cem_policy_optimisation(latent_state)
#
#         # return a distribution that we can sample from
#         return tfp.distributions.MultivariateNormalDiag(loc=policy_mean, scale_diag=policy_stddev)
#
#     # TODO Fix this so we can use different action dimensions
#     def cem_policy_optimisation(self, z_t_minus_one):
#
#         # need to change these two if the policy dimension changes
#         mean_best_policies = tf.zeros(self.planning_horizon)
#         std_best_policies = tf.ones(self.planning_horizon)
#
#         for i in range(self.n_cem_policy_iterations):
#             policy_distr = tfp.distributions.MultivariateNormalDiag(loc=mean_best_policies, scale_diag=std_best_policies)
#             policies = policy_distr.sample([self.n_policies])
#             policies = tf.clip_by_value(policies, clip_value_min=-1, clip_value_max=1)
#
#             # project trajectory into the future using transition model and calculate FEEF for each policy
#             policy_results = self.forward_policies(policies.numpy(), z_t_minus_one)
#             FEEFs = self.evaluate_policy(*policy_results)
#
#             # print("POLICIES", policies)
#             # print("FEEFS", FEEFs)
#
#             FEEFs = tf.convert_to_tensor(FEEFs)
#
#             # sum over the timesteps to get the FEEF for each policy
#             FEEFs_sum = tf.reduce_sum(FEEFs, axis=0)
#
#             # multiply by one to find largest value which is euqivalent to smallest FEEF with top_k
#             neg_FEEF_sum = -1*FEEFs_sum
#
#             result = tf.math.top_k(neg_FEEF_sum, self.n_policy_candidates, sorted=False)
#             min_FEEF_indices = result.indices
#
#             # update the policy distributions
#             mean_best_policies = tf.reduce_mean(tf.gather(policies, min_FEEF_indices), axis=0)
#             std_best_policies = tf.math.reduce_std(tf.gather(policies, min_FEEF_indices), axis=0)
#
#
#         # TODO not sure why we need all of this is with the x means? I think it's for training but maybe not
#
#         # One last forward pass to gather the stats of the policy mean
#         #FEEFs, next_x_means, next_x_stds = self._forward_policies(mean_best_policies.unsqueeze(1))
#         # return mean_best_policies, std_best_policies, FEEFs.detach().squeeze(1), next_x_means.detach().squeeze(1), next_x_stds.detach().squeeze(1)
#
#         return mean_best_policies, std_best_policies
#
#
#     def forward_policies(self, policies, z_t_minus_one):
#         """
#         Forward propogate a policy and compute the FEEF of each policy
#         :param z_t_minus_one:
#         :return:
#         """
#
#         # stack up the new observation to have shape (self.n_policies, latent_dim) when z_t_minus is tensor with shape (1, latent_dim
#         prev_latent_mean = tf.squeeze(tf.stack([z_t_minus_one]*self.n_policies, axis=1))
#
#         policy_posteriors = []
#         policy_sds = []
#         likelihoods = []
#         z_means = []
#         z_sds = []
#
#         # get the starting hidden state that coressponds to the memory stored by the previous sequences. Should have shape (1, self.tran.num_hidden_units) for the observed sequence
#         # extend the current hidden state to the number of policies present
#         if self.hidden_state is None:
#             cur_hidden_state = np.zeros((self.n_policies, self.tran.hidden_units))
#         else:
#             cur_hidden_state = np.vstack([self.hidden_state]*self.n_policies)
#
#         # print(cur_hidden_state)
#
#         # find the predicted latent states from the transition model
#         for t in range(self.planning_horizon):
#
#             # print(prev_latent_mean)
#
#             ob_plus_action = np.concatenate([prev_latent_mean, policies[:, t].reshape(self.n_policies, 1)], axis=1)
#             tran_input = ob_plus_action.reshape((self.n_policies, 1, ob_plus_action.shape[1]))  # reshape to pass to GRU
#
#             # print(tran_input)
#
#             next_latent_mean, next_latent_sd, next_hidden_state, _ = self.tran((tran_input, cur_hidden_state))  # shape = [num policies, latent dim
#
#             # update the hidden state for use with the next policies
#             cur_hidden_state = next_hidden_state
#
#             policy_posteriors.append(next_latent_mean)
#             policy_sds.append(next_latent_sd)
#
#             next_likelihoods = self.model_vae.decoder(next_latent_mean)
#             likelihoods.append(next_likelihoods)
#
#             next_posterior_means, next_posteriors_sds, next_posteriors_z = self.model_vae.encoder(next_likelihoods)
#             z_means.append(next_posterior_means)
#             z_sds.append(next_posteriors_sds)
#
#             prev_latent_mean = next_latent_mean
#
#         return policy_posteriors, policy_sds, likelihoods, z_means, z_sds
#
#
#     def evaluate_policy(self, policy_posteriors, policy_sd, predicted_likelihood, predicted_posterior, predicted_posterior_sd):
#
#         if self.use_FEEF:
#             return self.FEEF(policy_posteriors, policy_sd, predicted_likelihood, predicted_posterior, predicted_posterior_sd)
#         else:
#             return self.EFE(policy_posteriors, policy_sd, predicted_likelihood, predicted_posterior, predicted_posterior_sd)
#
#
#     def FEEF(self, policy_posteriors_list, policy_sd_list, predicted_likelihood_list, predicted_posterior_list, predicted_posterior_sd_list):
#         """
#         Compute the FEEF for policy selection
#         :param policy_posteriors:
#         :param predicted_likelihood:
#         :param predicted_posterior:
#         :return:
#         """
#
#         FEEFs = []
#
#         for t in range(self.planning_horizon):
#
#             # extract the values for each time step
#             predicted_likelihood = predicted_likelihood_list[t]
#             policy_posteriors = policy_posteriors_list[t]
#             policy_sd = policy_sd_list[t]
#             predicted_posterior = predicted_posterior_list[t]
#             predicted_posterior_sd = predicted_posterior_sd_list[t]
#
#             # !!!! evaluate the EXTRINSIC KL divergence !!!!
#
#             # convert to normal distributions
#             # TODO Why is the stddev 1s here? I think because we assume it is on the true state of the world.
#
#             if self.use_kl_extrinsic:
#                 likelihood_dist = tfp.distributions.MultivariateNormalDiag(loc=predicted_likelihood, scale_diag=np.ones_like(predicted_likelihood))
#
#                 if self.prior_model is None:
#
#                     # TODO how exactly is the prior defined? After you apply transformations what is the prior
#                     # create the prior distribution
#                     prior_preferences_mean = tf.convert_to_tensor(np.stack([self.given_prior_mean]*self.n_policies), dtype="float32")
#                     prior_preferences_stddev = tf.convert_to_tensor(np.stack([self.given_prior_stddev]*self.n_policies), dtype="float32")
#
#                     prior_dist = tfp.distributions.MultivariateNormalDiag(loc=prior_preferences_mean, scale_diag=prior_preferences_stddev)
#
#                     kl_extrinsic = tfp.distributions.kl_divergence(likelihood_dist, prior_dist)
#
#                 # Compute the extrinisc approximation with the prior model
#                 else:
#                     kl_extrinsic = self.prior_model.extrinsic_kl(predicted_likelihood)
#                     kl_extrinsic = tf.reduce_sum(kl_extrinsic, axis=-1)
#
#             # if we don't use extrinsic set it to zero
#             else:
#                 kl_extrinsic = tf.zeros(self.n_policies, dtype="float")
#
#             # !!!! evaluate the KL INTRINSIC part !!!!
#             if self.use_kl_intrinsic:
#
#                 policy_posteriors_dist = tfp.distributions.MultivariateNormalDiag(loc=policy_posteriors, scale_diag=policy_sd)
#                 predicted_posterior_dist = tfp.distributions.MultivariateNormalDiag(loc=predicted_posterior, scale_diag=predicted_posterior_sd)
#
#                 kl_intrinsic = tfp.distributions.kl_divergence(predicted_posterior_dist, policy_posteriors_dist)
#
#             else:
#                 kl_intrinsic = tf.zeros(self.n_policies, dtype="float")
#
#             # print("Extrinsic", kl_extrinsic)
#             # print("Intrinsic", kl_intrinsic)
#
#             FEEF = kl_extrinsic - kl_intrinsic
#
#             FEEFs.append(FEEF)
#
#         return FEEFs
#
#
#     # TODO Find out how this works with the log probability extrinsic term
#     def EFE(self, policy_posteriors_list, policy_sd_list, predicted_likelihood_list, predicted_posterior_list, predicted_posterior_sd_list):
#         """
#         Compute the EFE for policy selection
#         :param policy_posteriors:
#         :param predicted_likelihood:
#         :param predicted_posterior:
#         :return:
#         """
#
#         EFEs = []
#
#         for t in range(self.planning_horizon):
#
#             # extract the values for each time step
#             predicted_likelihood = predicted_likelihood_list[t]
#             policy_posteriors = policy_posteriors_list[t]
#             policy_sd = policy_sd_list[t]
#             predicted_posterior = predicted_posterior_list[t]
#             predicted_posterior_sd = predicted_posterior_sd_list[t]
#
#             # !!!! evaluate the EXTRINSIC KL divergence !!!!
#
#             # convert to normal distributions
#             # TODO Why is the stddev 1s here? I think because we assume it is on the true state of the world.
#
#             if self.use_kl_extrinsic:
#                 likelihood_dist = tfp.distributions.MultivariateNormalDiag(loc=predicted_likelihood, scale_diag=np.ones_like(predicted_likelihood))
#
#                 if self.prior_model is None:
#
#                     # TODO how exactly is the prior defined? After you apply transformations what is the prior
#                     # create the prior distribution
#                     prior_preferences_mean = tf.convert_to_tensor(np.stack(self.given_prior_mean), dtype="float32")
#                     prior_preferences_stddev = tf.convert_to_tensor(np.stack(self.given_prior_stddev), dtype="float32")
#
#                     prior_dist = tfp.distributions.MultivariateNormalDiag(loc=prior_preferences_mean, scale_diag=prior_preferences_stddev)
#
#                     # compute extrinsic prior preferences term
#                     efe_extrinsic = -1 * tf.math.log(prior_dist.prob(predicted_likelihood))
#
#                 # TODO Can I use the learned prior model here?
#                 else:
#                     efe_extrinsic = self.prior_model.extrinsic_kl(predicted_likelihood)
#                     efe_extrinsic = tf.reduce_sum(efe_extrinsic, axis=-1)
#
#             # if we don't use extrinsic set it to zero
#             else:
#                 efe_extrinsic = tf.zeros(self.n_policies, dtype="float")
#
#             # !!!! evaluate the KL INTRINSIC part !!!!
#             if self.use_kl_intrinsic:
#
#                 policy_posteriors_dist = tfp.distributions.MultivariateNormalDiag(loc=policy_posteriors, scale_diag=policy_sd)
#                 predicted_posterior_dist = tfp.distributions.MultivariateNormalDiag(loc=predicted_posterior, scale_diag=predicted_posterior_sd)
#
#                 kl_intrinsic = tfp.distributions.kl_divergence(predicted_posterior_dist, policy_posteriors_dist)
#
#             else:
#                 kl_intrinsic = tf.zeros(self.n_policies, dtype="float")
#
#             # print("EX", efe_extrinsic)
#             # print("IN", kl_intrinsic)
#
#             EFE = efe_extrinsic - kl_intrinsic
#
#             EFEs.append(EFE)
#
#         return EFEs

In [14]:
enc = create_encoder(2, 2, [20])
dec = create_decoder(2, 2, [20])
vae = VAE(enc, dec, 2,  [0, 0], [0.3, 0.3], llik_scaling=1)

pln_hrzn = 5
latent_dim = 2

tran = TransitionGRU(2, 1, 12, 2*pln_hrzn*latent_dim, 2)

habit_net = HabitualAction(latent_dim, 1, pln_hrzn, [20, 20])

# unscaled prior mean and prior stddev
prior_mean = [0.45, 0]
prior_stddev = [1, 1]

observation_max = np.array([0.6, 0.07])
observation_min = np.array([-1.2, -0.07])

# observation_noise_stddev = [0, 0]
observation_noise_stddev = [0.05, 0.05]

scaled_prior_mean = transform_observations(prior_mean, observation_max, observation_min, [0,0])  # no noise on prior

daifa = DAIFAgentRecurrent(None,
                           vae,
                           tran,
                           habit_net,
                           scaled_prior_mean,
                           prior_stddev,
                           planning_horizon=pln_hrzn,
                           use_kl_extrinsic=True,
                           use_kl_intrinsic=True,
                           use_FEEF=True,
                           vae_train_epochs=2,
                           tran_train_epochs=2,
                           habit_train_epochs=1,
                           show_vae_training=True,
                           show_tran_training=True,
                           show_habit_training=False)
                           # n_policies=50,
                           # n_policy_candidates=5)

scaled_prior_mean

array([0.83333333, 0.        ])

In [15]:
# train the agent on the env
env = gym.make('MountainCarContinuous-v0')
agent, results = train_single_agent(env, daifa, observation_max, observation_min, observation_noise_stddev, num_episodes=100, action_repeats=6, num_actions_to_execute=2, train_on_full_data=True, show_replay_training=True, train_during_episode=True, train_vae=True, train_tran=True, train_prior=False, train_habit=False)

Episode 1
[-0.5536803  0.       ]
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 

In [40]:
num_seqs = 20
seq_length = 500
ob_dim = 2
ob_seqs = []
next_obs = []
observations = []
actions = []

pre_obs = []
post_obs = []

env = gym.make('MountainCarContinuous-v0')

for i in range(num_seqs):
    o, a, r = random_observation_sequence(env, 1000, epsilon=0.1)

    o = transform_observations(o, observation_max, observation_min, observation_noise_stddev)
    actions.append(a)

    pre_obs.append(o[:-1])
    post_obs.append(o[1:])


    observations.append(o)
    # ob_seqs.append(train)
    # next_obs.append(test)

observations = np.vstack(observations)
actions = np.vstack(actions)

pre_obs = np.vstack(pre_obs)
post_obs = np.vstack(post_obs)


print(pre_obs.shape, actions.shape)

(10364, 2) (10364, 1)


In [41]:
agent.habit_action_model(pre_obs)

[<tf.Tensor: shape=(10364, 1), dtype=float32, numpy=
 array([[0.9748195 ],
        [0.97457135],
        [0.9743734 ],
        ...,
        [0.9219717 ],
        [0.9209647 ],
        [0.920074  ]], dtype=float32)>,
 <tf.Tensor: shape=(10364, 1), dtype=float32, numpy=
 array([[1.1854709],
        [1.1811893],
        [1.1769786],
        ...,
        [1.2008821],
        [1.1915183],
        [1.1825242]], dtype=float32)>]

In [86]:
observations

array([[-0.14843825,  0.02739476],
       [-0.16762309, -0.02662175],
       [-0.15353353, -0.03600727],
       ...,
       [ 0.79431692,  0.14646526],
       [ 0.88328971,  0.15770078],
       [ 0.82112477,  0.15569911]])

In [87]:
res = agent.model_vae(observations)
res

<tf.Tensor: shape=(8458, 2), dtype=float32, numpy=
array([[-0.09615259, -0.03364024],
       [-0.24450868, -0.08118109],
       [-0.1103467 , -0.08155781],
       ...,
       [ 0.7594091 ,  0.16196477],
       [ 0.78782696,  0.1396032 ],
       [ 0.8218754 ,  0.13280863]], dtype=float32)>

In [60]:
m, s, z_pre = agent.model_vae.encoder(pre_obs)
m, s, z_post = agent.model_vae.encoder(post_obs)
z_pre

<tf.Tensor: shape=(12640, 2), dtype=float32, numpy=
array([[-1.0801549e-04, -1.3623969e-01],
       [-1.1207108e-01,  5.6329429e-02],
       [-6.4431690e-02, -2.6241505e-01],
       ...,
       [ 2.2340380e-01,  3.6851272e-02],
       [ 4.0759775e-01,  1.6808099e-01],
       [ 2.2341438e-01,  1.9690935e-01]], dtype=float32)>

In [63]:
z_plus_a = np.concatenate([z_pre, actions], axis=1)
z_plus_a

array([[-1.0801549e-04, -1.3623969e-01, -4.2869693e-01],
       [-1.1207108e-01,  5.6329429e-02, -4.2869693e-01],
       [-6.4431690e-02, -2.6241505e-01,  5.8256221e-01],
       ...,
       [ 2.2340380e-01,  3.6851272e-02,  7.8342098e-01],
       [ 4.0759775e-01,  1.6808099e-01,  7.8342098e-01],
       [ 2.2341438e-01,  1.9690935e-01,  7.8342098e-01]], dtype=float32)

In [38]:
agent.tran((z, np.zeros_like(z)))

ValueError: Exception encountered when calling layer "transition_gru_5" (type TransitionGRU).

Input 1 of layer "transition" is incompatible with the layer: expected shape=(None, 20), found shape=(10371, 2)

Call arguments received:
  • inputs=('tf.Tensor(shape=(10371, 2), dtype=float32)', 'tf.Tensor(shape=(10371, 2), dtype=float32)')
  • training=None
  • mask=None

## Test without the replay training

In [6]:
enc = create_encoder(2, 2, [20])
dec = create_decoder(2, 2, [20])
vae = VAE(enc, dec, 2,  [0, 0], [0.3, 0.3], llik_scaling=1)

tran = TransitionGRU(2, 1, 12, 20, 2)

# unscaled prior mean and prior stddev
prior_mean = [0.45, 0]
prior_stddev = [1, 1]

observation_max = np.array([0.6, 0.07])
observation_min = np.array([-1.2, -0.07])

# observation_noise_stddev = [0, 0]
observation_noise_stddev = [0.05, 0.05]

scaled_prior_mean = transform_observations(prior_mean, observation_max, observation_min, [0,0])  # no noise on prior

daifa = DAIFAgentRecurrent(None,
                           vae,
                           tran,
                           scaled_prior_mean,
                           prior_stddev,
                           planning_horizon=5,
                           use_kl_extrinsic=True,
                           use_kl_intrinsic=True,
                           use_FEEF=True,
                           vae_train_epochs=1,
                           tran_train_epochs=1,
                           show_vae_training=False)

scaled_prior_mean

array([0.83333333, 0.        ])

In [7]:
# train the agent on the env
env = gym.make('MountainCarContinuous-v0')
agent, results = train_single_agent(env, daifa, observation_max, observation_min, observation_noise_stddev, num_episodes=40, action_repeats=6, num_actions_to_execute=2, train_on_full_data=False, show_replay_training=False)

  deprecation(
  deprecation(


Episode 1
[-0.5452394  0.       ]
Instructions for updating:
`scale_identity_multiplier` is deprecated; please combine it into `scale_diag` directly instead.


2022-08-16 14:32:26.876947: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


[0.80860598 0.09458183]
tf.Tensor([ 0.9830013   0.7846042   0.3220308  -0.01687961  0.16555098], shape=(5,), dtype=float32)
Success in episode 1 at time step 369
Episode 2
[-0.57064813  0.        ]
[0.47878008 0.40288668]
tf.Tensor([ 0.957456    0.79769796  0.22657327 -0.27285808 -0.09558801], shape=(5,), dtype=float32)
Success in episode 2 at time step 106
Episode 3
[-0.4439096  0.       ]
[0.55124285 0.51222228]
tf.Tensor([ 0.9207449  -0.5341525  -0.88466763 -0.8933735  -0.15289223], shape=(5,), dtype=float32)
Success in episode 3 at time step 92
Episode 4
[-0.55050933  0.        ]
No Success
Episode 5
[-0.53231597  0.        ]
[0.7296093  0.15947908]
tf.Tensor([ 0.9820634   0.93507713  0.9540813   0.42379785 -0.84602374], shape=(5,), dtype=float32)
Success in episode 5 at time step 125
Episode 6
[-0.4112631  0.       ]
No Success
Episode 7
[-0.53678113  0.        ]
No Success
Episode 8
[-0.42946008  0.        ]
[0.74362838 0.34122923]
tf.Tensor([0.7967912  0.9772121  0.9674017  0.93

In [8]:
observations

NameError: name 'observations' is not defined

In [9]:
res = agent.model_vae(observations)
res

NameError: name 'observations' is not defined

## Test with the prior model FEEF

In [9]:
enc = create_encoder(2, 2, [20])
dec = create_decoder(2, 2, [20])
vae = VAE(enc, dec, 2, [0, 0], [0.3, 0.3], llik_scaling=1, recon_stddev=0.05)

pl_hoz = 5
latent_dim = 2

tran = TransitionGRU(2, 1, 12, 2*latent_dim*pl_hoz, 2)

# unscaled prior mean and prior stddev
prior_model = PriorModelBellman(2)

observation_max = np.array([0.6, 0.07])
observation_min = np.array([-1.2, -0.07])

# observation_noise_stddev = [0, 0]
observation_noise_stddev = [0.05, 0.05]

daifa = DAIFAgentRecurrent(prior_model,
                           vae,
                           tran,
                           None,
                           None,
                           train_prior_model=True,
                           planning_horizon=pl_hoz,
                           use_kl_extrinsic=True,
                           use_kl_intrinsic=True,
                           use_FEEF=True,
                           vae_train_epochs=1,
                           tran_train_epochs=1)

In [10]:
# train the agent on the env
env = gym.make('MountainCarContinuous-v0')
agent, results = train_single_agent(env, daifa, observation_max, observation_min, observation_noise_stddev, num_episodes=40, action_repeats=6, num_actions_to_execute=5, train_on_full_data=False, show_replay_training=True, train_during_episode=True, train_vae=True, train_tran=True, train_prior=True)

Episode 1
[-0.5058971  0.       ]
Instructions for updating:
`scale_identity_multiplier` is deprecated; please combine it into `scale_diag` directly instead.


2022-08-19 13:15:13.314620: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


[0.3039005  0.71092576]
tf.Tensor([ 0.19833167 -0.18556146 -0.01475587  0.9705029   0.9624671 ], shape=(5,), dtype=float32)
Success in episode 1 at time step 885
Episode 2
[-0.5088794  0.       ]
[-0.43504306  0.78424563]
tf.Tensor([ 0.9206759  -0.9428289   0.32532373  0.9761217   0.97713995], shape=(5,), dtype=float32)
Success in episode 2 at time step 779
Episode 3
[-0.5958802  0.       ]
[0.4419062  0.76586994]
tf.Tensor([0.94090074 0.9010319  0.84719336 0.8775117  0.84439796], shape=(5,), dtype=float32)
Success in episode 3 at time step 518
Episode 4
[-0.58932567  0.        ]
[-0.13395191  0.59637804]
tf.Tensor([0.96085453 0.9359395  0.93016607 0.8845991  0.6142043 ], shape=(5,), dtype=float32)
Success in episode 4 at time step 234
Episode 5
[-0.51195633  0.        ]
[0.29763652 0.46101472]
tf.Tensor([0.95064896 0.77370644 0.9154784  0.94035494 0.9007728 ], shape=(5,), dtype=float32)
Success in episode 5 at time step 135
Episode 6
[-0.5905234  0.       ]
No Success
Episode 7
[-0.45

## Test the models produced

In [11]:
num_seqs = 20
seq_length = 300
ob_dim = 2
ob_seqs = []
next_obs = []
observations = []

env = gym.make('MountainCarContinuous-v0')

for i in range(num_seqs):
    o, a, r = random_observation_sequence(env, 1000, epsilon=0.1)

    o = transform_observations(o, observation_max, observation_min, observation_noise_stddev)

    train = np.concatenate([o[:-1], a], axis=1)
    # train = o[:-1]
    test = o[-1]

    observations.append(o)
    ob_seqs.append(train)
    next_obs.append(test)

# ob_seqs = np.array(ob_seqs)
# next_obs = np.array(next_obs)

observations = np.vstack(observations)
# observations = observations.reshape((num_seqs*(seq_length+1), ob_dim))

# ob_seqs_stddev = np.ones_like(ob_seqs_flat)
# next_obs_stddev = np.ones_like(ob_seqs_flat)

observations.shape

(11795, 2)

In [12]:
observations

array([[-0.26057783, -0.04207603],
       [-0.28624836, -0.08627176],
       [-0.19631219, -0.02719388],
       ...,
       [-0.01224737, -0.17963201],
       [ 0.04159929, -0.13444244],
       [-0.01116673, -0.20939676]])

In [14]:
agent.model_vae(observations)

<tf.Tensor: shape=(11795, 2), dtype=float32, numpy=
array([[-0.28514823,  0.00535257],
       [-0.27190113,  0.04403386],
       [-0.3022769 , -0.11604911],
       ...,
       [-0.18510318,  0.17403544],
       [-0.44161722, -0.13208804],
       [-0.3475597 , -0.16978927]], dtype=float32)>

In [15]:
tf.reduce_mean(agent.model_vae.compute_loss(observations))

<tf.Tensor: shape=(), dtype=float32, numpy=54.17067>

In [19]:
observations

array([[ 0.03154742, -0.09820337],
       [-0.0969649 ,  0.07948929],
       [-0.1169571 , -0.01325517],
       ...,
       [ 0.78184811,  0.71964955],
       [ 0.82025732,  0.6818092 ],
       [ 0.78129383,  0.67271704]])

In [17]:
agent.prior_model(observations)

<tf.Tensor: shape=(12967, 2), dtype=float32, numpy=
array([[-0.17886269, -0.19079794],
       [-0.19636999, -0.21472369],
       [-0.21236189, -0.21023679],
       ...,
       [ 0.16945922, -0.18041734],
       [ 0.17687897, -0.17450874],
       [ 0.16238074, -0.17725156]], dtype=float32)>

In [18]:
agent.prior_model.extrinsic_kl(observations)

<tf.Tensor: shape=(12967, 2), dtype=float32, numpy=
array([[1.1788627 , 1.1907979 ],
       [1.19637   , 1.2147237 ],
       [1.2123619 , 1.2102368 ],
       ...,
       [0.8305408 , 1.1804173 ],
       [0.823121  , 1.1745087 ],
       [0.83761925, 1.1772516 ]], dtype=float32)>

## Test EFE

In [206]:
enc = create_encoder(2, 2, [20])
dec = create_decoder(2, 2, [20])
vae = VAE(enc, dec, [0, 0], [0.3, 0.3], llik_scaling=1)

pl_hoz = 5
latent_dim = 2

tran = TransitionGRU(2, 1, 12, 2*pl_hoz*latent_dim, 2)

# unscaled prior mean and prior stddev
prior_mean = [0.45, 0]
prior_stddev = [1, 1]

# unscaled prior mean and prior stddev
prior_model = PriorModelBellman(2)

observation_max = np.array([0.6, 0.07])
observation_min = np.array([-1.2, -0.07])

observation_noise_stddev = [0, 0]

scaled_prior_mean = transform_observations(prior_mean, observation_max, observation_min, [0,0])  # no noise on prior

# without prior model
daifa = DAIFAgentRecurrent(None,
                           vae,
                           tran,
                           scaled_prior_mean,
                           prior_stddev,
                           planning_horizon=pl_hoz,
                           use_kl_extrinsic=True,
                           use_kl_intrinsic=True,
                           use_FEEF=False,
                           vae_train_epochs=1,
                           tran_train_epochs=1,
                           show_vae_training=False)

# with prior model
daifa = DAIFAgentRecurrent(prior_model,
                           vae,
                           tran,
                           None,
                           None,
                           planning_horizon=pl_hoz,
                           use_kl_extrinsic=True,
                           use_kl_intrinsic=True,
                           use_FEEF=False,
                           vae_train_epochs=1,
                           tran_train_epochs=1,
                           show_vae_training=False)

scaled_prior_mean

array([0.83333333, 0.        ])

In [207]:
# train the agent on the env
env = gym.make('MountainCarContinuous-v0')
agent, results = train_agent(env, daifa, observation_max, observation_min, observation_noise_stddev, num_episodes=20, action_repeats=10, num_actions_to_execute=2, train_on_full_data=True)

Episode 1
[0.32907545 0.81204165]
tf.Tensor([0.93988144 0.9176549  0.9576125  0.92440426 0.50135976], shape=(5,), dtype=float32)
Epoch 1/2
Epoch 2/2
Success in episode 1 at time step 389
Episode 2
Epoch 1/2
Epoch 2/2
No Success
Episode 3
[0.69670431 0.33783742]
tf.Tensor([ 0.9314696  -0.9435271   0.9645834   0.8415556   0.56516707], shape=(5,), dtype=float32)
Epoch 1/2
Epoch 2/2
Success in episode 3 at time step 306
Episode 4
Epoch 1/2
Epoch 2/2
No Success
Episode 5
[0.49734786 0.77757351]
tf.Tensor([ 0.98152953 -0.9801048   0.346932    0.32506007 -0.20486811], shape=(5,), dtype=float32)
Epoch 1/2
Epoch 2/2
Success in episode 5 at time step 426
Episode 6
[-0.08916565  0.69457532]
tf.Tensor([ 0.86979437  0.9163867   0.45106322 -0.9567182  -0.96376824], shape=(5,), dtype=float32)
Epoch 1/2
Epoch 2/2
Success in episode 6 at time step 560
Episode 7
[0.7740761  0.43105478]
tf.Tensor([-0.8406548   0.22958903 -0.86530244  0.16205291  0.9585254 ], shape=(5,), dtype=float32)
Epoch 1/2
Epoch 2/2

In [208]:
## Test the models produced
num_seqs = 20
seq_length = 300
ob_dim = 2
ob_seqs = []
next_obs = []
observations = []

env = gym.make('MountainCarContinuous-v0')

for i in range(num_seqs):
    o, a, r = random_observation_sequence(env, 1000, epsilon=0.1)

    o = transform_observations(o, observation_max, observation_min, observation_noise_stddev)

    train = np.concatenate([o[:-1], a], axis=1)
    # train = o[:-1]
    test = o[-1]

    observations.append(o)
    ob_seqs.append(train)
    next_obs.append(test)

# ob_seqs = np.array(ob_seqs)
# next_obs = np.array(next_obs)

observations = np.vstack(observations)
# observations = observations.reshape((num_seqs*(seq_length+1), ob_dim))

# ob_seqs_stddev = np.ones_like(ob_seqs_flat)
# next_obs_stddev = np.ones_like(ob_seqs_flat)

observations.shape

(12806, 2)

In [209]:
observations

array([[-0.27734923,  0.        ],
       [-0.27835019, -0.01286914],
       [-0.28034457, -0.02564205],
       ...,
       [ 0.78566315,  0.35808382],
       [ 0.81395161,  0.36370895],
       [ 0.8428795 ,  0.37193014]])

In [210]:
agent.model_vae(observations)
## Test EFE

<tf.Tensor: shape=(12806, 2), dtype=float32, numpy=
array([[-0.26162374, -0.03878228],
       [-0.27726045, -0.06711522],
       [-0.24541792, -0.04369228],
       ...,
       [ 0.7668527 ,  0.39962938],
       [ 0.93341327,  0.4722877 ],
       [ 0.8112131 ,  0.41610983]], dtype=float32)>

## Testing the Identity VAE

In [None]:
enc = identity_encoder
dec = identity_decoder
idvae = IdentityVAE(enc, dec, [0, 0], [0.3, 0.3], llik_scaling=100)

hidden_size = 2*2*15  # 2*latent_dim * planning_size
tran = TransitionGRU(2, 1, 12, hidden_size, 2)

# unscaled prior mean and prior stddev
prior_mean = [0.6, 0]
prior_stddev = [1, 1]

observation_max = np.array([0.6, 0.07])
observation_min = np.array([-1.2, -0.07])

observation_noise_stddev = [0, 0]

scaled_prior_mean = transform_observations(prior_mean, observation_max, observation_min, [0,0])  # no noise on prior

daifa = DAIFAgentRecurrent(None, idvae, tran, scaled_prior_mean, prior_stddev)

In [None]:
env = gym.make('MountainCarContinuous-v0')

agent, succeeded, time_to_success = train_agent(env, daifa, observation_max, observation_min, observation_noise_stddev, num_episodes=5)

In [None]:
out = agent.tran((ob_seqs[0:1], None))
out

In [None]:
t = ob_seqs[0:1, -1].reshape(1,1,3)
h = out[3]
h = h[0, -2, :]
h = h.numpy().reshape(1,30)
h

In [None]:
agent.tran((t, h))

In [None]:
ob_seqs[0:1]

## Test to see how the agent trains on standard observation data

In [8]:
enc = identity_encoder
dec = identity_decoder
idvae = IdentityVAE(enc, dec, [0, 0], [0.3, 0.3], llik_scaling=100)

hidden_size = 2*2*15  # 2*latent_dim * planning_size
tran = TransitionGRU(2, 1, 12, hidden_size, 2)

# unscaled prior mean and prior stddev
prior_mean = [0.45, 0]
prior_stddev = [1, 1]

observation_max = np.array([0.6, 0.07])
observation_min = np.array([-1.2, -0.07])

observation_noise_stddev = [0, 0]

scaled_prior_mean = transform_observations(prior_mean, observation_max, observation_min, [0,0])  # no noise on prior

daifa = DAIFAgentRecurrent(None, idvae, tran, scaled_prior_mean, prior_stddev, vae_train_epochs=1, tran_train_epochs=2)

In [None]:
env = gym.make('MountainCarContinuous-v0')
success, agent, t, pre_obs, post_obs, acts = run_episode(env, daifa, observation_max, observation_min, observation_noise_stddev)

In [None]:
pre_np = np.array(pre_obs)
a = np.array(acts)
a.shape
pre_a = np.concatenate([pre_np, a], axis=2)

In [None]:
print(a.max(), a.min())

In [None]:
pre_a

In [None]:
post_obs_to_predict = np.array(post_obs)[:, 14, :]
post_obs_to_predict

In [None]:
agent.tran((pre_a, None))

In [None]:
post_obs_to_predict

## Examine training the model on the observation data

Does it eventually converge to a good model

In [None]:
num_train_runs = 1
for i in range(num_train_runs):

    for j in range(len(pre)):
        pre = pre_obs[j]
        post = post_obs[j]
        actions = acts[j]

        daifa.train(pre, post, actions, None, verbose=1)

In [None]:
daifa.cem_policy_optimisation(np.array([0.5, 0.1]))

In [None]:
daifa.cem_policy_optimisation

## Test the FEEF computations

In [8]:
# enc = create_encoder(2, 2, [20])
# dec = create_decoder(2, 2, [20])
# vae = VAE(enc, dec, [0, 0], [0.3, 0.3], llik_scaling=100)

enc = identity_encoder
dec = identity_decoder
idvae = IdentityVAE(enc, dec, [0, 0], [0.3, 0.3], llik_scaling=100)

tran = TransitionGRU(2, 1, 12, 60, 2)

# unscaled prior mean and prior stddev
prior_mean = [0.6, 0]
prior_stddev = [1, 1]

observation_max = np.array([0.6, 0.07])
observation_min = np.array([-1.2, -0.07])

observation_noise_stddev = [0, 0]

scaled_prior_mean = transform_observations(prior_mean, observation_max, observation_min, [0, 0])  # no noise on prior

print(scaled_prior_mean)

daifa = DAIFAgentRecurrent(None, idvae, tran, scaled_prior_mean, prior_stddev, planning_horizon=15, n_policy_candidates=70, n_policies=1500, n_cem_policy_iterations=5)

[1.  0.5]


In [9]:
env = gym.make('MountainCarContinuous-v0')

agent, succeeded, time_to_success = train_agent(env, daifa, observation_max, observation_min, observation_noise_stddev,
                                                num_episodes=1)

Episode 1
Instructions for updating:
`scale_identity_multiplier` is deprecated; please combine it into `scale_diag` directly instead.


2022-08-06 15:23:48.777636: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz

KeyboardInterrupt



In [None]:
def test_policy(agent, env, policy, action_repeats):

    observation = env.reset()
    obs = transform_observations(observation, observation_max, observation_min, [0, 0])

    z_t_minus_1 = obs
    p, s = agent.cem_policy_optimisation(z_t_minus_1)
    p
    print(obs)
    print(p)

    for action in p:
        for t in range(action_repeats):
            res = env.step(np.array([action]))
            print(res)

In [None]:
z_t_minus_1 = np.array([0, 0])
p, s = agent.cem_policy_optimisation(z_t_minus_1)
p

agent.forward_policies(p, z_t_minus_1)

In [None]:
env = gym.make('MountainCarContinuous-v0')

test_policy(agent, env, p.numpy(), 6)

In [None]:
z_t_minus_1 = np.array([-0.27691475,  0.01688306])
p, s = agent.cem_policy_optimisation(z_t_minus_1)
p

In [182]:
import tensorflow as tf
import tensorflow_probability as tfp
import numpy as np

from vae_recurrent import VAE


class DAIFAgentRecurrent:

    def __init__(self,
                 prior_model,
                 vae,
                 tran,
                 given_prior_mean,
                 given_prior_stddev,
                 planning_horizon=15,
                 n_policies=1500,
                 n_cem_policy_iterations=2,
                 n_policy_candidates=70,
                 tran_train_epochs=1,
                 vae_train_epochs=1,
                 agent_time_ratio=6,
                 train_vae=True,
                 train_tran=True):

        super(DAIFAgentRecurrent, self).__init__()

        self.prior_model = prior_model
        self.planning_horizon = planning_horizon
        self.n_policy_candidates = n_policy_candidates
        self.n_policies = n_policies
        self.n_cem_policy_iterations = n_cem_policy_iterations

        self.vae_train_epochs = vae_train_epochs
        self.tran_train_epochs = tran_train_epochs
        self.train_vae = train_vae
        self.train_tran = train_tran

        self.given_prior_mean = given_prior_mean
        self.given_prior_stddev = given_prior_stddev

        # full vae
        self.model_vae = vae
        self.model_vae.compile(optimizer=tf.keras.optimizers.Adam())

        # transition
        # takes action plus last state and outputs next latent state
        self.tran = tran
        self.tran.compile(optimizer=tf.keras.optimizers.Adam())

        self.hidden_state = None

        # how much is the agents planning time compressed compared to the simulation time
        self.agent_time_ratio = agent_time_ratio


    def select_policy(self, observation):

        policy_mean, policy_stddev = self.cem_policy_optimisation(observation)

        # return a distribution that we can sample from
        return tfp.distributions.MultivariateNormalDiag(loc=policy_mean, scale_diag=policy_stddev)


    def train(self, pre_observations_raw, post_observations_raw, actions_complete, rewards, verbose=0):

        # compress the observations based on the agents time compression factor
        # pre_observations = pre_observations_raw[::self.agent_time_ratio]  # for example take every 6th element
        # post_observations = np.array([post_observations_raw[i] for i in range(len(post_observations_raw)) if i % self.agent_time_ratio == self.agent_time_ratio - 1])
        #
        # print(pre_observations_raw)
        # print(pre_observations)
        # print(post_observations_raw)
        # print(post_observations)

        pre_observations = pre_observations_raw
        post_observations = post_observations_raw

        # only look at the first n actions that we took
        actions = actions_complete[0: len(pre_observations)]

        num_observations = pre_observations.shape[0]
        observation_dim = pre_observations.shape[1]
        action_dim = actions.shape[1]
        # action_dim = 1  # TODO fix this to allow different actions

        # find the actual observed latent states using the vae
        pre_latent_mean, pre_latent_stddev, pre_latent = self.model_vae.encoder(pre_observations)
        post_latent_mean, post_latent_stddev, post_latent = self.model_vae.encoder(post_observations)

        # set up the input training data that we use to train the transition model
        z_train = np.concatenate([np.array(pre_latent_mean), np.array(actions)], axis=1)

        # we use the sequence to find the right hidden states to use as input
        z_train_seq = z_train.reshape((1, num_observations, observation_dim + action_dim))
        z_train_singles = z_train.reshape(num_observations, 1, observation_dim + action_dim)

        # the previous hidden state is the memory after observing some sequences but it might be None
        if self.hidden_state is None:
            self.hidden_state = np.zeros((1, self.tran.hidden_units))

        if self.train_tran:
            # find the hidden states at t=0, t=1, t=2, ..., t=num_observations - 1
            _, _, _, h_states = self.tran((z_train_seq, self.hidden_state))


            # squeeze so we make the shape [num_observations, hidden_units]
            h_states = tf.squeeze(h_states)

            # exclude the last state as this will become the hidden state later on. next hidden state will become our new memory
            h_states_for_training = h_states[:-1]
            # next_hidden_state = h_states[-1]

            # add the current hidden state we saved to the start. This has h0, h1, h2, .. h=num_observations - 1
            h_states_for_training = tf.concat([self.hidden_state, h_states_for_training], axis=0)


            # use the hidden states with the pre and post observations to train transition model
            self.tran.fit((z_train_singles, h_states_for_training), (post_latent_mean, post_latent_stddev), epochs=self.tran_train_epochs, verbose=verbose)

        # now find the new predicted hidden state that we will use for finding the policy
        # TODO not sure if I should pass the old hidden state or reset it to 0
        _, _, final_hidden_state, _ = self.tran((z_train_seq, self.hidden_state))
        # _, _, final_hidden_state, _ = self.tran((z_train_seq, None))

        self.hidden_state = final_hidden_state

        #### TRAIN THE VAE ####
        if self.train_vae:
            # train the vae model on post_observations because these are all new
            self.model_vae.fit(post_observations, epochs=self.vae_train_epochs, verbose=verbose)



    def cem_policy_optimisation(self, z_t_minus_one):

        # need to change these two if the policy dimension changes
        mean_best_policies = tf.zeros(self.planning_horizon)
        std_best_policies = tf.ones(self.planning_horizon)

        for i in range(self.n_cem_policy_iterations):
            policy_distr = tfp.distributions.MultivariateNormalDiag(loc=mean_best_policies, scale_diag=std_best_policies)
            policies = policy_distr.sample([self.n_policies])
            policies = tf.clip_by_value(policies, clip_value_min=-1, clip_value_max=1)

            # project trajectory into the future using transition model and calculate FEEF for each policy
            policy_results = self.forward_policies(policies.numpy(), z_t_minus_one)
            FEEFs = self.evaluate_policy(*policy_results)

            FEEFs = tf.convert_to_tensor(FEEFs)

            # sum over the timesteps to get the FEEF for each policy
            FEEFs_sum = tf.reduce_sum(FEEFs, axis=0)

            # multiply by one to find largest value which is euqivalent to smallest FEEF with top_k
            neg_FEEF_sum = -1*FEEFs_sum

            result = tf.math.top_k(neg_FEEF_sum, self.n_policy_candidates, sorted=False)
            min_FEEF_indices = result.indices

            # update the policy distributions
            mean_best_policies = tf.reduce_mean(tf.gather(policies, min_FEEF_indices), axis=0)
            std_best_policies = tf.math.reduce_std(tf.gather(policies, min_FEEF_indices), axis=0)


        # TODO not sure why we need all of this is with the x means? I think it's for training but maybe not

        # One last forward pass to gather the stats of the policy mean
        #FEEFs, next_x_means, next_x_stds = self._forward_policies(mean_best_policies.unsqueeze(1))
        # return mean_best_policies, std_best_policies, FEEFs.detach().squeeze(1), next_x_means.detach().squeeze(1), next_x_stds.detach().squeeze(1)

        print(z_t_minus_one)
        print(mean_best_policies)
        return mean_best_policies, std_best_policies


    def forward_policies(self, policies, z_t_minus_one):
        """
        Forward propogate a policy and compute the FEEF of each policy
        :param z_t_minus_one:
        :return:
        """

        # stack up the new observation to have shape [self.n_policies, len(z_t_minus_one)]
        prev_latent_mean = np.stack([z_t_minus_one]*self.n_policies)

        policy_posteriors = []
        policy_sds = []
        likelihoods = []
        z_means = []
        z_sds = []

        # get the starting hidden state that coressponds to the memory stored by the previous sequences. Should have shape (1, self.tran.num_hidden_units) for the observed sequence
        # extend the current hidden state to the number of policies present
        if self.hidden_state is None:
            cur_hidden_state = np.zeros((self.n_policies, self.tran.hidden_units))
        else:
            cur_hidden_state = np.vstack([self.hidden_state]*self.n_policies)

        # print(cur_hidden_state)

        # find the predicted latent states from the transition model
        for t in range(self.planning_horizon):

            ob_plus_action = np.concatenate([prev_latent_mean, policies[:, t].reshape(self.n_policies, 1)], axis=1)
            tran_input = ob_plus_action.reshape((self.n_policies, 1, ob_plus_action.shape[1]))  # reshape to pass to GRU

            next_latent_mean, next_latent_sd, next_hidden_state, _ = self.tran((tran_input, cur_hidden_state))  # shape = [num policies, latent dim

            # update the hidden state for use with the next policies
            cur_hidden_state = next_hidden_state

            policy_posteriors.append(next_latent_mean)
            policy_sds.append(next_latent_sd)

            next_likelihoods = self.model_vae.decoder(next_latent_mean)
            likelihoods.append(next_likelihoods)

            next_posterior_means, next_posteriors_sds, next_posteriors_z = self.model_vae.encoder(next_likelihoods)
            z_means.append(next_posterior_means)
            z_sds.append(next_posteriors_sds)

            prev_latent_mean = next_latent_mean

        return policy_posteriors, policy_sds, likelihoods, z_means, z_sds


    def evaluate_policy(self, policy_posteriors, policy_sd, predicted_likelihood, predicted_posterior, predicted_posterior_sd):

        return self.FEEF(policy_posteriors, policy_sd, predicted_likelihood, predicted_posterior, predicted_posterior_sd)


    def FEEF(self, policy_posteriors_list, policy_sd_list, predicted_likelihood_list, predicted_posterior_list, predicted_posterior_sd_list):
        """
        Compute the FEEF for policy selection
        :param policy_posteriors:
        :param predicted_likelihood:
        :param predicted_posterior:
        :return:
        """

        FEEFs = []

        for t in range(self.planning_horizon):

            # extract the values for each time step
            predicted_likelihood = predicted_likelihood_list[t]
            policy_posteriors = policy_posteriors_list[t]
            policy_sd = policy_sd_list[t]
            predicted_posterior = predicted_posterior_list[t]
            predicted_posterior_sd = predicted_posterior_sd_list[t]

            # !!!! evaluate the EXTRINSIC KL divergence !!!!

            # convert to normal distributions
            # TODO Why is the stddev 1s here? I think because we assume it is on the true state of the world.
            likelihood_dist = tfp.distributions.MultivariateNormalDiag(loc=predicted_likelihood, scale_diag=np.ones_like(predicted_likelihood))

            if self.prior_model is None:

                # TODO how exactly is the prior defined? After you apply transformations what is the prior
                # create the prior distribution
                prior_preferences_mean = tf.convert_to_tensor(np.stack([self.given_prior_mean]*self.n_policies), dtype="float32")
                prior_preferences_stddev = tf.convert_to_tensor(np.stack([self.given_prior_stddev]*self.n_policies), dtype="float32")

                prior_dist = tfp.distributions.MultivariateNormalDiag(loc=prior_preferences_mean, scale_diag=prior_preferences_stddev)

            # TODO Fix the learned prior model
            else:
                prior_dist = self.prior_model()

            kl_extrinsic = tfp.distributions.kl_divergence(likelihood_dist, prior_dist)

            # !!!! evaluate the KL INTRINSIC part !!!!
            policy_posteriors_dist = tfp.distributions.MultivariateNormalDiag(loc=policy_posteriors, scale_diag=policy_sd)
            predicted_posterior_dist = tfp.distributions.MultivariateNormalDiag(loc=predicted_posterior, scale_diag=predicted_posterior_sd)

            kl_intrinsic = tfp.distributions.kl_divergence(predicted_posterior_dist, policy_posteriors_dist)

            FEEF = kl_extrinsic - kl_intrinsic

            FEEFs.append(FEEF)

        return FEEFs


    def EFE(self, policy_posteriors, predicted_likelihood, predicted_posterior):
        """
        Compute the EFE for policy selection
        :param policy_posteriors:
        :param predicted_likelihood:
        :param predicted_posterior:
        :return:
        """
        pass

## Testing with a pretrained transition model

This works well! So the problem can't lie with the transition model.

In [183]:
# enc = create_encoder(2, 2, [20])
# dec = create_decoder(2, 2, [20])
# vae = VAE(enc, dec, [0, 0], [0.3, 0.3], llik_scaling=100)

enc = identity_encoder
dec = identity_decoder
idvae = IdentityVAE(enc, dec, [0, 0], [0.3, 0.3], llik_scaling=100)

tran = TransitionGRU(2, 1, 12, 60, 2)

# unscaled prior mean and prior stddev
prior_mean = [0.6, 0.07]
prior_stddev = [1, 1]

observation_max = np.array([0.6, 0.07])
observation_min = np.array([-1.2, -0.07])

observation_noise_stddev = [0, 0]

scaled_prior_mean = transform_observations(prior_mean, observation_max, observation_min, [0, 0])  # no noise on prior

print(scaled_prior_mean)

daifa = DAIFAgentRecurrent(None, idvae, tran, scaled_prior_mean, prior_stddev, train_vae=False)

[1. 1.]


In [184]:
num_seqs = 200
seq_length = 500
ob_seqs = []
next_obs = []
next_obs_stddev = []
actions = []

env = gym.make('MountainCarContinuous-v0')

for i in range(num_seqs):
    o, a, r = random_observation_sequence(env, seq_length, epsilon=0.2)

    o = transform_observations(o, observation_max, observation_min, [0, 0])

    # train = np.concatenate([o[:-1], a], axis=1)
    train = o[:-1]
    test = o[1:]

    actions.append(a)
    ob_seqs.append(train)
    next_obs.append(test)

    ob_seqs_stddev = np.ones_like(train)
    next_stddev = np.ones_like(test)

    next_obs_stddev.append(next_stddev)

In [185]:
for i in range(num_seqs):

    pre = ob_seqs[i]
    next = next_obs[i]
    acts = actions[i]

    next_sd = next_obs_stddev[i]

    daifa.train(pre, next, acts, None, verbose=1)



In [187]:
num_seqs = 20
seq_length = 150
ob_seqs = []
next_obs = []

for i in range(num_seqs):
    o, a, r = random_observation_sequence(env, seq_length, epsilon=0.1)

    o = transform_observations(o, observation_max, observation_min, [0, 0])

    train = np.concatenate([o[:-1], a], axis=1)
    test = o[-1]

    ob_seqs.append(train)
    next_obs.append(test)

ob_seqs = np.array(ob_seqs)[:, -5:, :]
next_obs = np.array(next_obs)
ob_seqs.shape

ob_seqs_stddev = np.ones_like(ob_seqs)
next_obs_stddev = np.ones_like(next_obs)

ob_seqs.shape

(20, 5, 3)

In [188]:
daifa.tran((ob_seqs, None))

[<tf.Tensor: shape=(20, 2), dtype=float32, numpy=
 array([[0.42757505, 0.6529479 ],
        [0.38936174, 0.5224733 ],
        [0.5177275 , 0.5719767 ],
        [0.32569912, 0.5357033 ],
        [0.5789795 , 0.37268484],
        [0.50462687, 0.16914466],
        [0.10576638, 0.2804633 ],
        [0.50895584, 0.26160803],
        [0.84278893, 0.7405859 ],
        [0.41768757, 0.8168086 ],
        [0.6252694 , 0.524398  ],
        [0.29302755, 0.6640839 ],
        [0.7982165 , 0.4708082 ],
        [0.0011582 , 0.43203047],
        [0.15381938, 0.3239351 ],
        [0.51816386, 0.7722647 ],
        [0.4246617 , 0.18837008],
        [0.62402654, 0.48184252],
        [0.6900874 , 0.6458197 ],
        [0.48874134, 0.24390756]], dtype=float32)>,
 <tf.Tensor: shape=(20, 2), dtype=float32, numpy=
 array([[1.0015444 , 1.0040803 ],
        [1.0029435 , 1.0013554 ],
        [1.00249   , 1.0037054 ],
        [1.0025867 , 1.0013175 ],
        [1.0037783 , 1.0024655 ],
        [1.0045325 , 0.9992334 ]

In [189]:
next_obs

array([[0.41424149, 0.64817536],
       [0.37549612, 0.51434882],
       [0.50395287, 0.56238058],
       [0.31282919, 0.53150581],
       [0.56830029, 0.36537232],
       [0.49751557, 0.15729226],
       [0.10200014, 0.28023718],
       [0.49930485, 0.25297206],
       [0.83264549, 0.76498072],
       [0.40183171, 0.8195579 ],
       [0.61205382, 0.51735449],
       [0.28334383, 0.66585968],
       [0.78955368, 0.48945753],
       [0.        , 0.5       ],
       [0.15014838, 0.32678897],
       [0.5064109 , 0.78005801],
       [0.41774712, 0.18253206],
       [0.61022634, 0.47224011],
       [0.67659513, 0.6480953 ],
       [0.48181082, 0.23722683]])

That looks fantastic!!! With enough data the transition model is training very well

In [190]:
daifa.hidden_state

<tf.Tensor: shape=(1, 60), dtype=float32, numpy=
array([[ 0.1668222 ,  0.05080901,  0.20015422,  0.38899958, -0.08546768,
        -0.04040675,  0.03512116,  0.00171472,  0.03852477,  0.20108685,
        -0.03330585,  0.01087453, -0.03944991, -0.23539615,  0.19884759,
         0.15129937,  0.08765514,  0.15757117, -0.16009761, -0.02254741,
        -0.17335685, -0.09706004,  0.05607434,  0.03711884, -0.0560054 ,
        -0.27313083,  0.02705026,  0.14458522, -0.25310335, -0.08086976,
        -0.10635097, -0.28293777,  0.00502296,  0.2793439 ,  0.07475004,
        -0.09199525, -0.23762226,  0.05454395, -0.07554322, -0.06423084,
         0.11491245,  0.03344171, -0.03258195,  0.04890673,  0.07888647,
         0.11464167,  0.31568897,  0.01460155, -0.23916677,  0.24096602,
         0.1589966 ,  0.0215495 , -0.38883814,  0.2073881 ,  0.17495394,
         0.30218056, -0.14856315, -0.09490789,  0.20044254,  0.12068783]],
      dtype=float32)>

In [191]:
z_t_minus_1 = np.array([0.4, 0.5])
daifa.hidden_state = None
p, s = daifa.cem_policy_optimisation(z_t_minus_1)
p

[0.4 0.5]
tf.Tensor(
[ 0.7089493   0.76423895  0.8047202   0.8124183   0.73879427  0.76978123
  0.61311316  0.5821078   0.4306626   0.40520254  0.17977683  0.27913105
 -0.00298302  0.03768509  0.0850338 ], shape=(15,), dtype=float32)


<tf.Tensor: shape=(15,), dtype=float32, numpy=
array([ 0.7089493 ,  0.76423895,  0.8047202 ,  0.8124183 ,  0.73879427,
        0.76978123,  0.61311316,  0.5821078 ,  0.4306626 ,  0.40520254,
        0.17977683,  0.27913105, -0.00298302,  0.03768509,  0.0850338 ],
      dtype=float32)>

In [192]:
daifa.tran((np.array([[[0.4, 0.5, 1]]]), None))

[<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[0.17485519, 0.28052184]], dtype=float32)>,
 <tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[0.998175 , 0.9848511]], dtype=float32)>,
 <tf.Tensor: shape=(1, 60), dtype=float32, numpy=
 array([[ 0.06734794, -0.03086371,  0.04919352,  0.06776053,  0.05020184,
         -0.02855486,  0.05532712,  0.00621268,  0.03898622,  0.06404883,
         -0.06092996,  0.02080428, -0.03182369, -0.09051668,  0.02938318,
          0.02920253,  0.09898859,  0.04242412, -0.0937261 , -0.07689307,
         -0.04255384, -0.01035933,  0.0217123 ,  0.0560016 , -0.06113841,
         -0.08259731, -0.04008626,  0.02852438, -0.09232952, -0.09844272,
         -0.09578703, -0.0808928 ,  0.07631816,  0.10538951,  0.03266784,
         -0.06981869, -0.13518177, -0.00443301, -0.0766279 ,  0.03137437,
          0.10195947,  0.03933517,  0.01060682, -0.02867689,  0.05144734,
          0.02094595,  0.07122529,  0.09150924, -0.11927285,  0.11849919,
          0.06

In [193]:
env = gym.make('MountainCarContinuous-v0')

daifa.train_tran = False
daifa.train_vae = False

daifa.hidden_state = None

agent, succeeded, time_to_success = train_agent(env, daifa, observation_max, observation_min, observation_noise_stddev,
                                                num_episodes=1)

Episode 1
[0.3531864 0.5      ]
tf.Tensor(
[0.75776505 0.808785   0.823482   0.7384236  0.66807634 0.75550365
 0.65706307 0.57430935 0.44249344 0.50059706 0.30967107 0.3025037
 0.23872639 0.13167822 0.03006317], shape=(15,), dtype=float32)
[0.41072467 0.60575192]
tf.Tensor(
[0.80091256 0.831186   0.80336916 0.743643   0.67225486 0.7388898
 0.6710419  0.49252346 0.45496187 0.39570916 0.33172902 0.4691094
 0.18269321 0.02243686 0.13316137], shape=(15,), dtype=float32)
[0.52321638 0.6161014 ]
tf.Tensor(
[0.7582315  0.7428887  0.78753793 0.70738727 0.6904973  0.6947737
 0.7184091  0.63121426 0.51972836 0.44251725 0.4724567  0.35061508
 0.20868196 0.29411447 0.1608929 ], shape=(15,), dtype=float32)
[0.59332169 0.53282979]
tf.Tensor(
[ 0.8261601   0.7788568   0.7498009   0.68705684  0.71283436  0.7625619
  0.6251273   0.59106356  0.47288027  0.37912145  0.37760547  0.27614397
  0.17472365  0.0352636  -0.06415275], shape=(15,), dtype=float32)
[0.57653528 0.43853916]
tf.Tensor(
[ 0.84993833  0