# Run The Agent on Mountain Car

In [1]:
import tensorflow as tf
import tensorflow_probability as tfp
import keras
from keras import layers
import numpy as np
import matplotlib.pyplot as plt
import gym

In [1]:
from vae import VAE, create_decoder, create_encoder
from transition_basic import TransitionModel
from basic_agent import DAIFAgent

In [2]:
from util import random_observation_sequence, transform_observations

What does the agent do?
- The agent plans using a policy then executes that policy for 12 simulation timesteps, the first two actions of the policy are executed for 6 steps each

What data does it accumulate?
- It accumulates 12 observation actions pairs

How is it trained?
- VAE is trained to reproduce observations using the latent states
- Transition is trained by taking previous hidden state and previous latent state and trying to predict the next latent state


Online learning For all tasks, we initialize all the agents with random weights and learn online only. Training an agent for 150 epochs takes about 3 minutes on a single CPU core (Intel I7-4870HQ). In contrast, previous approaches using active inference [Ueltzhöffer, 2018, Tschantz et al., 2019, 2020] and policy gradient methods (e.g., [Liu et al., 2017]) use (offline) policy replay and typically need hours of GPU-accelerated compute while achieving similar convergence. To our knowledge, this is the first model-based RL method to learn online using neural network representations. This is afforded by the high sample efficiency of the FEEF, which directs exploration towards states that are uncertain for both the encoder and transition models.


Why this is true?

In [3]:
def run_episode(mcc_env, agent, observation_noise_stddev=[0.1, 0.05], episode_length=1000):

    # first environment observation
    policy_observation, info = mcc_env.reset()

    policy_observation = np.array([policy_observation, 0])

    # apply noise to and scaling to first observation
    policy_observation_noisy = transform_observations(policy_observation, observation_noise_stddev)

    # clear the sequences
    observation_sequence = []
    reward_sequence = []

    # loop until completion
    t = 0
    while True:

        # agent selects policy
        policy_mean, policy_stddev = agent.cem_policy_optimisation(policy_observation_noisy)

        # execute the first two actions of the policy for 6 time steps each
        action0 = policy_mean[0]
        action1 = policy_mean[1]

        # sequence of actions to execute
        action_sequence = [action0]*6 + [action1]*6

        for action in action_sequence:

            action_as_array = np.array([action])  # need to this to satisfy gym requirements
            observation, reward, done, info = mcc_env.step(action_as_array)

            observation_sequence.append(observation)
            reward_sequence.append(reward)

            t += 1
            if done:
                if t < episode_length - 1:
                    return agent, True, t
                else:
                    return agent, False, t

        # assemble the training data
        observation_sequence = np.array(observation_sequence)
        action_sequence = np.array(action_sequence).reshape(12, 1)  # reshape to concatenate when training

        observation_sequence_noisy = transform_observations(observation_sequence, observation_noise_stddev)  # OBS_1:t+1

        # add the noisy observation from last time to the start of the observation sequence
        observation_sequence_noisy = np.vstack([policy_observation_noisy, observation_sequence_noisy])

        pre_observation_sequence_noisy = observation_sequence_noisy[:-1]  # OBS_0:t
        post_observation_sequence_noisy = observation_sequence_noisy[1:]  # OBS_1:t+1

        # train perception
        agent.train_vae(observation_sequence_noisy)

        # train transition
        agent.train_transition(pre_observation_sequence_noisy, post_observation_sequence_noisy, action_sequence)

        # get the next policy_observation_noisy which is the last item in the post observation list
        policy_observation_noisy = post_observation_sequence_noisy[-1]

        # clear all the observation sequences then return to the start of the loop to use policy_observation_noisy to start a new policy
        observation_sequence = []
        reward_sequence = []

    env.close()


def train_agent(mcc_env, agent, num_episodes=100, observation_noise_stddev=[0.1, 0.05], episode_length=1000):

    for n in range(num_episodes):
        print("Episode", n+1)
        agent, success, t = run_episode(mcc_env, agent, observation_noise_stddev=[0.1, 0.05], episode_length=1000)

        if success:
            print("Success in episode", n+1, "at time step", t)
        else:
            print("No Success")

    return agent

In [4]:
import tensorflow as tf
import tensorflow_probability as tfp
import keras
from keras import layers
import numpy as np
import matplotlib.pyplot as plt

from vae import VAE


class DAIFAgent:

    def __init__(self,
                 prior_model,
                 enc,
                 dec,
                 tran,
                 planning_horizon=15,
                 n_policies=1500,
                 n_cem_policy_iterations=2,
                 n_policy_candidates=70):

        super(DAIFAgent, self).__init__()

        self.prior_model = prior_model
        self.planning_horizon = planning_horizon
        self.n_policy_candidates = n_policy_candidates
        self.n_policies = n_policies
        self.n_cem_policy_iterations = n_cem_policy_iterations

        # encoder
        self.enc = enc

        # decoder
        # takes latent state and outputs observation
        self.dec = dec

        # full vae
        self.model_vae = VAE(enc, dec)
        self.model_vae.compile(optimizer=tf.keras.optimizers.Adam())

        # transition
        # takes action plus last state and outputs next latent state
        self.tran = tran
        self.tran.compile(optimizer=tf.keras.optimizers.Adam())


    def select_action(self, observation):

        policy_mean, policy_stddev = self.cem_policy_optimisation(observation)

        # return a distribution that we can sample from
        return tfp.distributions.MultivariateNormalDiag(loc=policy_mean, scale_diag=policy_stddev)


    def train_vae(self, observation, verbose=0):
        self.model_vae.fit(observation, verbose=verbose)


    def train_transition(self, o_t_minus_one, o_t, action_t_minus_one, verbose=0):

        # find the latent reps with the decoder
        z_t_minus_1_mean, z_t_minus_1_stddev, z_t_minus = self.enc(o_t_minus_one)
        z_t_mean, z_t_stddev, z_t = self.enc(o_t)

        # concatenate action and observation for input into transition
        z_train = np.concatenate([np.array(z_t_minus_1_mean), np.array(action_t_minus_one)], axis=1)

        # train the transition model
        self.tran.fit(z_train, (z_t_mean, z_t_stddev), epochs=1, verbose=verbose)


    def cem_policy_optimisation(self, z_t_minus_one):

        # need to change these two if the policy dimension changes
        mean_best_policies = tf.zeros(self.planning_horizon)
        std_best_policies = tf.ones(self.planning_horizon)

        for i in range(self.n_cem_policy_iterations):
            policy_distr = tfp.distributions.MultivariateNormalDiag(loc=mean_best_policies, scale_diag=std_best_policies)
            policies = policy_distr.sample([self.n_policies])
            policies = tf.clip_by_value(policies, clip_value_min=-1, clip_value_max=1)

            # project into the future and calculate FEEF
            policy_results = self.forward_policies(policies.numpy(), z_t_minus_one)
            FEEFs = self.evaluate_policy(*policy_results)

            FEEFs = tf.convert_to_tensor(FEEFs)

            # sum over the timesteps to get the FEEF for each policy
            FEEFs_sum = tf.reduce_sum(FEEFs, axis=0)

            # multiply by one to find largest value which is euqivalent to smallest FEEF with top_k
            neg_FEEF_sum = -1*FEEFs_sum

            result = tf.math.top_k(neg_FEEF_sum, self.n_policy_candidates, sorted=False)
            min_FEEF_indices = result.indices

            # update the policy distributions
            mean_best_policies = tf.reduce_mean(tf.gather(policies, min_FEEF_indices), axis=0)
            std_best_policies = tf.math.reduce_std(tf.gather(policies, min_FEEF_indices), axis=0)


        # TODO not sure why we need all of this is with the x means? I think it's for training but maybe not

        # One last forward pass to gather the stats of the policy mean
        #FEEFs, next_x_means, next_x_stds = self._forward_policies(mean_best_policies.unsqueeze(1))
        # return mean_best_policies, std_best_policies, FEEFs.detach().squeeze(1), next_x_means.detach().squeeze(1), next_x_stds.detach().squeeze(1)

        return mean_best_policies, std_best_policies


    def forward_policies(self, policies, z_t_minus_one):
        """
        Forward propogate a policy and compute the FEEF of each policy
        :param z_t_minus_one:
        :return:
        """

        # stack up the new observation to have shape [self.n_policies, len(z_t_minus_one)]
        prev_latent_mean = np.stack([z_t_minus_one]*self.n_policies)

        policy_posteriors = []
        policy_sds = []
        likelihoods = []
        z_means = []
        z_sds = []

        # find the predicted latent states from the transition model
        for t in range(self.planning_horizon):

            tran_input = np.concatenate([prev_latent_mean, policies[:, t].reshape(self.n_policies, 1)], axis=1)
            next_latent_mean, next_latent_sd = self.tran(tran_input)  # shape = [num policies, latent dim

            policy_posteriors.append(next_latent_mean)
            policy_sds.append(next_latent_sd)

            next_likelihoods = self.dec(next_latent_mean)
            likelihoods.append(next_likelihoods)

            next_posterior_means, next_posteriors_sds, next_posteriors_z = self.enc(next_likelihoods)
            z_means.append(next_posterior_means)
            z_sds.append(next_posteriors_sds)

            prev_latent_mean = next_latent_mean

        return policy_posteriors, policy_sds, likelihoods, z_means, z_sds


    def evaluate_policy(self, policy_posteriors, policy_sd, predicted_likelihood, predicted_posterior, predicted_posterior_sd):

        return self.FEEF(policy_posteriors, policy_sd, predicted_likelihood, predicted_posterior, predicted_posterior_sd)


    def FEEF(self, policy_posteriors_list, policy_sd_list, predicted_likelihood_list, predicted_posterior_list, predicted_posterior_sd_list):
        """
        Compute the FEEF for policy selection
        :param policy_posteriors:
        :param predicted_likelihood:
        :param predicted_posterior:
        :return:
        """

        FEEFs = []

        for t in range(self.planning_horizon):

            # extract the values for each time step
            predicted_likelihood = predicted_likelihood_list[t]
            policy_posteriors = policy_posteriors_list[t]
            policy_sd = policy_sd_list[t]
            predicted_posterior = predicted_posterior_list[t]
            predicted_posterior_sd = predicted_posterior_sd_list[t]

            # !!!! evaluate the EXTRINSIC KL divergence !!!!

            # convert to normal distributions
            # TODO Why is the stddev 1s here? I think because we assume it is on the true state of the world.
            likelihood_dist = tfp.distributions.MultivariateNormalDiag(loc=predicted_likelihood, scale_diag=np.ones_like(predicted_likelihood))

            if self.prior_model is None:

                # TODO how exactly is the prior defined
                # create the prior distribution
                prior_preferences = tf.convert_to_tensor(np.stack([[0.5, 100]]*self.n_policies), dtype="float32")

                prior_dist = tfp.distributions.MultivariateNormalDiag(loc=prior_preferences, scale_diag=np.ones_like(prior_preferences))

            # TODO Fix the learned prior model
            else:
                prior_dist = self.prior_model()


            kl_extrinsic = tfp.distributions.kl_divergence(likelihood_dist, prior_dist)

            # !!!! evaluate the KL INTRINSIC part !!!!
            policy_posteriors_dist = tfp.distributions.MultivariateNormalDiag(loc=policy_posteriors, scale_diag=policy_sd)
            predicted_posterior_dist = tfp.distributions.MultivariateNormalDiag(loc=predicted_posterior, scale_diag=predicted_posterior_sd)

            kl_intrinsic = tfp.distributions.kl_divergence(predicted_posterior_dist, policy_posteriors_dist)

            FEEF = kl_extrinsic - kl_intrinsic

            FEEFs.append(FEEF)

        return FEEFs


    def EFE(self, policy_posteriors, predicted_likelihood, predicted_posterior):
        """
        Compute the EFE for policy selection
        :param policy_posteriors:
        :param predicted_likelihood:
        :param predicted_posterior:
        :return:
        """
        pass


In [102]:
enc = create_encoder(2, 2, [20])
dec = create_decoder(2, 2, [20])
tran = TransitionModel(2, 1)

daifa = DAIFAgent(None, enc, dec, tran)

env = gym.make('MountainCarContinuous-v0')

train_agent(env, daifa, num_episodes=20)

Episode 1


2022-07-07 10:44:10.356569: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-07-07 10:44:10.802549: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


No Success
Episode 2
No Success
Episode 3
No Success
Episode 4
Success in episode 4 at time step 716
Episode 5
Success in episode 5 at time step 766
Episode 6
Success in episode 6 at time step 431
Episode 7
No Success
Episode 8
Success in episode 8 at time step 902
Episode 9
Success in episode 9 at time step 343
Episode 10
Success in episode 10 at time step 406
Episode 11
No Success
Episode 12
No Success
Episode 13
No Success
Episode 14
Success in episode 14 at time step 828
Episode 15
Success in episode 15 at time step 735
Episode 16
No Success
Episode 17
No Success
Episode 18
Success in episode 18 at time step 331
Episode 19
Success in episode 19 at time step 456
Episode 20
Success in episode 20 at time step 620


<__main__.DAIFAgent at 0x2f88c4c10>

In [94]:
enc = create_encoder(2, 2, [20])
dec = create_decoder(2, 2, [20])
tran = TransitionModel(2, 1)

daifa = DAIFAgent(None, enc, dec, tran)

env = gym.make('MountainCarContinuous-v0')

train_agent(env, daifa)

Episode 1


2022-07-07 06:46:30.583715: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-07-07 06:46:30.936236: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


No Success
Episode 2
No Success
Episode 3
No Success
Episode 4
Success in episode 4 at time step 753


<__main__.DAIFAgent at 0x2b4140910>

In [95]:
enc = create_encoder(2, 2, [20])
dec = create_decoder(2, 2, [20])
tran = TransitionModel(2, 1)

daifa = DAIFAgent(None, enc, dec, tran)

env = gym.make('MountainCarContinuous-v0')

train_agent(env, daifa)

Episode 1


2022-07-07 06:50:21.611683: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-07-07 06:50:21.986528: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


No Success
Episode 2
No Success
Episode 3
No Success
Episode 4
No Success
Episode 5
Success in episode 5 at time step 953


<__main__.DAIFAgent at 0x2c63c9490>

In [96]:
enc = create_encoder(2, 2, [20])
dec = create_decoder(2, 2, [20])
tran = TransitionModel(2, 1)

daifa = DAIFAgent(None, enc, dec, tran)

env = gym.make('MountainCarContinuous-v0')

train_agent(env, daifa)

Episode 1


2022-07-07 06:55:21.234757: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-07-07 06:55:21.584529: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


No Success
Episode 2
No Success
Episode 3
No Success
Episode 4
No Success
Episode 5
No Success
Episode 6
No Success
Episode 7
Success in episode 7 at time step 683


<__main__.DAIFAgent at 0x2bdb494c0>

In [97]:
enc = create_encoder(2, 2, [20])
dec = create_decoder(2, 2, [20])
tran = TransitionModel(2, 1)

daifa = DAIFAgent(None, enc, dec, tran)

env = gym.make('MountainCarContinuous-v0')

train_agent(env, daifa)

Episode 1


2022-07-07 07:02:03.874260: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-07-07 07:02:04.264139: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


No Success
Episode 2
Success in episode 2 at time step 605


<__main__.DAIFAgent at 0x29c9b6580>

In [99]:
enc = create_encoder(2, 2, [20])
dec = create_decoder(2, 2, [20])
tran = TransitionModel(2, 1)

daifa = DAIFAgent(None, enc, dec, tran)

env = gym.make('MountainCarContinuous-v0')

train_agent(env, daifa)

Episode 1


2022-07-07 07:05:31.535007: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-07-07 07:05:31.940020: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


No Success
Episode 2
No Success
Episode 3
No Success
Episode 4
Success in episode 4 at time step 411


<__main__.DAIFAgent at 0x177ac2c40>

In [98]:
enc = create_encoder(2, 2, [20])
dec = create_decoder(2, 2, [20])
tran = TransitionModel(2, 1)

daifa = DAIFAgent(None, enc, dec, tran)

env = gym.make('MountainCarContinuous-v0')

train_agent(env, daifa)

Episode 1


2022-07-07 07:03:41.624143: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-07-07 07:03:42.026746: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


No Success
Episode 2
Success in episode 2 at time step 813


<__main__.DAIFAgent at 0x2c635d880>