# Run The Agent on Mountain Car

In [166]:
import tensorflow as tf
import tensorflow_probability as tfp
import keras
from keras import layers
import numpy as np
import matplotlib.pyplot as plt
import gym

In [167]:
from vae_recurrent import VAE, create_decoder, create_encoder
from transition_gru import TransitionGRU
# from recurrent_agent import DAIFAgentRecurrent

In [168]:
from util import random_observation_sequence, transform_observations

What does the agent do?
- The agent plans using a policy then executes that policy for 12 simulation timesteps, the first two actions of the policy are executed for 6 steps each

What data does it accumulate?
- It accumulates 12 observation actions pairs

How is it trained?
- VAE is trained to reproduce observations using the latent states
- Transition is trained by taking previous hidden state and previous latent state and trying to predict the next latent state


Online learning For all tasks, we initialize all the agents with random weights and learn online only. Training an agent for 150 epochs takes about 3 minutes on a single CPU core (Intel I7-4870HQ). In contrast, previous approaches using active inference [Ueltzhöffer, 2018, Tschantz et al., 2019, 2020] and policy gradient methods (e.g., [Liu et al., 2017]) use (offline) policy replay and typically need hours of GPU-accelerated compute while achieving similar convergence. To our knowledge, this is the first model-based RL method to learn online using neural network representations. This is afforded by the high sample efficiency of the FEEF, which directs exploration towards states that are uncertain for both the encoder and transition models.


Why this is true?

In [169]:
# Hide GPU from visible devices
tf.config.set_visible_devices([], 'GPU')

In [212]:
def run_episode(mcc_env, agent, obs_max, obs_min, observation_noise_stddev=[0.05, 0.05], policy_repeats=1, episode_length=1000):

    # arrays to store observations, actions and rewards
    all_pre_observations = []
    all_post_observations = []
    all_action = []
    observation_sequence = []
    reward_sequence = []

    # get the first observation from the environment
    first_observation, info = mcc_env.reset()
    first_observation = np.array([first_observation, 0])

    # apply noise to and scaling to first observation
    first_observation_noisy = transform_observations(first_observation, obs_max, obs_min, observation_noise_stddev)

    # find the first policy
    policy_observation = first_observation_noisy
    policy = agent.select_policy(policy_observation)

    # loop until episode ends or the agent succeeds
    t = 0
    while True:

        if t % 10 == 0:
            print(t)

        # get the actions from the policy and reshape to desired form
        actions = policy.mean()
        actions = tf.reshape(actions, (actions.shape[0], agent.tran.action_dim))  # [num_actions, action_dim]
        actions = actions.numpy()

        # agent executes policy and gathers observations
        for action in actions:
            observation, reward, done, info = mcc_env.step(action)  # action should be array to satisfy gym requirements

            # all_observations.append(observation)
            # all_action.append(action)

            t += 1
            if done:
                if t < 999:
                    print(policy)
                return t < 999, agent, t, all_pre_observations, all_post_observations, all_action # the max for the environment

            observation_sequence.append(observation)
            reward_sequence.append(reward)

        # scale and add noise to the observation
        observation_sequence = transform_observations(observation_sequence, obs_max, obs_min, observation_noise_stddev)

        # get the noisy observations for pre and post actions
        pre_observation_sequence = np.vstack([policy_observation, observation_sequence[:-1]])
        post_action_observation_sequence = observation_sequence

        all_pre_observations.append(pre_observation_sequence)
        all_post_observations.append(post_action_observation_sequence)
        all_action.append(actions)

        # print("pol", policy_observation)
        # print("obs", observation_sequence)
        # print("pre", pre_observation_sequence)
        # print("post", post_action_observation_sequence)

        # if time to train the agent
        agent.train(pre_observation_sequence, post_action_observation_sequence, actions, reward_sequence)

        # the new observation we use to select a policy is the last observation in observation_sequences
        policy_observation = observation_sequence[-1]

        # select a new policy and clear everything
        policy = agent.select_policy(policy_observation)

        # clear the observations
        observation_sequence = []
        reward_sequence = []

    env.close()


def train_agent(mcc_env, agent, obs_max, obs_min, observation_noise_stddev, episode_length=1000, num_episodes=100):

    time_to_success = []
    did_succeed = []

    for n in range(num_episodes):
        print("Episode", n+1)
        success, agent, t, *rest = run_episode(mcc_env, agent, obs_max, obs_min, observation_noise_stddev, episode_length)

        did_succeed.append(success)
        time_to_success.append(t)

        if success:
            print("Success in episode", n+1, "at time step", t)
        else:
            print("No Success")

    return agent, did_succeed, time_to_success

In [213]:
import tensorflow as tf
import tensorflow_probability as tfp
import numpy as np

from vae_recurrent import VAE


class DAIFAgentRecurrent:

    def __init__(self,
                 prior_model,
                 vae,
                 tran,
                 given_prior_mean,
                 given_prior_stddev,
                 planning_horizon=15,
                 n_policies=1500,
                 n_cem_policy_iterations=2,
                 n_policy_candidates=70,
                 tran_train_epochs=1,
                 vae_train_epochs=1):

        super(DAIFAgentRecurrent, self).__init__()

        self.prior_model = prior_model
        self.planning_horizon = planning_horizon
        self.n_policy_candidates = n_policy_candidates
        self.n_policies = n_policies
        self.n_cem_policy_iterations = n_cem_policy_iterations

        self.vae_train_epochs = vae_train_epochs
        self.tran_train_epochs = tran_train_epochs

        self.given_prior_mean = given_prior_mean
        self.given_prior_stddev = given_prior_stddev

        # full vae
        self.model_vae = vae
        self.model_vae.compile(optimizer=tf.keras.optimizers.Adam())

        # transition
        # takes action plus last state and outputs next latent state
        self.tran = tran
        self.tran.compile(optimizer=tf.keras.optimizers.Adam())

        self.hidden_state = None


    def select_policy(self, observation):

        policy_mean, policy_stddev = self.cem_policy_optimisation(observation)

        # return a distribution that we can sample from
        return tfp.distributions.MultivariateNormalDiag(loc=policy_mean, scale_diag=policy_stddev)


    def train(self, pre_observations, post_observations, actions, rewards, verbose=0):

        num_observations = pre_observations.shape[0]
        observation_dim = pre_observations.shape[1]
        action_dim = actions.shape[1]
        # action_dim = 1  # TODO fix this to allow different actions

        # find the actual observed latent states using the vae
        pre_latent_mean, pre_latent_stddev, pre_latent = self.model_vae.encoder(pre_observations)
        post_latent_mean, post_latent_stddev, post_latent = self.model_vae.encoder(post_observations)

        # set up the input training data that we use to train the transition model
        z_train = np.concatenate([np.array(pre_latent_mean), np.array(actions)], axis=1)

        # we use the sequence to find the right hidden states to use as input
        z_train_seq = z_train.reshape((1, num_observations, observation_dim + action_dim))
        z_train_singles = z_train.reshape(num_observations, 1, observation_dim + action_dim)

        # the previous hidden state is the memory after observing some sequences but it might be None
        if self.hidden_state is None:
            self.hidden_state = np.zeros((1, self.tran.hidden_units))

        # find the hidden states at t=0, t=1, t=2, ..., t=num_observations - 1
        _, _, _, h_states = self.tran((z_train_seq, self.hidden_state))

        # squeeze so we make the shape [num_observations, hidden_units]
        h_states = tf.squeeze(h_states)

        # exclude the last state as this will become the hidden state later on. next hidden state will become our new memory
        h_states_for_training = h_states[:-1]
        # next_hidden_state = h_states[-1]

        # add the current hidden state we saved to the start. This has h0, h1, h2, .. h=num_observations - 1
        h_states_for_training = tf.concat([self.hidden_state, h_states_for_training], axis=0)

        # use the hidden states with the pre and post observations to train transition model
        self.tran.fit((z_train_singles, h_states_for_training), (post_latent_mean, post_latent_stddev), epochs=self.tran_train_epochs, verbose=verbose)

        # train the vae model on post_observations because these are all new
        self.model_vae.fit(post_observations, epochs=self.vae_train_epochs, verbose=verbose)

        # now find the new predicted hidden state that we will use for finding the policy
        # _, _, final_hidden_state, _ = self.tran((z_train_seq, self.hidden_state))
        _, _, final_hidden_state, _ = self.tran((z_train_seq, None))

        self.hidden_state = final_hidden_state


    def cem_policy_optimisation(self, z_t_minus_one):

        # need to change these two if the policy dimension changes
        mean_best_policies = tf.zeros(self.planning_horizon)
        std_best_policies = tf.ones(self.planning_horizon)

        for i in range(self.n_cem_policy_iterations):
            policy_distr = tfp.distributions.MultivariateNormalDiag(loc=mean_best_policies, scale_diag=std_best_policies)
            policies = policy_distr.sample([self.n_policies])
            policies = tf.clip_by_value(policies, clip_value_min=-1, clip_value_max=1)

            # project trajectory into the future using transition model and calculate FEEF for each policy
            policy_results = self.forward_policies(policies.numpy(), z_t_minus_one)
            FEEFs = self.evaluate_policy(*policy_results)

            FEEFs = tf.convert_to_tensor(FEEFs)

            # sum over the timesteps to get the FEEF for each policy
            FEEFs_sum = tf.reduce_sum(FEEFs, axis=0)

            # multiply by one to find largest value which is euqivalent to smallest FEEF with top_k
            neg_FEEF_sum = -1*FEEFs_sum

            result = tf.math.top_k(neg_FEEF_sum, self.n_policy_candidates, sorted=False)
            min_FEEF_indices = result.indices

            # update the policy distributions
            mean_best_policies = tf.reduce_mean(tf.gather(policies, min_FEEF_indices), axis=0)
            std_best_policies = tf.math.reduce_std(tf.gather(policies, min_FEEF_indices), axis=0)


        # TODO not sure why we need all of this is with the x means? I think it's for training but maybe not

        # One last forward pass to gather the stats of the policy mean
        #FEEFs, next_x_means, next_x_stds = self._forward_policies(mean_best_policies.unsqueeze(1))
        # return mean_best_policies, std_best_policies, FEEFs.detach().squeeze(1), next_x_means.detach().squeeze(1), next_x_stds.detach().squeeze(1)

        return mean_best_policies, std_best_policies


    def forward_policies(self, policies, z_t_minus_one):
        """
        Forward propogate a policy and compute the FEEF of each policy
        :param z_t_minus_one:
        :return:
        """

        # stack up the new observation to have shape [self.n_policies, len(z_t_minus_one)]
        prev_latent_mean = np.stack([z_t_minus_one]*self.n_policies)

        policy_posteriors = []
        policy_sds = []
        likelihoods = []
        z_means = []
        z_sds = []

        # get the starting hidden state that coressponds to the memory stored by the previous sequences. Should have shape (1, self.tran.num_hidden_units) for the observed sequence
        # extend the current hidden state to the number of policies present
        if self.hidden_state is None:
            cur_hidden_state = np.zeros((self.n_policies, self.tran.hidden_units))
        else:
            cur_hidden_state = np.vstack([self.hidden_state]*self.n_policies)

        # find the predicted latent states from the transition model
        for t in range(self.planning_horizon):

            ob_plus_action = np.concatenate([prev_latent_mean, policies[:, t].reshape(self.n_policies, 1)], axis=1)
            tran_input = ob_plus_action.reshape((self.n_policies, 1, ob_plus_action.shape[1]))  # reshape to pass to GRU

            next_latent_mean, next_latent_sd, next_hidden_state, _ = self.tran((tran_input, cur_hidden_state))  # shape = [num policies, latent dim

            # update the hidden state for use with the next policies
            cur_hidden_state = next_hidden_state

            policy_posteriors.append(next_latent_mean)
            policy_sds.append(next_latent_sd)

            next_likelihoods = self.model_vae.decoder(next_latent_mean)
            likelihoods.append(next_likelihoods)

            next_posterior_means, next_posteriors_sds, next_posteriors_z = self.model_vae.encoder(next_likelihoods)
            z_means.append(next_posterior_means)
            z_sds.append(next_posteriors_sds)

            prev_latent_mean = next_latent_mean

        return policy_posteriors, policy_sds, likelihoods, z_means, z_sds


    def evaluate_policy(self, policy_posteriors, policy_sd, predicted_likelihood, predicted_posterior, predicted_posterior_sd):

        return self.FEEF(policy_posteriors, policy_sd, predicted_likelihood, predicted_posterior, predicted_posterior_sd)


    def FEEF(self, policy_posteriors_list, policy_sd_list, predicted_likelihood_list, predicted_posterior_list, predicted_posterior_sd_list):
        """
        Compute the FEEF for policy selection
        :param policy_posteriors:
        :param predicted_likelihood:
        :param predicted_posterior:
        :return:
        """

        FEEFs = []

        for t in range(self.planning_horizon):

            # extract the values for each time step
            predicted_likelihood = predicted_likelihood_list[t]
            policy_posteriors = policy_posteriors_list[t]
            policy_sd = policy_sd_list[t]
            predicted_posterior = predicted_posterior_list[t]
            predicted_posterior_sd = predicted_posterior_sd_list[t]

            # !!!! evaluate the EXTRINSIC KL divergence !!!!

            # convert to normal distributions
            # TODO Why is the stddev 1s here? I think because we assume it is on the true state of the world.
            likelihood_dist = tfp.distributions.MultivariateNormalDiag(loc=predicted_likelihood, scale_diag=np.ones_like(predicted_likelihood))

            if self.prior_model is None:

                # TODO how exactly is the prior defined? After you apply transformations what is the prior
                # create the prior distribution
                prior_preferences_mean = tf.convert_to_tensor(np.stack([self.given_prior_mean]*self.n_policies), dtype="float32")
                prior_preferences_stddev = tf.convert_to_tensor(np.stack([self.given_prior_stddev]*self.n_policies), dtype="float32")

                prior_dist = tfp.distributions.MultivariateNormalDiag(loc=prior_preferences_mean, scale_diag=prior_preferences_stddev)

            # TODO Fix the learned prior model
            else:
                prior_dist = self.prior_model()

            kl_extrinsic = tfp.distributions.kl_divergence(likelihood_dist, prior_dist)

            # !!!! evaluate the KL INTRINSIC part !!!!
            policy_posteriors_dist = tfp.distributions.MultivariateNormalDiag(loc=policy_posteriors, scale_diag=policy_sd)
            predicted_posterior_dist = tfp.distributions.MultivariateNormalDiag(loc=predicted_posterior, scale_diag=predicted_posterior_sd)

            kl_intrinsic = tfp.distributions.kl_divergence(predicted_posterior_dist, policy_posteriors_dist)

            FEEF = kl_extrinsic - kl_intrinsic

            FEEFs.append(FEEF)

        return FEEFs


    def EFE(self, policy_posteriors, predicted_likelihood, predicted_posterior):
        """
        Compute the EFE for policy selection
        :param policy_posteriors:
        :param predicted_likelihood:
        :param predicted_posterior:
        :return:
        """
        pass


In [214]:
enc = create_encoder(2, 2, [20])
dec = create_decoder(2, 2, [20])
vae = VAE(enc, dec, [0, 0], [0.3, 0.3], llik_scaling=100)

tran = TransitionGRU(2, 1, 12, 60, 2)

# unscaled prior mean and prior stddev
prior_mean = [0.45, 0]
prior_stddev = [1, 1]

observation_max = np.array([0.6, 0.07])
observation_min = np.array([-1.2, -0.07])

observation_noise_stddev = [0, 0]

scaled_prior_mean = transform_observations(prior_mean, observation_max, observation_min, [0,0])  # no noise on prior

daifa = DAIFAgentRecurrent(None, vae, tran, scaled_prior_mean, prior_stddev)

In [215]:
env = gym.make('MountainCarContinuous-v0')

agent, succeeded, time_to_success = train_agent(env, daifa, observation_max, observation_min, observation_noise_stddev, num_episodes=5)

Episode 1
0
30
60
90
120
150
180
210
240
270
300
330
360
390
420
450
480
510
540
570
600
630
660
Success in episode 1 at time step 685
Episode 2
0
30
60
90
120
150
180
210
240
270
300
330
360
390
420
450
480
510
540
570
600
630
660
690
720
750
780
810
840
870
900
930
960
990
No Success
Episode 3
0
30
60
90
120
150
180
210
240
270
300
330
360
390
420
450
480
510
540
570
600
630
660
690
720
750
Success in episode 3 at time step 775
Episode 4
0
30
60
90
120
150
180
Success in episode 4 at time step 207
Episode 5
0
30
60
90
120
150
180
210
240
270
300
330
360
390
420
450
480
510
540
570
600
630
660
690
720
750
780
810
840
870
900
930
960
990
No Success


In [None]:
plt.plot(np.arange(len(time_to_success), time_to_success))

## Test the models produced

In [43]:
num_seqs = 1200
seq_length = 12
ob_dim = 2
ob_seqs = []
next_obs = []

for i in range(num_seqs):
    o, a, r = random_observation_sequence(env, seq_length)

    train = np.concatenate([o[:-1], a], axis=1)
    # train = o[:-1]
    test = o[-1]

    ob_seqs.append(train)
    next_obs.append(test)

ob_seqs = np.array(ob_seqs)
next_obs = np.array(next_obs)
ob_seqs_flat.shape

ob_seqs_stddev = np.ones_like(ob_seqs_flat)
next_obs_stddev = np.ones_like(ob_seqs_flat)

ob_seqs_flat.shape

# ob_seqs = transform_observations(ob_seqs, observation_max, observation_min, [0,0])

ob_seqs

array([[[-4.11200672e-01,  0.00000000e+00,  8.70453715e-02],
        [-4.11897212e-01, -6.96533592e-04,  9.12484288e-01],
        [-4.12047178e-01, -1.49977262e-04,  3.95269096e-02],
        ...,
        [-4.41741168e-01, -8.41731299e-03,  7.88793385e-01],
        [-4.49583083e-01, -7.84190279e-03,  7.74584651e-01],
        [-4.56813663e-01, -7.23059289e-03, -9.67206419e-01]],

       [[-4.42160368e-01,  0.00000000e+00,  7.79280484e-01],
        [-4.41596180e-01,  5.64190792e-04, -1.80532649e-01],
        [-4.41911638e-01, -3.15442914e-04,  4.04548228e-01],
        ...,
        [-4.54894215e-01, -3.60563770e-03, -4.09378260e-01],
        [-4.59625572e-01, -4.73134872e-03,  2.57950217e-01],
        [-4.64446843e-01, -4.82128235e-03,  7.45288432e-01]],

       [[-5.31784356e-01,  0.00000000e+00, -8.07618558e-01],
        [-5.32934427e-01, -1.15004205e-03, -9.58793938e-01],
        [-5.35452664e-01, -2.51822476e-03,  8.32027197e-01],
        ...,
        [-5.49402714e-01, -3.52771278e-03,

In [34]:
ob_seqs

array([[[0.40084078, 0.5       ],
        [0.40063332, 0.49733267],
        [0.400818  , 0.5023744 ],
        ...,
        [0.40380593, 0.497436  ],
        [0.40346972, 0.49567731],
        [0.40299592, 0.49390817]],

       [[0.35217812, 0.5       ],
        [0.35297981, 0.51030741],
        [0.35477814, 0.52312171],
        ...,
        [0.3731825 , 0.54759189],
        [0.37696869, 0.54867963],
        [0.38085253, 0.54993506]],

       [[0.41825164, 0.5       ],
        [0.41863805, 0.50496814],
        [0.41943579, 0.5102566 ],
        ...,
        [0.40065185, 0.44231838],
        [0.39516142, 0.4294088 ],
        [0.38937611, 0.42561743]],

       ...,

       [[0.3980581 , 0.5       ],
        [0.39853252, 0.50609965],
        [0.39801534, 0.49335054],
        ...,
        [0.3892667 , 0.48345105],
        [0.38794025, 0.48294551],
        [0.38585978, 0.47325106]],

       [[0.40699541, 0.5       ],
        [0.40668881, 0.49605815],
        [0.40552974, 0.48509772],
        .

In [28]:
agent.model_vae(ob_seqs)

<tf.Tensor: shape=(14400, 2), dtype=float32, numpy=
array([[0.38556084, 0.4905334 ],
       [0.39635327, 0.5072894 ],
       [0.39426306, 0.50200635],
       ...,
       [0.38257498, 0.49608925],
       [0.38280475, 0.4925485 ],
       [0.38699824, 0.493933  ]], dtype=float32)>

In [29]:
z = agent.model_vae.encoder(ob_seqs)
z

[<tf.Tensor: shape=(14400, 2), dtype=float32, numpy=
 array([[ 0.002228  , -0.00141016],
        [ 0.00227949, -0.00126522],
        [ 0.00219531, -0.00140826],
        ...,
        [ 0.00458311, -0.00133375],
        [ 0.00438524, -0.00022276],
        [ 0.00410312,  0.0010559 ]], dtype=float32)>,
 <tf.Tensor: shape=(14400, 2), dtype=float32, numpy=
 array([[0.3420336 , 0.370421  ],
        [0.34153882, 0.3700512 ],
        [0.34208104, 0.37044674],
        ...,
        [0.33828744, 0.36833283],
        [0.3355602 , 0.3661365 ],
        [0.33252472, 0.36367458]], dtype=float32)>,
 <tf.Tensor: shape=(14400, 2), dtype=float32, numpy=
 array([[ 0.34037507,  0.03871249],
        [-0.2799148 , -0.26680294],
        [ 0.11989297, -0.16823176],
        ...,
        [ 0.6025474 , -0.47514677],
        [ 0.12444279, -0.17091578],
        [-0.2816515 , -0.8058164 ]], dtype=float32)>]

In [17]:
agent.model_vae.decoder(z[2])

<tf.Tensor: shape=(14400, 2), dtype=float32, numpy=
array([[0.4300636 , 0.49906108],
       [0.43326598, 0.49975306],
       [0.43286473, 0.49547517],
       ...,
       [0.38715714, 0.5023156 ],
       [0.35460785, 0.48577   ],
       [0.42961854, 0.4994094 ]], dtype=float32)>

## Testing the Identity VAE

In [12]:
from identity_vae import IdentityVAE, identity_encoder, identity_decoder

In [172]:
def identity_encoder(inputs):

    return [inputs, np.ones_like(inputs), inputs]


def identity_decoder(inputs):
    return inputs


class IdentityVAE(keras.Model):
    """
    Implements the identity mapping with standard deviation as all 1s
    """
    def __init__(self, encoder, decoder, reg_mean, reg_stddev, llik_scaling=1, kl_scaling=1, **kwargs):
        super(IdentityVAE, self).__init__(**kwargs)
        self.encoder = encoder
        self.decoder = decoder
        self.total_loss_tracker = keras.metrics.Mean(name="total_loss")

        self.reg_mean = reg_mean
        self.reg_stddev = reg_stddev

        self.llik_scaling = llik_scaling
        self.kl_scaling = kl_scaling

    @property
    def metrics(self):
        return [
            self.total_loss_tracker,
        ]

    def call(self, inputs, training=None, mask=None):
        return inputs

    def train_step(self, data):
        return {
            "total_loss": 0
        }

In [216]:
enc = identity_encoder
dec = identity_decoder
idvae = IdentityVAE(enc, dec, [0, 0], [0.3, 0.3], llik_scaling=100)

hidden_size = 2*2*15  # 2*latent_dim * planning_size
tran = TransitionGRU(2, 1, 12, hidden_size, 2)

# unscaled prior mean and prior stddev
prior_mean = [0.6, 0]
prior_stddev = [1, 1]

observation_max = np.array([0.6, 0.07])
observation_min = np.array([-1.2, -0.07])

observation_noise_stddev = [0, 0]

scaled_prior_mean = transform_observations(prior_mean, observation_max, observation_min, [0,0])  # no noise on prior

daifa = DAIFAgentRecurrent(None, idvae, tran, scaled_prior_mean, prior_stddev)

In [217]:
env = gym.make('MountainCarContinuous-v0')

agent, succeeded, time_to_success = train_agent(env, daifa, observation_max, observation_min, observation_noise_stddev, num_episodes=5)

Episode 1
0
30
60
90
120
150
180
210
240
270
300
330
360
390
420
450
480
510
540
570
600
630
660
690
720
750
780
810
840
870
900
930
960
990
No Success
Episode 2
0
30
60
90
120
150
180
210
240
270
300
330
360
390
420
450
480
510
540
570
600
630
660
690
720
750
780
810
840
870
900
930
960
990
No Success
Episode 3
0
30
60
90
120
150
180
210
240
270
300
330
360
390
420
450
480
510
540
570
600
630
660
690
720
750
780
810
840
870
900
930
960
990
No Success
Episode 4
0
30
60
90
120
150
180
210
240
270
300
330
360
390
420
450
480
510
540
570
600
630
660
690
720
750
780
810
840
870
900
930
960
990
No Success
Episode 5
0
30
60
90
120
150
180
210
240
270
300
330
360
390
420
450
480
510
540
570
600
630
660
690
720
750
780
810
840
870
900
930
960
990
No Success


In [71]:
out = agent.tran((ob_seqs[0:1], None))
out

[<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[-0.11003135,  0.20155026]], dtype=float32)>,
 <tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[1.0919429, 1.079277 ]], dtype=float32)>,
 <tf.Tensor: shape=(1, 30), dtype=float32, numpy=
 array([[ 0.13321629,  0.18998466, -0.04438917, -0.09943078,  0.00196455,
         -0.01206965, -0.06738807, -0.07661976, -0.01011051,  0.04110336,
          0.05621128,  0.1290381 ,  0.11428525, -0.09795906,  0.01192785,
         -0.07657307, -0.03584003,  0.03242199,  0.1649867 ,  0.13230848,
         -0.01292835,  0.10172065,  0.00063759,  0.0078836 ,  0.05896772,
          0.05756728, -0.04861826,  0.13609464,  0.02476548, -0.12826644]],
       dtype=float32)>,
 <tf.Tensor: shape=(1, 12, 30), dtype=float32, numpy=
 array([[[ 0.02993285,  0.0584089 , -0.06604871, -0.01244637,
           0.03883965, -0.00862625, -0.04777011, -0.02679116,
           0.02655492, -0.01026066, -0.00338584,  0.0400382 ,
           0.02580428, -0.02700949,  0.030

In [73]:
t = ob_seqs[0:1, -1].reshape(1,1,3)
h = out[3]
h = h[0, -2, :]
h = h.numpy().reshape(1,30)
h

array([[-0.04016694,  0.13788535, -0.20154014,  0.05386092,  0.15624952,
        -0.03871916, -0.22826102, -0.07615418,  0.05998917, -0.13080825,
        -0.05901396,  0.01611179,  0.02587488,  0.08561166,  0.12387015,
        -0.19843666,  0.04556194,  0.0584335 , -0.03487249,  0.02025423,
        -0.01289217, -0.08300564, -0.06449609, -0.01476793, -0.10265449,
         0.1606046 , -0.17604604,  0.1427406 ,  0.09185451,  0.04602269]],
      dtype=float32)

In [74]:
agent.tran((t, h))

[<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[-0.11003135,  0.20155026]], dtype=float32)>,
 <tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[1.0919429, 1.079277 ]], dtype=float32)>,
 <tf.Tensor: shape=(1, 30), dtype=float32, numpy=
 array([[ 0.13321629,  0.18998466, -0.04438917, -0.09943078,  0.00196455,
         -0.01206965, -0.06738807, -0.07661976, -0.01011051,  0.04110336,
          0.05621128,  0.1290381 ,  0.11428525, -0.09795906,  0.01192785,
         -0.07657307, -0.03584003,  0.03242199,  0.1649867 ,  0.13230848,
         -0.01292835,  0.10172065,  0.00063759,  0.0078836 ,  0.05896772,
          0.05756728, -0.04861826,  0.13609464,  0.02476548, -0.12826644]],
       dtype=float32)>,
 <tf.Tensor: shape=(1, 1, 30), dtype=float32, numpy=
 array([[[ 0.13321629,  0.18998466, -0.04438917, -0.09943078,
           0.00196455, -0.01206965, -0.06738807, -0.07661976,
          -0.01011051,  0.04110336,  0.05621128,  0.1290381 ,
           0.11428525, -0.09795906,  0.0119

In [8]:
ob_seqs[0:1]

NameError: name 'ob_seqs' is not defined

## Test to see how the agent trains on standard observation data

In [199]:
enc = identity_encoder
dec = identity_decoder
idvae = IdentityVAE(enc, dec, [0, 0], [0.3, 0.3], llik_scaling=100)

hidden_size = 2*2*15  # 2*latent_dim * planning_size
tran = TransitionGRU(2, 1, 12, hidden_size, 2)

# unscaled prior mean and prior stddev
prior_mean = [0.6, 0]
prior_stddev = [1, 1]

observation_max = np.array([0.6, 0.07])
observation_min = np.array([-1.2, -0.07])

observation_noise_stddev = [0, 0]

scaled_prior_mean = transform_observations(prior_mean, observation_max, observation_min, [0,0])  # no noise on prior

daifa = DAIFAgentRecurrent(None, idvae, tran, scaled_prior_mean, prior_stddev, vae_train_epochs=1, tran_train_epochs=2)

In [178]:
env = gym.make('MountainCarContinuous-v0')
success, agent, t, pre_obs, post_obs, acts = run_episode(env, daifa, observation_max, observation_min, observation_noise_stddev)

0
30
60
90
120
150
180
210
240
270
300
330
360
390
420
450
480
510
540
570
600
630
660
690
720
750
780
810
840
870
900
930
960
990


In [189]:
pre_np = np.array(pre_obs)
a = np.array(acts)
a.shape
pre_a = np.concatenate([pre_np, a], axis=2)

In [190]:
print(a.max(), a.min())

0.8098704 -0.06532676


In [191]:
pre_a

array([[[0.37008965, 0.5       , 0.64274138],
        [0.37066795, 0.50743499, 0.55658776],
        [0.37174838, 0.51389117, 0.65481091],
        ...,
        [0.41047474, 0.57226037, 0.52277917],
        [0.41627194, 0.57453541, 0.52470559],
        [0.4222051 , 0.57628357, 0.37393394]],

       [[0.42810532, 0.5758599 , 0.69643861],
        [0.43419865, 0.57834281, 0.68352282],
        [0.44043066, 0.58012587, 0.69041729],
        ...,
        [0.4988479 , 0.56140433, 0.59778053],
        [0.50326538, 0.55679618, 0.60766363],
        [0.50730686, 0.55196177, 0.29275563]],

       [[0.51068671, 0.54345512, 0.71056032],
        [0.51373403, 0.53917979, 0.64313263],
        [0.51637571, 0.53396438, 0.61633205],
        ...,
        [0.51848038, 0.47661969, 0.41549021],
        [0.51604071, 0.46863281, 0.45536974],
        [0.51302629, 0.46124309, 0.283014  ]],

       ...,

       [[0.51902043, 0.52036842, 0.76976782],
        [0.52027577, 0.51614007, 0.68564558],
        [0.52112542, 0

In [192]:
post_obs_to_predict = np.array(post_obs)[:, 14, :]
post_obs_to_predict

array([[0.42810532, 0.5758599 ],
       [0.51068671, 0.54345512],
       [0.50931002, 0.45221953],
       [0.42358571, 0.41151359],
       [0.36385255, 0.48845687],
       [0.41928679, 0.5793542 ],
       [0.51176256, 0.55046145],
       [0.51612226, 0.45125959],
       [0.42809368, 0.40711605],
       [0.36297415, 0.4807838 ],
       [0.41079962, 0.57215037],
       [0.50237944, 0.55340252],
       [0.51530115, 0.46191713],
       [0.43562587, 0.40977219],
       [0.36920734, 0.47506812],
       [0.40700312, 0.56339959],
       [0.4956689 , 0.55673263],
       [0.51504541, 0.46971338],
       [0.44361163, 0.41414712],
       [0.37392667, 0.47021407],
       [0.40472932, 0.56099515],
       [0.48988603, 0.55681477],
       [0.513942  , 0.47466574],
       [0.44597241, 0.41866915],
       [0.37727873, 0.46897353],
       [0.40491911, 0.55493856],
       [0.48499408, 0.55410988],
       [0.50919015, 0.47715691],
       [0.44699771, 0.42027778],
       [0.37685666, 0.4646408 ],
       [0.

In [207]:
agent.tran((pre_a, None))

[<tf.Tensor: shape=(66, 2), dtype=float32, numpy=
 array([[0.4335834 , 0.56521523],
        [0.5122515 , 0.5295362 ],
        [0.51478904, 0.44462413],
        [0.44347557, 0.41256368],
        [0.383164  , 0.48628917],
        [0.4260966 , 0.5680216 ],
        [0.5181996 , 0.5348844 ],
        [0.5264323 , 0.44104174],
        [0.4424122 , 0.40895247],
        [0.38420513, 0.47790435],
        [0.4277233 , 0.5596743 ],
        [0.5128076 , 0.538141  ],
        [0.52519715, 0.45137712],
        [0.45522428, 0.40997675],
        [0.3892485 , 0.4731649 ],
        [0.4223162 , 0.5517151 ],
        [0.50446856, 0.54205805],
        [0.522131  , 0.4598966 ],
        [0.45934907, 0.41366938],
        [0.39034382, 0.46974748],
        [0.415123  , 0.55189383],
        [0.49791732, 0.5430331 ],
        [0.52315855, 0.46357197],
        [0.45980898, 0.4188894 ],
        [0.39460406, 0.46837974],
        [0.42059064, 0.5451353 ],
        [0.4946693 , 0.54093736],
        [0.5175607 , 0.46689713]

In [208]:
post_obs_to_predict

array([[0.42810532, 0.5758599 ],
       [0.51068671, 0.54345512],
       [0.50931002, 0.45221953],
       [0.42358571, 0.41151359],
       [0.36385255, 0.48845687],
       [0.41928679, 0.5793542 ],
       [0.51176256, 0.55046145],
       [0.51612226, 0.45125959],
       [0.42809368, 0.40711605],
       [0.36297415, 0.4807838 ],
       [0.41079962, 0.57215037],
       [0.50237944, 0.55340252],
       [0.51530115, 0.46191713],
       [0.43562587, 0.40977219],
       [0.36920734, 0.47506812],
       [0.40700312, 0.56339959],
       [0.4956689 , 0.55673263],
       [0.51504541, 0.46971338],
       [0.44361163, 0.41414712],
       [0.37392667, 0.47021407],
       [0.40472932, 0.56099515],
       [0.48988603, 0.55681477],
       [0.513942  , 0.47466574],
       [0.44597241, 0.41866915],
       [0.37727873, 0.46897353],
       [0.40491911, 0.55493856],
       [0.48499408, 0.55410988],
       [0.50919015, 0.47715691],
       [0.44699771, 0.42027778],
       [0.37685666, 0.4646408 ],
       [0.

## Examine training the model on the observation data

Does it eventually converge to a good model

In [203]:
num_train_runs = 1
for i in range(num_train_runs):

    for j in range(len(pre)):
        pre = pre_obs[j]
        post = post_obs[j]
        actions = acts[j]

        daifa.train(pre, post, actions, None, verbose=1)

Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2


In [210]:
daifa.cem_policy_optimisation(np.array([0.5, 0.1]))

(<tf.Tensor: shape=(15,), dtype=float32, numpy=
 array([ 0.8344159 ,  0.75534546,  0.6847398 ,  0.80220443,  0.6786175 ,
         0.8187225 ,  0.5921203 ,  0.64381874,  0.5857641 ,  0.4067325 ,
         0.4416599 ,  0.26479524,  0.4449845 ,  0.07807608, -0.05506114],
       dtype=float32)>,
 <tf.Tensor: shape=(15,), dtype=float32, numpy=
 array([0.20074226, 0.2792185 , 0.32190827, 0.28694943, 0.37440932,
        0.31017554, 0.39044052, 0.39928463, 0.4503296 , 0.4880389 ,
        0.40736437, 0.52849865, 0.46789622, 0.59326804, 0.56217706],
       dtype=float32)>)

In [211]:
daifa.cem_policy_optimisation

<bound method DAIFAgentRecurrent.cem_policy_optimisation of <__main__.DAIFAgentRecurrent object at 0x16cc659a0>>