# Test the prior model to see how well it works

In [3]:
import numpy as np
import tensorflow as tf

In [4]:
# from prior_model import PriorModelBellman

In [91]:
import tensorflow as tf
import keras
from keras import layers
import numpy as np


class PriorModelBellman(keras.Model):


    def __init__(self, observation_dim, iterate_train=1, discount_factor=0.99):
        super(PriorModelBellman, self).__init__()
        self.observation_dim = observation_dim
        self.iterate_train = iterate_train
        self.discount_factor = discount_factor
        self.train_epochs = 1

        # make the model
        transition_inputs = layers.Input(observation_dim)
        h = layers.Dense(observation_dim * 20, activation="silu")(transition_inputs)
        h = layers.Dense(observation_dim, activation="tanh")(h)

        self.prior_model = keras.Model(transition_inputs, h, name="prior_model")
        self.prior_model.compile(optimizer=tf.keras.optimizers.SGD(), loss=tf.keras.losses.MeanSquaredError())

        self.observations = []
        self.rewards = []

    def call(self, inputs):
        return self.prior_model(inputs)

    def extrinsic_kl(self, y):
        return 1.0 - self.forward(y) # map from [-1, 1] to [2, 0]


    def train(self, observations, rewards):
        """

        :param observations: o_0, o_1, ... , o_n
        :param rewards: list with r_0, r_1, ... , r_n
        :return:
        """

        num_observations = len(observations)

        # expand rewards to have the same dimension as observation dimension and transpose to give [num_observations, observation_dimension
        rewards_stacked = np.stack([rewards]*self.observation_dim).T

        for i in range(self.iterate_train):

            # reducing discount factors through time
            discount_factors = np.power([self.discount_factor]*num_observations, np.arange(num_observations)).reshape(observations.shape[0], 1)
            discount_factors = np.flip(discount_factors)

            # print(discount_factors)

            # TODO Still seems a little strange that we add 0 to the end and discount the way we do but I think it makes sense. Check what predicted utilities are in practice
            utility_t = self.prior_model(observations)
            utility_t_plus_one = tf.concat([utility_t[1:], tf.zeros((1, self.observation_dim), dtype=utility_t.dtype)], axis=0)

            # print(predicted_utility, pred_next_v)

            expected_utility = rewards_stacked + discount_factors * utility_t_plus_one

            # print(rewards_stacked)
            # print(discount_factors * utility_t_plus_one)

            self.prior_model.fit(observations, expected_utility, epochs=self.train_epochs)


In [92]:
a = np.flip(np.arange(10))
a

array([9, 8, 7, 6, 5, 4, 3, 2, 1, 0])

In [93]:
from util import random_observation_sequence, transform_observations
import gym

In [94]:
p = PriorModelBellman(2)

In [95]:
env = gym.make('MountainCarContinuous-v0')

num_seqs = 1
seq_length = 10
ob_seqs = []
next_obs = []
rewards = []

for i in range(num_seqs):
    o, a, r = random_observation_sequence(env, seq_length)

    train = o[:-1]
    test = o[-1]

    rewards.append(r)

    ob_seqs.append(train)
    next_obs.append(test)

ob_seqs = np.array(ob_seqs)
next_obs = np.array(next_obs)
rewards = np.array(rewards)
ob_seqs.shape

ob_seqs_stddev = np.ones_like(ob_seqs)
next_obs_stddev = np.ones_like(next_obs)

next_obs
rewards

array([[-0.05975893, -0.04120084, -0.04120084, -0.01470336, -0.01470336,
        -0.01092644, -0.01165737, -0.01165737, -0.01165737, -0.01165737]])

In [99]:
p.train(ob_seqs[0], rewards)



In [115]:
ob_seqs[0]

array([[-5.53517997e-01,  0.00000000e+00],
       [-5.53142369e-01,  3.75652395e-04],
       [-5.51775575e-01,  1.36682065e-03],
       [-5.51400542e-01,  3.75043048e-04],
       [-5.51913500e-01, -5.12973871e-04],
       [-5.50954700e-01,  9.58819001e-04],
       [-5.49935400e-01,  1.01928855e-03],
       [-5.48296630e-01,  1.63875264e-03],
       [-5.46126664e-01,  2.16995459e-03],
       [-5.42930782e-01,  3.19589116e-03],
       [-5.40861130e-01,  2.06966931e-03],
       [-5.38667977e-01,  2.19315826e-03]])

In [116]:
np.stack([rewards[0]]*2).T * np.arange(12).reshape(12, 1)

array([[-0.00000000e+00, -0.00000000e+00],
       [-2.63429839e-02, -2.63429839e-02],
       [-1.28609152e-01, -1.28609152e-01],
       [-1.60246455e-01, -1.60246455e-01],
       [-2.82100130e-01, -2.82100130e-01],
       [-4.63810342e-03, -4.63810342e-03],
       [-4.75216788e-02, -4.75216788e-02],
       [-3.72745830e-02, -3.72745830e-02],
       [-2.61202061e-01, -2.61202061e-01],
       [-6.46309192e-01, -6.46309192e-01],
       [-1.55811912e-05, -1.55811912e-05],
       [-2.80436117e-01, -2.80436117e-01]])

In [117]:
pm = PriorModelBellman(2)

In [118]:
pm.train(ob_seqs[0], rewards[0])



2022-08-04 19:21:28.791784: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
