In [1]:
import gymnasium as gym
import collections
import numpy as np
import statistics
import tensorflow as tf
import tensorflow_probability as tfp
import tqdm
import os

from matplotlib import pyplot as plt
from tensorflow.keras import layers
from typing import Any, List, Sequence, Tuple

In [2]:
%load_ext tensorboard

In [3]:
class ContinuousActor(tf.keras.Model):
    """Actor that outputs a policy directly"""
    def __init__(
        self,
        num_actions: int,
        num_hidden_units: int
    ):
        super().__init__()
        self.means = layers.Dense(num_actions, 
        # activation = 'tanh'
        )
        # self.stds = lambda x: tf.zeros(num_actions)
        self.stds = layers.Dense(num_actions, activation='relu')
        # tf.ones(num_actions) * 0.25
    def call(self, inputs: tf.Tensor):
        means = self.means(inputs)
        stds = self.stds(inputs)
        stds = tf.clip_by_value(stds, 1.0e-3, 1)
        return tfp.distributions.MultivariateNormalDiag(loc = means, scale_diag = tf.exp(stds))


class ActorCritic(tf.keras.Model):
    """combined actor-critic network. """

    def __init__(
        self,
        num_actions: int,
        num_hidden_units: int
    ):
        super().__init__()
        self.common = layers.Dense(num_hidden_units, activation = 'relu')
        # outputs scale, location params for mvn
        self.actor = ContinuousActor(num_actions, num_hidden_units) #layers.Dense(num_actions)
        self.critic = layers.Dense(1)

    def call(self, inputs: tf.Tensor):
        x = self.common(inputs)
        return self.actor(x), self.critic(x)




In [4]:
class Agent():

    def __init__(
        self, 
        gamma: float, 
        entropy_coef: float,
        vf_coef: float,
        model: tf.keras.Model, 
        optimizer: tf.keras.optimizers.Optimizer,
        summary_writer,
        debug = False):

        # discount rate
        self.gamma = tf.constant(gamma)
        self.entropy_coef = tf.constant(entropy_coef)
        self.vf_coef = tf.constant(vf_coef)
        self.model = model
        self.optimizer = optimizer

        # creates a dictionary of tensor arrays to write to
        self.memory = self._init_memory()
        self.loss = tf.keras.losses.Huber(reduction = tf.keras.losses.Reduction.NONE)

        self.model.compile(optimizer=self.optimizer)
        self.summary_writer = summary_writer

        # switch for analysis
        self.DEBUG = debug
        if self.DEBUG:
            self.debug_val = 0

    def _init_memory(self):
        return {
                'action' : tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True),
                'state' : tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True),
                'next_state' : tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True),
                'action_probs': tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True),
                'values' : tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True),
                'rewards' : tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True),
                'done' : tf.TensorArray(dtype=tf.int32, size=0, dynamic_size=True)
        }

    
    def _get_action(self, state):

        policy, value = self.model(
                tf.expand_dims(tf.constant(state, tf.float32), 0)
            )
        action = policy.sample()[0]

        return tf.tanh(action), policy, value
        # return tf.clip_by_value(action, -0.99, 0.99), policy, value
    
    def get_action(self, state, step):

        # run the model
        action, policy, _ = self._get_action(state)

        if self.DEBUG:
            with self.summary_writer.as_default():
                tf.summary.scalar('mean1', policy.mean()[0][0], step = self.debug_val)
                tf.summary.scalar('mean2', policy.mean()[0][1], step = self.debug_val)
                tf.summary.scalar('std1', policy.stddev()[0][0], step = self.debug_val)
                tf.summary.scalar('std2', policy.stddev()[0][1], step = self.debug_val)

            self.debug_val += 1

        return action

    def log(self, action, state, next_state, reward, done, step):

        """
        Logs results into memory - not all used necessarily
        """

        self.memory['action'].write(
            step, 
            tf.constant(action, tf.float32)
            ).mark_used()

        self.memory['state'].write(
            step, 
            tf.constant(state, tf.float32)
            ).mark_used()

        self.memory['next_state'].write(
            step, 
            tf.constant(next_state, tf.float32)
            ).mark_used()

        self.memory['rewards'].write(
            step, 
            tf.constant(reward, tf.float32)
            ).mark_used()

        self.memory['done'].write(
            step, 
            tf.constant(done, tf.int32)
            ).mark_used()

    def get_expected_return(
        self,
        rewards: tf.Tensor,
        standardize: bool = True):
        """Compute expected returns"""

        n = tf.shape(rewards)[0] 
        returns = tf.TensorArray(dtype=tf.float32, size=n)

        # start at last reward and then accumulate reward sums into returns array
        rewards = rewards[::-1]
        discounted_sum = tf.constant(0.0)
        discounted_sum_shape = discounted_sum.shape
        for i in tf.range(n):
            reward = rewards[i]
            discounted_sum = reward + self.gamma * discounted_sum # discounted_sum= 0 for last reward (i.e. first element in loop)
            discounted_sum.set_shape(discounted_sum_shape)
            returns = returns.write(i, discounted_sum)
        returns = returns.stack()[::-1] # reverse order back to original

        if standardize:
            returns = ((returns - tf.reduce_mean(returns)) / 
                        (tf.math.reduce_std(returns) + eps))
        
        return returns 

    def update(self, episode: int):

        states = self.memory['state'].gather([i for i in tf.range(self.memory['state'].size())])
        # tf.expand_dims(self.memory['state'].gather([i for i in tf.range(self.memory['state'].size())]), 1)
        rewards = self.memory['rewards'].gather([i for i in tf.range(self.memory['rewards'].size())])
        actions = self.memory['action'].gather([i for i in tf.range(self.memory['action'].size())])
        returns = self.get_expected_return(rewards = rewards, standardize=True)

        with tf.GradientTape() as tape:

            policy, values = self.model(states)

            # calculate the loss values
            loss = self.compute_loss(
                actions, 
                policy, 
                values,
                returns,
                episode)

        # compute the gradients from the loss
        grads = tape.gradient(loss, self.model.trainable_variables)
        self.model.optimizer.apply_gradients(zip(grads, self.model.trainable_variables))

        with self.summary_writer.as_default():
            tf.summary.scalar('loss', loss, step = episode)
            tf.summary.scalar('episode_reward', tf.reduce_sum(rewards), step = episode)

        # wipe memory for next episode
        self.memory = self._init_memory()
        
    def compute_loss(
        self,
        actions: tf.Tensor,
        policy: tf.Tensor,
        values: tf.Tensor,
        returns: tf.Tensor,
        episode: int
    ) -> tf.Tensor:
        """Computes combined actor-critic loss"""
        # print(f'returns: {tf.shape(returns)}; values: {tf.shape(values)}')
        advantage = returns - tf.squeeze(values)

        critic_loss = self.vf_coef*self.loss(tf.squeeze(values), returns)#tf.math.square(advantage)

        entropy_loss = -self.entropy_coef * policy.entropy()

        # squashing correction
        log_probs = tf.expand_dims(policy.log_prob(actions), axis = 1)
        # stopping gradient for advantage dramatically improves stability!
        actor_loss = tf.reduce_sum(-log_probs - tf.squeeze(tf.math.log(1 - tf.math.pow(actions, 2) + 1.0e-10)), axis=1) * tf.stop_gradient(advantage)

        with self.summary_writer.as_default():
            tf.summary.scalar('actor_loss', tf.reduce_mean(actor_loss), step = episode)
            tf.summary.scalar('critic_loss', tf.reduce_mean(critic_loss), step = episode)
            tf.summary.scalar('entropy_loss', tf.reduce_mean(entropy_loss), step = episode)

        return tf.reduce_mean(actor_loss + critic_loss + entropy_loss)

def get_next_run(log_dir):
    next_run = max([0]+[int(j) for j in [i.split('_')[-1] for i in os.listdir(log_dir)] if j.isdigit()]) + 1
    return log_dir + f'/run_{next_run}'

def train(agent, env, num_episodes, seed):
    
    for i in range(num_episodes):
        state, _ = env.reset(seed = seed)
        done = False
        step = 0
        while not done:
            action = agent.get_action(state, step)
            next_state, reward, terminated, truncated, info = env.step(np.array(action))
            done = terminated or truncated
            agent.log(action, state, next_state, reward, done, step)
            state = next_state

            step += 1
        agent.update(i)

def test(agent, env, num_episodes, seed):

    for i in range(num_episodes):
        state, _ = env.reset(seed = seed)
        done = False
        step = 0
        while not done:
            action = agent.get_action(state, step)
            next_state, reward, terminated, truncated, info = env.step(np.array(action))
            done = terminated or truncated
            state = next_state

            step += 1
    env.close()


In [5]:
## create tensorboard logs
LOGS = './logs'
if not os.path.exists(LOGS):
    os.mkdir(LOGS)

In [6]:
seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)
env = gym.make('LunarLanderContinuous-v2')
eps = np.finfo(np.float32).eps.item()

optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.0007, clipnorm=0.5)
num_actions = env.action_space.shape[0]
num_hidden_units = 256
model = ActorCritic(num_actions, num_hidden_units)

log_dir = get_next_run(LOGS) 
print('Saving logs to: ', log_dir)
summary_writer = tf.summary.create_file_writer(logdir = log_dir)
agent = Agent(0.99, 0.0, 0.4, model, optimizer, summary_writer, debug=True)

NUM_EPISODES=600
train(agent, env, NUM_EPISODES, seed)


Saving logs to:  ./logs/run_143


In [123]:
from gymnasium.wrappers import NormalizeObservation
env = NormalizeObservation(env)
state, _ = env.reset()
state

array([-4.84584044e-05,  8.14826722e-03, -4.40632277e-03, -2.24742149e-03,
        5.62189279e-05,  1.10504436e-03,  0.00000000e+00,  0.00000000e+00])

In [7]:
NUM_EPISODES=5
env = gym.make('LunarLanderContinuous-v2', render_mode='human')
test(agent, env, NUM_EPISODES, seed)

## RNN / sequence model
Try to create an RL agent with an RNN network

In [8]:
lstm = tf.keras.Sequential(
    layers.LSTM(128)
)



In [119]:
class ContinuousActor(tf.keras.Model):
    """Actor that outputs a policy directly"""
    def __init__(
        self,
        num_actions: int,
        num_hidden_units: int
    ):
        super().__init__()
        self.means = layers.Dense(num_actions)
        self.stds = lambda x: tf.zeros(num_actions)
        # self.stds = layers.Dense(num_actions, activation='relu')
        # tf.ones(num_actions) * 0.25
    def call(self, inputs: tf.Tensor):
        means = self.means(inputs)
        stds = self.stds(inputs)
        # stds = tf.clip_by_value(stds, 1.0e-3, 1)
        return tfp.distributions.MultivariateNormalDiag(loc = means, scale_diag = tf.exp(stds))


class RecurrentActorCritic(tf.keras.Model):
    """combined actor-critic network. """

    def __init__(
        self,
        num_actions: int,
        num_hidden_units: int
    ):
        super().__init__()
        self.common = layers.LSTM(num_hidden_units)
        # outputs scale, location params for mvn
        self.actor = ContinuousActor(num_actions, num_hidden_units) 
        self.critic = layers.Dense(1)

    def call(self, inputs: tf.Tensor):
        x = self.common(inputs)
        return self.actor(x), self.critic(x)




In [193]:
class RecurrentAgent():

    def __init__(
        self, 
        gamma: float, 
        entropy_coef: float,
        vf_coef: float,
        window_size: int,
        model: tf.keras.Model, 
        optimizer: tf.keras.optimizers.Optimizer,
        summary_writer,
        debug = False):

        # discount rate
        self.gamma = tf.constant(gamma)
        self.entropy_coef = tf.constant(entropy_coef)
        self.vf_coef = tf.constant(vf_coef)
        self.window_size = tf.constant(window_size)
        self.model = model
        self.optimizer = optimizer

        # creates a dictionary of tensor arrays to write to
        self.memory = self._init_memory()
        self.loss = tf.keras.losses.Huber(reduction = tf.keras.losses.Reduction.NONE)

        self.model.compile(optimizer=self.optimizer)
        self.summary_writer = summary_writer

        # switch for analysis
        self.DEBUG = debug
        if self.DEBUG:
            self.debug_val = 0

    def _init_memory(self):
        return {
                'action' : tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True),
                'state' : tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True),
                'next_state' : tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True),
                'action_probs': tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True),
                'values' : tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True),
                'rewards' : tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True),
                'done' : tf.TensorArray(dtype=tf.int32, size=0, dynamic_size=True)
        }

    def _pad_input(self, new_state):
        
        # get previous states and combine with new_state
        if self.memory['state'].size() > 0:
            last_n = self.memory['state'].size()
            first_n = np.max([0, last_n - self.window_size], 0)
            prev_states = self.memory['state'].gather([i for i in tf.range(first_n, last_n)])
            combined_states = tf.concat([tf.expand_dims(new_state, 0), prev_states], axis =0)
        else:
            combined_states = tf.expand_dims(new_state, 0)

        # get size of padding - 0 if we have enough
        pad_size = np.max([0, self.window_size-tf.shape(combined_states)[0]], axis = 0)
        
        # get the padded vals
        padded = tf.pad(combined_states, [[0,pad_size],[0,0]])
        padded = tf.expand_dims(padded, 0)

        return padded
    
    def _get_action(self, state):

        padded = self._pad_input(state)

        policy, value = self.model(padded)
        action = policy.sample()[0]

        return tf.tanh(action), policy, value
        # return tf.clip_by_value(action, -0.99, 0.99), policy, value
    
    def get_action(self, state, step):

        # run the model
        action, policy, _ = self._get_action(state)

        if self.DEBUG:
            with self.summary_writer.as_default():
                tf.summary.scalar('mean1', policy.mean()[0][0], step = self.debug_val)
                tf.summary.scalar('mean2', policy.mean()[0][1], step = self.debug_val)
                tf.summary.scalar('std1', policy.stddev()[0][0], step = self.debug_val)
                tf.summary.scalar('std2', policy.stddev()[0][1], step = self.debug_val)

            self.debug_val += 1

        return action

    def log(self, action, state, next_state, reward, done, step):

        """
        Logs results into memory - not all used necessarily
        """

        self.memory['action'].write(
            step, 
            tf.constant(action, tf.float32)
            ).mark_used()

        self.memory['state'].write(
            step, 
            tf.constant(state, tf.float32)
            ).mark_used()

        self.memory['next_state'].write(
            step, 
            tf.constant(next_state, tf.float32)
            ).mark_used()

        self.memory['rewards'].write(
            step, 
            tf.constant(reward, tf.float32)
            ).mark_used()

        self.memory['done'].write(
            step, 
            tf.constant(done, tf.int32)
            ).mark_used()

    def get_expected_return(
        self,
        rewards: tf.Tensor,
        standardize: bool = True):
        """Compute expected returns"""

        n = tf.shape(rewards)[0] 
        returns = tf.TensorArray(dtype=tf.float32, size=n)

        # start at last reward and then accumulate reward sums into returns array
        rewards = rewards[::-1]
        discounted_sum = tf.constant(0.0)
        discounted_sum_shape = discounted_sum.shape
        for i in tf.range(n):
            reward = rewards[i]
            discounted_sum = reward + self.gamma * discounted_sum # discounted_sum= 0 for last reward (i.e. first element in loop)
            discounted_sum.set_shape(discounted_sum_shape)
            returns = returns.write(i, discounted_sum)
        returns = returns.stack()[::-1] # reverse order back to original

        if standardize:
            returns = ((returns - tf.reduce_mean(returns)) / 
                        (tf.math.reduce_std(returns) + eps))
        
        return returns

    def _prepare_states_for_lstm(self, states):
        """wraps states into lstm sequence format"""
        ## need the initial k 

        initial_padded_states = tf.convert_to_tensor([tf.pad(states[:(i+1)][::-1], [[0,np.max([0, self.window_size-(i+1)], axis = 0)],[0,0]]) for i in tf.range(0, self.window_size)])
        remaining_states = tf.convert_to_tensor([states[(i-self.window_size):i] for i in tf.range(self.window_size, tf.shape(states)[0])])
        combined_states = tf.concat([initial_padded_states, remaining_states], axis = 0)

        return combined_states
        

    def update(self, episode: int):

        states = self.memory['state'].gather([i for i in tf.range(self.memory['state'].size())])
        states = self._prepare_states_for_lstm(states)
        rewards = self.memory['rewards'].gather([i for i in tf.range(self.memory['rewards'].size())])
        actions = self.memory['action'].gather([i for i in tf.range(self.memory['action'].size())])
        returns = self.get_expected_return(rewards = rewards, standardize=True)

        with tf.GradientTape() as tape:

            policy, values = self.model(states)

            # calculate the loss values
            loss = self.compute_loss(
                actions, 
                policy, 
                values,
                returns,
                episode)

        # compute the gradients from the loss
        grads = tape.gradient(loss, self.model.trainable_variables)
        self.model.optimizer.apply_gradients(zip(grads, self.model.trainable_variables))

        with self.summary_writer.as_default():
            tf.summary.scalar('loss', loss, step = episode)
            tf.summary.scalar('episode_reward', tf.reduce_sum(rewards), step = episode)

        # wipe memory for next episode
        self.memory = self._init_memory()
        
    def compute_loss(
        self,
        actions: tf.Tensor,
        policy: tf.Tensor,
        values: tf.Tensor,
        returns: tf.Tensor,
        episode: int
    ) -> tf.Tensor:
        """Computes combined actor-critic loss"""
        # print(f'returns: {tf.shape(returns)}; values: {tf.shape(values)}')
        advantage = returns - tf.squeeze(values)

        critic_loss = self.vf_coef*self.loss(tf.squeeze(values), returns)#tf.math.square(advantage)

        entropy_loss = -self.entropy_coef * policy.entropy()

        # squashing correction
        log_probs = tf.expand_dims(policy.log_prob(actions), axis = 1)
        # stopping gradient for advantage dramatically improves stability!
        actor_loss = tf.reduce_sum(-log_probs - tf.squeeze(tf.math.log(1 - tf.math.pow(actions, 2) + 1.0e-10)), axis=1) * tf.stop_gradient(advantage)

        with self.summary_writer.as_default():
            tf.summary.scalar('actor_loss', tf.reduce_mean(actor_loss), step = episode)
            tf.summary.scalar('critic_loss', tf.reduce_mean(critic_loss), step = episode)
            tf.summary.scalar('entropy_loss', tf.reduce_mean(entropy_loss), step = episode)

        return tf.reduce_mean(actor_loss + critic_loss + entropy_loss)

In [194]:
seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)
env = gym.make('LunarLanderContinuous-v2')
eps = np.finfo(np.float32).eps.item()

optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.0007, clipnorm=0.5)
num_actions = env.action_space.shape[0]
num_hidden_units = 256
model = RecurrentActorCritic(num_actions, num_hidden_units)

log_dir = get_next_run(LOGS) 
print('Saving logs to: ', log_dir)
summary_writer = tf.summary.create_file_writer(logdir = log_dir)
agent2 = RecurrentAgent(0.99, 0.0, 0.4, 8, model, optimizer, summary_writer, debug=True)

NUM_EPISODES=100
train(agent2, env, NUM_EPISODES, seed)

Saving logs to:  ./logs/run_153


In [189]:
num_episodes = 1
env = gym.make('LunarLanderContinuous-v2')

for i in range(num_episodes):
    state, _ = env.reset(seed = seed)
    done = False
    step = 0
    while not done:
        action = agent2.get_action(state, step)
        next_state, reward, terminated, truncated, info = env.step(np.array(action))
        done = terminated or truncated
        agent2.log(action, state, next_state, reward, done, step)
        state = next_state

        step += 1

In [191]:
agent2.update(i)

states before prep: [186   8]
states after prep: [186   8   8]


In [142]:
states = agent2.memory['state'].gather([i for i in tf.range(agent.memory['state'].size())])

In [None]:
pad_size = np.max([0, self.window_size-tf.shape(combined_states)[0]], axis = 0)
tf.pad(states[i], [[0,np.max([0, self.window_size-i], axis = 0)],[0,0]]) for i in tf.range()

In [179]:
states[1]

<tf.Tensor: shape=(8,), dtype=float32, numpy=
array([ 0.00442362,  1.4254959 ,  0.21411863,  0.32733026, -0.00409935,
       -0.02889132,  0.        ,  0.        ], dtype=float32)>

In [178]:
agent2._pad_input(states[1])

<tf.Tensor: shape=(1, 9, 8), dtype=float32, numpy=
array([[[ 0.00442362,  1.4254959 ,  0.21411863,  0.32733026,
         -0.00409935, -0.02889132,  0.        ,  0.        ],
        [ 0.76255953,  0.22457801,  1.5824549 , -1.438667  ,
         -1.4822356 , -0.53626955,  0.        ,  0.        ],
        [ 0.7790512 ,  0.19222791,  1.6510906 , -1.4542502 ,
         -1.506693  , -0.48915023,  0.        ,  0.        ],
        [ 0.7955519 ,  0.15916143,  1.6515049 , -1.4878939 ,
         -1.5337818 , -0.5417758 ,  0.        ,  0.        ],
        [ 0.8124922 ,  0.12568003,  1.694713  , -1.5066842 ,
         -1.5613644 , -0.55165184,  0.        ,  0.        ],
        [ 0.82943994,  0.09152406,  1.6946552 , -1.5378029 ,
         -1.590625  , -0.5852125 ,  0.        ,  0.        ],
        [ 0.8463917 ,  0.05668946,  1.6940836 , -1.5691475 ,
         -1.6216508 , -0.62051594,  0.        ,  0.        ],
        [ 0.8639492 ,  0.02130125,  1.7537113 , -1.593907  ,
         -1.6529601 , -0.62

In [185]:
# pad_size = np.max([0, self.window_size-tf.shape(combined_states)[0]], axis = 0)

initial_padded_states = tf.convert_to_tensor([tf.pad(states[:(i+1)][::-1], [[0,np.max([0, agent2.window_size-(i+1)], axis = 0)],[0,0]]) for i in tf.range(0, agent2.window_size)])
remaining_states = tf.convert_to_tensor([states[(i-agent2.window_size):i] for i in tf.range(agent2.window_size, tf.shape(states)[0])])
combined_states = tf.concat([initial_padded_states, remaining_states], axis = 0)

In [186]:
combined_states

<tf.Tensor: shape=(165, 8, 8), dtype=float32, numpy=
array([[[ 2.2970201e-03,  1.4181306e+00,  2.3264711e-01, ...,
         -5.2698046e-02,  0.0000000e+00,  0.0000000e+00],
        [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
          0.0000000e+00,  0.0000000e+00,  0.0000000e+00],
        [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
          0.0000000e+00,  0.0000000e+00,  0.0000000e+00],
        ...,
        [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
          0.0000000e+00,  0.0000000e+00,  0.0000000e+00],
        [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
          0.0000000e+00,  0.0000000e+00,  0.0000000e+00],
        [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
          0.0000000e+00,  0.0000000e+00,  0.0000000e+00]],

       [[ 4.4236183e-03,  1.4254959e+00,  2.1411863e-01, ...,
         -2.8891325e-02,  0.0000000e+00,  0.0000000e+00],
        [ 2.2970201e-03,  1.4181306e+00,  2.3264711e-01, ...,
         -5.2698046e-02,  0.00

In [184]:
tf.concat([initial_padded_states, remaining_states], axis = 0)

<tf.Tensor: shape=(165, 8, 8), dtype=float32, numpy=
array([[[ 2.2970201e-03,  1.4181306e+00,  2.3264711e-01, ...,
         -5.2698046e-02,  0.0000000e+00,  0.0000000e+00],
        [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
          0.0000000e+00,  0.0000000e+00,  0.0000000e+00],
        [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
          0.0000000e+00,  0.0000000e+00,  0.0000000e+00],
        ...,
        [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
          0.0000000e+00,  0.0000000e+00,  0.0000000e+00],
        [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
          0.0000000e+00,  0.0000000e+00,  0.0000000e+00],
        [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
          0.0000000e+00,  0.0000000e+00,  0.0000000e+00]],

       [[ 4.4236183e-03,  1.4254959e+00,  2.1411863e-01, ...,
         -2.8891325e-02,  0.0000000e+00,  0.0000000e+00],
        [ 2.2970201e-03,  1.4181306e+00,  2.3264711e-01, ...,
         -5.2698046e-02,  0.00

In [183]:
print(tf.shape(initial_padded_states), tf.shape(remaining_states))

tf.Tensor([8 8 8], shape=(3,), dtype=int32) tf.Tensor([157   8   8], shape=(3,), dtype=int32)


In [144]:
initial_padded_states

<tf.Tensor: shape=(8, 1, 9, 8), dtype=float32, numpy=
array([[[[ 2.2970201e-03,  1.4181306e+00,  2.3264711e-01,
           3.2046661e-01, -2.6548801e-03, -5.2698046e-02,
           0.0000000e+00,  0.0000000e+00],
         [ 7.6255953e-01,  2.2457801e-01,  1.5824549e+00,
          -1.4386671e+00, -1.4822356e+00, -5.3626955e-01,
           0.0000000e+00,  0.0000000e+00],
         [ 7.7905118e-01,  1.9222791e-01,  1.6510906e+00,
          -1.4542502e+00, -1.5066930e+00, -4.8915023e-01,
           0.0000000e+00,  0.0000000e+00],
         [ 7.9555190e-01,  1.5916143e-01,  1.6515049e+00,
          -1.4878939e+00, -1.5337818e+00, -5.4177582e-01,
           0.0000000e+00,  0.0000000e+00],
         [ 8.1249219e-01,  1.2568003e-01,  1.6947130e+00,
          -1.5066842e+00, -1.5613644e+00, -5.5165184e-01,
           0.0000000e+00,  0.0000000e+00],
         [ 8.2943994e-01,  9.1524065e-02,  1.6946552e+00,
          -1.5378029e+00, -1.5906250e+00, -5.8521253e-01,
           0.0000000e+00,  0.000000

In [141]:
agent2.update(i)

states before prep: [152   8]


InvalidArgumentError: {{function_node __wrapped__ConcatV2_N_2_device_/job:localhost/replica:0/task:0/device:CPU:0}} ConcatOp : Ranks of all input tensors should match: shape[0] = [8,1,9,8] vs. shape[1] = [144,8,8] [Op:ConcatV2] name: concat

In [134]:
# I need to roll actions, states and rewards into batches so
# batch_size x timesteps x 3
actions = agent.memory['action'].gather([i for i in tf.range(agent.memory['action'].size())])
states = agent.memory['state'].gather([i for i in tf.range(agent.memory['state'].size())])
rewards = agent.memory['rewards'].gather([i for i in tf.range(agent.memory['rewards'].size())])


In [136]:
action

<tf.Tensor: shape=(2,), dtype=float32, numpy=array([-0.8053639, -0.5151801], dtype=float32)>

In [135]:
states

<tf.Tensor: shape=(165, 8), dtype=float32, numpy=
array([[ 2.2970201e-03,  1.4181306e+00,  2.3264711e-01, ...,
        -5.2698046e-02,  0.0000000e+00,  0.0000000e+00],
       [ 4.4236183e-03,  1.4254959e+00,  2.1411863e-01, ...,
        -2.8891325e-02,  0.0000000e+00,  0.0000000e+00],
       [ 6.5270425e-03,  1.4330266e+00,  2.1036837e-01, ...,
        -4.2330031e-04,  0.0000000e+00,  0.0000000e+00],
       ...,
       [ 9.5423925e-01,  2.4304004e-01,  1.9828390e+00, ...,
        -6.2625843e-01,  0.0000000e+00,  0.0000000e+00],
       [ 9.7409457e-01,  2.1030845e-01,  1.9827158e+00, ...,
        -6.2621605e-01,  0.0000000e+00,  0.0000000e+00],
       [ 9.9395734e-01,  1.7691380e-01,  1.9822598e+00, ...,
        -6.5380037e-01,  0.0000000e+00,  0.0000000e+00]], dtype=float32)>

In [94]:
new_state =states[-1]
prev_states = states[-4:-2]
print(new_state, prev_states)

tf.Tensor(
[ 0.02998333  0.02727829 -0.2577023  -1.9478024   2.714047    0.42441875
  0.          0.        ], shape=(8,), dtype=float32) tf.Tensor(
[[ 0.03765373  0.15462822 -0.19136551 -1.8191761   2.6443205   0.54569924
   0.          0.        ]
 [ 0.035429    0.1133526  -0.1999444  -1.8424113   2.6694772   0.5031384
   0.          0.        ]], shape=(2, 8), dtype=float32)


In [114]:

combined_states = tf.concat([tf.expand_dims(new_state, 0), prev_states], axis =0)
padded = tf.pad(combined_states, [[0,np.max([0,window_size - tf.shape(combined_states)[0]],axis=0)],[0,0]])
tf.expand_dims(padded, 0)

<tf.Tensor: shape=(1, 5, 8), dtype=float32, numpy=
array([[[ 0.02998333,  0.02727829, -0.2577023 , -1.9478024 ,
          2.714047  ,  0.42441875,  0.        ,  0.        ],
        [ 0.03765373,  0.15462822, -0.19136551, -1.8191761 ,
          2.6443205 ,  0.54569924,  0.        ,  0.        ],
        [ 0.035429  ,  0.1133526 , -0.1999444 , -1.8424113 ,
          2.6694772 ,  0.5031384 ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ]]],
      dtype=float32)>

In [111]:
tf.shape(combined_states)[0]

<tf.Tensor: shape=(), dtype=int32, numpy=3>

In [112]:
np.max([0,window_size - tf.shape(combined_states)[0]],axis=0)

2

In [68]:
lstm = tf.keras.Sequential(
    layers.LSTM(2)
)
window_size = 5

inputs = tf.convert_to_tensor([tf.concat([actions[(i-window_size):i], states[(i-window_size):i], tf.expand_dims(rewards[(i-window_size):i], 1)], axis = 1) for i in tf.range(window_size, tf.shape(actions)[0])])
outputs = lstm(inputs)


In [77]:
window_size - tf.shape(tensor1)[1]

<tf.Tensor: shape=(), dtype=int32, numpy=3>

In [91]:
tensor1 = inputs[:,0:10,:]#tf.constant([[[1,2,3], [4,5,6], [9,9,9]],[[1,2,3], [4,5,6], [9,9,9]]])
pad_tensor1 = tf.pad(tensor1, [[0,0],[0,np.max([0,window_size - tf.shape(tensor1)[1]],axis=0)],[0,0]])
print(tf.shape(pad_tensor1))
# window_size - tf.shape(tensor1)[1]

tf.Tensor([76  5 11], shape=(3,), dtype=int32)


In [92]:
tensor1

<tf.Tensor: shape=(76, 5, 11), dtype=float32, numpy=
array([[[-9.9999952e-01, -2.1917503e-01,  2.2970201e-03, ...,
          0.0000000e+00,  0.0000000e+00,  1.1449804e+00],
        [-9.9975705e-01,  2.1055248e-01,  4.5939446e-03, ...,
          0.0000000e+00,  0.0000000e+00,  1.1880542e+00],
        [-9.9997276e-01, -7.9044843e-01,  6.8910597e-03, ...,
          0.0000000e+00,  0.0000000e+00,  1.8632145e+00],
        [-9.9999756e-01, -4.1611990e-01,  9.1216089e-03, ...,
          0.0000000e+00,  0.0000000e+00,  1.3250611e+00],
        [-9.9999225e-01,  4.8514101e-01,  1.1352062e-02, ...,
          0.0000000e+00,  0.0000000e+00,  1.2652316e+00]],

       [[-9.9975705e-01,  2.1055248e-01,  4.5939446e-03, ...,
          0.0000000e+00,  0.0000000e+00,  1.1880542e+00],
        [-9.9997276e-01, -7.9044843e-01,  6.8910597e-03, ...,
          0.0000000e+00,  0.0000000e+00,  1.8632145e+00],
        [-9.9999756e-01, -4.1611990e-01,  9.1216089e-03, ...,
          0.0000000e+00,  0.0000000e+00,  1

In [73]:
pad_tensor1

<tf.Tensor: shape=(76, 6, 11), dtype=float32, numpy=
array([[[ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
          0.0000000e+00,  0.0000000e+00,  0.0000000e+00],
        [-9.9975705e-01,  2.1055248e-01,  4.5939446e-03, ...,
          0.0000000e+00,  0.0000000e+00,  1.1880542e+00],
        [-9.9997276e-01, -7.9044843e-01,  6.8910597e-03, ...,
          0.0000000e+00,  0.0000000e+00,  1.8632145e+00],
        [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
          0.0000000e+00,  0.0000000e+00,  0.0000000e+00],
        [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
          0.0000000e+00,  0.0000000e+00,  0.0000000e+00],
        [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
          0.0000000e+00,  0.0000000e+00,  0.0000000e+00]],

       [[ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
          0.0000000e+00,  0.0000000e+00,  0.0000000e+00],
        [-9.9997276e-01, -7.9044843e-01,  6.8910597e-03, ...,
          0.0000000e+00,  0.0000000e+00,  1

In [256]:
examp = tf.expand_dims(inputs[:, 1,:], 1)
pad_examp = tf.keras.utils.pad_sequences(examp)
print(tf.shape(examp), tf.shape(pad_examp))

tf.Tensor([133   1  11], shape=(3,), dtype=int32) tf.Tensor([133   1  11], shape=(3,), dtype=int32)


In [270]:
tf.pad(examp, paddings= [[1,1],[1,1]],constant_values=0)

InvalidArgumentError: {{function_node __wrapped__Pad_device_/job:localhost/replica:0/task:0/device:CPU:0}} The first dimension of paddings must be the rank of inputs[2,2] [133,1,11] [Op:Pad]

In [187]:
# so I want to pad inputs during training

TypeError: Missing required positional argument

In [176]:
reshaped = tf.convert_to_tensor([tf.stack(actions[(i-window_size):i], states[(i-window_size):i],rewards[(i-window_size):i]) for i in tf.range(window_size, tf.shape(actions)[0])])
reshaped
# lstm(reshaped)

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [163]:
window_size = 4
for i in tf.range(window_size, tf.shape(actions)[0]):
    print(actions[(i-window_size):i])

tf.Tensor(
[[ 9.2616910e-01  5.6924653e-04]
 [-5.8331132e-01  9.5472580e-01]
 [-3.2710966e-01 -8.0249214e-01]
 [-9.0271854e-01 -2.4973607e-01]], shape=(4, 2), dtype=float32)
tf.Tensor(
[[-0.5833113   0.9547258 ]
 [-0.32710966 -0.80249214]
 [-0.90271854 -0.24973607]
 [ 0.36762196 -0.2197899 ]], shape=(4, 2), dtype=float32)
tf.Tensor(
[[-0.32710966 -0.80249214]
 [-0.90271854 -0.24973607]
 [ 0.36762196 -0.2197899 ]
 [-0.49681523  0.6425888 ]], shape=(4, 2), dtype=float32)
tf.Tensor(
[[-0.90271854 -0.24973607]
 [ 0.36762196 -0.2197899 ]
 [-0.49681523  0.6425888 ]
 [-0.3458454  -0.5184323 ]], shape=(4, 2), dtype=float32)
tf.Tensor(
[[ 0.36762196 -0.2197899 ]
 [-0.49681523  0.6425888 ]
 [-0.3458454  -0.5184323 ]
 [-0.8733477  -0.10362962]], shape=(4, 2), dtype=float32)
tf.Tensor(
[[-0.49681523  0.6425888 ]
 [-0.3458454  -0.5184323 ]
 [-0.8733477  -0.10362962]
 [-0.39454803  0.500879  ]], shape=(4, 2), dtype=float32)
tf.Tensor(
[[-0.3458454  -0.5184323 ]
 [-0.8733477  -0.10362962]
 [-0.394548