In [1]:
import gymnasium as gym
import collections
import numpy as np
import statistics
import tensorflow as tf
import tensorflow_probability as tfp
import tqdm
import os

from matplotlib import pyplot as plt
from tensorflow.keras import layers
from typing import Any, List, Sequence, Tuple

In [4]:
class ContinuousActor(tf.keras.Model):
    """Actor that outputs a policy directly"""
    def __init__(
        self,
        num_actions: int,
        num_hidden_units: int,
        std_init = -2
    ):
        super().__init__()
        self.means = layers.Dense(num_actions)
        # self.stds = tf.Variable(tf.ones(num_actions))*std_init#lambda x: tf.zeros(num_actions)
        # self.stds = layers.Dense(num_actions, activation='relu')
        # tf.ones(num_actions) * 0.25
    def call(self, inputs: tf.Tensor):
        means = self.means(inputs)
        # stds = self.stds(inputs)
        # stds = tf.clip_by_value(stds, 1.0e-3, 1)
        return means
        # return tfp.distributions.MultivariateNormalDiag(loc = means, scale_diag = tf.exp(self.stds))


class ActorCritic(tf.keras.Model):
    """combined actor-critic network. """

    def __init__(
        self,
        num_actions: int,
        num_hidden_units: int
    ):
        super().__init__()
        self.common = layers.Dense(num_hidden_units, activation = 'relu')
        # outputs scale, location params for mvn
        self.actor = ContinuousActor(num_actions, num_hidden_units) #layers.Dense(num_actions)
        self.critic = layers.Dense(1)

    def call(self, inputs: tf.Tensor):
        x = self.common(inputs)
        return self.actor(x), self.critic(x)

class sepActorCritic(tf.keras.Model):
    """combined actor-critic network. """

    def __init__(
        self,
        num_actions: int,
        num_hidden_units: int
    ):
        super().__init__()
        self.actor1 = layers.Dense(num_hidden_units, activation = 'relu')
        # outputs scale, location params for mvn
        self.actor2 = ContinuousActor(num_actions, num_hidden_units) 
        self.critic1 = layers.Dense(num_hidden_units, activation = 'relu')
        self.critic2 = layers.Dense(1)

    def call(self, inputs: tf.Tensor):
        a = self.actor1(inputs)
        c = self.critic1(inputs)
        return self.actor2(a), self.critic2(c)


In [12]:
class Actor(tf.keras.Model):

    def __init__(
        self,
        num_actions,
        num_hidden_units
        ):
        super().__init__()
        self.fc1 = layers.Dense(num_hidden_units, activation = 'relu')
        self.fc2 = layers.Dense(num_actions)

    def call(self, x):
        x = self.fc1(x)
        return self.fc2(x)

class Critic(tf.keras.Model):

    def __init__(
        self,
        num_hidden_units
        ):
        super().__init__()
        self.fc1 = layers.Dense(num_hidden_units, activation = 'relu')
        self.fc2 = layers.Dense(1)

    def call(self, x):
        x = self.fc1(x)
        return self.fc2(x)

In [13]:
alist = [1,2,3,4]
atensor = tf.convert_to_tensor(alist)
print(atensor)

tf.Tensor([1 2 3 4], shape=(4,), dtype=int32)


In [110]:
class PPO:

    def __init__(
        self,
        env,
        gamma,
        lam,
        ent_coef,
        vf_coef,
        clip,
        timesteps_per_batch,
        max_timesteps_per_episode,
        n_updates_per_iteration,
        actor,
        optimizer_actor,
        critic,
        optimizer_critic,
        summary_writer
        ):

        # env
        self.env_name = env
        self.env = gym.make(self.env_name)
        
        # learning params
        self.gamma = gamma
        self.lam = lam
        self.ent_coef = ent_coef
        self.vf_coef = vf_coef
        self.clip = clip

        # rollout params
        self.timesteps_per_batch = timesteps_per_batch
        self.max_timesteps_per_episode = max_timesteps_per_episode
        self.n_updates_per_iteration = n_updates_per_iteration

        # model 
        self.actor = actor
        self.optimizer_actor = optimizer_actor
        self.critic = critic
        self.optimizer_critic = optimizer_critic

        # other params
        # TODO: generalise dimensions
        self.stds = tf.constant([0.5, 0.5])
        self.summary_writer = summary_writer

    def learn(self, total_timesteps):
        t_so_far = 0
        while t_so_far < total_timesteps:
            batch_states, batch_actions, batch_log_probs, batch_returns, batch_lens = self.rollout()

            V, _ = self.evaluate(batch_states, batch_actions)

            A_k = batch_returns - V

            # normalise advantages
            A_k = (A_k - tf.reduce_mean(A_k)) / (tf.math.reduce_std(A_k) + 1.0e-10)

            ## set up loss records
            running_actor_loss = 0
            running_critic_loss = 0
            for _ in range(self.n_updates_per_iteration):
                with tf.GradientTape(persistent=True) as tape:
                    V, curr_log_probs = self.evaluate(batch_states, batch_actions)
                    # this is the start of our computation graph?
                    ratios = tf.exp(curr_log_probs - tf.squeeze(batch_log_probs))

                    # calc surrogate losses
                    surr1 = ratios * A_k
                    surr2 = tf.clip_by_value(ratios, 1 - self.clip, 1 + self.clip) * A_k

                    # get actor / critic losses 
                    actor_loss = tf.reduce_mean(-tf.math.minimum(surr1, surr2))
                    critic_loss = tf.reduce_mean((V - batch_returns)**2)

                # backprop actor loss
                actor_grads = tape.gradient(actor_loss, self.actor.trainable_variables)
                self.optimizer_actor.apply_gradients(zip(actor_grads, self.actor.trainable_variables))

                # backprop critic loss
                critic_grads = tape.gradient(critic_loss, self.critic.trainable_variables)
                self.optimizer_critic.apply_gradients(zip(critic_grads, self.critic.trainable_variables))
                running_actor_loss += tf.stop_gradient(actor_loss)
                running_critic_loss += tf.stop_gradient(critic_loss)            
            
            t_so_far += np.sum(batch_lens)
            with self.summary_writer.as_default():
                tf.summary.scalar('actor_loss', running_actor_loss / self.n_updates_per_iteration, step = t_so_far)
                tf.summary.scalar('critic_loss', running_critic_loss / self.n_updates_per_iteration, step = t_so_far)
                tf.summary.scalar('returns', tf.reduce_mean(batch_returns), step = t_so_far)

            
            # with self.summary_writer.as_default():
            #     tf.summary.scalar('total_reward', tf.reduce_mean(actor_loss), step = t_so_far)


    def rollout(self):
        """
        Runs a single episode in the environment and accumulates data
        """

        # store results
        batch_states = []
        batch_actions = []
        batch_log_probs = []
        batch_rewards = []
        batch_returns = []
        batch_lens = []

        # num timesteps so far
        t = 0

        while t <  self.timesteps_per_batch:
            ep_rewards = []

            state, _ = self.env.reset()
            done = False

            for ep_t in range(self.max_timesteps_per_episode):

                # increment timesteps
                t += 1

                # collect observations
                batch_states.append(state)
                
                action, log_prob = self.get_action(state)
                state, reward, truncated, terminated, _ = self.env.step(action)
                done = truncated or terminated

                # collect reward, action and log prob
                ep_rewards.append(reward)
                batch_actions.append(action)
                batch_log_probs.append(log_prob)

                if done:
                    break
            
            # collect episode length and rewards
            batch_lens.append(ep_t + 1)
            batch_rewards.append(ep_rewards)

        # convert to tensors
        batch_states = tf.convert_to_tensor(batch_states, dtype = tf.float32)
        batch_actions = tf.convert_to_tensor(batch_actions, dtype = tf.float32)
        batch_log_probs = tf.convert_to_tensor(batch_log_probs, dtype = tf.float32)

        batch_returns = self.compute_returns(batch_rewards)

        return batch_states, batch_actions, batch_log_probs, batch_returns, batch_lens


    def get_action(self, state):

        means = self.actor(tf.expand_dims(state, 0))
        policy = tfp.distributions.MultivariateNormalDiag(loc = means, scale_diag = self.stds)

        # TODO: squash / scale action
        action = policy.sample()[0]
        log_prob = policy.log_prob(action)

        return action, log_prob

    def compute_returns(self, rewards):
        batch_returns = []
        for ep_rewards in reversed((rewards)):

            discounted_reward = 0 # reward so far

            for reward in reversed(ep_rewards):
                discounted_reward = reward + self.gamma * discounted_reward
                batch_returns.insert(0, discounted_reward)
            
        # convert to tensor
        batch_returns = tf.convert_to_tensor(batch_returns, dtype = tf.float32)

        return batch_returns


    def compute_gae(self, rewards, values, dones):
        batch_advantages = []

        for ep_rewards, ep_values, ep_dones in zip(rewards, values, dones):
            advantages = []
            last_advantage = 0

            for t in reversed(range(len(ep_rewards))):
                if t + 1 < len(ep_rewards):
                    delta = ep_rewards[t] + self.gamma * ep_values[t+1] * (1 - ep_dones[t+1]) - ep_values[t]
                else:
                    delta = ep_rewards[t] - ep_values[t]

                advantage = delta + self.gamma * self.lam * (1 - ep_dones[t]) * last_advantage
                last_advantage = advantage
                advantages.insert(0, advantage)

            batch_advantages.extend(advantages)
        
        return tf.convert_to_tensor(batch_advantages, dtype = tf.float32)
    
    def evaluate(self, batch_states, batch_actions):
        V = tf.squeeze(self.critic(batch_states))
        mean = self.actor(batch_states)
        policy = tfp.distributions.MultivariateNormalDiag(mean, self.stds)

        log_probs = policy.log_prob(batch_actions)

        return V, log_probs

    def test(self, num_episodes):
        env = gym.make(self.env_name, render_mode = 'human')
        for i in range(num_episodes):
            state, _ = env.reset(seed = seed)
            done = False
            step = 0
            while not done:
                action, _ = self.get_action(state)
                next_state, reward, terminated, truncated, info = env.step(action)
                done = terminated or truncated
                state = next_state

                step += 1
        env.close()

def get_next_run(log_dir):
    next_run = max([0]+[int(j) for j in [i.split('_')[-1] for i in os.listdir(log_dir)] if j.isdigit()]) + 1
    return log_dir + f'/run_{next_run}'


            # if standardize:
            # advantages = ((advantages - tf.reduce_mean(advantages)) /
            #                 (tf.math.reduce_std(advantages) + eps))

In [111]:
ENV = 'LunarLanderContinuous-v2'#'Pendulum-v1'#

seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)
env = gym.make(ENV)
eps = np.finfo(np.float32).eps.item()



optimizer_actor = tf.keras.optimizers.RMSprop(learning_rate=0.0007, clipnorm=0.5)
optimizer_critic = tf.keras.optimizers.RMSprop(learning_rate=0.0007, clipnorm=0.5)
num_actions = env.action_space.shape[0]
num_hidden_units = 256

actor = Actor(num_actions, num_hidden_units)
critic = Critic(num_hidden_units)


log_dir = get_next_run('./logs') 
print('Saving logs to: ', log_dir)
summary_writer = tf.summary.create_file_writer(logdir = log_dir)


agent = PPO(        
    env = ENV,
    gamma = 0.99,
    lam = 0.9,
    ent_coef = 0.01,
    vf_coef = 0.4,
    clip = 0.2,
    timesteps_per_batch = 10,
    max_timesteps_per_episode = 20,
    n_updates_per_iteration = 1,
    actor = actor,
    optimizer_actor = optimizer_actor,
    critic = critic,
    optimizer_critic = optimizer_critic,
    summary_writer= summary_writer)

Saving logs to:  ./logs/run_195


In [112]:
agent.learn(500)

In [113]:
agent.test(5)

In [107]:
env = gym.make(ENV, render_mode='human')
for i in range(30):
    state, _ = env.reset()
env.close()

In [104]:
env.render_mode = 'human'
env.render()
state, _ = env.reset()
env.close()

AttributeError: property 'render_mode' of 'TimeLimit' object has no setter

In [61]:
batch_states, batch_actions, batch_log_probs, batch_returns, batch_lens = agent.rollout()

### TODO: Where does the gradient tape start for tf? here?
V, _ = agent.evaluate(batch_states, batch_actions)

A_k = batch_returns - V

# normalise advantages
A_k = (A_k - tf.reduce_mean(A_k)) / (tf.math.reduce_std(A_k) + 1.0e-10)

for _ in range(1):
    _, curr_log_probs = agent.evaluate(batch_states, batch_actions)
    # this is the start of our computation graph?
    ratios = tf.exp(curr_log_probs - tf.squeeze(batch_log_probs))

    # calc surrogate losses
    surr1 = ratios * A_k
    surr2 = tf.clip_by_value(ratios, 1 - agent.clip, 1 + agent.clip) * A_k

    # actor_loss = tf.math.minimum(surr1, surr2)

In [68]:
tf.reduce_mean(-tf.math.minimum(surr1, surr2))

<tf.Tensor: shape=(), dtype=float32, numpy=-8.606018e-08>

In [63]:
batch_log_probs

<tf.Tensor: shape=(187, 1), dtype=float32, numpy=
array([[-0.86020947],
       [-0.8257072 ],
       [-0.75084114],
       [-0.75237584],
       [-0.670789  ],
       [-0.4941579 ],
       [-1.2827153 ],
       [-2.4005966 ],
       [-2.038274  ],
       [-2.50885   ],
       [-1.0050769 ],
       [-0.535179  ],
       [-0.72144794],
       [-0.9657695 ],
       [-0.9282563 ],
       [-1.4808846 ],
       [-0.987303  ],
       [-1.4309921 ],
       [-0.50107646],
       [-0.648643  ],
       [-1.7586565 ],
       [-0.65567756],
       [-1.0893621 ],
       [-0.591717  ],
       [-1.2166104 ],
       [-0.85562086],
       [-0.5078547 ],
       [-2.2736385 ],
       [-0.7293763 ],
       [-1.2845066 ],
       [-1.0116034 ],
       [-1.8786213 ],
       [-1.1035981 ],
       [-0.5052619 ],
       [-0.5018351 ],
       [-1.9937894 ],
       [-1.3588228 ],
       [-1.5594311 ],
       [-2.0590606 ],
       [-0.51939607],
       [-0.6041256 ],
       [-0.4598049 ],
       [-0.93824697],
    

In [55]:
ratios

<tf.Tensor: shape=(20,), dtype=float32, numpy=
array([1.        , 1.0000002 , 0.99999976, 1.        , 1.        ,
       1.        , 1.        , 1.0000002 , 1.        , 1.        ,
       1.        , 1.0000005 , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.0000002 , 1.0000002 , 1.        ],
      dtype=float32)>

In [None]:
class PPOAgent():

    def __init__(
        self,
        env, 
        gamma: float, 
        lam: float,
        entropy_coef: float,
        vf_coef: float,
        scale: float,
        timesteps_per_batch: int,
        max_timesteps_per_episode: int,
        model: tf.keras.Model, 
        optimizer: tf.keras.optimizers.Optimizer,
        summary_writer,
        normalise_rewards = False):

        self.env = env

        # discount rate
        self.gamma = tf.constant(gamma)
        self.lam = tf.constant(lam)
        self.entropy_coef = tf.constant(entropy_coef)
        self.vf_coef = tf.constant(vf_coef)
        self.scale = scale

        # rollout params
        self.timesteps_per_batch = timesteps_per_batch
        self.max_timesteps_per_episode = max_timesteps_per_episode

        # model setup
        self.model = model
        self.optimizer = optimizer

        # creates a dictionary of tensor arrays to write to
        self.memory = self._init_memory()
        self.loss = tf.keras.losses.Huber(reduction = tf.keras.losses.Reduction.NONE)

        self.model.compile(optimizer=self.optimizer)
        self.summary_writer = summary_writer

        # switch for analysis
        self.DEBUG = debug
        if self.DEBUG:
            self.debug_val = 0

        self.normalise_rewards = normalise_rewards

    def _init_memory(self):
        return {
                'action' : tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True),
                'state' : tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True),
                'next_state' : tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True),
                'action_probs': tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True),
                'values' : tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True),
                'rewards' : tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True),
                'done' : tf.TensorArray(dtype=tf.int32, size=0, dynamic_size=True)
        }


    def learn(self, total_timesteps):
        t_so_far = 0

        while t_so_far < total_timesteps:


    def rollout(self):
        ## for each rollout we want to collect
        # states (observations), actions, log_probs, rewards, rewards-to-go, length
        batch_obs = []
        batch_acts = []
        batch_log_probs = []
        batch_rews = []
        batch_rtgs = []
        batch_lens = []

        while t <  self.timesteps_per_batch:
            ep_rews = []
            obs, _ = self.env.reset()
            done = False

            for ep_t in range(self.max_timesteps_per_episode):
                
                t+=1

                batch_obs.append(obs)

                action, log_prob = self.get_action(obs)
                obs, rew, terminated, truncated, _ = self.env.step(action)
                done = terminated or truncated

                batch_acts.append(action)
                batch_log_probs.append(log_prob)

                if done:
                    break

            batch_lens.append(ep_t + 1)
            batch_rews.append(ep_rews)

    
    def _get_action(self, state):

        policy, value = self.model(
                tf.expand_dims(tf.constant(state, tf.float32), 0)
            )
        action = policy.sample()[0]

        return self.scale * tf.tanh(action), policy, value
        # return tf.clip_by_value(action, -0.99, 0.99), policy, value

    def _squash_correction(self, action, dist):
        log_probs = dist.log_prob(action)
        tf.math.loc(1 - tf.math.pow(action, 2) + eps)
        #tf.reduce_sum(-log_probs - tf.squeeze(tf.math.log(1 - tf.math.pow(converted_actions, 2) + 1.0e-10)), axis=1)

    
    def get_action(self, state, step):

        # run the model
        action, policy, _ = self._get_action(state)
        log_probs = self._squash_correction(action, policy)
        

        # if self.DEBUG:
        #     with self.summary_writer.as_default():
        #         tf.summary.scalar('mean1', policy.mean(), step = self.debug_val)
        #         tf.summary.scalar('mean2', policy.mean(), step = self.debug_val)
        #         # tf.summary.scalar('std1', policy.stddev()[0], step = self.debug_val)
        #         # tf.summary.scalar('std2', policy.stddev()[1], step = self.debug_val)

        #     self.debug_val += 1

        return action

    def log(self, action, state, next_state, reward, done, step):

        """
        Logs results into memory - not all used necessarily
        """

        self.memory['action'].write(
            step, 
            tf.constant(action, tf.float32)
            ).mark_used()

        self.memory['state'].write(
            step, 
            tf.constant(state, tf.float32)
            ).mark_used()

        self.memory['next_state'].write(
            step, 
            tf.constant(next_state, tf.float32)
            ).mark_used()

        self.memory['rewards'].write(
            step, 
            tf.constant(reward, tf.float32)
            ).mark_used()

        self.memory['done'].write(
            step, 
            tf.constant(done, tf.int32)
            ).mark_used()

    def get_gae(self, rewards, values, dones, standardize):
        
        n = tf.shape(dones)[0] 
        advantages = tf.TensorArray(dtype=tf.float32, size=n)
        last_advantage = tf.constant(0.)

        for t in reversed(tf.range(tf.shape(rewards))):
            if t + 1 < tf.shape(rewards):
                
                delta = rewards[t] + self.gamma * values[t + 1] * (1 - dones[t + 1]) - values[t]

            else:
                
                delta = rewards[t] - values[t]

            
            advantage = delta + self.gamma * self.lam * (1 - dones[t]) * last_advantage
            advantages.write(t, advantage).mark_used()

        advantages = advantages.stack()[::-1]
        if standardize:
            advantages = ((advantages - tf.reduce_mean(advantages)) /
                            (tf.math.reduce_std(advantages) + eps))

        return advantages

    def get_expected_return(
        self,
        rewards: tf.Tensor,
        standardize: bool = True):
        """Compute expected returns"""

        ### could convert this to generalised advantage estimator
        ### get 'done' flags and multiply discounted sum and gamma by (1-done)
        ### also need value estimates
        ### do some checking, but if you're careful, the done value could enable you to do this process across batches
        ### means you could do multiple batches per update - this might help the algorithm learn
        n = tf.shape(rewards)[0] 
        returns = tf.TensorArray(dtype=tf.float32, size=n)

        # start at last reward and then accumulate reward sums into returns array
        rewards = rewards[::-1]
        discounted_sum = tf.constant(0.0)
        discounted_sum_shape = discounted_sum.shape
        for i in tf.range(n):
            reward = rewards[i]
            discounted_sum = reward + self.gamma * discounted_sum # discounted_sum= 0 for last reward (i.e. first element in loop)
            discounted_sum.set_shape(discounted_sum_shape)
            returns = returns.write(i, discounted_sum)
        returns = returns.stack()[::-1] # reverse order back to original

        if standardize:
            returns = ((returns - tf.reduce_mean(returns)) / 
                        (tf.math.reduce_std(returns) + eps))
        
        return returns 

    def update(self, episode: int):

        states = self.memory['state'].gather([i for i in tf.range(self.memory['state'].size())])
        # tf.expand_dims(self.memory['state'].gather([i for i in tf.range(self.memory['state'].size())]), 1)
        rewards = self.memory['rewards'].gather([i for i in tf.range(self.memory['rewards'].size())])
        actions = self.memory['action'].gather([i for i in tf.range(self.memory['action'].size())])
        dones = tf.cast(self.memory['done'].gather([i for i in tf.range(self.memory['done'].size())]), tf.float32)
        # returns = self.get_expected_return(rewards = rewards, standardize=self.normalise_rewards)

        with tf.GradientTape() as tape:

            policy, values = self.model(states)

            advantages = self.get_gae(rewards, values, dones, self.normalise_rewards)

            # calculate the loss values
            loss = self.compute_loss(
                actions, 
                policy, 
                advantages,
                # values,
                # returns,
                episode)

        # compute the gradients from the loss
        grads = tape.gradient(loss, self.model.trainable_variables)
        self.model.optimizer.apply_gradients(zip(grads, self.model.trainable_variables))

        with self.summary_writer.as_default():
            tf.summary.scalar('loss', loss, step = episode)
            tf.summary.scalar('episode_reward', tf.reduce_sum(rewards), step = episode)

        # wipe memory for next episode
        self.memory = self._init_memory()
        
    def compute_loss(
        self,
        actions: tf.Tensor,
        policy: tf.Tensor,
        advantages: tf.Tensor,
        # values: tf.Tensor,
        # returns: tf.Tensor,
        episode: int
    ) -> tf.Tensor:
        """Computes combined actor-critic loss"""
        # print(f'returns: {tf.shape(returns)}; values: {tf.shape(values)}')
        # advantage = returns - tf.squeeze(values)

        critic_loss = self.vf_coef*tf.pow(advantages, 2)#self.loss(tf.squeeze(values), returns)#tf.math.square(advantage)

        entropy_loss = -self.entropy_coef * policy.entropy()

        # back to original scale
        converted_actions = actions / self.scale
        # squashing correction
        log_probs = tf.expand_dims(policy.log_prob(converted_actions), axis = 1)
        # stopping gradient for advantage dramatically improves stability!
        actor_loss = tf.reduce_sum(-log_probs - tf.squeeze(tf.math.log(1 - tf.math.pow(converted_actions, 2) + 1.0e-10)), axis=1) * tf.stop_gradient(advantages)

        with self.summary_writer.as_default():
            tf.summary.scalar('actor_loss', tf.reduce_mean(actor_loss), step = episode)
            tf.summary.scalar('critic_loss', tf.reduce_mean(critic_loss), step = episode)
            tf.summary.scalar('entropy_loss', tf.reduce_mean(entropy_loss), step = episode)

        return tf.reduce_mean(actor_loss + critic_loss + entropy_loss)

def get_next_run(log_dir):
    next_run = max([0]+[int(j) for j in [i.split('_')[-1] for i in os.listdir(log_dir)] if j.isdigit()]) + 1
    return log_dir + f'/run_{next_run}'

# def train(agent, env, num_episodes, seed):
    
#     for i in range(num_episodes):
#         state, _ = env.reset(seed = seed)
#         done = False
#         step = 0
#         while not done:
#             action = agent.get_action(state, step)
#             next_state, reward, terminated, truncated, info = env.step(np.array(action))
#             done = terminated or truncated
#             agent.log(action, state, next_state, reward, done, step)
#             state = next_state

#             step += 1
#         agent.update(i)

# def test(agent, env, num_episodes, seed):

#     for i in range(num_episodes):
#         state, _ = env.reset(seed = seed)
#         done = False
#         step = 0
#         while not done:
#             action = agent.get_action(state, step)
#             next_state, reward, terminated, truncated, info = env.step(np.array(action))
#             done = terminated or truncated
#             state = next_state

#             step += 1
#     env.close()
