In [1]:
import agent, environments

import gym
from gym import spaces
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import t, mode
from tqdm import tqdm
import cProfile

from utility import print_gridworld_with_policy, print_heatmap, plot_graph

# Global Variables

In [2]:
confidence = 0.95

# Register the environment

In [3]:
gym.envs.register(
    id='GridworldPOMDPEnvGoalless-v0',
    entry_point=environments.GridworldPOMDPEnvGoalless
)

# Gridworld Vector First

In [4]:
# Number of episodes for the training
n_episodes = 2000
n_traj = 100
# Number of runs per episode
n_run = 4
# Define the arguments for each environment
time_horizon = 25
steepness = 15
prob = 0
envs = gym.vector.make('GridworldPOMDPEnvGoalless-v0', time_horizon=time_horizon, steepness=steepness, prob=prob, num_envs=n_traj)
env = environments.GridworldPOMDPEnvGoalless(time_horizon = time_horizon, steepness=15, prob=0)

In [5]:
transition = env.transition_matrix

In [6]:
with tqdm(total=n_run * n_episodes, ncols=80) as pbar:
    # Train the agent and plot the entropies
    list_entropies = []
    list_true_entropies = []
    for r in range(n_run):
        ag = agent.REINFORCEAgentEPOMDPVec(env, alpha=0.5, n_traj=n_traj)  # Modify the agent to accept the parallel environments
        avg_entropies = []
        avg_true_entropies = []
        for i in range(n_episodes):
            trajectories = []
            episodes = [ [] for _ in range(n_traj) ]
            true_entropies = []
            d_t = np.zeros((n_traj, env.observation_space.n))  # Initialize visitation counts for all parallel environments
            true_d_t = np.zeros((n_traj, env.observation_space.n))
            envs.reset()
            ag.beliefs = np.ones((n_traj, env.observation_space.n)) / env.observation_space.n
            done = np.zeros(n_traj, dtype=bool)
            while not np.all(done):
                # Sample action and get probabilities from the belief
                actions, probs = ag.get_actions()
                # Sample state
                sampled_states = ag.get_states()
                # Take a step in the parallel environments
                next_obs, rewards, done, _, true_states = envs.step(actions)
                # Get the indices of the states for all parallel environments
                state_indices = [env.state_to_index(state) for state in sampled_states]
                true_state_indices = true_states['true_state']
                # Update state visitation counts for all parallel environments
                for i, state_index in enumerate(state_indices):
                    d_t[i][state_index] += 1
                for i, true_state_index in enumerate(true_state_indices):
                    true_d_t[i][true_state_index] += 1
                # Arrange the single trajectories
                for i in range(n_traj):
                    episodes[i].append((ag.beliefs[i], actions[i], probs[i], rewards[i], true_state_indices[i]))
                # Update belief
                ag.belief_update(actions, next_obs)
            # Compute true entropy of the trajectory for all parallel environments
            true_d_t /= time_horizon
            log_true_d_t = -np.log(true_d_t, where=true_d_t>0)
            true_entropies = np.sum(np.multiply(true_d_t, log_true_d_t),axis=1)
            # Compute believed entropy of the trajectory for all parallel environments
            d_t /= time_horizon
            log_d_t = -np.log(d_t, where=d_t>0)
            entropies = np.sum(np.multiply(d_t, log_d_t), axis=1)
            for i in range(n_traj):
                trajectories.append((episodes[i], entropies[i]))
            ag.update_multiple_sampling(trajectories)
            avg_entropies.append(np.mean(entropies))
            avg_true_entropies.append(np.mean(true_entropies))
            pbar.update(1)
        ag.print_visuals(envs=envs, env=env, n_traj=n_traj)  # Modify the agent to accept the parallel environments
        list_entropies.append(avg_entropies)
        list_true_entropies.append(avg_true_entropies)
    list_entropies = np.transpose(np.array(list_entropies), (1, 0))
    list_true_entropies = np.transpose(np.array(list_true_entropies), (1, 0))

plot_graph(n_run, n_episodes, list_entropies, list_true_entropies, confidence)

  0%|                                       | 12/8000 [00:08<1:30:09,  1.48it/s]


  logger.warn(
  logger.warn(
  if not isinstance(terminated, (bool, np.bool8)):


KeyboardInterrupt: 