In [1]:
import os 
import sys
import time


sys.path.append(os.path.abspath('../..'))


# DQN

In [None]:
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3 import DQN
from environment.env import Grid
from utils.checkpoints import find_last_checkpoint

obstacles = [((0.14625, 0.3325), (0.565, 0.55625)), 
             ((0.52875, 0.5375), (0.7375, 0.84125)), 
             ((0.0, 0.00125), (0.01625, 0.99125)), 
             ((0.0075, 0.00125), (0.99875, 0.04)), 
             ((0.98875, 0.0075), (0.99875, 1.0)), 
             ((0.00125, 0.9825), (0.99875, 1.0))]

env = Grid(
    obstacles=obstacles, 
    shear_range=(-.2, .2),
    stretch_range=(.4,1),
    render_mode="human"
)

last_checkpoint = find_last_checkpoint("../../agents/pretrained/MDP/DQN_continous_1vbmjd2a")
model = DQN.load(f"../../agents/pretrained/MDP/DQN_continous_1vbmjd2a/{last_checkpoint}", env=env)


mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=20, deterministic=True)
print(mean_reward, std_reward)
env.close()

# PPO

In [3]:
run = "PPO_continous_" + "enh53x0u"

In [6]:
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3 import PPO
from environment.env import Grid
from utils.checkpoints import find_last_checkpoint

obstacles = [((0.14625, 0.3325), (0.565, 0.55625)), 
             ((0.52875, 0.5375), (0.7375, 0.84125)), 
             ((0.0, 0.00125), (0.01625, 0.99125)), 
             ((0.0075, 0.00125), (0.99875, 0.04)), 
             ((0.98875, 0.0075), (0.99875, 1.0)), 
             ((0.00125, 0.9825), (0.99875, 1.0))]

env = Grid(
    obstacles=obstacles, 
    shear_range=(-.2, .2),
    stretch_range=(.4,1),
    render_mode="human"
)

last_checkpoint = find_last_checkpoint(f"../../agents/pretrained/MDP/{run}")
model = PPO.load(f"../../agents/pretrained/MDP/{run}/{last_checkpoint}", env=env)


mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=20, deterministic=True)
print(mean_reward, std_reward)

env.close()

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-9.95 7.012310603502956


In [8]:
def eval_agent_mdp(agent,env,num_episodes,max_episode_steps,render):
    """Returns
        - episode_transition: list of list of tuples (s,a,r,s',done), t[i] is the ith episode
        - beliefs: list of beliefs at each time step 
    """
    for i in range(num_episodes):
        s, _ = env.reset()

        totalReward = 0.0
        done = False
        steps = 0

        while not done and steps < max_episode_steps:

            best_action, _ = agent.predict(s,deterministic=True)
            next_state, reward, terminated, truncated, info = env.step(best_action)
            totalReward += reward            

            done = terminated or truncated
            if render:
                # print("State", s)
                # print("Action: ", best_action)
                # print("Reward:     " + str(totalReward) + "  ")
                # print("Next State: ", next_state)
                # print("\n")
                env.render()

            s = next_state
            steps += 1
            time.sleep(0.05)

env = Grid(
    render_mode="human"
)

eval_agent_mdp(model,env,10,100,True)

error: display Surface quit

In [6]:
env = Grid(
    render_mode="human"
)
env.reset()

(OrderedDict([('pos', array([0.52946089, 0.01536412])),
              ('theta',
               array([ 0.69716857,  0.16414655, -0.17197535,  0.46720087]))]),
 {})

In [24]:
from environment.env import POMDPDeformedGridworld
import torch
import numpy as np
from collections import OrderedDict

class POMDPAgent():

    def __init__(self, env: POMDPDeformedGridworld, model, discretization=10, update='discrete_exact', obs_model=None):
        self.env = env
        self.model = model

        if update == 'discrete_modelled': 
            assert obs_model is not None
            self.obs_model = obs_model
            # assuming discrete belief over 2 parameters
            belief_points = np.linspace(env.stretch_range[0], env.stretch_range[1], discretization) 
            # zip belief points in every combination
            import itertools
            self.tmp_belief_points = { bp:0 for bp in itertools.product(belief_points, belief_points) }
            self.belief_points = torch.tensor(list(self.tmp_belief_points.keys()), dtype=torch.float32)

            self.belief_values = torch.ones(self.belief_points.shape[0], dtype=torch.float32) / len(self.tmp_belief_points.keys())

            self.belief_update = self.discrete_belief_update
        
        elif update == 'discrete_exact':
            # assuming discrete belief over 2 parameters
            belief_points = np.linspace(env.stretch_range[0], env.stretch_range[1], discretization) 
            # zip belief points in every combination
            import itertools
            self.tmp_belief_points = { bp:0 for bp in itertools.product(belief_points, belief_points) }
            self.belief_points = torch.tensor(list(self.tmp_belief_points.keys()), dtype=torch.float32)

            self.belief_values = torch.ones(self.belief_points.shape[0], dtype=torch.float32) / len(self.tmp_belief_points.keys())

            self.belief_update = self.exact_belief_update
        else:
            raise NotImplementedError('Only discrete belief update is supported')
        
        self.original_def = env.transformation_matrix[0][0], env.transformation_matrix[1][1]
        
    def act(self):
        action = self.get_action()
        pomdp_state, reward, terminated,truncated, info = self.env.step(int(action))
        self.belief_update(pomdp_state)
        return action, pomdp_state, reward, terminated, truncated, info
    
    def get_action(self):

        sampled_deformation = self.belief_points[torch.multinomial(self.belief_values, 1).item()]
        theta = torch.tensor([[sampled_deformation[0],0],[0,sampled_deformation[1]]])
        s = OrderedDict({'pos': torch.tensor(self.env.state),
                        'theta' : theta.flatten()})
 
        # Agent takes an action using a greedy policy (without exploration)
        action = self.model.predict(s,deterministic=True)[0]
        return action

    def discrete_belief_update(self, pomdp_state):
        """discrete belief update"""
        obs = pomdp_state['obs']
        pos = pomdp_state['pos']

        batch_pos = pos.repeat(len(self.belief_points), 1)
        batch_obs = obs.repeat(len(self.belief_points), 1)
        
        # need theta because working on two parameters only in this example
        # siamo sicuri che sia l'ordine gisuto ??
        theta = torch.cat([self.belief_points, torch.zeros(len(self.belief_points), 2)], dim=1)
        # permute theta to match the order of pos
        theta = theta[:, [0,3,2,1]]
        

        likelihood = torch.distributions.Bernoulli(self.obs_model(batch_pos,batch_obs,theta)).sample()
        self.belief_values =  torch.einsum("ij,j->i",likelihood, self.belief_values)
        self.belief_values = self.belief_values / self.belief_values.sum()
    
    def exact_belief_update(self, pomdp_state):
        """discrete belief update"""
        obs = pomdp_state['obs']
        pos = pomdp_state['pos']

        # need theta because working on two parameters only in this example
        # siamo sicuri che sia l'ordine gisuto ??
        theta = torch.cat([self.belief_points, torch.zeros(len(self.belief_points), 2)], dim=1)
        # permute theta to match the order of pos
        theta = theta[:, [0,3,2,1]]

        def f():
            likelihood = []
            for x in theta:
                self.env.set_deformation([x[0], x[3]],[x[2],x[1]])
                likelihood.append(torch.all(torch.tensor(self.env.observe(list(pos))) == obs))
            
            self.env.set_deformation(self.original_def, [0,0])
            return torch.tensor(likelihood, dtype=torch.float32)

        likelihood = f()

        self.belief_values =  likelihood * self.belief_values
        self.belief_values = self.belief_values / self.belief_values.sum()

    def render_act(self):
        """For testing belief convergence"""
        pomdp_state = self.env.get_state()
        self.belief_update(pomdp_state)
        self.env.render()       

In [30]:
pomdp_env = POMDPDeformedGridworld(obs_type='cardinal')
pomdp_env.reset()


({'obs': tensor([0., 0., 0., 0.]), 'pos': tensor([0.1057, 0.1537])}, {})

In [31]:
agent = POMDPAgent(pomdp_env, model,discretization=100, update='discrete_exact')

In [32]:
while True:
   action, pomdp_state, reward, terminated, truncated, info = agent.act()
   if terminated or truncated:
         break

In [90]:
import numpy as np

# Define ranges for the parameters
l0, h0 = 0, 10  # Range for parameter p0
l1, h1 = 0, 5   # Range for parameter p1
l2, h2 = -5, 5  # Range for parameter p2
l3, h3 = 1, 20  # Range for parameter p3

# Define the number of points per dimension
n_points = 10

# Create linspace for each parameter
p0 = np.linspace(l0, h0, n_points)
p1 = np.linspace(l1, h1, n_points)
p2 = np.linspace(l2, h2, n_points)
p3 = np.linspace(l3, h3, n_points)

# Create the grid
grid = np.meshgrid(p0, p1, p2, p3, indexing='ij')

# Reshape into a list of points
points = np.array([g.ravel() for g in grid]).T

# Display the results
print(f"Grid points shape: {points.shape}")
print(points[:5])  # Show the first 5 points


Grid points shape: (10000, 4)
[[ 0.          0.         -5.          1.        ]
 [ 0.          0.         -5.          3.11111111]
 [ 0.          0.         -5.          5.22222222]
 [ 0.          0.         -5.          7.33333333]
 [ 0.          0.         -5.          9.44444444]]


In [112]:
grid[0]

array([[[[ 0.        ,  0.        ,  0.        , ...,  0.        ,
           0.        ,  0.        ],
         [ 0.        ,  0.        ,  0.        , ...,  0.        ,
           0.        ,  0.        ],
         [ 0.        ,  0.        ,  0.        , ...,  0.        ,
           0.        ,  0.        ],
         ...,
         [ 0.        ,  0.        ,  0.        , ...,  0.        ,
           0.        ,  0.        ],
         [ 0.        ,  0.        ,  0.        , ...,  0.        ,
           0.        ,  0.        ],
         [ 0.        ,  0.        ,  0.        , ...,  0.        ,
           0.        ,  0.        ]],

        [[ 0.        ,  0.        ,  0.        , ...,  0.        ,
           0.        ,  0.        ],
         [ 0.        ,  0.        ,  0.        , ...,  0.        ,
           0.        ,  0.        ],
         [ 0.        ,  0.        ,  0.        , ...,  0.        ,
           0.        ,  0.        ],
         ...,
         [ 0.        ,  0.        

In [106]:
belief_values = np.ones(points.shape[0], dtype=np.float32) / points.shape[0]
print(f"Initial belief values: {belief_values[:5]}")

Initial belief values: [1.e-04 1.e-04 1.e-04 1.e-04 1.e-04]


In [107]:
likelihood = np.random.rand(points.shape[0]) 

In [108]:
updated_belief_values = likelihood * belief_values

In [65]:
belief_points = torch.tensor(np.array(grid))
belief_values = torch.ones_like(belief_points)
belief_values = belief_values / belief_values.sum() 

Value at (2, 3, 1, 0): None
Value at (5, 5, 5, 5) (out of bounds): None


In [130]:
random_function.evaluate(5,5,5,5)

1.268655659592826e-08

In [89]:
points

array([[ 0.        ,  0.        ],
       [ 1.11111111,  0.        ],
       [ 2.22222222,  0.        ],
       [ 3.33333333,  0.        ],
       [ 4.44444444,  0.        ],
       [ 5.55555556,  0.        ],
       [ 6.66666667,  0.        ],
       [ 7.77777778,  0.        ],
       [ 8.88888889,  0.        ],
       [10.        ,  0.        ],
       [ 0.        ,  0.55555556],
       [ 1.11111111,  0.55555556],
       [ 2.22222222,  0.55555556],
       [ 3.33333333,  0.55555556],
       [ 4.44444444,  0.55555556],
       [ 5.55555556,  0.55555556],
       [ 6.66666667,  0.55555556],
       [ 7.77777778,  0.55555556],
       [ 8.88888889,  0.55555556],
       [10.        ,  0.55555556],
       [ 0.        ,  1.11111111],
       [ 1.11111111,  1.11111111],
       [ 2.22222222,  1.11111111],
       [ 3.33333333,  1.11111111],
       [ 4.44444444,  1.11111111],
       [ 5.55555556,  1.11111111],
       [ 6.66666667,  1.11111111],
       [ 7.77777778,  1.11111111],
       [ 8.88888889,