In [1]:
### this is where we convert the problem into compatible mode for gym
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import random
import pickle
import json
import os
import tensorflow as tf
from tqdm.auto import tqdm
from termcolor import colored

import gymnasium as gym
from gymnasium import spaces
from gymnasium.envs.registration import register, registry, EnvSpec
from gymnasium.utils.env_checker import check_env

from enum import Enum
import collections
import torch
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter

2024-09-19 16:22:09.407821: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-09-19 16:22:09.435493: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Define Actions for Each Option
class NavigationActions(Enum):
    UP = 0
    DOWN = 1
    LEFT = 2
    RIGHT = 3

class InformationCollectionActions(Enum):
    COLLECT_X = 0
    COLLECT_Y = 1
    COLLECT_Z = 2
    COLLECT_A = 3
    COLLECT_B = 4
    COLLECT_C = 5

class OperationTriageActions(Enum):
    SAVE = 0
    USE = 1
    REMOVE = 2
    CARRY = 3

# Define Robot Options (Subtasks)
class RobotOption(Enum):
    NAVIGATION = 0
    INFORMATION_COLLECTION = 1
    OPERATION_TRIAGE = 2

class GridTile(Enum):
    _FLOOR = 0
    ROBOT = 1
    TARGET = 2
    X_INFO = 3
    Y_INFO = 4
    Z_INFO = 5
    DITCH = 6
    
    def __str__(self):
        return self.name[:1]

In [9]:
import os
import json
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import Chroma
from langchain_community import embeddings
from langchain_community.chat_models import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.document_loaders import JSONLoader
from pprint import pprint

def get_file_type(document_path):
    # Split the path and get the extension
    _, file_extension = os.path.splitext(document_path)
    # Return the file extension without the period
    return file_extension[1:] if file_extension else None

### class that processes verbal inputs handle disaster-related verbal inputs, analyze them using RAG architecture, and generate a 
# response in a specified format. It leverages models like ChatOllama and techniques like vector storage and retrieval for its operations.
class DisasterResponseAssistant:
    def __init__(self, data_path, data_type, model_name="mistral", embedding_model='nomic-embed-text', collection_name="rag-chroma"):
        self.model_name = model_name
        self.embedding_model = embedding_model
        self.collection_name = collection_name
        self.data_path = data_path
        self.data_type = data_type
        
        self.llm = None
        self.loader = None
        self.vectorstore = None
        self.retriever = None
        
        self._load_model()            # Initializes an instance of the ChatOllama model    
        self._load_documents()        # Loads and splits the PDF document into chunks
        self._create_vectorstore()    # Creates a vector store using Chroma from the document splits
        self._create_retriever()      # Creates a retriever from the vector store
        
        self.hazard_coordinates = []  # To store hazard coordinates
        self.poi_coordinates = []     # To store points of interest coordinates
    
    def _load_model(self):
        self.llm = ChatOllama(model=self.model_name)
        

    def _load_documents(self): ## for json documents
        print(f"document {self.data_type} will be infused")
        if self.data_type == 'pdf':
            self.loader = PyPDFLoader(self.data_path)
            self.data = self.loader.load_and_split()
        elif self.data_type == 'json':
            self.loader = JSONLoader(
                file_path=self.data_path,
                jq_schema='.',
                text_content=False)
            self.data = self.loader.load()
            #pprint(self.data)
        else:
            raise ValueError("Unsupported document type. Please choose either 'pdf' or 'json'.")


    def _create_vectorstore(self): ## for json documents
        self.vectorstore = Chroma.from_documents(
            documents=self.data,
            collection_name=self.collection_name,
            embedding=embeddings.OllamaEmbeddings(model=self.embedding_model),
        )

        
    def _create_retriever(self):
        self.retriever = self.vectorstore.as_retriever()

    ### generate a response based on a verbal input
    ### construct a template for the response using RAG architecture
    def generate_response(self, verbal_input):
        prompt_template = """You are an assistant, who carefully listens to verbal inputs: {verbal_input} and specialized in analyzing disaster-related inputs. Your task is 
to identify physical locations mentioned in the text and classify them as either points of interest (POI) or as hazards/dangers (HAZARD) for rescue operations. Use the
information provided in the documents: {context}, such as KEYWORDS, descriptions and context when locations are mentioned, to make your classification.
Output the classification in the form of a JSON array dictionary with keys 'location', 'coordinates', and 'category'. Here are some rules you always follow:
- Focus strictly on physical locations. Avoid including entities that do not represent physical, geographical places (such as individuals, conditions, or 
  abstract concepts).
- Generate human-readable output in the specified dictionary format.
- Generate only the requested output, strictly following the dictionary structure.
- Within the dictionary, the value of the `category` key must be either 'POI' or 'HAZARD'. 
- Never generate offensive or foul language.
- Never give explanations over your output.
Input: {verbal_input}
"""
        system_template = ChatPromptTemplate.from_template(prompt_template)
        output_parser = StrOutputParser()
        after_rag_chain = (
            {"context": self.retriever, "verbal_input": RunnablePassthrough()}
            | system_template
            | self.llm  # Assuming model_local is defined elsewhere and accessible
            | output_parser
        )
        response = after_rag_chain.invoke(verbal_input)
        return response
    
    def refine_response(self, output):
        cleaned_output_str = output.strip().replace('\n', '').replace('(', '[').replace(')', ']')
        output_dict = json.loads(cleaned_output_str)

        for item in output_dict:
            coord = tuple(item['coordinates'])
            if item['category'] == 'HAZARD':
                self.hazard_coordinates.append(coord)
            else:
                self.poi_coordinates.append(coord)
                    
        print("Hazardous Coordinates:", self.hazard_coordinates)
        print("Point of Interest Coordinates:", self.poi_coordinates)
        return self.hazard_coordinates, self.poi_coordinates

In [10]:
def evaluate_HRL_agent(env, manager, workers, test_episodes=1):
    total_rewards = []
    for episode in range(test_episodes):
        obs, _ = env.reset(seed=episode)
        state = manager.get_state(obs)
        terminated = False
        total_return, step, cnt = 0, 0, 0
        collisions = []
        while not terminated:
            current_option = np.argmax(manager.Q_table[state]) # Manager chooses option greedily
            worker = workers[current_option] # Get the worker for this option
            action = np.argmax(worker.Q_table[state]) # Worker chooses action greedily
            next_obs, reward, terminated, _, _ = env.step(action) # Take action in environment
            next_state = manager.get_state(next_obs)

            option_name = RobotOption(current_option).name # Map the option index to the correct option Enum
            if current_option == RobotOption.NAVIGATION.value:
                action_name = NavigationActions(action).name
            elif current_option == RobotOption.INFORMATION_COLLECTION.value:
                action_name = InformationCollectionActions(action).name
            elif current_option == RobotOption.OPERATION_TRIAGE.value:
                action_name = OperationTriageActions(action).name
            else:
                action_name = f"Unknown Action ({action})"

            print(f"Step {step+1}: || State={state} || Option={option_name} || Action={action_name} || Reward={reward} || Next State={next_state} || Done={terminated}")
            # Optionally, print logs or store them
            total_return += reward
            state = next_state
            step += 1

            if tuple([state[0], state[1]]) in env.sar_robot.fires:
                print(colored("Robot is in fire!", "red"))
                cnt += 1
                collisions.append(tuple([state[0], state[1]]))
        total_rewards.append(total_return)
        print(f"Test {episode}: Finished after {step} steps with total reward {total_return} and {cnt} collisions at {collisions}.")
    avg_reward = sum(total_rewards) / test_episodes
    print(f"Average reward over {test_episodes} testing episodes: {avg_reward}")
    return total_rewards

In [12]:
#hier + attention 
class searchANDrescueRobot_HRL:
    def __init__(self, grid_rows=7, grid_cols=7, info_number_needed=3):
        self.grid_rows = grid_rows
        self.grid_cols = grid_cols
        self.info_number_needed = info_number_needed
        self.reset()
        # for LLM integration
        self.ask_action_counter = 0
        self.visited_information_state = False
        self.input_received = False
        self.POIs, self.fires, self.hazards, self.pois = [], [], [], []
        document_path = "/home/dimi/HRL-LLM/data/sar_data.json"
        document_type = get_file_type(document_path)
        self.assistant = DisasterResponseAssistant(document_path, document_type)
        self.sensor_readings = {}
    
    def reset(self, seed=None):
        self.init_positions = [[4, 1]]
        self.robot_pos = random.choice(self.init_positions)
        self.has_info = 0
        self.has_saved = 0
        self.current_option = RobotOption.NAVIGATION.value
        random.seed(seed)
        self.target_pos = [0, 3]
        self.info_pos1 = [4, 4]
        self.info_pos2 = [6, 2]
        self.info_pos3 = [5, 5]
        self.ditches = [(1, 6), (2, 2), (2, 4), (3, 2), (3, 3), (3, 4), (4, 5), \
                        (5, 0), (5, 1), (5, 2), (6, 0), (0, 2), (0, 4)]
        
        self.POIs, self.fires = [], []
        self.visited_information_state = False


    # Update the robot's position
    def next_state_vision(self, target, robot_action):
        robot_pos = target
        self.last_action = robot_action
        if robot_action in [NavigationActions.UP, NavigationActions.DOWN, NavigationActions.LEFT, NavigationActions.RIGHT]:
            if robot_action == NavigationActions.UP:
                if robot_pos[0] > 0:
                    robot_pos[0] -= 1  
            elif robot_action == NavigationActions.DOWN:
                if robot_pos[0] < self.grid_rows-1:
                    robot_pos[0] += 1
            elif robot_action == NavigationActions.LEFT:
                if robot_pos[1] > 0:
                    robot_pos[1] -= 1
            elif robot_action == NavigationActions.RIGHT:
                if robot_pos[1] < self.grid_cols-1:
                    robot_pos[1] += 1
        if robot_action in [OperationTriageActions.SAVE, OperationTriageActions.USE, OperationTriageActions.REMOVE, OperationTriageActions.CARRY, \
                            InformationCollectionActions.COLLECT_X, InformationCollectionActions.COLLECT_Y, InformationCollectionActions.COLLECT_Z]:
            robot_pos = robot_pos
        return robot_pos


    def perform_action(self, robot_action):
        robot_option = self.current_option # hrl
        self.last_action = robot_action
        info_collected_X, info_collected_Y, info_collected_Z = False, False, False
        total_info_collected = False
        illegal_action = False
        
        if robot_option == RobotOption.NAVIGATION.value: ### option 0 (subtask 0)
            if robot_action in [NavigationActions.UP.value, NavigationActions.DOWN.value, NavigationActions.LEFT.value, NavigationActions.RIGHT.value]:
                if (self.has_info < 1 and self.robot_pos != self.info_pos1) or \
                    (self.has_info == 1 and self.robot_pos != self.info_pos2) or \
                    (self.has_info == 2 and self.robot_pos != self.info_pos3) or \
                    (self.has_info == self.info_number_needed and self.robot_pos != self.target_pos):
                    if robot_action == NavigationActions.UP.value:
                        if self.robot_pos[0] > 0:
                            self.robot_pos[0] -= 1  
                    elif robot_action == NavigationActions.DOWN.value:
                        if self.robot_pos[0] < self.grid_rows-1:
                            self.robot_pos[0] += 1
                    elif robot_action == NavigationActions.LEFT.value:
                        if self.robot_pos[1] > 0:
                            self.robot_pos[1] -= 1
                    elif robot_action == NavigationActions.RIGHT.value:
                        if self.robot_pos[1] < self.grid_cols-1:
                            self.robot_pos[1] += 1
                if (self.has_info < 1 and self.robot_pos == self.info_pos1) or \
                    (self.has_info == 1 and self.robot_pos == self.info_pos2) or \
                    (self.has_info == 2 and self.robot_pos == self.info_pos3):
                    robot_option = RobotOption.INFORMATION_COLLECTION.value
                if self.has_info == self.info_number_needed and self.robot_pos == self.target_pos:
                    robot_option = RobotOption.OPERATION_TRIAGE.value
            else:
                illegal_action = True
        
        elif robot_option == RobotOption.INFORMATION_COLLECTION.value:   ### option 1 (subtask 1)
            if robot_action in [InformationCollectionActions.COLLECT_X.value, InformationCollectionActions.COLLECT_Y.value, InformationCollectionActions.COLLECT_Z.value]:
                if robot_action == InformationCollectionActions.COLLECT_X.value:
                    if self.robot_pos == self.info_pos1 and self.has_info < 1:
                        self.has_info += 1
                        info_collected_X = True
                        robot_option = RobotOption.NAVIGATION.value
                if robot_action == InformationCollectionActions.COLLECT_Y.value:
                    if self.robot_pos == self.info_pos2 and self.has_info == 1:
                        self.has_info += 1
                        info_collected_Y = True
                        robot_option = RobotOption.NAVIGATION.value
                if robot_action == InformationCollectionActions.COLLECT_Z.value:
                    if self.robot_pos == self.info_pos3 and self.has_info == 2:
                        self.perform_collect_action() # Collect the third info and exploit the knowledge from this moment on
                        self.has_info += 1
                        info_collected_Z = True
                        total_info_collected = True
                        robot_option = RobotOption.NAVIGATION.value
            else:
                illegal_action = True

        elif robot_option == RobotOption.OPERATION_TRIAGE.value:  ### option 2 (subtask 2)
            if robot_action in [OperationTriageActions.SAVE.value, OperationTriageActions.USE.value, OperationTriageActions.REMOVE.value, OperationTriageActions.CARRY.value]:
                if robot_action == OperationTriageActions.SAVE.value:
                    if self.robot_pos == self.target_pos and self.has_info == self.info_number_needed:
                        self.has_saved = 1
                else:
                    illegal_action = True
            else:
                illegal_action = True
        
        self.current_option = robot_option
                
        mission_complete = self.has_saved
        return mission_complete, info_collected_X, info_collected_Y, total_info_collected, illegal_action
    

    def perform_collect_action(self):
        self.ask_action_counter += 1
        x, y = self.robot_pos
        verbal_inputs = []
        if self.has_info == 2:  ## should be 2 if total number of infos are 3 
            verbal_input = ("Hey, there's a victim at the hospital. A fire was reported at the train station. There is a fire at the bank. A safe area is the mall. You must go to the access route in the school. Another access route at the restaurant. And there is a shelter in the shop. There are also reports of significant instances of heat at the bakery. Police told us that no access allowed around the petrol station.")
            # print(f"real LLM is about to start handling the input {verbal_input}")
            verbal_inputs.append(verbal_input)
            
            if self.ask_action_counter <= 1:
                print(f"real LLM is about to start handling the input {verbal_input}")
                for input_text in verbal_inputs:
                    response = self.assistant.generate_response(input_text)
                    if response:
                        self.visited_information_state = True
                    self.hazards, self.pois = self.assistant.refine_response(response)
                    print(f"real LLM is about to end handling the input {verbal_input}")
                    self.update_environment_REAL(self.hazards, self.pois)
            else:
                # #print(f"input will be handled hereby by pseudoLLM")
                # print(self.hazards, self.pois)
                self.visited_information_state = True
                self.update_environment_REAL(self.hazards, self.pois)
            
    def update_environment_REAL(self, haz, poi):
        for hazardous_location in haz:
            self.sensor_readings[(hazardous_location[0], hazardous_location[1], 3, 0)] = -10.0
            self.fires.append(hazardous_location)
        for safe_location in poi:
            self.sensor_readings[(safe_location[0], safe_location[1], 3, 0)] = 10.0
            self.POIs.append(safe_location)

    
    def is_in_ditch(self):
        return tuple(self.robot_pos) in self.ditches

    def render(self):
        for x in range(self.grid_rows):
            for y in range(self.grid_cols):
                if [x, y] == self.robot_pos:
                    print(GridTile.ROBOT, end=' ')
                elif [x, y] == self.target_pos:
                    print(GridTile.TARGET, end=' ')
                elif [x, y] == self.info_pos1:
                    print(GridTile.X_INFO, end=' ')
                elif [x, y] == self.info_pos2:
                    print(GridTile.Y_INFO, end=' ')
                elif [x, y] == self.info_pos3:
                    print(GridTile.Z_INFO, end=' ')
                elif tuple([x, y]) in self.ditches:
                    print(GridTile.DITCH, end=' ')
                else:
                    print(GridTile._FLOOR, end=' ')
            print()
        print()


class SARrobotEnv_HRL(gym.Env):
    metadata = {"render_modes": ["human"], 'render_fps': 1}
    def __init__(self, grid_rows=7, grid_cols=7, render_mode=None, info_number_needed=3):
        self.grid_rows = grid_rows
        self.grid_cols = grid_cols
        self.render_mode = render_mode
        
        self.sar_robot = searchANDrescueRobot_HRL(grid_rows, grid_cols, info_number_needed)
        self.option_space = spaces.Discrete(len(RobotOption))
        
        self.observation_space = spaces.Box(
            low = 0,
            high = np.array([self.grid_rows-1, self.grid_cols-1, info_number_needed, 1]),
            shape = (4,),
            dtype = np.int32
        )
        
        self.max_steps = 50
        self.current_step = 0
        self.turnPenalty = -1
        self.stepsPenalty = -5
        self.ditchPenalty = -30
        self.illegalActionPenalty = -5  # Penalty for illegal actions
        self.winReward = 100
    
    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.sar_robot.reset(seed=seed)
        self.current_step = 0
        obs = np.concatenate((self.sar_robot.robot_pos, [self.sar_robot.has_info], [self.sar_robot.has_saved])).astype(np.int32)
        info = {'option': self.sar_robot.current_option}
        return obs, info

    def step(self, action):
        reward = 0
        self.current_step += 1
        target_reached, info_collected_X, info_collected_Y, total_info_collected, illegal_action = self.sar_robot.perform_action(action)
        terminated = False
        
        if self.sar_robot.is_in_ditch():
            reward = self.ditchPenalty
            terminated = True
            self.sar_robot.current_option = RobotOption.NAVIGATION
        
        if self.is_max_steps_exceeded():
            reward = self.stepsPenalty
            terminated = True
            self.sar_robot.current_option = RobotOption.NAVIGATION
        
        if info_collected_X or info_collected_Y or total_info_collected:
            reward = 6  # Reward for collecting info
        
        if target_reached:
            reward = self.winReward
            terminated = True

        if illegal_action:
            reward = self.illegalActionPenalty
        
        reward += self.turnPenalty
        
        obs = np.concatenate((self.sar_robot.robot_pos, [self.sar_robot.has_info], [self.sar_robot.has_saved])).astype(np.int32)
        info = {'option': self.sar_robot.current_option}
        
        if self.render_mode == 'human':
            print(f"Option: {self.sar_robot.current_option}, Action: {action}, Reward: {reward}, Terminated: {terminated}")
            self.render()
        
        return obs, reward, terminated, False, info

    def is_max_steps_exceeded(self):
        return self.current_step >= self.max_steps
    
    def render(self):
        if self.render_mode == 'human':
            self.sar_robot.render()


env = SARrobotEnv_HRL(grid_rows=7, grid_cols=7, render_mode='None',  info_number_needed=3)

document json will be infused


In [6]:
class QLearningAgentHierarchical:
    def __init__(self, env, action_space_size, ALPHA, GAMMA, EPSILON_MAX, DECAY_RATE, EPSILON_MIN, epsilon_decay, log_dir="testing_HRLgym/HierQ"):
        self.env = env 
        self.ALPHA = ALPHA
        self.GAMMA = GAMMA 
        self.EPSILON_MAX = EPSILON_MAX
        self.EPSILON = EPSILON_MAX
        self.DECAY_RATE = DECAY_RATE
        self.EPSILON_MIN = EPSILON_MIN
        self.epsilon_decay = epsilon_decay
        self.num_states = (self.env.observation_space.high[0] + 1, 
                           self.env.observation_space.high[1] + 1, 
                           self.env.observation_space.high[2] + 1,
                           self.env.observation_space.high[3] + 1)  # 7*7*4*2
        self.action_space_size = action_space_size
        self.Q_table = np.zeros((*self.num_states, self.action_space_size))
        self.writer = tf.summary.create_file_writer(log_dir)

    def epsilon_greedy_policy(self, state):
        if np.random.rand() < self.EPSILON:
            return np.random.randint(self.action_space_size)  # Explore
        else:
            return np.argmax(self.Q_table[state])  # Exploit

    def get_state(self, observation):
        return tuple(observation)

    def decay_epsilon(self, episodes):
        if self.EPSILON > 0.1:
            self.EPSILON -= self.DECAY_RATE / episodes
        else:
            self.EPSILON = self.EPSILON_MIN
        return self.EPSILON

    def update(self, state, action, reward, next_state):
        best_next_action = np.argmax(self.Q_table[next_state])
        td_target = reward + self.GAMMA * self.Q_table[next_state][best_next_action]
        td_error = td_target - self.Q_table[state][action]
        self.Q_table[state][action] += self.ALPHA * td_error


    def train(self, manager, workers, num_episodes):
        return_list_Q = []
        total_rewards_per_episode = np.zeros(num_episodes)
        total_steps_per_episode = np.zeros(num_episodes)
        Rewards = 0

        for episode in tqdm(range(num_episodes)):
            if episode % 100 == 0:
                print(f"Episode: {episode} | Reward: {Rewards} | epsilon: {self.EPSILON}")

            obs, _ = self.env.reset(seed=episode)
            s = self.get_state(obs)
            terminated = False
            Rewards, steps_cnt, episode_return_Q = 0, 0, 0

            while not terminated:
                option = self.env.sar_robot.current_option
                worker = workers[option]
                a = worker.epsilon_greedy_policy(s)
                obs_, r, terminated, _, _ = self.env.step(a)
                s_ = self.get_state(obs_)
                
                Rewards += r
                episode_return_Q += r

                worker.update(s, a, r, s_)
                manager.update(s, option, r, s_)

                s = s_
                steps_cnt += 1

            with self.writer.as_default():
                tf.summary.scalar('Episode Return', Rewards, step=episode)
                tf.summary.scalar('Steps per Episode', steps_cnt, step=episode)

            manager.EPSILON = self.decay_epsilon(num_episodes)
            for w in workers.values():
                w.decay_epsilon(num_episodes)

            total_rewards_per_episode[episode] = Rewards
            return_list_Q.append(episode_return_Q)

        return total_rewards_per_episode, workers


In [7]:
#hier
# Define action space sizes for each worker
# Manager action space size (Options)
manager_action_space_size = len(RobotOption)

# Workers for each option
explore_action_space_size = len(NavigationActions)
collect_action_space_size = len(InformationCollectionActions)
operate_action_space_size = len(OperationTriageActions)

In [8]:
#hier
all_total_rewards_AGENT_hier = []  # List to store total rewards from each run
EPISODES = 1500
ALPHA = 0.1
GAMMA = 0.98
EPSILON_MAX = 1.0
EPSILON_MIN = 0.01
epsilon_decay = 0.95
DECAY_RATE = 2
for _ in range(1):
    # Manager for choosing options
    manager_hier = QLearningAgentHierarchical(env, manager_action_space_size, ALPHA, GAMMA, EPSILON_MAX, DECAY_RATE, EPSILON_MIN, epsilon_decay)
    explore_worker_hier = QLearningAgentHierarchical(env, explore_action_space_size, ALPHA, GAMMA, EPSILON_MAX, DECAY_RATE, EPSILON_MIN, epsilon_decay)  # [Up, down, left, right]
    collect_worker_hier = QLearningAgentHierarchical(env, collect_action_space_size, ALPHA, GAMMA, EPSILON_MAX, DECAY_RATE, EPSILON_MIN, epsilon_decay)  # [A, B, C, X, Y, Z]
    operate_worker_hier = QLearningAgentHierarchical(env, operate_action_space_size, ALPHA, GAMMA, EPSILON_MAX, DECAY_RATE, EPSILON_MIN, epsilon_decay)  # [save, use, remove, carry]

    workers_hier = {
        0: explore_worker_hier,  # Worker for EXPLORE
        1: collect_worker_hier,  # Worker for COLLECT
        2: operate_worker_hier   # Worker for OPERATE
    }
    hier_returns, workers_simple = manager_hier.train(manager_hier, workers_hier, EPISODES)
    all_total_rewards_AGENT_hier.append(hier_returns)
    
avg_total_rewards_AGENT_hier = np.mean(all_total_rewards_AGENT_hier, axis=0)

2024-09-19 16:23:07.188318: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-09-19 16:23:07.191879: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-09-19 16:23:07.192012: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

  0%|          | 0/1500 [00:00<?, ?it/s]

Episode: 0 | Reward: 0 | epsilon: 1.0
Episode: 100 | Reward: -52 | epsilon: 0.8666666666666702
Episode: 200 | Reward: -32 | epsilon: 0.7333333333333405
Episode: 300 | Reward: -34 | epsilon: 0.6000000000000107
Episode: 400 | Reward: -31 | epsilon: 0.4666666666666796
Episode: 500 | Reward: -43 | epsilon: 0.3333333333333443
Episode: 600 | Reward: -49 | epsilon: 0.20000000000001006
Episode: 700 | Reward: -37 | epsilon: 0.01
Episode: 800 | Reward: -37 | epsilon: 0.01
Episode: 900 | Reward: 85 | epsilon: 0.01
Episode: 1000 | Reward: 94 | epsilon: 0.01
Episode: 1100 | Reward: 94 | epsilon: 0.01
Episode: 1200 | Reward: 93 | epsilon: 0.01
Episode: 1300 | Reward: 94 | epsilon: 0.01
Episode: 1400 | Reward: 94 | epsilon: 0.01


In [17]:
_ = evaluate_HRL_agent(env, manager_hier, workers_hier)

Step 1: || State=(4, 1, 0, 0) || Option=NAVIGATION || Action=RIGHT || Reward=-1 || Next State=(4, 2, 0, 0) || Done=False
Step 2: || State=(4, 2, 0, 0) || Option=NAVIGATION || Action=RIGHT || Reward=-1 || Next State=(4, 3, 0, 0) || Done=False
Step 3: || State=(4, 3, 0, 0) || Option=NAVIGATION || Action=RIGHT || Reward=-1 || Next State=(4, 4, 0, 0) || Done=False
Step 4: || State=(4, 4, 0, 0) || Option=INFORMATION_COLLECTION || Action=COLLECT_X || Reward=5 || Next State=(4, 4, 1, 0) || Done=False
Step 5: || State=(4, 4, 1, 0) || Option=NAVIGATION || Action=DOWN || Reward=-1 || Next State=(5, 4, 1, 0) || Done=False
Step 6: || State=(5, 4, 1, 0) || Option=NAVIGATION || Action=LEFT || Reward=-1 || Next State=(5, 3, 1, 0) || Done=False
Step 7: || State=(5, 3, 1, 0) || Option=NAVIGATION || Action=DOWN || Reward=-1 || Next State=(6, 3, 1, 0) || Done=False
Step 8: || State=(6, 3, 1, 0) || Option=NAVIGATION || Action=LEFT || Reward=-1 || Next State=(6, 2, 1, 0) || Done=False
Step 9: || State=(6, 

In [None]:
# def evaluate_policy(env, manager, workers, num_episodes=1):
#     for episode in range(num_episodes):
#         obs, _ = env.reset()
#         s = manager.get_state(obs)
#         terminated = False
#         total_reward = 0
#         steps = 0
#         path = []
#         while not terminated:
#             # Manager chooses option greedily
#             option = np.argmax(manager.Q_table[s])
#             # Get the worker for this option
#             worker = workers[option]
#             # Worker chooses action greedily
#             action = np.argmax(worker.Q_table[s])
#             # Take action in environment
#             obs_, reward, terminated, truncated, info = env.step(action)
#             s_ = manager.get_state(obs_)
#             # Log the transition
#             path.append({
#                 'state': s,
#                 'option': option,
#                 'action': action,
#                 'next_state': s_,
#                 'reward': reward,
#                 'terminated': terminated
#             })
#             s = s_
#             total_reward += reward
#             steps += 1
#         # After episode ends, log the path
#         print(f"Episode {episode}: Total reward: {total_reward}, Steps: {steps}")
#         for idx, t in enumerate(path):
#             print(f"Step {idx + 1}:")
#             print(f"  State: {t['state']}")
#             print(f"  Option chosen by Manager: {t['option']}")
#             print(f"  Action taken by Worker: {t['action']}")
#             print(f"  Next State: {t['next_state']}")
#             print(f"  Reward: {t['reward']}")
#             print(f"  Terminated: {t['terminated']}\n")
#     return
# evaluate_policy(env, manager_hier, workers_hier, num_episodes=1)

In [13]:
class AttentionSpace:
    def __init__(self, env, action_space_size):
        self.env = env
        self.num_states = (self.env.observation_space.high[0] + 1, 
                           self.env.observation_space.high[1] + 1, 
                           self.env.observation_space.high[2] + 1,
                           self.env.observation_space.high[3] + 1)  # 7*7*4*2
        self.action_space_size = action_space_size
        self.attention_space_low = np.zeros((*self.num_states, self.action_space_size))

    def identify_changed_states(self, readings):
        return [i for i, value in readings.items() if value != 1]

    def get_connected_states(self, target_state):
        inverse_actions = {
            NavigationActions.UP.value: NavigationActions.DOWN.value,
            NavigationActions.DOWN.value: NavigationActions.UP.value,
            NavigationActions.LEFT.value: NavigationActions.RIGHT.value,
            NavigationActions.RIGHT.value: NavigationActions.LEFT.value
        }
        connected_states_pairs = []
        for action in range(len(NavigationActions)):
            possible_prev_state = self.env.sar_robot.next_state_vision(list(target_state[:2]), NavigationActions(inverse_actions[action]))
            if tuple(possible_prev_state) != tuple(target_state[:2]) and tuple(possible_prev_state) not in self.env.sar_robot.ditches:
                connected_states_pairs.append((tuple(possible_prev_state), action))
        return connected_states_pairs

    def update_attention_space(self, connection, readings):
        connected_states = self.get_connected_states(connection)
        value_to_add = 2.0 if readings[connection] > 0 else -100.0
        for connected_state, action in connected_states:
            full_state = tuple([*connected_state, connection[2], connection[3]])
            if self.attention_space_low[full_state][action] == 0:  # Avoid overwriting
                self.attention_space_low[full_state][action] = value_to_add
        if list((connection[0], connection[1])) == self.env.sar_robot.target_pos:
            self.attention_space_low[connection][0] = 100  # 'save' is highly favored

    def apply_attention_to_q_table(self, Q_table):
        for index, value in np.ndenumerate(self.attention_space_low):
            *state_indices, action = index
            if value != 0:
                Q_table[tuple(state_indices)][action] = value
                print(f"Updated Q-table at {tuple(state_indices)}, action {action} with value {value}")


In [14]:
class QLearningAgentHierarchicalAttention:
    def __init__(self, env, action_space_size, ALPHA, GAMMA, EPSILON_MAX, DECAY_RATE, EPSILON_MIN, epsilon_decay, log_dir="testing_HRLgym/HierQ-Att"):
        self.env = env 
        self.ALPHA = ALPHA
        self.GAMMA = GAMMA 
        self.EPSILON_MAX = EPSILON_MAX
        self.EPSILON = EPSILON_MAX
        self.DECAY_RATE = DECAY_RATE
        self.EPSILON_MIN = EPSILON_MIN
        self.epsilon_decay = epsilon_decay
        self.num_states = (self.env.observation_space.high[0] + 1, 
                           self.env.observation_space.high[1] + 1, 
                           self.env.observation_space.high[2] + 1,
                           self.env.observation_space.high[3] + 1)  # 7*7*4*2
        self.action_space_size = action_space_size
        self.Q_table = np.zeros((*self.num_states, self.action_space_size))
        self.attention_space = AttentionSpace(self.env, self.action_space_size)  # Refactor: use AttentionSpace class
        self.input_received = False
        self.writer = tf.summary.create_file_writer(log_dir)

    def epsilon_greedy_policy(self, state):
        if not self.env.sar_robot.visited_information_state:
            if np.random.rand() < self.EPSILON:
                return np.random.randint(self.action_space_size)  # Explore
            else:
                return np.argmax(self.Q_table[state])  # Exploit
        else:
            return np.argmax(self.Q_table[state])  # Exploit

    def get_state(self, observation):
        return tuple(observation)

    def decay_epsilon(self, episodes):
        if self.EPSILON > 0.1:
            self.EPSILON -= self.DECAY_RATE / episodes
        else:
            self.EPSILON = self.EPSILON_MIN
        return self.EPSILON

    def decay_epsilon_exploit(self):
        self.EPSILON = 0.01
        return self.EPSILON

    def update(self, state, action, reward, next_state):
        best_next_action = np.argmax(self.Q_table[next_state])
        td_target = reward + self.GAMMA * self.Q_table[next_state][best_next_action]
        td_error = td_target - self.Q_table[state][action]
        self.Q_table[state][action] += self.ALPHA * td_error

    def update_attention(self, manager, workers, sensor_readings):
        changed_states = self.attention_space.identify_changed_states(sensor_readings)
        if changed_states:
            for state in changed_states:
                if list((state[0], state[1])) != self.env.sar_robot.target_pos:
                    workers[0].attention_space.update_attention_space(state, sensor_readings)
                else:
                    workers[2].attention_space.update_attention_space(state, sensor_readings)
        workers[0].attention_space.apply_attention_to_q_table(workers[0].Q_table)
        workers[2].attention_space.apply_attention_to_q_table(workers[2].Q_table)


    def train(self, manager, workers, num_episodes):
        return_list_Q = []
        total_rewards_per_episode = np.zeros(num_episodes)
        total_steps_per_episode = np.zeros(num_episodes)
        attention_space = {}
        Rewards = 0

        for episode in tqdm(range(num_episodes)):
            if episode % 100 == 0:
                print(f"Episode: {episode} | Reward: {Rewards} | epsilon: {self.EPSILON}")

            obs, _ = self.env.reset(seed=episode)
            s = self.get_state(obs)
            terminated = False
            Rewards, steps_cnt, episode_return_Q = 0, 0, 0

            while not terminated:
                if self.env.sar_robot.visited_information_state and not manager.input_received:
                    self.update_attention(manager, workers, self.env.sar_robot.sensor_readings)
                    manager.input_received = True
                    print(f"Updated attention space with new information at episode {episode}")

                option = self.env.sar_robot.current_option
                worker = workers[option]
                a = worker.epsilon_greedy_policy(s)
                obs_, r, terminated, _, _ = self.env.step(a)
                s_ = self.get_state(obs_)
                
                Rewards += r
                episode_return_Q += r

                worker.update(s, a, r, s_)
                manager.update(s, option, r, s_)

                s = s_
                steps_cnt += 1

            with self.writer.as_default():
                tf.summary.scalar('Episode Return', Rewards, step=episode)
                tf.summary.scalar('Steps per Episode', steps_cnt, step=episode)

            manager.EPSILON = self.decay_epsilon(num_episodes) if not self.env.sar_robot.visited_information_state else self.decay_epsilon_exploit()
            for w in workers.values():
                w.decay_epsilon(num_episodes) if not self.env.sar_robot.visited_information_state else w.decay_epsilon_exploit()

            total_rewards_per_episode[episode] = Rewards
            return_list_Q.append(episode_return_Q)

        return total_rewards_per_episode, attention_space, workers


In [15]:
#hier
all_total_rewards_AGENT_hier_att = []  # List to store total rewards from each run
EPISODES = 1500
ALPHA = 0.1
GAMMA = 0.98
EPSILON_MAX = 1.0
EPSILON_MIN = 0.01
epsilon_decay = 0.95
DECAY_RATE = 2
for _ in range(1):
    # Manager for choosing options
    manager_hier_att = QLearningAgentHierarchicalAttention(env, manager_action_space_size, ALPHA, GAMMA, EPSILON_MAX, DECAY_RATE, EPSILON_MIN, epsilon_decay)
    explore_worker_hier_att = QLearningAgentHierarchicalAttention(env, explore_action_space_size, ALPHA, GAMMA, EPSILON_MAX, DECAY_RATE, EPSILON_MIN, epsilon_decay)  # [Up, down, left, right]
    collect_worker_hier_att = QLearningAgentHierarchicalAttention(env, collect_action_space_size, ALPHA, GAMMA, EPSILON_MAX, DECAY_RATE, EPSILON_MIN, epsilon_decay)  # [A, B, C, X, Y, Z]
    operate_worker_hier_att = QLearningAgentHierarchicalAttention(env, operate_action_space_size, ALPHA, GAMMA, EPSILON_MAX, DECAY_RATE, EPSILON_MIN, epsilon_decay)  # [save, use, remove, carry]

    workers_hier_att = {
        0: explore_worker_hier_att,  # Worker for EXPLORE
        1: collect_worker_hier_att,  # Worker for COLLECT
        2: operate_worker_hier_att   # Worker for OPERATE
    }
    hier_returns_att, attention, workers_simple_att = manager_hier_att.train(manager_hier_att, workers_hier_att, EPISODES)
    all_total_rewards_AGENT_hier_att.append(hier_returns_att)
    
avg_total_rewards_AGENT_hier_att = np.mean(all_total_rewards_AGENT_hier_att, axis=0)

  0%|          | 0/1500 [00:00<?, ?it/s]

Episode: 0 | Reward: 0 | epsilon: 1.0
Episode: 100 | Reward: -34 | epsilon: 0.8666666666666702
Episode: 200 | Reward: -31 | epsilon: 0.7333333333333405


Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3


Episode: 300 | Reward: -41 | epsilon: 0.6000000000000107
real LLM is about to start handling the input Hey, there's a victim at the hospital. A fire was reported at the train station. There is a fire at the bank. A safe area is the mall. You must go to the access route in the school. Another access route at the restaurant. And there is a shelter in the shop. There are also reports of significant instances of heat at the bakery. Police told us that no access allowed around the petrol station.
Hazardous Coordinates: [(5, 6), (6, 5), (3, 6), (2, 5)]
Point of Interest Coordinates: [(0, 3), (4, 1), (3, 0), (2, 0), (1, 2)]
real LLM is about to end handling the input Hey, there's a victim at the hospital. A fire was reported at the train station. There is a fire at the bank. A safe area is the mall. You must go to the access route in the school. Another access route at the restaurant. And there is a shelter in the shop. There are also reports of significant instances of heat at the bakery. Po

In [16]:
_ = evaluate_HRL_agent(env, manager_hier_att, workers_hier_att)

Step 1: || State=(4, 1, 0, 0) || Option=NAVIGATION || Action=RIGHT || Reward=-1 || Next State=(4, 2, 0, 0) || Done=False
Step 2: || State=(4, 2, 0, 0) || Option=NAVIGATION || Action=RIGHT || Reward=-1 || Next State=(4, 3, 0, 0) || Done=False
Step 3: || State=(4, 3, 0, 0) || Option=NAVIGATION || Action=RIGHT || Reward=-1 || Next State=(4, 4, 0, 0) || Done=False
Step 4: || State=(4, 4, 0, 0) || Option=INFORMATION_COLLECTION || Action=COLLECT_X || Reward=5 || Next State=(4, 4, 1, 0) || Done=False
Step 5: || State=(4, 4, 1, 0) || Option=NAVIGATION || Action=DOWN || Reward=-1 || Next State=(5, 4, 1, 0) || Done=False
Step 6: || State=(5, 4, 1, 0) || Option=NAVIGATION || Action=DOWN || Reward=-1 || Next State=(6, 4, 1, 0) || Done=False
Step 7: || State=(6, 4, 1, 0) || Option=NAVIGATION || Action=LEFT || Reward=-1 || Next State=(6, 3, 1, 0) || Done=False
Step 8: || State=(6, 3, 1, 0) || Option=NAVIGATION || Action=LEFT || Reward=-1 || Next State=(6, 2, 1, 0) || Done=False
Step 9: || State=(6, 