# SMC 2024 submission (notebook version)

## Flat Agent and Flat Attention Agent (with policy shaping) - SPARSE reward environment

In [1]:
### this is where we convert the problem into compatible mode for gym
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import random
import pickle
import json
import os
import tensorflow as tf
from tqdm.auto import tqdm
from termcolor import colored

import gymnasium as gym
from gymnasium import spaces
from gymnasium.envs.registration import register, registry, EnvSpec
from gymnasium.utils.env_checker import check_env

from enum import Enum
import collections
import torch
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter

2024-09-24 15:11:52.493009: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-09-24 15:11:52.521059: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## LLM assistant setup

In [2]:
import os
import json
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import Chroma
from langchain_community import embeddings
from langchain_community.chat_models import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.document_loaders import JSONLoader
from pprint import pprint

def get_file_type(document_path):
    # Split the path and get the extension
    _, file_extension = os.path.splitext(document_path)
    # Return the file extension without the period
    return file_extension[1:] if file_extension else None

### class that processes verbal inputs handle disaster-related verbal inputs, analyze them using RAG architecture, and generate a 
# response in a specified format. It leverages models like ChatOllama and techniques like vector storage and retrieval for its operations.
class DisasterResponseAssistant:
    def __init__(self, data_path, data_type, model_name="mistral", embedding_model='nomic-embed-text', collection_name="rag-chroma"):
        self.model_name = model_name
        self.embedding_model = embedding_model
        self.collection_name = collection_name
        self.data_path = data_path
        self.data_type = data_type
        
        self.llm = None
        self.loader = None
        self.vectorstore = None
        self.retriever = None
        
        self._load_model()            # Initializes an instance of the ChatOllama model    
        self._load_documents()        # Loads and splits the PDF document into chunks
        self._create_vectorstore()    # Creates a vector store using Chroma from the document splits
        self._create_retriever()      # Creates a retriever from the vector store
        
        self.hazard_coordinates = []  # To store hazard coordinates
        self.poi_coordinates = []     # To store points of interest coordinates
    
    def _load_model(self):
        self.llm = ChatOllama(model=self.model_name)
        

    def _load_documents(self): ## for json documents
        print(f"document {self.data_type} will be infused")
        if self.data_type == 'pdf':
            self.loader = PyPDFLoader(self.data_path)
            self.data = self.loader.load_and_split()
        elif self.data_type == 'json':
            self.loader = JSONLoader(
                file_path=self.data_path,
                jq_schema='.',
                text_content=False)
            self.data = self.loader.load()
            #pprint(self.data)
        else:
            raise ValueError("Unsupported document type. Please choose either 'pdf' or 'json'.")


    def _create_vectorstore(self): ## for json documents
        self.vectorstore = Chroma.from_documents(
            documents=self.data,
            collection_name=self.collection_name,
            embedding=embeddings.OllamaEmbeddings(model=self.embedding_model),
        )

        
    def _create_retriever(self):
        self.retriever = self.vectorstore.as_retriever()

    ### generate a response based on a verbal input
    ### construct a template for the response using RAG architecture
    def generate_response(self, verbal_input):
        prompt_template = """You are an assistant, who carefully listens to verbal inputs: {verbal_input} and specialized in analyzing disaster-related inputs. Your task is 
to identify physical locations mentioned in the text and classify them as either points of interest (POI) or as hazards/dangers (HAZARD) for rescue operations. Use the
information provided in the documents: {context}, such as KEYWORDS, descriptions and context when locations are mentioned, to make your classification.
Output the classification in the form of a JSON array dictionary with keys 'location', 'coordinates', and 'category'. Here are some rules you always follow:
- Focus strictly on physical locations. Avoid including entities that do not represent physical, geographical places (such as individuals, conditions, or 
  abstract concepts).
- Generate human-readable output in the specified dictionary format.
- Generate only the requested output, strictly following the dictionary structure.
- Within the dictionary, the value of the `category` key must be either 'POI' or 'HAZARD'. 
- Never generate offensive or foul language.
- Never give explanations over your output.
Input: {verbal_input}
"""
        system_template = ChatPromptTemplate.from_template(prompt_template)
        output_parser = StrOutputParser()
        after_rag_chain = (
            {"context": self.retriever, "verbal_input": RunnablePassthrough()}
            | system_template
            | self.llm  # Assuming model_local is defined elsewhere and accessible
            | output_parser
        )
        response = after_rag_chain.invoke(verbal_input)
        return response
    
    def refine_response(self, output):
        cleaned_output_str = output.strip().replace('\n', '').replace('(', '[').replace(')', ']')
        output_dict = json.loads(cleaned_output_str)

        for item in output_dict:
            coord = tuple(item['coordinates'])
            if item['category'] == 'HAZARD':
                self.hazard_coordinates.append(coord)
            else:
                self.poi_coordinates.append(coord)
                    
        print("Hazardous Coordinates:", self.hazard_coordinates)
        print("Point of Interest Coordinates:", self.poi_coordinates)
        return self.hazard_coordinates, self.poi_coordinates

## Robot and Environment setup

In [3]:
class RobotAction(Enum):
    UP = 0
    DOWN = 1
    LEFT = 2
    RIGHT = 3
    COLLECT_X = 4
    COLLECT_Y = 5
    COLLECT_Z = 6
    COLLECT_A = 7
    COLLECT_B = 8
    COLLECT_C = 9
    SAVE = 10
    USE = 11
    REMOVE = 12
    CARRY = 13

class GridTile(Enum):
    _FLOOR = 0
    ROBOT = 1
    TARGET = 2
    X_INFO = 3
    Y_INFO = 4
    Z_INFO = 5
    DITCH = 6
    
    def __str__(self):
        return self.name[:1]

In [4]:
def evaluate_agent(env, agent, test_episodes=1):
    total_rewards = []
    for episode in range(test_episodes):
        obs, _ = env.reset(seed=episode)
        state = agent.get_state(obs)
        terminated = False
        total_return, step, cnt = 0, 0, 0
        collisions = []
        while not terminated:
            action = np.argmax(agent.Q_table[state]) # Worker chooses action greedily
            next_obs, reward, terminated, _, _ = env.step(action) # Take action in environment
            next_state = agent.get_state(next_obs)

            print(f"Step {step+1}: || State={state} || Action={RobotAction(action).name}|| Reward={reward} || Next State={next_state} || Done={terminated}")
            # Optionally, print logs or store them
            total_return += reward
            state = next_state
            step += 1

            if tuple([state[0], state[1]]) in env.sar_robot.fires:
                print(colored("Robot is in fire!", "red"))
                cnt += 1
                collisions.append(tuple([state[0], state[1]]))
        total_rewards.append(total_return)
        print(f"Test {episode}: Finished after {step} steps with total reward {total_return} and {cnt} collisions at {collisions}.")
    avg_reward = sum(total_rewards) / test_episodes
    print(f"Average reward over {test_episodes} testing episodes: {avg_reward}")
    return total_rewards

In [12]:
class searchANDrescueRobot:
    def __init__(self, grid_rows=7, grid_cols=7, info_number_needed=3):
        self.grid_rows = grid_rows
        self.grid_cols = grid_cols
        self.info_number_needed = info_number_needed
        self.reset()
        # for LLM integration
        self.ask_action_counter = 0
        self.visited_information_state = False
        self.input_received = False
        self.POIs, self.fires, self.hazards, self.pois = [], [], [], []
        document_path = "/home/dimi/HRL-LLM/data/sar_data.json"
        document_type = get_file_type(document_path)
        self.assistant = DisasterResponseAssistant(document_path, document_type)
        self.sensor_readings = {}
         
    def reset(self, seed=None):
        self.init_positions = [[4, 1]]
        self.robot_pos = random.choice(self.init_positions)
        self.has_info = 0
        self.has_saved = 0
        random.seed(seed)
        self.target_pos = [0, 3]
        self.info_pos1 = [4, 4]
        self.info_pos2 = [6, 2]
        self.info_pos3 = [5, 5]
        self.ditches = [(1, 6), (2, 2), (2, 4), (3, 2), (3, 3), (3, 4), (4, 5), \
                        (5, 0), (5, 1), (5, 2), (6, 0), (0, 2), (0, 4)]
        
        self.POIs, self.fires = [], []
        self.visited_information_state = False


    # Update the robot's position
    def next_state_vision(self, target, robot_action:RobotAction) -> bool:
        robot_pos = target
        self.last_action = robot_action
        if robot_action == RobotAction.UP:
            if robot_pos[0] > 0:
                robot_pos[0] -= 1  
        elif robot_action == RobotAction.DOWN:
            if robot_pos[0] < self.grid_rows-1:
                robot_pos[0] += 1
        elif robot_action == RobotAction.LEFT:
            if robot_pos[1] > 0:
                robot_pos[1] -= 1
        elif robot_action == RobotAction.RIGHT:
            if robot_pos[1] < self.grid_cols-1:
                robot_pos[1] += 1
        if robot_action in [RobotAction.COLLECT_A, RobotAction.COLLECT_B, RobotAction.COLLECT_C, \
                            RobotAction.COLLECT_X, RobotAction.COLLECT_Y, RobotAction.COLLECT_Z, \
                            RobotAction.SAVE, RobotAction.USE, RobotAction.REMOVE, RobotAction.CARRY]: 
            robot_pos = robot_pos
        return robot_pos


    def perform_action(self, robot_action:RobotAction):
        self.last_action = robot_action
        info_collected_X, info_collected_Y, info_collected_Z = False, False, False
        total_info_collected = False
        illegal_action = False
        if robot_action in [RobotAction.UP, RobotAction.DOWN, RobotAction.LEFT, RobotAction.RIGHT]:
            if robot_action == RobotAction.UP:
                if self.robot_pos[0] > 0:
                    self.robot_pos[0] -= 1  
            elif robot_action == RobotAction.DOWN:
                if self.robot_pos[0] < self.grid_rows-1:
                    self.robot_pos[0] += 1
            elif robot_action == RobotAction.LEFT:
                if self.robot_pos[1] > 0:
                    self.robot_pos[1] -= 1
            elif robot_action == RobotAction.RIGHT:
                if self.robot_pos[1] < self.grid_cols-1:
                    self.robot_pos[1] += 1
        
        elif robot_action in [RobotAction.COLLECT_A, RobotAction.COLLECT_B, RobotAction.COLLECT_C,
                              RobotAction.COLLECT_X, RobotAction.COLLECT_Y, RobotAction.COLLECT_Z,
                              RobotAction.SAVE, RobotAction.USE, RobotAction.REMOVE, RobotAction.CARRY]:
            if self.robot_pos == self.info_pos1 and self.has_info < 1:
                if robot_action == RobotAction.COLLECT_X:
                    self.has_info += 1
                    info_collected_X = True
                else:
                    illegal_action = True
            elif self.robot_pos == self.info_pos2 and self.has_info == 1:
                if robot_action == RobotAction.COLLECT_Y:
                    self.has_info += 1
                    info_collected_Y = True
                else:
                    illegal_action = True
            elif self.robot_pos == self.info_pos3 and self.has_info == 2:
                if robot_action == RobotAction.COLLECT_Z:
                    self.perform_collect_action()
                    self.has_info += 1
                    info_collected_Z = True
                    total_info_collected = True
                else:
                    illegal_action = True
            elif self.robot_pos == self.target_pos and self.has_info == self.info_number_needed:
                if robot_action == RobotAction.SAVE:
                    self.has_saved = 1
                else:
                    illegal_action = True
                
        mission_complete = self.robot_pos == self.target_pos and self.has_info == self.info_number_needed and self.has_saved
        return mission_complete, info_collected_X, info_collected_Y, total_info_collected, illegal_action


    def perform_collect_action(self):
        self.ask_action_counter += 1
        x, y = self.robot_pos
        verbal_inputs = []
        if self.has_info == 2:  ## should be 2 if total number of infos are 3 
            verbal_input = ("Hey, there's a victim at the hospital. A fire was reported at the train station. There is a fire at the bank. A safe area is the mall. You must go to the access route in the school. Another access route at the restaurant. And there is a shelter in the shop. There are also reports of significant instances of heat at the bakery. Police told us that no access allowed around the petrol station.")
            # print(f"real LLM is about to start handling the input {verbal_input}")
            verbal_inputs.append(verbal_input)
            
            if self.ask_action_counter <= 1:
                print(f"real LLM is about to start handling the input {verbal_input}")
                for input_text in verbal_inputs:
                    response = self.assistant.generate_response(input_text)
                    if response:
                        self.visited_information_state = True
                    self.hazards, self.pois = self.assistant.refine_response(response)
                    print(f"real LLM is about to end handling the input {verbal_input}")
                    self.update_environment_REAL(self.hazards, self.pois)
            else:
                # #print(f"input will be handled hereby by pseudoLLM")
                # print(self.hazards, self.pois)
                self.visited_information_state = True
                self.update_environment_REAL(self.hazards, self.pois)
            
    def update_environment_REAL(self, haz, poi):
        for hazardous_location in haz:
            self.sensor_readings[(hazardous_location[0], hazardous_location[1], 3, 0)] = -10.0
            self.fires.append(hazardous_location)
        for safe_location in poi:
            self.sensor_readings[(safe_location[0], safe_location[1], 3, 0)] = 10.0
            self.POIs.append(safe_location)
            
    
    def is_in_ditch(self):
        return tuple(self.robot_pos) in self.ditches

    def render(self):
        for x in range(self.grid_rows):
            for y in range(self.grid_cols):
                if [x, y] == self.robot_pos:
                    print(GridTile.ROBOT, end=' ')
                elif [x, y] == self.target_pos:
                    print(GridTile.TARGET, end=' ')
                elif [x, y] == self.info_pos1:
                    print(GridTile.X_INFO, end=' ')
                elif [x, y] == self.info_pos2:
                    print(GridTile.Y_INFO, end=' ')
                elif [x, y] == self.info_pos3:
                    print(GridTile.Z_INFO, end=' ')
                elif tuple([x, y]) in self.ditches:
                    print(GridTile.DITCH, end=' ')
                else:
                    print(GridTile._FLOOR, end=' ')
            print()
        print()


class SARrobotEnv(gym.Env):
    metadata = {"render_modes": ["human"], 'render_fps': 1}
    def __init__(self, grid_rows=7, grid_cols=7, render_mode=None, info_number_needed=3):
        self.grid_rows = grid_rows
        self.grid_cols = grid_cols
        self.render_mode = render_mode
        
        self.sar_robot = searchANDrescueRobot(grid_rows, grid_cols, info_number_needed)
        self.action_space = spaces.Discrete(len(RobotAction))
        
        self.observation_space = spaces.Box(
            low = 0,
            high = np.array([self.grid_rows-1, self.grid_cols-1, info_number_needed, 1]),
            shape = (4,),
            dtype = np.int32
        )
        
        self.max_steps = 50
        self.current_step = 0
        self.turnPenalty = -1
        self.stepsPenalty = -5
        self.ditchPenalty = -30
        self.illegalActionPenalty = -5  # Penalty for illegal actions
        self.winReward = 100
    
    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.sar_robot.reset(seed=seed)
        self.current_step = 0
        obs = np.concatenate((self.sar_robot.robot_pos, [self.sar_robot.has_info], [self.sar_robot.has_saved])).astype(np.int32)
        info = {}
        return obs, info

    def step(self, action):
        reward = 0
        self.current_step += 1
        target_reached, info_collected_X, info_collected_Y, total_info_collected, illegal_action = self.sar_robot.perform_action(RobotAction(action))
        terminated = False
        
        if self.sar_robot.is_in_ditch():
            reward = self.ditchPenalty
            terminated = True
        
        if self.is_max_steps_exceeded():
            reward = self.stepsPenalty
            terminated = True
        
        ## the reward for getting the information is now removed --> this makes the environment more challenging (sparse reward)
        # if info_collected_X or info_collected_Y or total_info_collected:
        #     reward = 10  # Reward for collecting info
        
        if target_reached:
            reward = self.winReward
            terminated = True

        if illegal_action:
            reward = self.illegalActionPenalty
        
        reward += self.turnPenalty
        
        obs = np.concatenate((self.sar_robot.robot_pos, [self.sar_robot.has_info], [self.sar_robot.has_saved])).astype(np.int32)
        info = {}
        
        if self.render_mode == 'human':
            print(f"Action: {RobotAction(action)}, Reward: {reward}, Terminated: {terminated}")
            self.render()
        
        return obs, reward, terminated, False, info

    def is_max_steps_exceeded(self):
        return self.current_step >= self.max_steps
    
    def render(self):
        if self.render_mode == 'human':
            self.sar_robot.render()

env = SARrobotEnv(grid_rows=7, grid_cols=7, render_mode='None',  info_number_needed=3)

document json will be infused


## Flat learning agent (Q-learning)

In [6]:
class QLearningAgentFlat:
    def __init__(self, env, ALPHA, GAMMA, EPSILON_MAX, DECAY_RATE, EPSILON_MIN, log_dir="curve-SMC-HRL_sparse/flatQ"):
        self.env = env 
        self.ALPHA = ALPHA
        self.GAMMA = GAMMA 
        self.EPSILON_MAX = EPSILON_MAX
        self.EPSILON = EPSILON_MAX
        self.DECAY_RATE = DECAY_RATE
        self.EPSILON_MIN = EPSILON_MIN
        self.num_states = (self.env.observation_space.high[0] + 1, 
                           self.env.observation_space.high[1] + 1, 
                           self.env.observation_space.high[2] + 1,
                           self.env.observation_space.high[3] + 1)  # 7*7*4*2
        self.num_actions = self.env.action_space.n
        self.Q_table = np.zeros((*self.num_states, self.num_actions))
        self.writer = tf.summary.create_file_writer(log_dir)
        
    def epsilon_greedy_policy(self, state):
        if np.random.rand() < self.EPSILON:
            return self.env.action_space.sample() # Explore: choose a random action
        else:
            return np.argmax(self.Q_table[state]) # Exploit: choose the action with max Q-value
    
    def get_state(self, observation):
        return tuple(observation)
    
    def decay_epsilon(self, episodes):
        if self.EPSILON > 0.1:
            self.EPSILON -= self.DECAY_RATE/episodes
        else:
            self.EPSILON = self.EPSILON_MIN
        return self.EPSILON

    def update(self, state, action, reward, next_state):
        # Q-learning update
        best_next_action = np.argmax(self.Q_table[next_state])
        td_target = reward + self.GAMMA * self.Q_table[next_state][best_next_action]
        td_error = td_target - self.Q_table[state][action]
        self.Q_table[state][action] += self.ALPHA * td_error
    
    def train(self, num_episodes):
        return_list_Q = []
        total_rewards_per_episode = np.zeros(num_episodes)
        total_steps_per_episode = np.zeros(num_episodes)
        Rewards, steps_cnt, episode_return_Q = 0, 0, 0
        for episode in tqdm(range(num_episodes)):
            if episode % 100 == 0:
                print(f"episode: {episode} | reward: {Rewards} | epsilon: {self.EPSILON}")
            
            obs, _ = self.env.reset(seed=episode)
            s = self.get_state(obs)
            
            terminated = False
            Rewards, steps_cnt, episode_return_Q = 0, 0, 0
            while not terminated:
                a = self.epsilon_greedy_policy(s)
                obs_, r, terminated, _, _ = self.env.step(a)
                s_ = self.get_state(obs_)
                Rewards += r
                episode_return_Q += r
                self.update(s, a, r, s_)
                s = s_
                steps_cnt += 1
            
            # Log the rewards and steps to Tensorboard
            with self.writer.as_default():
                tf.summary.scalar('Episode Return', Rewards, step=episode)
                tf.summary.scalar('Steps per Episode', steps_cnt, step=episode)
                
            self.EPSILON = self.decay_epsilon(num_episodes)
            total_rewards_per_episode[episode] = Rewards
            total_steps_per_episode[episode] = steps_cnt
            return_list_Q.append(episode_return_Q)
            
        return total_rewards_per_episode, total_steps_per_episode, return_list_Q

### Run (flat learning agent)

In [7]:
all_total_rewards_AGENT_flat = []  # List to store total rewards from each run
all_total_steps_AGENT_flat = []  # List to store total rewards from each run
for _ in range(1):
    EPISODES = 1000
    ALPHA = 0.1
    GAMMA = 0.98
    EPSILON_MAX = 1.0
    EPSILON_MIN = 0.01
    DECAY_RATE = 2
    agent_flat = QLearningAgentFlat(env, ALPHA, GAMMA, EPSILON_MAX, DECAY_RATE, EPSILON_MIN)
    rewards_flat, steps_flat, returns_flat = agent_flat.train(EPISODES)

    all_total_rewards_AGENT_flat.append(rewards_flat)
    all_total_steps_AGENT_flat.append(steps_flat)
    
avg_total_rewards_AGENT_flat = np.mean(all_total_rewards_AGENT_flat, axis=0)
avg_total_steps_AGENT_flat = np.mean(all_total_steps_AGENT_flat, axis=0)

2024-09-24 15:13:09.908064: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-09-24 15:13:09.911801: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-09-24 15:13:09.911924: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

  0%|          | 0/1000 [00:00<?, ?it/s]

episode: 0 | reward: 0 | epsilon: 1.0
episode: 100 | reward: -55 | epsilon: 0.7999999999999998
episode: 200 | reward: -55 | epsilon: 0.5999999999999996
episode: 300 | reward: -55 | epsilon: 0.39999999999999947
episode: 400 | reward: -59 | epsilon: 0.1999999999999993
episode: 500 | reward: -55 | epsilon: 0.01
episode: 600 | reward: -55 | epsilon: 0.01
episode: 700 | reward: -55 | epsilon: 0.01
episode: 800 | reward: -55 | epsilon: 0.01
episode: 900 | reward: -55 | epsilon: 0.01


### Evaluate (flat learning agent)

In [17]:
_ = evaluate_agent(env, agent_flat)

Step 1: || State=(4, 1, 0, 0) || Action=RIGHT|| Reward=-1 || Next State=(4, 2, 0, 0) || Done=False
Step 2: || State=(4, 2, 0, 0) || Action=REMOVE|| Reward=-1 || Next State=(4, 2, 0, 0) || Done=False
Step 3: || State=(4, 2, 0, 0) || Action=REMOVE|| Reward=-1 || Next State=(4, 2, 0, 0) || Done=False
Step 4: || State=(4, 2, 0, 0) || Action=REMOVE|| Reward=-1 || Next State=(4, 2, 0, 0) || Done=False
Step 5: || State=(4, 2, 0, 0) || Action=REMOVE|| Reward=-1 || Next State=(4, 2, 0, 0) || Done=False
Step 6: || State=(4, 2, 0, 0) || Action=REMOVE|| Reward=-1 || Next State=(4, 2, 0, 0) || Done=False
Step 7: || State=(4, 2, 0, 0) || Action=REMOVE|| Reward=-1 || Next State=(4, 2, 0, 0) || Done=False
Step 8: || State=(4, 2, 0, 0) || Action=REMOVE|| Reward=-1 || Next State=(4, 2, 0, 0) || Done=False
Step 9: || State=(4, 2, 0, 0) || Action=REMOVE|| Reward=-1 || Next State=(4, 2, 0, 0) || Done=False
Step 10: || State=(4, 2, 0, 0) || Action=REMOVE|| Reward=-1 || Next State=(4, 2, 0, 0) || Done=False


## Flat learning agent (Q-learning) + Attention

In [13]:
class AttentionSpace:
    def __init__(self, env):
        self.env = env
        self.num_states = (self.env.observation_space.high[0] + 1, 
                           self.env.observation_space.high[1] + 1, 
                           self.env.observation_space.high[2] + 1,
                           self.env.observation_space.high[3] + 1)  # 7*7*4*2
        self.num_actions = self.env.action_space.n
        self.attention_space_low = np.zeros((*self.num_states, self.num_actions))
    
    def identify_changed_states(self, readings):
        changed_states = [i for i, value in readings.items() if value != 1]
        return changed_states
    
    def get_connected_states(self, target_state):
        inverse_actions = {
            RobotAction.UP.value: RobotAction.DOWN.value, 
            RobotAction.DOWN.value: RobotAction.UP.value, 
            RobotAction.LEFT.value: RobotAction.RIGHT.value, 
            RobotAction.RIGHT.value: RobotAction.LEFT.value
        }
        connected_states_pairs = []
        for action in range(self.num_actions-10):  # Movement actions only
            possible_prev_state = self.env.sar_robot.next_state_vision(
                list(target_state[:2]), 
                RobotAction(inverse_actions[action])
            )
            if tuple(possible_prev_state) != tuple(target_state[:2]) and tuple(possible_prev_state) not in self.env.sar_robot.ditches:
                connected_states_pairs.append((tuple(possible_prev_state), action))
        return connected_states_pairs

    def update_attention_space(self, connection, readings):
        connected_states = self.get_connected_states(connection)
        value_to_add = 2.0 if readings[connection] > 0 else -100.0
        for connected_state, action in connected_states:
            full_state = tuple([*connected_state, connection[2], connection[3]])
            self.attention_space_low[full_state][action] = value_to_add
        # Special handling for victim state
        if list((connection[0], connection[1])) == self.env.sar_robot.target_pos:
            self.attention_space_low[connection][10] = 100  # Action SAVE

    def apply_attention_to_q_table(self, Q_table):
        for index, value in np.ndenumerate(self.attention_space_low):
            *state_indices, action = index
            if value != 0:
                Q_table[tuple(state_indices)][action] = value
                print(f"Updated Q-table at {tuple(state_indices)}, action {action} with value {value}")

In [14]:
class QLearningAgentFlatAttention:
    def __init__(self, env, ALPHA, GAMMA, EPSILON_MAX, DECAY_RATE, EPSILON_MIN, log_dir="curve-SMC-HRL_sparse/flatQ-Att"):
        self.env = env 
        self.ALPHA = ALPHA
        self.GAMMA = GAMMA 
        self.EPSILON_MAX = EPSILON_MAX
        self.EPSILON = EPSILON_MAX
        self.DECAY_RATE = DECAY_RATE
        self.EPSILON_MIN = EPSILON_MIN
        self.num_states = (self.env.observation_space.high[0] + 1, 
                           self.env.observation_space.high[1] + 1, 
                           self.env.observation_space.high[2] + 1,
                           self.env.observation_space.high[3] + 1)  # 7*7*4*2
        self.num_actions = self.env.action_space.n
        self.Q_table = np.zeros((*self.num_states, self.num_actions))
        self.attention_space = AttentionSpace(self.env)  # Instantiate the new AttentionSpace class
        self.input_received = False
        self.writer = tf.summary.create_file_writer(log_dir)
        
    def epsilon_greedy_policy(self, state):
        if not self.env.sar_robot.visited_information_state:
            if np.random.rand() < self.EPSILON:
                return self.env.action_space.sample()  # Explore: choose a random action
            else:
                return np.argmax(self.Q_table[state])  # Exploit: choose the action with max Q-value
        else:
            return np.argmax(self.Q_table[state])

    def get_state(self, observation):
        return tuple(observation)
    
    def decay_epsilon(self, episodes):
        if self.EPSILON > 0.1:
            self.EPSILON -= self.DECAY_RATE/episodes
        else:
            self.EPSILON = self.EPSILON_MIN
        return self.EPSILON

    def decay_epsilon_exploit(self):
        self.EPSILON = 0.01
        return self.EPSILON

    def update(self, state, action, reward, next_state):
        best_next_action = np.argmax(self.Q_table[next_state])
        td_target = reward + self.GAMMA * self.Q_table[next_state][best_next_action]
        td_error = td_target - self.Q_table[state][action]
        self.Q_table[state][action] += self.ALPHA * td_error

    def update_attention(self, sensor_readings):
        changed_states = self.attention_space.identify_changed_states(sensor_readings)
        if changed_states:
            for state in changed_states:
                self.attention_space.update_attention_space(state, sensor_readings)
            self.attention_space.apply_attention_to_q_table(self.Q_table)

    def train(self, num_episodes):
        total_rewards_per_episode = np.zeros(num_episodes)
        total_steps_per_episode = np.zeros(num_episodes)
        Rewards = 0
        for episode in tqdm(range(num_episodes)):
            if episode % 100 == 0:
                print(f"episode: {episode} | reward: {Rewards} | epsilon: {self.EPSILON}")

            obs, _ = self.env.reset(seed=episode)
            s = self.get_state(obs)
            terminated = False
            Rewards, steps_cnt = 0, 0
            
            while not terminated:
                # Check if we have new sensor readings and update attention
                if self.env.sar_robot.visited_information_state and not self.input_received:
                    self.update_attention(self.env.sar_robot.sensor_readings)
                    self.input_received = True
                    print(f"Updated attention space with new information at episode {episode}")

                # Choose action using epsilon-greedy policy
                a = self.epsilon_greedy_policy(s)
                
                # Step the environment
                obs_, r, terminated, _, _ = self.env.step(a)
                s_ = self.get_state(obs_)
                
                # Update Q-table
                self.update(s, a, r, s_)
                s = s_
                
                Rewards += r
                steps_cnt += 1
            
            # Log rewards and steps to Tensorboard
            if self.writer:
                with self.writer.as_default():
                    tf.summary.scalar('Episode Return', Rewards, step=episode)
                    tf.summary.scalar('Steps per Episode', steps_cnt, step=episode)

            # Adjust epsilon for exploration
            if not self.env.sar_robot.visited_information_state:
                self.EPSILON = self.decay_epsilon(num_episodes)
            else:
                self.EPSILON = self.decay_epsilon_exploit()
            
            total_rewards_per_episode[episode] = Rewards
            total_steps_per_episode[episode] = steps_cnt
        
        return total_rewards_per_episode, total_steps_per_episode


### Run (flat learning agent + attention)

In [15]:
### flat agent (5 runs -- 1500 episodes) 
all_total_rewards_AGENT_att = []  # List to store total rewards from each run
all_total_steps_AGENT_att = []  # List to store total rewards from each run
for _ in range(1):
    EPISODES = 1000
    ALPHA = 0.1
    GAMMA = 0.98
    EPSILON_MAX = 1.0
    EPSILON_MIN = 0.01
    DECAY_RATE = 2
    agent_att = QLearningAgentFlatAttention(env, ALPHA, GAMMA, EPSILON_MAX, DECAY_RATE, EPSILON_MIN)
    returns_att, steps_att = agent_att.train(EPISODES)

    all_total_rewards_AGENT_att.append(returns_att)
    all_total_steps_AGENT_att.append(steps_att)
    
avg_total_rewards_AGENT_att = np.mean(all_total_rewards_AGENT_att, axis=0)
avg_total_steps_AGENT_att = np.mean(all_total_steps_AGENT_att, axis=0)

  0%|          | 0/1000 [00:00<?, ?it/s]

episode: 0 | reward: 0 | epsilon: 1.0
episode: 100 | reward: -115 | epsilon: 0.7999999999999998
episode: 200 | reward: -33 | epsilon: 0.5999999999999996
episode: 300 | reward: -39 | epsilon: 0.39999999999999947
episode: 400 | reward: -52 | epsilon: 0.1999999999999993
episode: 500 | reward: -60 | epsilon: 0.01
episode: 600 | reward: -55 | epsilon: 0.01
episode: 700 | reward: -70 | epsilon: 0.01
episode: 800 | reward: -55 | epsilon: 0.01


Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3


episode: 900 | reward: -55 | epsilon: 0.01
real LLM is about to start handling the input Hey, there's a victim at the hospital. A fire was reported at the train station. There is a fire at the bank. A safe area is the mall. You must go to the access route in the school. Another access route at the restaurant. And there is a shelter in the shop. There are also reports of significant instances of heat at the bakery. Police told us that no access allowed around the petrol station.
Hazardous Coordinates: [(5, 6), (6, 5), (3, 6), (2, 5)]
Point of Interest Coordinates: [(0, 3), (4, 1), (3, 0), (2, 0), (1, 2)]
real LLM is about to end handling the input Hey, there's a victim at the hospital. A fire was reported at the train station. There is a fire at the bank. A safe area is the mall. You must go to the access route in the school. Another access route at the restaurant. And there is a shelter in the shop. There are also reports of significant instances of heat at the bakery. Police told us t

### Evaluate (flat learning agent + attention)

In [16]:
_ = evaluate_agent(env, agent_att)

Step 1: || State=(4, 1, 0, 0) || Action=RIGHT|| Reward=-1 || Next State=(4, 2, 0, 0) || Done=False
Step 2: || State=(4, 2, 0, 0) || Action=REMOVE|| Reward=-1 || Next State=(4, 2, 0, 0) || Done=False
Step 3: || State=(4, 2, 0, 0) || Action=REMOVE|| Reward=-1 || Next State=(4, 2, 0, 0) || Done=False
Step 4: || State=(4, 2, 0, 0) || Action=REMOVE|| Reward=-1 || Next State=(4, 2, 0, 0) || Done=False
Step 5: || State=(4, 2, 0, 0) || Action=REMOVE|| Reward=-1 || Next State=(4, 2, 0, 0) || Done=False
Step 6: || State=(4, 2, 0, 0) || Action=REMOVE|| Reward=-1 || Next State=(4, 2, 0, 0) || Done=False
Step 7: || State=(4, 2, 0, 0) || Action=REMOVE|| Reward=-1 || Next State=(4, 2, 0, 0) || Done=False
Step 8: || State=(4, 2, 0, 0) || Action=REMOVE|| Reward=-1 || Next State=(4, 2, 0, 0) || Done=False
Step 9: || State=(4, 2, 0, 0) || Action=REMOVE|| Reward=-1 || Next State=(4, 2, 0, 0) || Done=False
Step 10: || State=(4, 2, 0, 0) || Action=REMOVE|| Reward=-1 || Next State=(4, 2, 0, 0) || Done=False
