# Loading Data

In [12]:
import pandas as pd

df = pd.read_json('results/PaperTab/rag1/2025-05-19-08-41_results.json')

In [13]:
import json
from dataclasses import dataclass

@dataclass
class ExperimentResult:
    user_chat: str
    response_chat: str

    user_prompt: str
    general_agent: str
    text_agent: str
    image_agent: str

    @staticmethod
    def from_json(user_ai_interaction: dict):
        user_chat = user_ai_interaction[0]['content'][0]['text']
        response_chat = user_ai_interaction[1]['content'][0]['text']
        
        # Extract agent responses
        user_prompt = ""
        general_agent = ""
        text_agent = ""
        image_agent = ""
        
        if "General Agent:" in user_chat:
            parts = user_chat.split("General Agent:")
            if len(parts) > 1:
                user_prompt = parts[0].strip()
                general_part = parts[1].split("Text Agent:")[0].strip()
                general_agent = general_part
                
        if "Text Agent:" in user_chat:
            parts = user_chat.split("Text Agent:")
            if len(parts) > 1:
                text_part = parts[1].split("Image Agent:")[0].strip()
                text_agent = text_part
                
        if "Image Agent:" in user_chat:
            parts = user_chat.split("Image Agent:")
            if len(parts) > 1:
                image_part = parts[1].strip()
                image_agent = image_part
        
        return ExperimentResult(
            user_chat=user_chat,
            user_prompt=user_prompt,
            general_agent=general_agent,
            text_agent=text_agent,
            image_agent=image_agent,
            response_chat=response_chat
        )

    def __str__(self):
        return \
            f"{'='*20} User Prompt: \n{self.user_prompt}\n" \
            f"{'='*20} General Agent: \n{self.general_agent}\n" \
            f"{'='*20} Text Agent: \n{self.text_agent}\n" \
            f"{'='*20} Image Agent: \n{self.image_agent}\n" \
            f"{'='*20} Response Chat: \n{self.response_chat}"

sample_result = ExperimentResult.from_json(df['ans_2_message'].iloc[18])
print(sample_result)

You are tasked with summarizing and evaluating the collective responses provided by multiple agents. You have access to the following information:
Answers: The individual answers from all agents.
Using this information, perform the following tasks:
Analyze: Evaluate the quality, consistency, and relevance of each answer. Identify commonalities, discrepancies, or gaps in reasoning.
Synthesize: Summarize the most accurate and reliable information based on the evidence provided by the agents and their discussions.
Conclude: Provide a final, well-reasoned answer to the question or task. Your conclusion should reflect the consensus (if one exists) or the most credible and well-supported answer.
Based on the provided answers from all agents, summarize the final decision clearly. You should only return the final answer in this dictionary format: {"Answer": <Your final answer here>}. Don't give other information.
The datasets used in the experiments are ShapeWorldICE datasets. These datasets a

In [60]:
correct_df = df[df['binary_correctness'] == 1]
sample_result = ExperimentResult.from_json(correct_df['ans_2_message'].iloc[cnt])
cnt += 1
print(sample_result)

You are tasked with summarizing and evaluating the collective responses provided by multiple agents. You have access to the following information:
Answers: The individual answers from all agents.
Using this information, perform the following tasks:
Analyze: Evaluate the quality, consistency, and relevance of each answer. Identify commonalities, discrepancies, or gaps in reasoning.
Synthesize: Summarize the most accurate and reliable information based on the evidence provided by the agents and their discussions.
Conclude: Provide a final, well-reasoned answer to the question or task. Your conclusion should reflect the consensus (if one exists) or the most credible and well-supported answer.
Based on the provided answers from all agents, summarize the final decision clearly. You should only return the final answer in this dictionary format: {"Answer": <Your final answer here>}. Don't give other information.
The authors compared the performance of deep LSTM models with other models, inclu

# Ablation Study

In [1]:
from mydatasets.base_dataset import BaseDataset
import hydra
from omegaconf import OmegaConf
import os

# Initialize Hydra
hydra.initialize(config_path="config")

# Compose the configuration
cfg = hydra.compose(config_name="ptab")

# Set environment variables
os.environ["CUDA_VISIBLE_DEVICES"] = cfg.mdoc_agent.cuda_visible_devices
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:64"

# Create dataset instance
dataset = BaseDataset(cfg.dataset)
samples = dataset.load_data(use_retreival=True)

The version_base parameter is not specified.
Please specify a compatability version level, or None.
Will assume defaults for version 1.1
  hydra.initialize(config_path="config")


In [18]:
sample_index = 4
sample = samples[sample_index]

sample_result = ExperimentResult.from_json(df['ans_2_message'].iloc[sample_index])
print(sample_result)
print('SCORE : ', df['binary_correctness'].iloc[sample_index])

print('='*200)

print(sample)
question, texts, images = dataset.load_sample_retrieval_data(sample)
print('question: ', question)
print('texts: ', texts)
print('images: ', images)

You are tasked with summarizing and evaluating the collective responses provided by multiple agents. You have access to the following information:
Answers: The individual answers from all agents.
Using this information, perform the following tasks:
Analyze: Evaluate the quality, consistency, and relevance of each answer. Identify commonalities, discrepancies, or gaps in reasoning.
Synthesize: Summarize the most accurate and reliable information based on the evidence provided by the agents and their discussions.
Conclude: Provide a final, well-reasoned answer to the question or task. Your conclusion should reflect the consensus (if one exists) or the most credible and well-supported answer.
Based on the provided answers from all agents, summarize the final decision clearly. You should only return the final answer in this dictionary format: {"Answer": <Your final answer here>}. Don't give other information.
The performance of BERT on the task is not directly provided in the text. However

## Self-Reflection Pattern

In [1]:
from agents.multi_agent_system import MultiAgentSystem
from tqdm import tqdm
import torch
import json
from mydatasets.base_dataset import BaseDataset
import hydra
import os

class ReflectionAgent(MultiAgentSystem):
    def __init__(self, config):
        super().__init__(config)
    
    def extract_confidence(self, messages: str) -> float:
        try:
            if "confidence" in messages.lower():
                confidence_str = messages.split("confidence:")[1].split()[0]
                return float(confidence_str)
            else:
                return 0.0
        except Exception as e:
            print(f"Error extracting confidence: {e}")
            return 0.0
    def predict(self, question: str, texts: list[str], images: list[str]) -> tuple[str, list[dict]]:
        general_agent = self.agents[0]

        current_iter, current_confidence = 0, 0.0
        all_messages = []

        while current_iter < self.config.max_reflection_iter:
            if current_iter == 0:
                current_ans, messages = general_agent.predict(question, texts, images)
            else:
                reflection_prompt = f"""
                    Based on the following question and my previous answer, please perform a technical analysis and reflection:

                    Question: {question}

                    My previous answer: {current_ans}

                    Please perform a detailed technical analysis:
                    1. Technical Accuracy:
                    - Are there any technical inaccuracies or misconceptions?
                    - Are the technical terms and concepts used correctly?
                    - Is the technical depth appropriate for the question?

                    2. Technical Completeness:
                    - Are there missing technical details or specifications?
                    - Are all relevant technical components addressed?
                    - Are there any technical dependencies or requirements not mentioned?

                    3. Technical Precision:
                    - Can the technical explanation be more precise?
                    - Are there more specific technical terms that could be used?
                    - Are the technical relationships and interactions clearly explained?

                    4. Information Gathering:
                    - What additional technical information would help improve the answer?
                    - Are there specific technical aspects that need more research?
                    - What technical details from the provided context (texts/images) could be better utilized?

                    If you're uncertain about any technical aspect:
                    1. Identify the specific technical points that need clarification
                    2. Explain what additional information would help resolve the uncertainty
                    3. If possible, gather more information from the provided context (texts/images)
                    4. If still uncertain, explicitly state what information is missing and why it's important

                    Provide an improved technical answer if needed, or confirm if the original answer is technically sufficient.
                    Also, provide your confidence in this technical analysis (0.0 to 1.0) by adding "Confidence: X.X" at the end of your response.
                """
                current_ans, messages = general_agent.predict(reflection_prompt, texts, images)

            all_messages.extend(messages)
            confidence = self.extract_confidence(current_ans)

            # Remove confidence score from the answer
            if "confidence:" in current_ans.lower():
                current_ans = current_ans.lower().split("confidence:")[0].strip()
            
            # Check if we should stop reflecting
            if confidence >= self.confidence_threshold:
                break
                
            current_iter += 1
        
        return current_ans, all_messages

    def predict_dataset(self, dataset:BaseDataset, resume_path = None):
        samples = dataset.load_data(use_retreival=True)
        if resume_path:
            assert os.path.exists(resume_path)
            with open(resume_path, 'r') as f:
                samples = json.load(f)
            
        sample_no = 0
        for sample in tqdm(samples):
            if resume_path and self.config.ans_key in sample:
                continue
            question, texts, images = dataset.load_sample_retrieval_data(sample)
            try:
                final_ans, final_messages = self.predict(question, texts, images)
            except RuntimeError as e:
                print(e)
                if "out of memory" in str(e):
                    torch.cuda.empty_cache()
                final_ans, final_messages = None, None
            
            # Sample
            return final_messages

            sample[self.config.ans_key] = final_ans
            if self.config.save_message:
                sample[self.config.ans_key+"_message"] = final_messages
            torch.cuda.empty_cache()
            self.clean_messages()
            
            sample_no += 1
            if sample_no % self.config.save_freq == 0:
                path = dataset.dump_reults(samples)
                print(f"Save {sample_no} results to {path}.")
        path = dataset.dump_reults(samples)
        print(f"Save final results to {path}.")

hydra.initialize(config_path="config", version_base="1.2")

# Compose the configuration
cfg = hydra.compose(config_name="base")

# Set environment variables
os.environ["CUDA_VISIBLE_DEVICES"] = cfg.mdoc_agent.cuda_visible_devices
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:64"

cfg = hydra.compose(config_name="base")    
mdoc_agent = ReflectionAgent(cfg.reflection_agent)
mdoc_agent.predict_dataset(dataset)

AttributeError: 'str' object has no attribute 'class_name'