## Setup

In [None]:
import sys, os
sys.path.append('..')
root = '../root/'

os.makedirs(root, exist_ok=True)

In [3]:
from util import summarize_trial, log_trial, save_agents
import joblib
from agents import CoTAgent, ReflexionStrategy
from llm import ClaudeLLM

## Initialize Claude LLM

In [None]:
os.environ["ANTHROPIC_API_KEY"] = "KEY"

model_name = "claude-3-haiku-20240307"

llm = ClaudeLLM(
    model_name=model_name,
    temperature=0.0,
    max_tokens=250
)

print(f"✓ Using model: {model_name}")

ValueError: ANTHROPIC_API_KEY not found in environment. Set with: export ANTHROPIC_API_KEY=<your-key>

## Load the HotPotQA Sample

In [None]:
hotpot = joblib.load('../data/hotpot-qa-distractor-sample.joblib').reset_index(drop=True)
print(f"Loaded {len(hotpot)} questions")

## Define the Reflexion Strategy

In [None]:
print(ReflexionStrategy.__doc__)

In [None]:
strategy: ReflexionStrategy = ReflexionStrategy.REFLEXION
print(f"Using strategy: {strategy.value}")

## Initialize CoT Agents with Claude

In [None]:
from prompts import cot_simple_reflect_agent_prompt, cot_simple_reflect_prompt, cot_simple_agent_prompt
from fewshots import COTQA_SIMPLE6, COT_SIMPLE_REFLECTION

agents = []
for _, row in hotpot.iterrows():
    agent = CoTAgent(
        question=row['question'],
        context='',  # No context
        key=row['answer'],
        llm=llm,
        agent_prompt=cot_simple_agent_prompt if strategy == ReflexionStrategy.NONE else cot_simple_reflect_agent_prompt,
        cot_examples=COTQA_SIMPLE6,
        reflect_prompt=cot_simple_reflect_prompt,
        reflect_examples=COT_SIMPLE_REFLECTION,
    )
    agents.append(agent)

print(f"Initialized {len(agents)} CoT agents with Claude")

## Run n Trials

In [None]:
n = 5  # Number of trials
trial = 0
log = ''

print(f"Running {n} trials...")
print("=" * 70)

In [None]:
for i in range(n):
    print(f"\n--- Trial {i+1}/{n} ---")
    
    # Run only agents that haven't succeeded yet
    active_agents = [a for a in agents if not a.is_correct()]
    print(f"Active agents: {len(active_agents)}")
    
    for idx, agent in enumerate(active_agents):
        try:
            agent.run(reflexion_strategy=strategy)
            
            if (idx + 1) % 10 == 0:
                print(f"  Processed {idx+1}/{len(active_agents)} agents...")
        except Exception as e:
            print(f"  Error on agent {idx}: {e}")
    
    trial += 1
    log += log_trial(agents, trial)
    correct, incorrect = summarize_trial(agents)
    
    print(f"\nTrial {trial} Results:")
    print(f"  Correct: {len(correct)}")
    print(f"  Incorrect: {len(incorrect)}")
    print(f"  Accuracy: {len(correct)/len(agents)*100:.1f}%")

print("\n" + "=" * 70)
print("Experiment complete!")

## Save Results

In [None]:
# Create output directory
output_dir = os.path.join(root, 'CoT', 'claude', 'no_context', strategy.value)
os.makedirs(output_dir, exist_ok=True)

# Save log file
log_file = os.path.join(output_dir, f'{model_name.replace("-", "_")}_{len(agents)}_questions_{trial}_trials.txt')
with open(log_file, 'w') as f:
    f.write(log)
print(f"✓ Log saved to: {log_file}")

# Save agents
agents_dir = os.path.join(root, 'CoT', 'claude', 'no_context', strategy.value, 'agents')
save_agents(agents, agents_dir)
print(f"✓ Agents saved to: {agents_dir}")

## Final Summary

In [None]:
correct, incorrect = summarize_trial(agents)

print("FINAL RESULTS")
print("=" * 70)
print(f"Model: {model_name}")
print(f"Strategy: {strategy.value}")
print(f"Total Questions: {len(agents)}")
print(f"Trials: {trial}")
print(f"\nCorrect: {len(correct)} ({len(correct)/len(agents)*100:.1f}%)")
print(f"Incorrect: {len(incorrect)} ({len(incorrect)/len(agents)*100:.1f}%)")