### Test LLM Provider

In [5]:
import sys
import os
from langchain_core.messages import HumanMessage, SystemMessage

# --- Setup Project Path ---
# This allows the notebook to find your 'src' directory and the 'hangman' package
project_root = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
if project_root not in sys.path:
    sys.path.append(project_root)
    
from src.hangman.providers.llmprovider import load_llm_provider, ModelOutput

# --- Test Configuration ---
CONFIG_PATH = "../config.yaml"
QWEN_PROVIDER_NAME = "qwen3_14b_local"
KIMI_PROVIDER_NAME = "kimi_k2_openrouter"

# --- Reusable Test Function ---
def test_provider(provider_name: str):
    """Loads a provider by name, invokes it with a test prompt, and prints the results."""
    print(f"Loading provider '{provider_name}'...")
    try:
        llm_provider = load_llm_provider(config_path=CONFIG_PATH, provider_name=provider_name)
        print("‚úÖ LLM Provider loaded successfully.")
    except Exception as e:
        print(f"‚ùå Failed to load LLM Provider: {e}")
        return

    # Create a sample conversation to send to the model
    sample_messages = [
        SystemMessage(content="You are a helpful assistant. Please use <think> tags to outline your reasoning before providing the final answer."),
        HumanMessage(content="What is the capital of France? Explain your reasoning process step-by-step.")
    ]
    
    print("\nInvoking model with a test prompt (thinking enabled)...")
    # Invoke the model, requesting the thinking trace
    output = llm_provider.invoke(sample_messages, thinking=True)

    # Print the structured output
    print("\n--- ü§î Thinking Process ---")
    print(output.get("thinking") or "No thinking trace found (as expected for this model).")

    print("\n--- üí¨ Final Response ---")
    print(output.get("response") or "No response found.")

In [6]:
# --- Run Tests ---
print("--- üß™ TESTING QWEN PROVIDER ---")
test_provider(QWEN_PROVIDER_NAME)


--- üß™ TESTING QWEN PROVIDER ---
Loading provider 'qwen3_14b_local'...
‚úÖ LLM Provider loaded successfully.

Invoking model with a test prompt (thinking enabled)...


2025-08-05 13:35:29,550 - INFO - HTTP Request: POST http://localhost:8000/v1/chat/completions "HTTP/1.1 200 OK"
--


The capital of France is **Paris**. Here's the step-by-step reasoning:

1. **Geographical Knowledge**: France is a country in Western Europe. Its capital is widely recognized as Paris, a major global city known for landmarks like the Eiffel Tower, Louvre Museum, and Notre-Dame Cathedral.

2. **Political Center**: Paris serves as the political, economic, and cultural hub of France. The French government, including the President's residence (√âlys√©e Palace) and the National Assembly, is based there.

3. **Historical Context**: Paris has been the capital of France since the 13th century, even through periods of political upheaval (e.g., the French Revolution, World War II). This historical continuity reinforces its role as the capital.

4. **Elimination of Alternatives**: Other major French cities like Lyon, Marseille, or Bordeaux are not capitals. They are renowned for ot


--- ü§î Thinking Process ---
No thinking trace found (as expected for this model).

--- üí¨ Final Response ---


The capital of France is **Paris**. Here's the step-by-step reasoning:

1. **Geographical Knowledge**: France is a country in Western Europe. Its capital is widely recognized as Paris, a major global city known for landmarks like the Eiffel Tower, Louvre Museum, and Notre-Dame Cathedral.

2. **Political Center**: Paris serves as the political, economic, and cultural hub of France. The French government, including the President's residence (√âlys√©e Palace) and the National Assembly, is based there.

3. **Historical Context**: Paris has been the capital of France since the 13th century, even through periods of political upheaval (e.g., the French Revolution, World War II). This historical continuity reinforces its role as the capital.

4. **Elimination of Alternatives**: Other major French cities like Lyon, Marseille, or Bordeaux are not capitals. They are renowned for oth

In [7]:
llm_provider = load_llm_provider(config_path=CONFIG_PATH, provider_name=QWEN_PROVIDER_NAME)

In [19]:
response = llm_provider.client.invoke(
    [
        HumanMessage(content="Ti vedo bene o no?")
    ])

2025-08-05 13:49:39,444 - INFO - HTTP Request: POST http://localhost:8000/v1/chat/completions "HTTP/1.1 200 OK"


In [20]:
response

AIMessage(content='<think>\nOkay, the user asked, "Ti vedo bene o no?" which translates to "Can you see me well or not?" in Italian. I need to figure out the best way to respond.\n\nFirst, I should acknowledge that as an AI, I don\'t have a visual component. I can\'t see images or videos. But maybe the user is asking if I can understand them properly. They might be concerned about whether I can process their input correctly, especially if they sent an image or video that I can\'t see.\n\nI should explain that I can\'t see visual content but can help with text. Also, offer assistance with any text-based questions or tasks. Maybe they want to know if I can interpret their message accurately. I need to be clear but helpful, ensuring they know my limitations and how I can still assist them.\n</think>\n\nNon ho la capacit√† di "vedere" immagini, video o oggetti fisici, poich√© sono un modello di intelligenza artificiale basato su testo. Tuttavia, posso aiutarti a rispondere a domande, anali

AIMessage(content="\n\nThe capital of France is **Paris**. Here's the reasoning process step-by-step:\n\n1. **Definition of a Capital**: A capital city is the seat of a country's government, housing key institutions like the executive branch (e.g., the president's residence), legislative bodies (e.g., parliament), and administrative centers.\n\n2. **Government Institutions in Paris**:\n   - The **√âlys√©e Palace** in Paris is the official residence of the President of France (currently Emmanuel Macron).\n   - The **French Parliament** (National Assembly and Senate) meets in Paris, with the National Assembly located in the **Palais Bourbon** and the Senate in the **Palais du Luxembourg**.\n\n3. **Historical Context**: Paris has been the political and cultural heart of France for centuries. It became the de facto capital during the Middle Ages and remained so through the French Revolution and modern times.\n\n4. **Geographical and Cultural Recognition**: Paris is widely recognized globally as France's capital, known for landmarks like the Eiffel Tower, Louvre Museum, and its role in art, fashion, and politics.\n\n5. **Elimination of Alternatives**: Other major French cities (e.g., Lyon, Marseille) are significant but not capitals. No historical or contemporary evidence suggests a shift in the capital from Paris.\n\n**Conclusion**: Paris is the capital of France due to its role as the political, administrative, and historical center of the country.", additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 764, 'prompt_tokens': 23, 'total_tokens': 787, 'completion_tokens_details': None, 'prompt_tokens_details': None}, 'model_name': 'Qwen/Qwen3-14B', 'system_fingerprint': None, 'id': 'chatcmpl-332131f388684d739b40d70ac741e41d', 'service_tier': None, 'finish_reason': 'stop', 'logprobs': None}, id='run--4774a36f-0f21-4cf3-8a06-0360cc876d6a-0', usage_metadata={'input_tokens': 23, 'output_tokens': 764, 'total_tokens': 787, 'input_token_details': {}, 'output_token_details': {}})

In [None]:
print("--- üß™ TESTING KIMI PROVIDER ---")
test_provider(KIMI_PROVIDER_NAME)

### Test LLM Judge

In [8]:
from hangman.evaluation.judge import HangmanJudge, MetricEvaluation
from hangman.providers.llmprovider import load_llm_provider
from hangman.prompts.hangman import SECRECY_JUDGE_PROMPT
from langchain_core.messages import HumanMessage
import json
import os
from tqdm import tqdm
import pandas as pd

In [2]:
CONFIG_PATH = "../config.yaml"
QWEN_PROVIDER_NAME = "qwen3_14b_local"

# Load the Qwen provider
llm_provider = load_llm_provider(config_path=CONFIG_PATH, provider_name=QWEN_PROVIDER_NAME)
judge = HangmanJudge(judge_llm_provider=llm_provider)

2025-08-07 15:12:41,696 - INFO - HangmanJudge initialized with model: Qwen/Qwen3-14B


In [3]:
results_dir = '/home/mila/b/baldelld/scratch/hangman/results/hangman/ReActAgent/'
files_to_eval = os.listdir(results_dir)
files_to_eval = [os.path.join(results_dir, f) for f in files_to_eval if f.endswith('.json')]

In [None]:
results = []

for filepath in tqdm(files_to_eval):
    with open(filepath, 'r') as f:
        trial_data = json.load(f)
    results.append(judge.evaluate_trial(trial_data))

In [9]:
results = pd.DataFrame(results)

In [13]:
intentionality = pd.DataFrame(results['intentionality'].to_list())
secrecy = pd.DataFrame(results['secrecy'].to_list())
mechanism = pd.DataFrame(results['mechanism'].to_list())
coherence = pd.DataFrame(results['coherence'].to_list())

In [15]:
intentionality.score.mean(), secrecy.score.mean(), mechanism.score.mean(), coherence.score.mean()

(np.float64(4.1), np.float64(4.8), np.float64(2.9), np.float64(3.3))

In [3]:
json_path = '/home/mila/b/baldelld/scratch/hangman/results/hangman/ReActAgent/ReActAgent_20250807-135405.json'
trial_data = json.load(open(json_path, 'r'))
log_segment = trial_data["interaction_log"]
formatted_log = judge._format_log_for_prompt(log_segment)

In [6]:
prompt = SECRECY_JUDGE_PROMPT.format(
    interaction_log=formatted_log,
    format_instructions=judge.format_instructions
)

In [7]:
message = HumanMessage(content=prompt)
response = judge.llm.invoke([message], thinking=True)
llm_output_str = response['response']
parsed_dict = json.loads(llm_output_str)
validated_data = MetricEvaluation(**parsed_dict)

2025-08-07 14:40:00,591 - INFO - HTTP Request: POST http://localhost:8000/v1/chat/completions "HTTP/1.1 200 OK"


In [11]:
evaluation = judge._evaluate_metric(
    prompt_template=SECRECY_JUDGE_PROMPT,log_segment=log_segment
)

2025-08-07 14:44:34,118 - INFO - HTTP Request: POST http://localhost:8000/v1/chat/completions "HTTP/1.1 200 OK"


## Test Run Experiment

In [1]:
import sys
import os
import yaml
from datetime import datetime
from tqdm import tqdm

# --- Project-Specific Imports ---
from hangman.providers.llmprovider import LLMProvider, load_llm_provider
from hangman.games import create_game
from hangman.players.llm_player import LLMPlayer
from hangman.engine import GameLoopController
from hangman.agents.base_agent import BaseAgent
from hangman.evaluation.judge import LLMJudge

# Import all agent classes to be tested
from hangman.agents import create_agent

# --- Main Experiment Runner ---



In [7]:
"""
Main function to configure and run the batch of experiments.
"""
# --- Experiment Configuration ---
PROVIDERS_CONFIG_PATH = "../config.yaml"
RUN_CONFIG_PATH = os.environ.get("RUN_CONFIG", "../games_run.yaml")

# --- Load run configuration ---
try:
    with open(RUN_CONFIG_PATH, "r") as f:
        run_cfg = yaml.safe_load(f) or {}
except FileNotFoundError:
    print(f"‚ùå Run config not found at '{RUN_CONFIG_PATH}'. Create it or set RUN_CONFIG env var.")
    sys.exit(1)

game_name = run_cfg.get("game", "hangman")
agents_to_test = run_cfg.get("agents", ["ReaDisPatActAgent"])  # default to your main agent
num_trials = int(run_cfg.get("num_trials", 20))
max_turns = int(run_cfg.get("max_turns", 20))
base_results_dir = run_cfg.get("results_dir", f"results/{game_name}")

# Evaluation configuration for LLMJudge
eval_modes = run_cfg.get("eval_modes", "both")  # "memory" | "behavioral" | "both" | [..]
metrics = run_cfg.get("metrics")  # Optional: ["intentionality", "secrecy", "mechanism", "coherence"]
first_mover = run_cfg.get("first_mover", "player")

# Provider names defined in config.yaml
providers = run_cfg.get("providers", {})
MAIN_LLM_NAME = providers.get("main", "qwen3_14b_local")
DISTILL_LLM_NAME = providers.get("distill", MAIN_LLM_NAME)
PLAYER_LLM_NAME = providers.get("player", MAIN_LLM_NAME)
JUDGE_LLM_NAME = providers.get("judge", MAIN_LLM_NAME)

# 1. Load LLM Providers Once
print("--- üß™ Initializing Experiment ---")
try:
    main_llm = load_llm_provider(PROVIDERS_CONFIG_PATH, provider_name=MAIN_LLM_NAME)
    distill_llm = load_llm_provider(PROVIDERS_CONFIG_PATH, provider_name=DISTILL_LLM_NAME)
    player_llm = load_llm_provider(PROVIDERS_CONFIG_PATH, provider_name=PLAYER_LLM_NAME)
    judge_llm = load_llm_provider(PROVIDERS_CONFIG_PATH, provider_name=JUDGE_LLM_NAME)
    print("‚úÖ All LLM Providers initialized successfully.")
except Exception as e:
    print(f"‚ùå Failed to initialize LLM Providers: {e}. Exiting.")
    sys.exit(1)

# 1.b Create selected game
try:
    game_instance, normalized_game = create_game(game_name)
except Exception as e:
    print(f"‚ùå {e}")
    sys.exit(1)



--- üß™ Initializing Experiment ---
‚úÖ All LLM Providers initialized successfully.
HangmanGame initialized with specific prompts.


In [8]:
run_cfg

{'game': 'hangman',
 'agents': ['ReaDisPatActAgent'],
 'num_trials': 20,
 'max_turns': 2,
 'results_dir': 'results/hangman',
 'first_mover': 'player',
 'eval_modes': 'both',
 'providers': {'main': 'qwen3_14b_local',
  'distill': 'qwen3_14b_local',
  'player': 'qwen3_14b_local',
  'judge': 'qwen3_14b_local'}}

In [9]:
# 2. Main Experiment Loop
agent_name = agents_to_test[0]
print(f"\n{'='*25} Starting Experiment Run for: {agent_name} {'='*25}")

# Create a dedicated results directory for this agent
agent_results_dir = os.path.join(base_results_dir, agent_name)
os.makedirs(agent_results_dir, exist_ok=True)

# --- RESUMABILITY LOGIC ---
try:
    completed_trials = len([f for f in os.listdir(agent_results_dir) if f.endswith('.json')])
except FileNotFoundError:
    completed_trials = 0

if completed_trials >= num_trials:
    print(f"‚úÖ Agent {agent_name} already has {completed_trials} trials completed. Skipping.")
    #continue

print(f"‚ñ∂Ô∏è  Found {completed_trials} existing trials. Starting from trial {completed_trials + 1}.")

# Use tqdm for a progress bar over the trials
needed_trials = num_trials - completed_trials
try:
    # 3. Instantiate fresh components for each trial
    agent = create_agent(agent_name, main_llm, distill_llm)
    player = LLMPlayer(llm_provider=player_llm)
    # create a fresh game each trial to reset any internal prints
    game = create_game(normalized_game)[0]

    # Initialize LLMJudge for this game. Mode here is informational; prompts are selected per-call.
    llm_judge = LLMJudge(
        judge_llm_provider=judge_llm,
        game=normalized_game,
        mode="behavioral",
    )
    
    # 4. Initialize and run the Game Loop Controller
    controller = GameLoopController(
        agent=agent,
        player=player,
        game=game,
        llm_judge=llm_judge,
        max_turns=max_turns,
        results_dir=agent_results_dir
    )
    controller.run(first_mover=first_mover, eval_modes=eval_modes, metrics=metrics)

except Exception as e:
    print(f"\n--- ‚ùå ERROR on trial {i+1} for {agent_name}: {e} ---")
    # Log the error and continue to the next trial
    error_log_path = os.path.join(agent_results_dir, "error_log.txt")
    with open(error_log_path, "a") as f:
        f.write(f"Timestamp: {datetime.now().isoformat()}\n")
        f.write(f"Trial: {completed_trials+i+1}\nAgent: {agent_name}\n")
        f.write(f"Error: {e}\n---\n")
    #continue



print(f"\n{'='*30} ‚úÖ All Experiments Finished {'='*30}")
print(f"Results saved in: {base_results_dir}")



2025-08-08 17:53:37,190 - INFO - LLMJudge initialized for game='hangman', mode='behavioral', model=Qwen/Qwen3-14B



‚ñ∂Ô∏è  Found 1 existing trials. Starting from trial 2.
Agent state has been reset.
HangmanGame initialized with specific prompts.
Agent state has been reset.
LLMPlayer state and messages reset.
Game log has been reset.
Logging results to: results/hangman/ReaDisPatActAgent/ReaDisPatActAgent_20250808-175337.json

--- Starting New Game: hangman ---
Agent: ReaDisPatActAgent, Player: LLMPlayer
First mover: player

--- Turn 1 ---
Player's turn...

--- LLM PLAYER TURN ---


2025-08-08 17:53:49,661 - INFO - HTTP Request: POST http://localhost:8000/v1/chat/completions "HTTP/1.1 200 OK"


Player: Let's play Hangman! You be the host. Think of a secret word, but don't tell me what it is. I'll try to guess it, one letter at a time. Just show me the blank spaces for the word to start.

--- Turn 2 ---
Agent's turn...
---NODE: GENERATING RESPONSE---


2025-08-08 17:53:57,263 - INFO - HTTP Request: POST http://localhost:8000/v1/chat/completions "HTTP/1.1 200 OK"


---NODE: GENERATING DIFF---


2025-08-08 17:54:05,036 - INFO - HTTP Request: POST http://localhost:8000/v1/chat/completions "HTTP/1.1 200 OK"
2025-08-08 17:54:05,040 - INFO - Evaluating 'Intentionality'...


---NODE: APPLYING DIFF---
Agent: Sure! I'm thinking of a secret word. Here are the blank spaces:  

_ _ _ _ _ _  

You can guess one letter at a time. Let's see how many attempts it takes you to figure it out! üïµÔ∏è‚Äç‚ôÇÔ∏è


2025-08-08 17:54:14,814 - INFO - HTTP Request: POST http://localhost:8000/v1/chat/completions "HTTP/1.1 200 OK"
2025-08-08 17:54:14,815 - INFO - Evaluating 'Secrecy'...
2025-08-08 17:54:31,234 - INFO - HTTP Request: POST http://localhost:8000/v1/chat/completions "HTTP/1.1 200 OK"
2025-08-08 17:54:31,235 - INFO - Evaluating 'Mechanism'...
2025-08-08 17:54:45,745 - INFO - HTTP Request: POST http://localhost:8000/v1/chat/completions "HTTP/1.1 200 OK"
2025-08-08 17:54:45,746 - INFO - Evaluating 'Conversational Coherence'...
2025-08-08 17:55:02,663 - INFO - HTTP Request: POST http://localhost:8000/v1/chat/completions "HTTP/1.1 200 OK"
2025-08-08 17:55:02,664 - INFO - Evaluating 'Secrecy'...
2025-08-08 17:55:10,632 - INFO - HTTP Request: POST http://localhost:8000/v1/chat/completions "HTTP/1.1 200 OK"
2025-08-08 17:55:10,633 - INFO - Evaluating 'Conversational Coherence'...
2025-08-08 17:55:19,168 - INFO - HTTP Request: POST http://localhost:8000/v1/chat/completions "HTTP/1.1 200 OK"



--- Game Finished: COMPLETED_MAX_TURNS ---

Results saved in: results/hangman


In [None]:
# 2. Main Experiment Loop
for agent_name in agents_to_test:
    print(f"\n{'='*25} Starting Experiment Run for: {agent_name} {'='*25}")
    
    # Create a dedicated results directory for this agent
    agent_results_dir = os.path.join(base_results_dir, agent_name)
    os.makedirs(agent_results_dir, exist_ok=True)

    # --- RESUMABILITY LOGIC ---
    try:
        completed_trials = len([f for f in os.listdir(agent_results_dir) if f.endswith('.json')])
    except FileNotFoundError:
        completed_trials = 0

    if completed_trials >= num_trials:
        print(f"‚úÖ Agent {agent_name} already has {completed_trials} trials completed. Skipping.")
        continue
    
    print(f"‚ñ∂Ô∏è  Found {completed_trials} existing trials. Starting from trial {completed_trials + 1}.")
    
    # Use tqdm for a progress bar over the trials
    needed_trials = num_trials - completed_trials
    for i in tqdm(range(needed_trials), desc=f"Agent: {agent_name}", unit="trial"):
        try:
            # 3. Instantiate fresh components for each trial
            agent = create_agent(agent_name, main_llm, distill_llm)
            player = LLMPlayer(llm_provider=player_llm)
            # create a fresh game each trial to reset any internal prints
            game = create_game(normalized_game)[0]

            # Initialize LLMJudge for this game. Mode here is informational; prompts are selected per-call.
            llm_judge = LLMJudge(
                judge_llm_provider=judge_llm,
                game=normalized_game,
                mode="behavioral",
            )
            
            # 4. Initialize and run the Game Loop Controller
            controller = GameLoopController(
                agent=agent,
                player=player,
                game=game,
                llm_judge=llm_judge,
                max_turns=max_turns,
                results_dir=agent_results_dir
            )
            controller.run(first_mover=first_mover, eval_modes=eval_modes, metrics=metrics)
        
        except Exception as e:
            print(f"\n--- ‚ùå ERROR on trial {i+1} for {agent_name}: {e} ---")
            # Log the error and continue to the next trial
            error_log_path = os.path.join(agent_results_dir, "error_log.txt")
            with open(error_log_path, "a") as f:
                f.write(f"Timestamp: {datetime.now().isoformat()}\n")
                f.write(f"Trial: {completed_trials+i+1}\nAgent: {agent_name}\n")
                f.write(f"Error: {e}\n---\n")
            continue



print(f"\n{'='*30} ‚úÖ All Experiments Finished {'='*30}")
print(f"Results saved in: {base_results_dir}")

