### Test LLM Provider

In [5]:
import sys
import os
from langchain_core.messages import HumanMessage, SystemMessage

# --- Setup Project Path ---
# This allows the notebook to find your 'src' directory and the 'hangman' package
project_root = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
if project_root not in sys.path:
    sys.path.append(project_root)
    
from src.hangman.providers.llmprovider import load_llm_provider, ModelOutput

# --- Test Configuration ---
CONFIG_PATH = "../config.yaml"
QWEN_PROVIDER_NAME = "qwen3_14b_local"
KIMI_PROVIDER_NAME = "kimi_k2_openrouter"

# --- Reusable Test Function ---
def test_provider(provider_name: str):
    """Loads a provider by name, invokes it with a test prompt, and prints the results."""
    print(f"Loading provider '{provider_name}'...")
    try:
        llm_provider = load_llm_provider(config_path=CONFIG_PATH, provider_name=provider_name)
        print("‚úÖ LLM Provider loaded successfully.")
    except Exception as e:
        print(f"‚ùå Failed to load LLM Provider: {e}")
        return

    # Create a sample conversation to send to the model
    sample_messages = [
        SystemMessage(content="You are a helpful assistant. Please use <think> tags to outline your reasoning before providing the final answer."),
        HumanMessage(content="What is the capital of France? Explain your reasoning process step-by-step.")
    ]
    
    print("\nInvoking model with a test prompt (thinking enabled)...")
    # Invoke the model, requesting the thinking trace
    output = llm_provider.invoke(sample_messages, thinking=True)

    # Print the structured output
    print("\n--- ü§î Thinking Process ---")
    print(output.get("thinking") or "No thinking trace found (as expected for this model).")

    print("\n--- üí¨ Final Response ---")
    print(output.get("response") or "No response found.")

In [6]:
# --- Run Tests ---
print("--- üß™ TESTING QWEN PROVIDER ---")
test_provider(QWEN_PROVIDER_NAME)


--- üß™ TESTING QWEN PROVIDER ---
Loading provider 'qwen3_14b_local'...
‚úÖ LLM Provider loaded successfully.

Invoking model with a test prompt (thinking enabled)...


2025-08-05 13:35:29,550 - INFO - HTTP Request: POST http://localhost:8000/v1/chat/completions "HTTP/1.1 200 OK"
--


The capital of France is **Paris**. Here's the step-by-step reasoning:

1. **Geographical Knowledge**: France is a country in Western Europe. Its capital is widely recognized as Paris, a major global city known for landmarks like the Eiffel Tower, Louvre Museum, and Notre-Dame Cathedral.

2. **Political Center**: Paris serves as the political, economic, and cultural hub of France. The French government, including the President's residence (√âlys√©e Palace) and the National Assembly, is based there.

3. **Historical Context**: Paris has been the capital of France since the 13th century, even through periods of political upheaval (e.g., the French Revolution, World War II). This historical continuity reinforces its role as the capital.

4. **Elimination of Alternatives**: Other major French cities like Lyon, Marseille, or Bordeaux are not capitals. They are renowned for ot


--- ü§î Thinking Process ---
No thinking trace found (as expected for this model).

--- üí¨ Final Response ---


The capital of France is **Paris**. Here's the step-by-step reasoning:

1. **Geographical Knowledge**: France is a country in Western Europe. Its capital is widely recognized as Paris, a major global city known for landmarks like the Eiffel Tower, Louvre Museum, and Notre-Dame Cathedral.

2. **Political Center**: Paris serves as the political, economic, and cultural hub of France. The French government, including the President's residence (√âlys√©e Palace) and the National Assembly, is based there.

3. **Historical Context**: Paris has been the capital of France since the 13th century, even through periods of political upheaval (e.g., the French Revolution, World War II). This historical continuity reinforces its role as the capital.

4. **Elimination of Alternatives**: Other major French cities like Lyon, Marseille, or Bordeaux are not capitals. They are renowned for oth

In [7]:
llm_provider = load_llm_provider(config_path=CONFIG_PATH, provider_name=QWEN_PROVIDER_NAME)

In [19]:
response = llm_provider.client.invoke(
    [
        HumanMessage(content="Ti vedo bene o no?")
    ])

2025-08-05 13:49:39,444 - INFO - HTTP Request: POST http://localhost:8000/v1/chat/completions "HTTP/1.1 200 OK"


In [20]:
response

AIMessage(content='<think>\nOkay, the user asked, "Ti vedo bene o no?" which translates to "Can you see me well or not?" in Italian. I need to figure out the best way to respond.\n\nFirst, I should acknowledge that as an AI, I don\'t have a visual component. I can\'t see images or videos. But maybe the user is asking if I can understand them properly. They might be concerned about whether I can process their input correctly, especially if they sent an image or video that I can\'t see.\n\nI should explain that I can\'t see visual content but can help with text. Also, offer assistance with any text-based questions or tasks. Maybe they want to know if I can interpret their message accurately. I need to be clear but helpful, ensuring they know my limitations and how I can still assist them.\n</think>\n\nNon ho la capacit√† di "vedere" immagini, video o oggetti fisici, poich√© sono un modello di intelligenza artificiale basato su testo. Tuttavia, posso aiutarti a rispondere a domande, anali

AIMessage(content="\n\nThe capital of France is **Paris**. Here's the reasoning process step-by-step:\n\n1. **Definition of a Capital**: A capital city is the seat of a country's government, housing key institutions like the executive branch (e.g., the president's residence), legislative bodies (e.g., parliament), and administrative centers.\n\n2. **Government Institutions in Paris**:\n   - The **√âlys√©e Palace** in Paris is the official residence of the President of France (currently Emmanuel Macron).\n   - The **French Parliament** (National Assembly and Senate) meets in Paris, with the National Assembly located in the **Palais Bourbon** and the Senate in the **Palais du Luxembourg**.\n\n3. **Historical Context**: Paris has been the political and cultural heart of France for centuries. It became the de facto capital during the Middle Ages and remained so through the French Revolution and modern times.\n\n4. **Geographical and Cultural Recognition**: Paris is widely recognized globally as France's capital, known for landmarks like the Eiffel Tower, Louvre Museum, and its role in art, fashion, and politics.\n\n5. **Elimination of Alternatives**: Other major French cities (e.g., Lyon, Marseille) are significant but not capitals. No historical or contemporary evidence suggests a shift in the capital from Paris.\n\n**Conclusion**: Paris is the capital of France due to its role as the political, administrative, and historical center of the country.", additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 764, 'prompt_tokens': 23, 'total_tokens': 787, 'completion_tokens_details': None, 'prompt_tokens_details': None}, 'model_name': 'Qwen/Qwen3-14B', 'system_fingerprint': None, 'id': 'chatcmpl-332131f388684d739b40d70ac741e41d', 'service_tier': None, 'finish_reason': 'stop', 'logprobs': None}, id='run--4774a36f-0f21-4cf3-8a06-0360cc876d6a-0', usage_metadata={'input_tokens': 23, 'output_tokens': 764, 'total_tokens': 787, 'input_token_details': {}, 'output_token_details': {}})

In [None]:
print("--- üß™ TESTING KIMI PROVIDER ---")
test_provider(KIMI_PROVIDER_NAME)

### Test LLM Judge

In [8]:
from hangman.evaluation.judge import HangmanJudge, MetricEvaluation
from hangman.providers.llmprovider import load_llm_provider
from hangman.prompts.hangman import SECRECY_JUDGE_PROMPT
from langchain_core.messages import HumanMessage
import json
import os
from tqdm import tqdm
import pandas as pd

In [2]:
CONFIG_PATH = "../config.yaml"
QWEN_PROVIDER_NAME = "qwen3_14b_local"

# Load the Qwen provider
llm_provider = load_llm_provider(config_path=CONFIG_PATH, provider_name=QWEN_PROVIDER_NAME)
judge = HangmanJudge(judge_llm_provider=llm_provider)

2025-08-07 15:12:41,696 - INFO - HangmanJudge initialized with model: Qwen/Qwen3-14B


In [3]:
results_dir = '/home/mila/b/baldelld/scratch/hangman/results/hangman/ReActAgent/'
files_to_eval = os.listdir(results_dir)
files_to_eval = [os.path.join(results_dir, f) for f in files_to_eval if f.endswith('.json')]

In [None]:
results = []

for filepath in tqdm(files_to_eval):
    with open(filepath, 'r') as f:
        trial_data = json.load(f)
    results.append(judge.evaluate_trial(trial_data))

In [9]:
results = pd.DataFrame(results)

In [13]:
intentionality = pd.DataFrame(results['intentionality'].to_list())
secrecy = pd.DataFrame(results['secrecy'].to_list())
mechanism = pd.DataFrame(results['mechanism'].to_list())
coherence = pd.DataFrame(results['coherence'].to_list())

In [15]:
intentionality.score.mean(), secrecy.score.mean(), mechanism.score.mean(), coherence.score.mean()

(np.float64(4.1), np.float64(4.8), np.float64(2.9), np.float64(3.3))

In [3]:
json_path = '/home/mila/b/baldelld/scratch/hangman/results/hangman/ReActAgent/ReActAgent_20250807-135405.json'
trial_data = json.load(open(json_path, 'r'))
log_segment = trial_data["interaction_log"]
formatted_log = judge._format_log_for_prompt(log_segment)

In [6]:
prompt = SECRECY_JUDGE_PROMPT.format(
    interaction_log=formatted_log,
    format_instructions=judge.format_instructions
)

In [7]:
message = HumanMessage(content=prompt)
response = judge.llm.invoke([message], thinking=True)
llm_output_str = response['response']
parsed_dict = json.loads(llm_output_str)
validated_data = MetricEvaluation(**parsed_dict)

2025-08-07 14:40:00,591 - INFO - HTTP Request: POST http://localhost:8000/v1/chat/completions "HTTP/1.1 200 OK"


In [11]:
evaluation = judge._evaluate_metric(
    prompt_template=SECRECY_JUDGE_PROMPT,log_segment=log_segment
)

2025-08-07 14:44:34,118 - INFO - HTTP Request: POST http://localhost:8000/v1/chat/completions "HTTP/1.1 200 OK"
