In [1]:
# Build environment : wordle game
# a LLM sees a word but is not allowed to say it, another SLM (training target) tries to ask questions in order to guess the word
# we'd reward the SLM if it guess correctly, but the training signal is provided to the "question" it asks, as well as the "guess"
# it provided. 

In [4]:
import anthropic
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

client = anthropic.Anthropic()  # uses ANTHROPIC_API_KEY env var

def oracle_fn_claude(system_prompt: str, question: str) -> str:
    resp = client.messages.create(
        model="claude-sonnet-4-20250514",
        max_tokens=64,
        system=system_prompt,
        messages=[{"role": "user", "content": question}],
    )
    return resp.content[0].text

def learner_fn_claude(prompt: str) -> str:
    resp = client.messages.create(
        model="claude-sonnet-4-20250514",
        max_tokens=64,
        messages=[{"role": "user", "content": prompt}],
    )
    return resp.content[0].text

learner_tok = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B")
learner_llm = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3-0.6B", torch_dtype=torch.bfloat16, device_map="auto")

def learner_fn(prompt: str) -> str:
    msgs = [{"role": "user", "content": prompt}]
    text = learner_tok.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
    ids = learner_tok(text, return_tensors="pt").to(learner_llm.device)
    with torch.no_grad():
        out = learner_llm.generate(
            **ids, max_new_tokens=256, temperature=0.7, do_sample=True,
            top_p=0.9, top_k=50,
        )
    return learner_tok.decode(out[0][ids["input_ids"].shape[1]:], skip_special_tokens=True)

`torch_dtype` is deprecated! Use `dtype` instead!


Loading weights:   0%|          | 0/311 [00:00<?, ?it/s]



In [7]:
# --- Claude vs Claude: full game ---
from src.wordle_env import WordleEnv, EnvConfig

env_claude = WordleEnv(oracle_fn=oracle_fn_claude, config=EnvConfig(max_questions=5))
state_claude = env_claude.rollout(learner_fn=learner_fn, secret_word="piano", verbose=True)

[Secret word: piano]
--------------------------------------------------
Learner: <think>
Okay, let's see. I need to figure out the secret word based on the game rules. The player knows the secret word, and I have to ask them questions to narrow it down. The process is to ask a question, wait for their response, and repeat this until I guess it. There are five more messages to go through.

First, I should think about the possible strategies. Since the word is secret, maybe the key is to ask questions that can eliminate certain possibilities. Let's think of common word games. For example, if the word is a 5-letter word, maybe starting with the first letters, checking for common letters, or looking for an anagram.

Wait, the user hasn't provided the actual questions yet. So I need to come up with a question that can help narrow down the possibilities. Let's brainstorm some questions. For example, "What is the first letter?" If the player responds with a letter, that could help eliminate s