In [1]:
from Game import Game


In [5]:
players_data = [('Alice', 1000), ('Bob', 1000), ('Charlie', 1000)]
game = Game(players_data)

game.run_hand()

Charlie posts small blind: 5
Alice posts big blind: 10
Bob's turn.
Hand: [5♥, 2♠]
Stack: 1000
Amount to call: 10
Legal actions: ['fold', 'call', 'raise']
Bob calls.
Charlie's turn.
Hand: [5♣, 6♦]
Stack: 995
Amount to call: 10
Legal actions: ['fold', 'call', 'raise']
Charlie calls.
Alice's turn.
Hand: [T♣, 8♦]
Stack: 990
Amount to call: 10
Legal actions: ['check', 'bet']
Alice checks.
Community cards: [2♥, 8♣, 2♦]
Charlie's turn.
Hand: [5♣, 6♦]
Stack: 990
Amount to call: 0
Legal actions: ['check', 'bet']
Charlie checks.
Alice's turn.
Hand: [T♣, 8♦]
Stack: 990
Amount to call: 0
Legal actions: ['check', 'bet']
Alice checks.
Bob's turn.
Hand: [5♥, 2♠]
Stack: 990
Amount to call: 0
Legal actions: ['check', 'bet']
Bob checks.
Community cards: [2♥, 8♣, 2♦, 3♣]
Charlie's turn.
Hand: [5♣, 6♦]
Stack: 990
Amount to call: 0
Legal actions: ['check', 'bet']
Charlie checks.
Alice's turn.
Hand: [T♣, 8♦]
Stack: 990
Amount to call: 0
Legal actions: ['check', 'bet']
Alice checks.
Bob's turn.
Hand: [5♥, 2♠

In [3]:
import unittest
from Game import Game

class TestGame(unittest.TestCase):
    def test_game_creation(self):
        players_data = [('Alice', 1000), ('Bob', 1000)]
        game = Game(players_data)
        self.assertEqual(len(game.players), 2)
        self.assertEqual(game.players[0].name, 'Alice')
        self.assertEqual(game.small_blind, 5)
        self.assertEqual(game.big_blind, 10)

    def test_setup_hand(self):
        players_data = [('Alice', 1000), ('Bob', 1000), ('Charlie', 1000)]
        game = Game(players_data)
        game._setup_hand()

        self.assertEqual(game.pot, 15)
        self.assertEqual(len(game.community_cards), 0)
        self.assertEqual(len(game.players[0].hand), 2)
        self.assertEqual(len(game.players[1].hand), 2)
        self.assertEqual(len(game.players[2].hand), 2)
        self.assertEqual(game.players[2].current_bet, 5)
        self.assertEqual(game.players[0].current_bet, 10)

    def test_pre_flop_betting_all_fold_to_bb(self):
        players_data = [('Alice', 1000), ('Bob', 1000), ('Charlie', 1000)]
        game = Game(players_data)
        game._setup_hand()

        # Scripted actions: Bob folds, Charlie folds
        game.players[1].scripted_actions = [('fold', 0)]
        game.players[2].scripted_actions = [('fold', 0)]

        game._run_betting_round()

        # Alice (BB) should win the pot
        self.assertEqual(game.players[0].stack, 1005) # 1000 - 10 (bb) + 15 (pot)
        self.assertEqual(game.players[1].stack, 1000)
        self.assertEqual(game.players[2].stack, 995)

    def test_post_flop_betting_check_around(self):
        players_data = [('Alice', 1000), ('Bob', 1000), ('Charlie', 1000)]
        game = Game(players_data)
        game._setup_hand()

        # Pre-flop: Bob calls, Charlie calls, Alice checks
        game.players[1].scripted_actions = [('call', 0)]
        game.players[2].scripted_actions = [('call', 0)]
        game.players[0].scripted_actions = [('check', 0)]
        game._run_betting_round()

        game._deal_community_cards(3)

        # Post-flop: all check
        game.players[1].scripted_actions = [('check', 0)]
        game.players[2].scripted_actions = [('check', 0)]
        game.players[0].scripted_actions = [('check', 0)]
        game._run_betting_round()

        self.assertEqual(game.pot, 30)

    def test_bet_raise_and_call(self):
        players_data = [('Alice', 1000), ('Bob', 1000), ('Charlie', 1000)]
        game = Game(players_data)
        game._setup_hand()

        # Pre-flop actions
        game.players[1].scripted_actions = [('call', 0)]
        game.players[2].scripted_actions = [('raise', 30)]
        game.players[0].scripted_actions = [('call', 0)]
        game.players[1].scripted_actions.append(('call', 0))

        game._run_betting_round()

        self.assertEqual(game.pot, 90)
        self.assertEqual(game.players[0].stack, 970)
        self.assertEqual(game.players[1].stack, 970)
        self.assertEqual(game.players[2].stack, 970)

In [7]:
from pydantic import BaseModel
from ai_client import AIClient
from openai import OpenAI
import os

# 1. Define your Pydantic model
class UserInfo(BaseModel):
    name: str

# 2. Configure the client for the OSS endpoint
client = OpenAI(
            api_key=os.getenv(f"OPENROUTER_API_KEY"), 
            base_url="https://openrouter.ai/api/v1",
        )

# 3. Request structured output
completion = client.beta.chat.completions.parse(
    model="openai/gpt-oss-120b", # Ensure you use a model that supports structured outputs
    messages=[
        {"role": "system", "content": "Extract the user information."},
        {"role": "user", "content": "John Doe is 30 years old and lives in New York."}
    ],
    response_format=UserInfo,
)

# 4. Access the parsed, validated Pydantic object
user = completion.choices[0].message.parsed
print(user.name) # "John Doe"


John Doe


In [26]:
import sqlite3
from collections import defaultdict

db_path = "benchmark/results.db"
conn = sqlite3.connect(db_path)
conn.row_factory = sqlite3.Row

# Get all unique LLM names
llms = set()
for row in conn.execute("SELECT llm1_name, llm2_name FROM game_results"):
    llms.add(row["llm1_name"])
    llms.add(row["llm2_name"])
llms = sorted(llms)

print("All LLMs found in results:")
print(llms)
print("=" * 100)

# Count and collect sessions for every unordered pair
pair_sessions = defaultdict(list)

for row in conn.execute("SELECT id, llm1_name, llm2_name, session_date FROM game_results"):
    l1, l2 = row["llm1_name"], row["llm2_name"]
    pair = tuple(sorted([l1, l2]))
    pair_sessions[pair].append(row["id"])

missing = []
duplicates = []
all_good = True

print("Pairwise session check (should be exactly one session per unordered pair):")
print("-" * 100)
for i, llm1 in enumerate(llms):
    for llm2 in llms[i+1:]:
        pair = (llm1, llm2)
        session_ids = pair_sessions.get(pair, [])
        count = len(session_ids)
        report_line = f"{llm1} vs {llm2}: {count} session(s) (IDs: {session_ids})"
        print(report_line)
        if count == 0:
            missing.append(pair)
            all_good = False
        elif count > 1:
            duplicates.append((pair, session_ids))
            all_good = False

print("\n" + "=" * 100)

if all_good:
    print("✅ No duplicates or missing sessions! Exactly one session per unordered pair.")
else:
    if missing:
        print("❌ Missing sessions for pairs:")
        for pair in missing:
            print(f"  {pair[0]} vs {pair[1]}")
    if duplicates:
        print("\n❌ Duplicate sessions detected for pairs:")
        for pair, session_ids in duplicates:
            print(f"  {pair[0]} vs {pair[1]}: {len(session_ids)} sessions (IDs: {session_ids})")
    print("\n⚠️  Review the above to fix missing or duplicate sessions.")

conn.close()

All LLMs found in results:
['claude-opus-4.5', 'claude-sonnet-4.5', 'deepseek-v3.2', 'gemini-2.5-flash', 'gemini-2.5-flash-lite', 'gemini-3-flash-preview', 'gemini-3-pro-preview', 'gpt-4o-mini', 'gpt-5.2-pro', 'gpt-oss-120b', 'grok-4.1-fast', 'qwen3-235b-a22b-2507']
Pairwise session check (should be exactly one session per unordered pair):
----------------------------------------------------------------------------------------------------
claude-opus-4.5 vs claude-sonnet-4.5: 1 session(s) (IDs: [469])
claude-opus-4.5 vs deepseek-v3.2: 1 session(s) (IDs: [466])
claude-opus-4.5 vs gemini-2.5-flash: 1 session(s) (IDs: [461])
claude-opus-4.5 vs gemini-2.5-flash-lite: 1 session(s) (IDs: [460])
claude-opus-4.5 vs gemini-3-flash-preview: 1 session(s) (IDs: [462])
claude-opus-4.5 vs gemini-3-pro-preview: 1 session(s) (IDs: [541])
claude-opus-4.5 vs gpt-4o-mini: 1 session(s) (IDs: [465])
claude-opus-4.5 vs gpt-5.2-pro: 1 session(s) (IDs: [481])
claude-opus-4.5 vs gpt-oss-120b: 1 session(s) (IDs

In [27]:
import sqlite3

db_path = "benchmark/results.db"
conn = sqlite3.connect(db_path)
conn.row_factory = sqlite3.Row

# Delete session 480 from game_results and any related hand_logs
session_to_delete = 544

# Delete from hand_logs
conn.execute("DELETE FROM hand_logs WHERE session_id = ?", (session_to_delete,))
# Delete from game_results
conn.execute("DELETE FROM game_results WHERE id = ?", (session_to_delete,))

conn.commit()

sessions_with_incorrect_hands = []

# Get all session ids from game_results
session_rows = conn.execute("SELECT id FROM game_results").fetchall()
for row in session_rows:
    session_id = row["id"]
    hand_count = conn.execute(
        "SELECT COUNT(*) FROM hand_logs WHERE session_id = ?", 
        (session_id,)
    ).fetchone()[0]
    if hand_count != 4:
        sessions_with_incorrect_hands.append((session_id, hand_count))

if sessions_with_incorrect_hands:
    print("Sessions with != 4 hands:")
    for sid, hcount in sessions_with_incorrect_hands:
        print(f"  Session {sid} has {hcount} hands.")
else:
    print("All sessions have exactly 4 hands!")

conn.close()


All sessions have exactly 4 hands!


In [None]:
import sqlite3

db_path = "benchmark/results.db"