# HyperToken Python Bridge Demo

This notebook demonstrates using HyperToken environments in Python through the WebSocket bridge.

## Prerequisites

Start the HyperToken server before running this notebook:

```bash
npx tsx bridge/server.ts --env blackjack --port 9999 --verbose
```

## 1. Setup and Connection

In [None]:
# Install hypertoken if needed
# !pip install hypertoken

from hypertoken import HyperTokenAECEnv, HyperTokenClient
import numpy as np

# Connect to the server
env = HyperTokenAECEnv("ws://localhost:9999")

print(f"Connected to HyperToken!")
print(f"Possible agents: {env.possible_agents}")
print(f"Observation space: {env.observation_space(env.possible_agents[0])}")
print(f"Action space: {env.action_space(env.possible_agents[0])}")

## 2. Environment Information

In [None]:
# Check latency to server
client = HyperTokenClient("ws://localhost:9999")
client.connect()

latencies = [client.ping() for _ in range(10)]
print(f"Average latency: {np.mean(latencies):.2f}ms")
print(f"Min latency: {np.min(latencies):.2f}ms")
print(f"Max latency: {np.max(latencies):.2f}ms")

client.disconnect()

## 3. Running a Single Episode

In [None]:
# Reset the environment
env.reset(seed=42)

print("Episode started!")
print(f"Active agents: {env.agents}")
print(f"Current agent: {env.agent_selection}")

In [None]:
# Run through the episode
step_count = 0

for agent in env.agent_iter():
    obs, reward, terminated, truncated, info = env.last()
    
    print(f"\n--- Step {step_count} ---")
    print(f"Agent: {agent}")
    print(f"Observation: {obs}")
    print(f"Reward: {reward}")
    print(f"Terminated: {terminated}, Truncated: {truncated}")
    
    if terminated or truncated:
        print("Agent done, passing None action")
        action = None
    else:
        # Get valid actions
        mask = env.action_mask(agent)
        print(f"Action mask: {mask}")
        
        # Sample a random valid action
        action = env.action_space(agent).sample()
        if mask is not None and not mask[action]:
            valid_actions = np.where(mask)[0]
            action = np.random.choice(valid_actions)
        
        action_names = ['Hit', 'Stand', 'Double', 'Split', 'Insurance']
        print(f"Taking action: {action} ({action_names[action]})")
    
    env.step(action)
    step_count += 1
    
    if step_count > 20:  # Safety limit
        print("\nReached step limit, breaking...")
        break

print("\n" + "="*40)
print("Episode complete!")
print(f"Final rewards: {env.rewards()}")

## 4. Running Multiple Episodes

In [None]:
def run_episode(env, policy='random'):
    """Run a single episode with the given policy."""
    env.reset()
    
    for agent in env.agent_iter():
        obs, reward, term, trunc, info = env.last()
        
        if term or trunc:
            action = None
        else:
            mask = env.action_mask(agent)
            
            if policy == 'random':
                action = env.action_space(agent).sample()
            elif policy == 'conservative':
                # Conservative: stand if hand value > 16
                hand_value = obs[0] * 30  # Denormalize
                action = 1 if hand_value > 16 else 0  # Stand or Hit
            else:
                action = env.action_space(agent).sample()
            
            # Ensure action is valid
            if mask is not None and not mask[action]:
                valid = np.where(mask)[0]
                action = np.random.choice(valid) if len(valid) > 0 else 1
        
        env.step(action)
    
    return env.rewards()

# Run 50 episodes
num_episodes = 50
all_rewards = {agent: [] for agent in env.possible_agents}

for i in range(num_episodes):
    rewards = run_episode(env, policy='conservative')
    for agent, r in rewards.items():
        all_rewards[agent].append(r)
    
    if (i + 1) % 10 == 0:
        print(f"Completed {i + 1}/{num_episodes} episodes")

print("\nResults:")
for agent in env.possible_agents:
    rewards = all_rewards[agent]
    print(f"  {agent}: avg={np.mean(rewards):.2f}, total={sum(rewards):.2f}")

## 5. Comparing Policies

In [None]:
policies = ['random', 'conservative']
results = {}

for policy in policies:
    rewards = []
    for _ in range(30):
        episode_rewards = run_episode(env, policy=policy)
        # Sum rewards for first agent only
        rewards.append(episode_rewards[env.possible_agents[0]])
    
    results[policy] = {
        'mean': np.mean(rewards),
        'std': np.std(rewards),
        'total': sum(rewards)
    }

print("Policy Comparison (30 episodes each):")
print("-" * 50)
for policy, stats in results.items():
    print(f"{policy:15} mean={stats['mean']:7.2f} std={stats['std']:6.2f}")

## 6. Cleanup

In [None]:
# Close the environment when done
env.close()
print("Environment closed!")