# Battleship Gameplay Manual Test

This notebook allows you to explore the new Battleship gameplay environment. 
The goal is to predict the next shot (r, c) given a partial board state.

In [1]:
import sys
import os
if os.getcwd() not in sys.path:
    sys.path.append(os.getcwd())

# Forced reload key modules to ensure we get the latest logic
import importlib
import environments.battleship_logic
importlib.reload(environments.battleship_logic)
import environments.battleship_env
importlib.reload(environments.battleship_env)

import numpy as np
from environments.battleship_env import BattleshipEnvironment
import re
import ast

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Configuration
config = {
    'environment': {
        'name': 'battleship',
        'min_grid_size': 6,
        'max_grid_size': 8,
        'size': 100,  # Generate more samples to ensure we find all edge cases
        'seed': 42,
    }
}

## Initialize and Generate


In [3]:
env = BattleshipEnvironment(config)
dataset = env.get_dataset(config)
print(f"Generated {len(dataset)} gameplay examples.")

Generating Battleship dataset with config: BattleshipConfig(min_grid_size=6, max_grid_size=8, seed=42, size=100, fleet_spec={4: 1, 3: 2, 2: 3, 1: 4})


Map: 100%|██████████| 100/100 [00:00<00:00, 3325.33 examples/s]

Generated 100 gameplay examples.





## Visualization Helper


In [4]:
def print_game_state(example):
    print("--- User Prompt ---")
    print(example['prompt'][1]['content'])
    
    # Access hidden state for debugging
    meta = example['metadata']
    
    print("\n--- True Board State (Cheat Sheet) ---")
    
    w, h = meta['width'], meta['height']
    ship_board = np.array(meta['ship_board'])
    shots_grid = np.array(meta['shots_grid'])
    
    # Robust handling for ship IDs (legacy data support)
    if 'ship_id_grid' in meta:
        ship_id_grid = np.array(meta['ship_id_grid'], dtype=int)
    else:
        ship_id_grid = np.zeros((h, w), dtype=int)
        print("Warning: 'ship_id_grid' not found in metadata. Using legacy visualization (IDs will be 0).")
        
    # List Ships
    print("Ships Placed:")
    ships = meta.get('ships', [])
    ships_by_size = {}
    for s in ships:
        ships_by_size.setdefault(s['size'], []).append(s)
        # Check if sunk
        s_id = s.get('id', 0)
        if s_id > 0:
            cells = (ship_id_grid == s_id)
            if np.any(cells): # Only check if ID exists on board
                is_sunk = np.all(shots_grid[cells] == 2)
                s['status'] = 'SUNK' if is_sunk else 'ALIVE'
            else:
                s['status'] = '?'
        else:
             s['status'] = '?'
    
    for size in sorted(ships_by_size.keys(), reverse=True):
        print(f"  Size {size}: {len(ships_by_size[size])} ships")
        for s in ships_by_size[size]:
             s_id = s.get('id', '?')
             status = s.get('status', '')
             print(f"    - ID {s_id} [{status}] {s['orientation']} at ({s['row']}, {s['col']})")

    print("\nVisual Board (S=Hidden Ship, H=Hit Ship, M=Miss, .=Water, Number=ShipID):")
    print("    " + " ".join(f"{c:2}" for c in range(w)))
    print("   " + "-" * (w * 3))
    
    for r in range(h):
        row_str = ""
        for c in range(w):
            if shots_grid[r, c] == 2: # Hit
                 char = " H"
            elif shots_grid[r, c] == 1: # Miss
                 char = " M"
            else: # Unknown
                if ship_board[r, c] == 1:
                    # Show ID for debugging
                    sid = ship_id_grid[r, c]
                    if sid == 0:
                        char = " S" # Fallback if IDs missing
                    else:
                        char = f"{sid:2}"
                else:
                    char = " ."
            row_str += char + " "
        print(f"{r:2} |{row_str}")
        
    print("\nReference Answer (One Example):", example['answer'])
    valid_targets = meta.get('valid_targets', [])
    print(f"All Valid Targets ({len(valid_targets)}): {valid_targets}")

In [5]:
# Visualize one random example to check text
idx = 0
example = dataset[idx]
print_game_state(example)

--- User Prompt ---
Play Battleship. Grid Size: 8x8
Board State (M=Miss, H=Hit, .=Unknown):
  0 1 2 3 4 5 6 7
0 H H . . M M . . 
1 M . M H . . . H 
2 M M M H M M M H 
3 . . M . M M M . 
4 . . . . H . . M 
5 M . M M M M . H 
6 M . . . . M . . 
7 M M M M . M . . 

Fleet sizes: 2x size 3, 3x size 2, 4x size 1
Output the next shot coordinate as a tuple (row, col).


--- True Board State (Cheat Sheet) ---
Ships Placed:
  Size 3: 2 ships
    - ID 1 [ALIVE] V at (1, 7)
    - ID 2 [ALIVE] H at (4, 3)
  Size 2: 3 ships
    - ID 3 [ALIVE] V at (5, 1)
    - ID 4 [SUNK] H at (0, 0)
    - ID 5 [SUNK] V at (1, 3)
  Size 1: 4 ships
    - ID 6 [ALIVE] H at (7, 4)
    - ID 7 [ALIVE] V at (3, 0)
    - ID 8 [ALIVE] H at (1, 5)
    - ID 9 [SUNK] H at (5, 7)

Visual Board (S=Hidden Ship, H=Hit Ship, M=Miss, .=Water, Number=ShipID):
     0  1  2  3  4  5  6  7
   ------------------------
 0 | H  H  .  .  M  M  .  . 
 1 | M  .  M  H  .  8  .  H 
 2 | M  M  M  H  M  M  M  H 
 3 | 7  .  M  .  M  M  M  1 
 4 | 

## Verifying All Reward Scenarios
Since scenarios (like a partial hit or a win) depend on the random board state, we will scan the dataset to find valid test cases for each reward type.

In [11]:
check_move(1, 2, meta)

-1.0

In [12]:
meta

{'difficulty': {'grid_size': [8, 8]},
 'fleet': {'1': 4, '2': 3, '3': 2},
 'height': 8,
 'ship_board': [[0, 0, 0, 0, 0, 0, 0, 0],
  [0, 1, 0, 0, 0, 1, 0, 1],
  [0, 1, 0, 1, 0, 0, 0, 0],
  [0, 0, 0, 0, 0, 1, 1, 1],
  [0, 0, 0, 0, 0, 0, 0, 0],
  [1, 0, 0, 0, 1, 0, 0, 0],
  [1, 0, 0, 0, 0, 0, 0, 1],
  [1, 0, 0, 1, 1, 0, 0, 1]],
 'ship_id_grid': [[0, 0, 0, 0, 0, 0, 0, 0],
  [0, 5, 0, 0, 0, 6, 0, 8],
  [0, 5, 0, 9, 0, 0, 0, 0],
  [0, 0, 0, 0, 0, 1, 1, 1],
  [0, 0, 0, 0, 0, 0, 0, 0],
  [2, 0, 0, 0, 7, 0, 0, 0],
  [2, 0, 0, 0, 0, 0, 0, 4],
  [2, 0, 0, 3, 3, 0, 0, 4]],
 'ships': [{'col': 5, 'id': 1, 'orientation': 'H', 'row': 3, 'size': 3},
  {'col': 0, 'id': 2, 'orientation': 'V', 'row': 5, 'size': 3},
  {'col': 3, 'id': 3, 'orientation': 'H', 'row': 7, 'size': 2},
  {'col': 7, 'id': 4, 'orientation': 'V', 'row': 6, 'size': 2},
  {'col': 1, 'id': 5, 'orientation': 'V', 'row': 1, 'size': 2},
  {'col': 5, 'id': 6, 'orientation': 'V', 'row': 1, 'size': 1},
  {'col': 4, 'id': 7, 'orientation': 'V

In [6]:
def check_move(r, c, meta):
    # Test function
    move_str = f"({r}, {c})"
    completion = [
        {"content": f"<think>Try {move_str}</think><answer>{move_str}</answer>"}
    ]
    rewards = env.completeness_reward([completion], metadata=[meta])
    return rewards[0]

# Track which scenarios we've verified
scenarios_found = {
    'New Hit (+1.0)': False,
    'Partial Hit (+2.0)': False,
    'Win (+10.0)': False,
    'Miss (0.0)': False,
    'Invalid (-1.0)': False
}

print("Scanning dataset for test cases...\n")

for i, example in enumerate(dataset):
    meta = example['metadata']
    ships = np.array(meta['ship_board'])
    shots = np.array(meta['shots_grid'])
    if 'ship_id_grid' in meta:
        ship_ids = np.array(meta['ship_id_grid'], dtype=int)
    else:
        ship_ids = np.zeros_like(ships, dtype=int)
        
    h, w = meta['height'], meta['width']

    # Iterate checks for this board
    for r in range(h):
        for c in range(w):
            # Check Invalid
            if shots[r, c] != 0:
                if not scenarios_found['Invalid (-1.0)']:
                    rew = check_move(r, c, meta)
                    print(f"[FOUND] Invalid/Repeat at sample {i}, ({r},{c}) -> Reward: {rew}")
                    scenarios_found['Invalid (-1.0)'] = True
            
            # Check Miss
            elif ships[r, c] == 0:
                if not scenarios_found['Miss (0.0)']:
                    rew = check_move(r, c, meta)
                    print(f"[FOUND] Miss at sample {i}, ({r},{c}) -> Reward: {rew}")
                    scenarios_found['Miss (0.0)'] = True
                    
            # Check Hit types
            elif ships[r, c] == 1:
                s_id = ship_ids[r, c]
                if s_id > 0:
                    # Check if partial
                    ship_cells = (ship_ids == s_id)
                    was_hit = np.any((shots == 2) & ship_cells)
                    
                    if was_hit:
                        # Potential +2.0 or +10.0
                        # Check if it's the last one (Win)
                        # We need to simulate the hit
                        temp_shots = shots.copy()
                        temp_shots[r, c] = 2
                        
                        # Check if ALL ships sunk now
                        all_sunk_now = True
                        for sid in np.unique(ship_ids):
                            if sid == 0: continue
                            if not np.all(temp_shots[ship_ids == sid] == 2):
                                all_sunk_now = False; break
                        
                        if all_sunk_now:
                             if not scenarios_found['Win (+10.0)']:
                                rew = check_move(r, c, meta)
                                print(f"[FOUND] Win at sample {i}, ({r},{c}) -> Reward: {rew}")
                                scenarios_found['Win (+10.0)'] = True
                        else:
                             if not scenarios_found['Partial Hit (+2.0)']:
                                rew = check_move(r, c, meta)
                                print(f"[FOUND] Partial Hit at sample {i}, ({r},{c}) -> Reward: {rew}")
                                scenarios_found['Partial Hit (+2.0)'] = True
                    else:
                        # New Hit
                        if not scenarios_found['New Hit (+1.0)']:
                            rew = check_move(r, c, meta)
                            print(f"[FOUND] New Hit at sample {i}, ({r},{c}) -> Reward: {rew}")
                            scenarios_found['New Hit (+1.0)'] = True

    if all(scenarios_found.values()):
        break

print("\nSummary:")
for k, v in scenarios_found.items():
    print(f"{k}: {'✅' if v else '❌'}")

Scanning dataset for test cases...

[FOUND] Invalid/Repeat at sample 0, (0,0) -> Reward: -1.0
[FOUND] Miss at sample 0, (0,2) -> Reward: 0.0
[FOUND] New Hit at sample 0, (1,5) -> Reward: 1.0
[FOUND] Partial Hit at sample 0, (3,7) -> Reward: 2.0
[FOUND] Win at sample 11, (7,3) -> Reward: 10.0

Summary:
New Hit (+1.0): ✅
Partial Hit (+2.0): ✅
Win (+10.0): ✅
Miss (0.0): ✅
Invalid (-1.0): ✅


In [7]:
scenarios_found

{'New Hit (+1.0)': True,
 'Partial Hit (+2.0)': True,
 'Win (+10.0)': True,
 'Miss (0.0)': True,
 'Invalid (-1.0)': True}