In [1]:
import re

def keep_sgf_file(sgf: str) -> bool:
    """
    Returns True if:
    - Board size is 19x19 (SZ[19])
    - AND either no HA tag or HA[0]
    Otherwise, returns False.
    """
    # Must be a 19x19 board
    sz_match = re.search(r'SZ\[(\d+)\]', sgf)
    if not sz_match or sz_match.group(1) != '19':
        return False

    # Handicap condition
    ha_match = re.search(r'HA\[(\d+)\]', sgf)
    if not ha_match:
        return True  # No HA → keep
    return ha_match.group(1) == '0'  # Keep only if HA[0]

In [3]:
def clean_sgf(sgf: str) -> str:
    # Remove all whitespace characters
    sgf = re.sub(r'\s+', '', sgf)

    # List of properties to remove
    remove_props = ['AP', 'GN', 'DT', 'WR', 'BR', 'CA', 'RU', 'C', 'HA', 'T', 'TT', 'TTT']

    # Remove each property (non-greedy, supports escaped brackets)
    for prop in remove_props:
        sgf = re.sub(rf'{prop}\[(?:\\.|[^\[\]])*?\]', '', sgf)

    # Replace player names with standardized names
    sgf = re.sub(r'PW\[[^\]]*\]', 'PW[White]', sgf)
    sgf = re.sub(r'PB\[[^\]]*\]', 'PB[Black]', sgf)

    # Clean up multiple semicolons or empty brackets
    sgf = re.sub(r';{2,}', ';', sgf)
    sgf = re.sub(r'\[\]', '', sgf)

    return sgf

In [18]:
import os
import random
from pathlib import Path

def process_sgfs(input_folder: str, output_base: str = "."):
    # Step 1: Find all SGF files recursively
    sgf_files = list(Path(input_folder).rglob("*.sgf"))
    print(len(sgf_files))

    # Step 2: Filter to only keep desired SGFs
    keep_files = []
    for fp in sgf_files:
        with open(fp, "r", encoding="latin1", errors="replace") as f:
            data = f.read()
        if keep_sgf_file(data):
            keep_files.append((fp, data))
    print(len(keep_files))
    

    # Step 3: Shuffle and split
    random.shuffle(keep_files)
    split_idx = int(len(keep_files) * 0.98)
    train_files = keep_files[:split_idx]
    val_files = keep_files[split_idx:]

    # Step 4: Write to train/ and val/
    for subset_name, subset in [('train', train_files), ('val', val_files)]:
        for original_path, sgf_data in subset:
            cleaned = clean_sgf(sgf_data)
            rel_path = Path(original_path).relative_to(input_folder)
            new_path = Path(output_base) / subset_name / rel_path
            os.makedirs(new_path.parent, exist_ok=True)
            with open(new_path, "w") as f:
                f.write(cleaned)

    print(f"Processed {len(train_files)} train and {len(val_files)} val SGFs.")


In [19]:
process_sgfs("./allsgfs")

231973
221213
Processed 216788 train and 4425 val SGFs.


In [None]:
!