In [None]:
import chess
import chess.engine
import chess.pgn
import glob
import csv
from time import time
import numpy as np
import math
import pandas as pd

# Configure logging
pgn_files = glob.glob("./database/pgns/*.pgn")
BATCH_SIZE = 100000

In [None]:
def pgn_to_csv(file, index):
    """Convert a PGN file to CSV with FEN, move, and normalized value using batched writes."""
    csv_file = f"./database/processed_{index}.csv"
    start_time = time()
    game_count = 0
    batch = []
    
    print(f"[{index:3}] Processing {file} to {csv_file}")

    try:
        with open(file, 'r') as pgn_file:
            while True:
                game = chess.pgn.read_game(pgn_file)
                if game is None:
                    break
                game_count += 1
                move_count = 0
                value_str = game.headers.get("Result")
                value = 1.0 if value_str == "1-0" else -1.0 if value_str == "0-1" else 0.0

                board = game.board()
                for move in game.mainline_moves():
                    batch.append({
                        "fen": board.fen(),
                        "move": move.uci(),
                        "value": value
                    })
                    board.push(move)
                    move_count += 1
                    print(f"[{index:3}] {game_count:5} {move_count:5} {board.fen()} {move.uci()} {value:.2f}", end='\r')

                    if len(batch) >= BATCH_SIZE:
                        print(f"[{index:3}] Writing batch of {len(batch)} to {csv_file}")
                        pd.DataFrame(batch).to_csv(csv_file, mode='a', index=False, header=not pd.io.common.file_exists(csv_file))
                        batch.clear()

        if batch:
            pd.DataFrame(batch).to_csv(csv_file, mode='a', index=False, header=not pd.io.common.file_exists(csv_file))

        elapsed = time() - start_time
        print(f"Processed {file} to {csv_file} in {elapsed:.2f} seconds")
        print(f"Processed {file} to {csv_file}")
    except Exception as e:
        print(f"Error processing {file}: {str(e)}")
        print(f"Error processing {file}: {str(e)}")

    return game_count


In [None]:
# Create Process Pool
start_time = time()

# Submit tasks
results = []
for i, file in enumerate(pgn_files):
    print(f"[{i+1:3} of {len(pgn_files)}] Scheduling {file} ...")
    result = pgn_to_csv(file, i+1)
    results.append(result)

print(f"All files processed in {time.time() - start_time:.2f} seconds.")
print(f'{np.sum([result])} entries processed.')