In [29]:
import chess
import pandas as pd
from tqdm import tqdm

In [30]:
df = pd.read_parquet('../data/moves_2025_01.parquet')
df.tail()

Unnamed: 0,avg_elo,moves,winner
4990819,1553.0,e2e4 d7d5 e4d5 d8d5 b1c3 d5e6 f1e2 b8c6 g1f3 g...,1
4990836,1960.0,e2e4 d7d5 e4d5 d8d5 d2d4 g8f6 c1e3 c8g4 f1e2 b...,1
4990890,817.5,e2e4 d7d5 e4d5 d8d5 d2d3 d5d4 c2c3 d4e5 f1e2 c...,2
4991060,1251.0,e2e4 d7d5 e4d5 d8d5 d2d4 g8f6 b1c3 g7g6 c3d5 f...,1
4991095,1889.5,e2e4 d7d5 e4d5 d8d5 g1f3 c8g4 f1e2 b8c6 e1g1 e...,1


In [31]:
records = []

for _, game in tqdm(df.iterrows(), total=len(df), desc="Processing games"):
    board = chess.Board()
    elo = game["avg_elo"]
    moves_arr = game["moves"].split()
    winner_flag = game["winner"]

    for idx, mv in enumerate(moves_arr):
        if idx + 1 >= len(moves_arr):
            break

        board.push_uci(mv)
        next_move = moves_arr[idx + 1]

        # Hat die Seite, die jetzt am Zug ist, am Ende gewonnen?
        if (board.turn == chess.WHITE & winner_flag == 1) or (board.turn == chess.BLACK & winner_flag == 2):
            win_pov = True
        else:
            win_pov = False

        records.append({
            "fen": board.fen(),
            "next_move": next_move,
            "played_by": elo,
            "win_pov": win_pov
        })

positions_df = pd.DataFrame.from_records(records)
positions_df.head()

Processing games: 100%|██████████| 61296/61296 [02:04<00:00, 492.39it/s]


Unnamed: 0,fen,next_move,played_by,win_pov
0,rnbqkbnr/pppppppp/8/8/4P3/8/PPPP1PPP/RNBQKBNR ...,d7d5,1927.5,False
1,rnbqkbnr/ppp1pppp/8/3p4/4P3/8/PPPP1PPP/RNBQKBN...,e4d5,1927.5,True
2,rnbqkbnr/ppp1pppp/8/3P4/8/8/PPPP1PPP/RNBQKBNR ...,d8d5,1927.5,False
3,rnb1kbnr/ppp1pppp/8/3q4/8/8/PPPP1PPP/RNBQKBNR ...,g1f3,1927.5,True
4,rnb1kbnr/ppp1pppp/8/3q4/8/5N2/PPPP1PPP/RNBQKB1...,d5f3,1927.5,False


In [32]:
positions_df["pair_freq"] = (
    positions_df
    .groupby(["fen", "next_move"])["fen"]
    .transform("size")
)

positions_df = positions_df[positions_df["pair_freq"] >= 100]

print(f"In {len(df)} games there are {len(positions_df)} unique position-move pairs that were played more than 100 times.")

In 61296 games there are 433375 unique position-move pairs that were played more than 100 times.


In [33]:
positions_df.head()

Unnamed: 0,fen,next_move,played_by,win_pov,pair_freq
0,rnbqkbnr/pppppppp/8/8/4P3/8/PPPP1PPP/RNBQKBNR ...,d7d5,1927.5,False,61296
1,rnbqkbnr/ppp1pppp/8/3p4/4P3/8/PPPP1PPP/RNBQKBN...,e4d5,1927.5,True,61296
2,rnbqkbnr/ppp1pppp/8/3P4/8/8/PPPP1PPP/RNBQKBNR ...,d8d5,1927.5,False,61296
3,rnb1kbnr/ppp1pppp/8/3q4/8/8/PPPP1PPP/RNBQKBNR ...,g1f3,1927.5,True,17865
34,rnbqkbnr/pppppppp/8/8/4P3/8/PPPP1PPP/RNBQKBNR ...,d7d5,1341.5,False,61296


In [34]:
positions_df.to_parquet("../data/positions_2025_01.parquet")
print("✅ Saved position data to 'data/positions_2025_01.parquet'")

✅ Saved position data to 'data/positions_2025_01.parquet'
