In [25]:
import chess
import pandas as pd
from tqdm import tqdm

In [26]:
df = pd.read_parquet('../data/data_2025_01.parquet')
df.tail()

Unnamed: 0,game_id,event,white_elo,black_elo,opening,winner,moves
4991159,4999996,Rated Rapid game,1746,1736,Caro-Kann Defense,2,e2e4 c7c6 g1f3 d7d5 e4d5 c6d5 d2d4 c8g4 b1c3 g...
4991160,4999997,Rated Rapid game,1498,1476,King's Pawn Game: McConnell Defense,2,e2e4 e7e5 g1f3 d8f6 f1b5 f8c5 c2c3 c7c6 b5a4 b...
4991161,4999998,Rated Rapid game,2187,2128,Sicilian Defense: Modern Variations,2,e2e4 c7c5 g1f3 d7d6 f1c4 e7e6 e1g1 d8c7 c2c3 a...
4991162,4999999,Rated Rapid game,1659,1696,"Queen's Gambit Accepted: Central Variation, Al...",1,d2d4 d7d5 c2c4 d5c4 e2e4 g8f6 b1c3 e7e6 a2a3 f...
4991163,5000000,Rated Rapid game,1734,1706,Caro-Kann Defense: Main Line,1,d2d4 c7c6 e2e4 d7d5 b1c3 d5e4 c3e4 g8f6 e4c5 c...


In [27]:
df["event"].value_counts()

event
Rated Blitz game                                                  2112717
Rated Bullet game                                                 1673664
Rated Rapid game                                                   679202
Rated Classical game                                                28840
Rated UltraBullet game                                              22866
                                                                   ...   
Bullet swiss https://lichess.org/swiss/xOqHMgIz                         1
Rated Blitz tournament https://lichess.org/tournament/hdiBmgBr          1
Blitz swiss https://lichess.org/swiss/5HMPMS5d                          1
Rapid swiss https://lichess.org/swiss/sDVxlEAo                          1
Rated Rapid tournament https://lichess.org/tournament/MyE7RM5x          1
Name: count, Length: 2345, dtype: int64

In [28]:
events_to_keep = ["Rated Blitz game", "Rated Rapid game", "Rated Classical game"]
df = df[df["event"].isin(events_to_keep)]

df = df[df["opening"] == "Scandinavian Defense: Mieses-Kotroc Variation"]
df.head()

Unnamed: 0,game_id,event,white_elo,black_elo,opening,winner,moves
283,285,Rated Blitz game,1714,1705,Scandinavian Defense: Mieses-Kotroc Variation,2,e2e4 d7d5 e4d5 d8d5 d2d4 d5d6 b1c3 c8f5 f1c4 b...
402,404,Rated Blitz game,1333,1335,Scandinavian Defense: Mieses-Kotroc Variation,2,e2e4 d7d5 e4d5 d8d5 b1c3 d5e5 g1e2 c8g4 d2d4 e...
569,571,Rated Blitz game,1231,1211,Scandinavian Defense: Mieses-Kotroc Variation,1,e2e4 d7d5 e4d5 d8d5 b1c3 d5e6 f1e2 g8f6 g1f3 g...
601,603,Rated Blitz game,1244,1153,Scandinavian Defense: Mieses-Kotroc Variation,0,e2e4 d7d5 e4d5 d8d5 b1c3 d5e6 d1e2 e6e2 f1e2 b...
898,900,Rated Blitz game,1452,1470,Scandinavian Defense: Mieses-Kotroc Variation,1,e2e4 d7d5 e4d5 d8d5 g1f3 c8g4 f1e2 g8f6 c2c4 d...


In [29]:
records = []

for _, game in tqdm(df.iterrows(), total=len(df), desc="Processing games"):
    board = chess.Board()
    moves_arr = game["moves"].split()
    winner_flag = game["winner"]

    for idx, mv in enumerate(moves_arr):
        if idx >= 15 or idx + 1 >= len(moves_arr):
            break

        board.push_uci(mv)
        next_move = moves_arr[idx + 1]

        if (board.turn == chess.WHITE and winner_flag == 1) or (board.turn == chess.BLACK and winner_flag == 2):
            win_pov = True
        else:
            win_pov = False

        played_by = game["white_elo"] if board.turn else game["black_elo"]

        records.append({
            "fen": board.fen(),
            "next_move": next_move,
            "played_by_elo": played_by,
            "win_pov": win_pov
        })

positions_df = pd.DataFrame.from_records(records)
positions_df.head()

Processing games: 100%|██████████| 35816/35816 [00:20<00:00, 1736.14it/s]


Unnamed: 0,fen,next_move,played_by_elo,win_pov
0,rnbqkbnr/pppppppp/8/8/4P3/8/PPPP1PPP/RNBQKBNR ...,d7d5,1705,True
1,rnbqkbnr/ppp1pppp/8/3p4/4P3/8/PPPP1PPP/RNBQKBN...,e4d5,1714,False
2,rnbqkbnr/ppp1pppp/8/3P4/8/8/PPPP1PPP/RNBQKBNR ...,d8d5,1705,True
3,rnb1kbnr/ppp1pppp/8/3q4/8/8/PPPP1PPP/RNBQKBNR ...,d2d4,1714,False
4,rnb1kbnr/ppp1pppp/8/3q4/3P4/8/PPP2PPP/RNBQKBNR...,d5d6,1705,True


In [30]:
positions_df["pair_freq"] = (
    positions_df
    .groupby(["fen", "next_move"])["fen"]
    .transform("size")
)

positions_df = positions_df[positions_df["pair_freq"] >= 100]

print(f"In {len(df)} games there are {len(positions_df)} unique position-move pairs that were played more than 100 times.")

In 35816 games there are 242512 unique position-move pairs that were played more than 100 times.


In [31]:
positions_df.head()

Unnamed: 0,fen,next_move,played_by_elo,win_pov,pair_freq
0,rnbqkbnr/pppppppp/8/8/4P3/8/PPPP1PPP/RNBQKBNR ...,d7d5,1705,True,35816
1,rnbqkbnr/ppp1pppp/8/3p4/4P3/8/PPPP1PPP/RNBQKBN...,e4d5,1714,False,35816
2,rnbqkbnr/ppp1pppp/8/3P4/8/8/PPPP1PPP/RNBQKBNR ...,d8d5,1705,True,35816
3,rnb1kbnr/ppp1pppp/8/3q4/8/8/PPPP1PPP/RNBQKBNR ...,d2d4,1714,False,4856
15,rnbqkbnr/pppppppp/8/8/4P3/8/PPPP1PPP/RNBQKBNR ...,d7d5,1335,True,35816


In [32]:
positions_df.to_parquet("../data/positions_2025_01.parquet")
print("✅ Saved position data to 'data/positions_2025_01.parquet'")

✅ Saved position data to 'data/positions_2025_01.parquet'
