In [18]:
import chess
import pandas as pd
from tqdm import tqdm

In [19]:
df = pd.read_parquet('../data/data_2025_01.parquet')
df.tail()

Unnamed: 0,game_id,event,white_elo,black_elo,opening,winner,moves
4991159,4999996,Rated Rapid game,1746,1736,Caro-Kann Defense,2,e2e4 c7c6 g1f3 d7d5 e4d5 c6d5 d2d4 c8g4 b1c3 g...
4991160,4999997,Rated Rapid game,1498,1476,King's Pawn Game: McConnell Defense,2,e2e4 e7e5 g1f3 d8f6 f1b5 f8c5 c2c3 c7c6 b5a4 b...
4991161,4999998,Rated Rapid game,2187,2128,Sicilian Defense: Modern Variations,2,e2e4 c7c5 g1f3 d7d6 f1c4 e7e6 e1g1 d8c7 c2c3 a...
4991162,4999999,Rated Rapid game,1659,1696,"Queen's Gambit Accepted: Central Variation, Al...",1,d2d4 d7d5 c2c4 d5c4 e2e4 g8f6 b1c3 e7e6 a2a3 f...
4991163,5000000,Rated Rapid game,1734,1706,Caro-Kann Defense: Main Line,1,d2d4 c7c6 e2e4 d7d5 b1c3 d5e4 c3e4 g8f6 e4c5 c...


In [20]:
df["event"].value_counts()

event
Rated Blitz game                                                  2112717
Rated Bullet game                                                 1673664
Rated Rapid game                                                   679202
Rated Classical game                                                28840
Rated UltraBullet game                                              22866
                                                                   ...   
Bullet swiss https://lichess.org/swiss/xOqHMgIz                         1
Rated Blitz tournament https://lichess.org/tournament/hdiBmgBr          1
Blitz swiss https://lichess.org/swiss/5HMPMS5d                          1
Rapid swiss https://lichess.org/swiss/sDVxlEAo                          1
Rated Rapid tournament https://lichess.org/tournament/MyE7RM5x          1
Name: count, Length: 2345, dtype: int64

In [21]:
events_to_keep = ["Rated Blitz game", "Rated Rapid game", "Rated Classical game"]
df = df[df["event"].isin(events_to_keep)]
df.head()

Unnamed: 0,game_id,event,white_elo,black_elo,opening,winner,moves
0,2,Rated Blitz game,1247,1218,Vienna Game: Anderssen Defense,1,b1c3 e7e5 e2e4 f8c5 d1h5 g8f6 h5e5 c5e7 d2d3 d...
1,3,Rated Blitz game,1577,1593,Caro-Kann Defense: Masi Variation,2,d2d4 c7c6 e2e4 g8f6 e4e5 f6g8 g1f3 d7d5 b1c3 c...
2,4,Rated Blitz game,1043,1000,Queen's Pawn Game,1,d2d4 d7d5 c2c3 b8c6 g1f3 c8g4 h2h3 g4f3 e2f3 e...
3,5,Rated Blitz game,2015,2028,Caro-Kann Defense: Exchange Variation,1,e2e4 c7c6 d2d4 d7d5 e4d5 c6d5 f1d3 g8f6 h2h3 b...
4,6,Rated Blitz game,2139,2145,Caro-Kann Defense: Endgame Variation,2,e2e4 c7c6 d2d3 d7d5 g1f3 d5e4 d3e4 d8d1 e1d1 g...


In [22]:
records = []

for _, game in tqdm(df.iterrows(), total=len(df), desc="Processing games"):
    board = chess.Board()
    moves_arr = game["moves"].split()
    winner_flag = game["winner"]

    for idx, mv in enumerate(moves_arr):
        if idx + 1 >= len(moves_arr):
            break

        board.push_uci(mv)
        next_move = moves_arr[idx + 1]

        if (board.turn & winner_flag == 1) or (not board.turn & winner_flag == 2):
            win_pov = True
        else:
            win_pov = False

        played_by = game["white_elo"] if board.turn else game["black_elo"]

        records.append({
            "fen": board.fen(),
            "next_move": next_move,
            "played_by_elo": played_by,
            "win_pov": win_pov
        })

positions_df = pd.DataFrame.from_records(records)
positions_df.head()

Processing games: 100%|██████████| 2820759/2820759 [1:53:14<00:00, 415.13it/s]  


Unnamed: 0,fen,next_move,played_by_elo,win_pov
0,rnbqkbnr/pppppppp/8/8/8/2N5/PPPPPPPP/R1BQKBNR ...,e7e5,1218,True
1,rnbqkbnr/pppp1ppp/8/4p3/8/2N5/PPPPPPPP/R1BQKBN...,e2e4,1247,True
2,rnbqkbnr/pppp1ppp/8/4p3/4P3/2N5/PPPP1PPP/R1BQK...,f8c5,1218,True
3,rnbqk1nr/pppp1ppp/8/2b1p3/4P3/2N5/PPPP1PPP/R1B...,d1h5,1247,True
4,rnbqk1nr/pppp1ppp/8/2b1p2Q/4P3/2N5/PPPP1PPP/R1...,g8f6,1218,True


In [26]:
positions_df["pair_freq"] = (
    positions_df
    .groupby(["fen", "next_move"])["fen"]
    .transform("size")
)

positions_df = positions_df[positions_df["pair_freq"] >= 100]

print(f"In {len(df)} games there are {len(positions_df)} unique position-move pairs that were played more than 100 times.")

In 2820759 games there are 17813540 unique position-move pairs that were played more than 100 times.


In [24]:
positions_df.head()

Unnamed: 0,fen,next_move,played_by_elo,win_pov,pair_freq
0,rnbqkbnr/pppppppp/8/8/8/2N5/PPPPPPPP/R1BQKBNR ...,e7e5,1218,True,3701
1,rnbqkbnr/pppp1ppp/8/4p3/8/2N5/PPPPPPPP/R1BQKBN...,e2e4,1247,True,1333
2,rnbqkbnr/pppp1ppp/8/4p3/4P3/2N5/PPPP1PPP/R1BQK...,f8c5,1218,True,149
28,rnbqkbnr/pppppppp/8/8/3P4/8/PPP1PPPP/RNBQKBNR ...,c7c6,1593,True,23871
29,rnbqkbnr/pp1ppppp/2p5/8/3P4/8/PPP1PPPP/RNBQKBN...,e2e4,1577,True,3934


In [25]:
positions_df.to_parquet("../data/positions_2025_01.parquet")
print("✅ Saved position data to 'data/positions_2025_01.parquet'")

✅ Saved position data to 'data/positions_2025_01.parquet'
