In [20]:
import sys
import pandas
import numpy as np
import matplotlib.pyplot as plt
import chess
import chess.svg

data_import = pandas.read_csv("~/Box/repos/Lichess_Trap_Finder/lichess-api-queries/data_from_queries/position_data_white_attacking.txt", sep="\t")

In [2]:
filtered_data = data_import[(data_import["prob_trimmed"] > 0.1) & (data_import["prob"] > 0.01) & (data_import["white_win_prob"] > 0.55) & (data_import["white_wins"] + data_import["draws"] + data_import["black_wins"] > 500)]
# Use merge sort, which is a stable sort in the python implementation
filtered_data = filtered_data.sort_values("prob", ascending=False, kind="mergesort")
filtered_data = filtered_data.sort_values("white_win_prob", ascending=False, kind="mergesort")
filtered_data.head()

Unnamed: 0,Opening,UCI_moves,move_index,prob,prob_trimmed,white_wins,draws,black_wins,white_win_prob,draw_prob,black_win_prob
64424,B06: Modern Defense,"d2d4,g7g6,e2e4,f8g7,c1h6,d7d6",6,0.010222,0.193307,9336,189,957,0.89067,0.018031,0.091299
61266,A40: Horwitz Defense,"d2d4,e7e6,c1g5,d7d5",4,0.029784,0.303493,127756,3045,20669,0.843441,0.020103,0.136456
61287,A40: Horwitz Defense,"d2d4,e7e6,c1g5,d7d5,g5d8,e8d8,b1d2",7,0.027599,0.281228,827,28,150,0.822886,0.027861,0.149254
64425,B06: Modern Defense,"d2d4,g7g6,e2e4,f8g7,c1h6,d7d6,h6g7",7,0.010222,0.193307,4953,187,937,0.81504,0.030772,0.154188
37088,B10: Caro-Kann Defense: Breyer Variation,"e2e4,c7c6,d2d3,d7d5,g1f3,d5e4,f3g5,e4d3,f1d3,g...",12,0.012235,0.14824,10969,362,2250,0.807672,0.026655,0.165673


In [3]:
filtered_data["san"] = ""
filtered_data["fen"] = ""
for i in range(filtered_data.shape[0]):
    board = chess.Board()
    moves = filtered_data.iloc[i, filtered_data.columns.get_loc("UCI_moves")].split(",")
    error_occurred = False
    for move in moves:
        try:
            board.push_uci(move)
        except ValueError:
            error_occurred = True
            print("Caught a ValueError (likely illegal move) for index " + str(i) + ", position " + filtered_data.iloc[i, filtered_data.columns.get_loc("UCI_moves")])
            break
    if not error_occurred:
        filtered_data.iloc[i, filtered_data.columns.get_loc("san")] = chess.Board().variation_san(board.move_stack)
        filtered_data.iloc[i, filtered_data.columns.get_loc("fen")] = board.fen()
        
# Remove rows with an illegal move
filtered_data = filtered_data[filtered_data.san != ""]

filtered_data.head()

Unnamed: 0,Opening,UCI_moves,move_index,prob,prob_trimmed,white_wins,draws,black_wins,white_win_prob,draw_prob,black_win_prob,san,fen
64424,B06: Modern Defense,"d2d4,g7g6,e2e4,f8g7,c1h6,d7d6",6,0.010222,0.193307,9336,189,957,0.89067,0.018031,0.091299,1. d4 g6 2. e4 Bg7 3. Bh6 d6,rnbqk1nr/ppp1ppbp/3p2pB/8/3PP3/8/PPP2PPP/RN1QK...
61266,A40: Horwitz Defense,"d2d4,e7e6,c1g5,d7d5",4,0.029784,0.303493,127756,3045,20669,0.843441,0.020103,0.136456,1. d4 e6 2. Bg5 d5,rnbqkbnr/ppp2ppp/4p3/3p2B1/3P4/8/PPP1PPPP/RN1Q...
61287,A40: Horwitz Defense,"d2d4,e7e6,c1g5,d7d5,g5d8,e8d8,b1d2",7,0.027599,0.281228,827,28,150,0.822886,0.027861,0.149254,1. d4 e6 2. Bg5 d5 3. Bxd8 Kxd8 4. Nd2,rnbk1bnr/ppp2ppp/4p3/3p4/3P4/8/PPPNPPPP/R2QKBN...
64425,B06: Modern Defense,"d2d4,g7g6,e2e4,f8g7,c1h6,d7d6,h6g7",7,0.010222,0.193307,4953,187,937,0.81504,0.030772,0.154188,1. d4 g6 2. e4 Bg7 3. Bh6 d6 4. Bxg7,rnbqk1nr/ppp1ppBp/3p2p1/8/3PP3/8/PPP2PPP/RN1QK...
37088,B10: Caro-Kann Defense: Breyer Variation,"e2e4,c7c6,d2d3,d7d5,g1f3,d5e4,f3g5,e4d3,f1d3,g...",12,0.012235,0.14824,10969,362,2250,0.807672,0.026655,0.165673,1. e4 c6 2. d3 d5 3. Nf3 dxe4 4. Ng5 exd3 5. B...,rnbq1b1r/pp2pkpp/2p2n2/8/8/3B4/PPP2PPP/RNBQK2R...


In [4]:
# Cluster data
filtered_data["cluster"] = ""
for i in range(filtered_data.shape[0]):
    filtered_data.iloc[i, filtered_data.columns.get_loc("cluster")] = "C" + str(i+1)
    
# Merge if one move sequence is a subset of another, or if positions are identical
for i in range(filtered_data.shape[0]):
    for j in range(i+1,filtered_data.shape[0]):
        moves1 = filtered_data.iloc[i, filtered_data.columns.get_loc("UCI_moves")]
        moves2 = filtered_data.iloc[j, filtered_data.columns.get_loc("UCI_moves")]
        fen1 = filtered_data.iloc[i, filtered_data.columns.get_loc("fen")]
        fen2 = filtered_data.iloc[j, filtered_data.columns.get_loc("fen")]
        # Ignore the halfmove clock / fullmove number in the FEN
        fen1 = " ".join(fen1.split(" ")[0:4])
        fen2 = " ".join(fen2.split(" ")[0:4])
        if (moves1 in moves2) or (moves2 in moves1) or (fen1 == fen2):
            prev_cluster_name = filtered_data.iloc[i, filtered_data.columns.get_loc("cluster")]
            for k in range(j):
                if filtered_data.iloc[k, filtered_data.columns.get_loc("cluster")] == prev_cluster_name:
                    filtered_data.iloc[k, filtered_data.columns.get_loc("cluster")] = filtered_data.iloc[j, filtered_data.columns.get_loc("cluster")]
                    
# Make any white Bh6 move the same cluster
for i in range(filtered_data.shape[0]):
  if ". Bh6" in filtered_data.iloc[i, filtered_data.columns.get_loc("san")]:
    filtered_data.iloc[i, filtered_data.columns.get_loc("cluster")] = "C" + str(filtered_data.shape[0] + 1)

In [5]:
# Rename clusters based on first appearance
filtered_data["cluster_idx"] = 0
new_cluster_idx = 1
cluster_map = {}

for i in range(filtered_data.shape[0]):
    old_cluster_val = filtered_data.iloc[i, filtered_data.columns.get_loc("cluster")]
    if not (old_cluster_val in cluster_map):
        cluster_map[old_cluster_val] = new_cluster_idx
        new_cluster_idx = new_cluster_idx + 1
    filtered_data.iloc[i, filtered_data.columns.get_loc("cluster_idx")] = cluster_map[old_cluster_val]
    
filtered_data = filtered_data.drop(columns="cluster")

In [6]:
# Group the clusters together using a stable sort that preserves previous ordering (by white win prob, then prob)
filtered_data = filtered_data.sort_values("cluster_idx", ascending=True, kind="mergesort")

In [7]:
filtered_data[["cluster_idx", "Opening", "prob", "prob_trimmed", "white_win_prob", "san"]].head()

Unnamed: 0,cluster_idx,Opening,prob,prob_trimmed,white_win_prob,san
64424,1,B06: Modern Defense,0.010222,0.193307,0.89067,1. d4 g6 2. e4 Bg7 3. Bh6 d6
64425,1,B06: Modern Defense,0.010222,0.193307,0.81504,1. d4 g6 2. e4 Bg7 3. Bh6 d6 4. Bxg7
175973,1,A00: Mieses Opening,0.035984,0.27443,0.781116,1. d3 Nf6 2. g3 g6 3. Bh6 Bg7
127907,1,A00: Hungarian Opening: Indian Defense,0.029282,0.274399,0.781116,1. g3 Nf6 2. d3 g6 3. Bh6 Bg7
90319,1,A04: Zukertort Opening: Kingside Fianchetto,0.010225,0.152385,0.767442,1. Nf3 g6 2. d4 Bg7 3. Bh6 Nf6 4. Bxg7 Rg8 5. ...


In [8]:
filtered_data.head()

Unnamed: 0,Opening,UCI_moves,move_index,prob,prob_trimmed,white_wins,draws,black_wins,white_win_prob,draw_prob,black_win_prob,san,fen,cluster_idx
64424,B06: Modern Defense,"d2d4,g7g6,e2e4,f8g7,c1h6,d7d6",6,0.010222,0.193307,9336,189,957,0.89067,0.018031,0.091299,1. d4 g6 2. e4 Bg7 3. Bh6 d6,rnbqk1nr/ppp1ppbp/3p2pB/8/3PP3/8/PPP2PPP/RN1QK...,1
64425,B06: Modern Defense,"d2d4,g7g6,e2e4,f8g7,c1h6,d7d6,h6g7",7,0.010222,0.193307,4953,187,937,0.81504,0.030772,0.154188,1. d4 g6 2. e4 Bg7 3. Bh6 d6 4. Bxg7,rnbqk1nr/ppp1ppBp/3p2p1/8/3PP3/8/PPP2PPP/RN1QK...,1
175973,A00: Mieses Opening,"d2d3,g8f6,g2g3,g7g6,c1h6,f8g7",6,0.035984,0.27443,728,30,174,0.781116,0.032189,0.186695,1. d3 Nf6 2. g3 g6 3. Bh6 Bg7,rnbqk2r/ppppppbp/5npB/8/8/3P2P1/PPP1PP1P/RN1QK...,1
127907,A00: Hungarian Opening: Indian Defense,"g2g3,g8f6,d2d3,g7g6,c1h6,f8g7",6,0.029282,0.274399,728,30,174,0.781116,0.032189,0.186695,1. g3 Nf6 2. d3 g6 3. Bh6 Bg7,rnbqk2r/ppppppbp/5npB/8/8/3P2P1/PPP1PP1P/RN1QK...,1
90319,A04: Zukertort Opening: Kingside Fianchetto,"g1f3,g7g6,d2d4,f8g7,c1h6,g8f6,h6g7,h8g8,g7f6,e...",11,0.010225,0.152385,627,36,154,0.767442,0.044064,0.188494,1. Nf3 g6 2. d4 Bg7 3. Bh6 Nf6 4. Bxg7 Rg8 5. ...,rnbqk1r1/pppp1p1p/5pp1/8/3P4/5NP1/PPP1PP1P/RN1...,1


Unnamed: 0,Opening,UCI_moves,move_index,prob,prob_trimmed,white_wins,draws,black_wins,white_win_prob,draw_prob,black_win_prob,san,fen,cluster_idx
64424,B06: Modern Defense,"d2d4,g7g6,e2e4,f8g7,c1h6,d7d6",6,0.010222,0.193307,9336,189,957,0.890670,0.018031,0.091299,1. d4 g6 2. e4 Bg7 3. Bh6 d6,rnbqk1nr/ppp1ppbp/3p2pB/8/3PP3/8/PPP2PPP/RN1QK...,1
61266,A40: Horwitz Defense,"d2d4,e7e6,c1g5,d7d5",4,0.029784,0.303493,127756,3045,20669,0.843441,0.020103,0.136456,1. d4 e6 2. Bg5 d5,rnbqkbnr/ppp2ppp/4p3/3p2B1/3P4/8/PPP1PPPP/RN1Q...,2
37088,B10: Caro-Kann Defense: Breyer Variation,"e2e4,c7c6,d2d3,d7d5,g1f3,d5e4,f3g5,e4d3,f1d3,g...",12,0.012235,0.148240,10969,362,2250,0.807672,0.026655,0.165673,1. e4 c6 2. d3 d5 3. Nf3 dxe4 4. Ng5 exd3 5. B...,rnbq1b1r/pp2pkpp/2p2n2/8/8/3B4/PPP2PPP/RNBQK2R...,3
66647,A06: Zukertort Opening: Tennison Gambit,"g1f3,d7d5,e2e4,d5e4,f3e5,g8f6,d1h5,g7g6,f1c4",9,0.032757,0.117933,797,8,279,0.735240,0.007380,0.257380,1. Nf3 d5 2. e4 dxe4 3. Ne5 Nf6 4. Qh5 g6 5. Bc4,rnbqkb1r/ppp1pp1p/5np1/4N2Q/2B1p3/8/PPPP1PPP/R...,4
49153,A45: Indian Defense,"d2d4,g8f6,c1f4,g7g6,f4c7,f8g7,c7d8,e8d8,e2e3",9,0.026948,0.106724,374,33,102,0.734774,0.064833,0.200393,1. d4 Nf6 2. Bf4 g6 3. Bxc7 Bg7 4. Bxd8 Kxd8 5...,rnbk3r/pp1pppbp/5np1/8/3P4/4P3/PPP2PPP/RN1QKBN...,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87647,A04: Zukertort Opening: Queen's Gambit Invitation,"g1f3,e7e6,b2b3,d7d5,c1b2,g8f6,b2f6",7,0.033190,0.364442,5973,450,4430,0.550355,0.041463,0.408182,1. Nf3 e6 2. b3 d5 3. Bb2 Nf6 4. Bxf6,rnbqkb1r/ppp2ppp/4pB2/3p4/8/1P3N2/P1PPPPPP/RN1...,374
29900,"B18: Caro-Kann Defense: Classical Variation, M...","e2e4,c7c6,d2d4,d7d5,b1d2,d5e4,d2e4,c8f5,e4g3,f...",13,0.023215,0.281264,7628,725,5509,0.550281,0.052301,0.397417,1. e4 c6 2. d4 d5 3. Nd2 dxe4 4. Nxe4 Bf5 5. N...,rn1qkbnr/pp2ppp1/2p3bp/8/3P3P/6N1/PPP1NPP1/R1B...,375
62453,A43: Benoni Defense: Benoni Gambit Accepted,"d2d4,c7c5,d4c5,d8a5,b1c3,a5c5,e2e4",7,0.011052,0.186448,11680,786,8763,0.550191,0.037025,0.412784,1. d4 c5 2. dxc5 Qa5+ 3. Nc3 Qxc5 4. e4,rnb1kbnr/pp1ppppp/8/2q5/4P3/2N5/PPP2PPP/R1BQKB...,376
96961,"A16: English Opening: Anglo-Indian Defense, Qu...","c2c4,g8f6,b1c3,g7g6,e2e4,d7d6,d2d3,f8g7,f1e2,e...",11,0.026676,0.133161,1472,117,1087,0.550075,0.043722,0.406203,1. c4 Nf6 2. Nc3 g6 3. e4 d6 4. d3 Bg7 5. Be2 ...,rnbq1rk1/ppp1ppbp/3p1np1/8/2P1P2P/2NP4/PP2BPP1...,377


In [9]:
# Get a dataframe containing the first element of each cluster
cluster_representatives = filtered_data[~filtered_data.duplicated(subset=['cluster_idx'])]

board_colors = chess.svg.DEFAULT_COLORS.copy()
board_colors["square light"] = "#f0d9b5"
board_colors["square dark"] = "#b58863"
board_colors["square light lastmove"] = "#cdd26a"
board_colors["square dark lastmove"] = "#aaa23b"

# Render images from the first position of each cluster
for i in range(cluster_representatives.shape[0]):
    board = chess.Board()
    moves = cluster_representatives.iloc[i, cluster_representatives.columns.get_loc("UCI_moves")].split(",")
    error_occurred = False
    last_move = ""
    for move in moves:
        board.push_uci(move)
        last_move = move
    boardsvg = chess.svg.board(board, orientation=chess.WHITE, lastmove=chess.Move.from_uci(last_move), colors=board_colors)
    outputfile = open("../docs/assets/svg_boards/board_" + str(cluster_representatives.iloc[i, cluster_representatives.columns.get_loc("cluster_idx")]) + ".svg", "w")
    outputfile.write(boardsvg)
    outputfile.close()

In [103]:
for i in range(cluster_representatives.shape[0]):
    num_games = cluster_representatives.iloc[i, cluster_representatives.columns.get_loc("white_wins")] + cluster_representatives.iloc[i, cluster_representatives.columns.get_loc("draws")] + cluster_representatives.iloc[i, cluster_representatives.columns.get_loc("black_wins")]
    current_row = cluster_representatives.iloc[[i]]
    current_row = current_row[["white_win_prob", "draw_prob", "black_win_prob"]]
    current_row = current_row * 100

    current_row.plot(
        kind = 'barh',
        stacked = True,
        legend=None,
        xlim=(0,100),
        color = ["white", "gray", "black"],
        edgecolor="black",
        figsize=(5, 0.75))

    plt.axis("off")

    cs = current_row.cumsum(1)
    current_row = current_row.values.flatten()
    cs = cs.values.flatten()
    plt.text(50, 0.5, "Win Proportions, Lichess Games", va="center", ha="center", fontsize=12)
    plt.text(50, -0.5, "Number of games: " + str(num_games), va="center", ha="center")
    for j in range(3):
        text_color = "black"
        if j == 2:
            text_color = "white"
        if current_row[j] > 15:
            plt.text((cs - current_row / 2)[j], 0, str(np.round(current_row[j], 1)) + "%", va="center", ha="center", color = text_color)
    plt.savefig("../docs/assets/svg_boards/barplot_" + str(cluster_representatives.iloc[i, cluster_representatives.columns.get_loc("cluster_idx")]) + ".svg", format="svg", bbox_inches="tight")
    plt.close()

In [10]:
filtered_data.to_csv("processed_data/filtered_data_san_fen_cluster.tsv", sep ='\t', index = False)