In [136]:
import zstandard as zstd
import chess.pgn

import pandas as pd

import random

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options

pd.set_option('display.max_colwidth', None)

## Decompress .zst files

In [2]:
# Decompress input file and write to output file
def decompress_zst(input_file, output_file):
    with open(input_file, 'rb') as compressed:
        decomp = zstd.ZstdDecompressor()
        with open(output_file, 'wb') as destination:
            decomp.copy_stream(compressed, destination)

In [4]:
input_zst_file = '../ChessStreamComputerVision/lichess_zst_files/lichess_db_standard_rated_2013-01.pgn.zst'
output_pgn_file = '../ChessStreamComputerVision/lichess_pgn_files/lichess_db_standard_rated_2013-01.pgn'
decompress_zst(input_zst_file, output_pgn_file)

## Parse .pgn files

In [6]:
# Extracts all fens from a single pgn file
def extract_fens_from_pgn(pgn_file):
    fens = []
    with open(pgn_file) as pgn:
        while True:
            game = chess.pgn.read_game(pgn)
            if game is None:
                break
            # Go through the moves and collect FENs
            board = game.board()
            for move in game.mainline_moves():
                board.push(move)
                fens.append(board.fen())
    return fens

In [11]:
test_pgn_file = '../ChessStreamComputerVision/lichess_pgn_files/test.pgn'
fens = extract_fens_from_pgn(test_pgn_file)

## Process lichess puzzles

In [16]:
# Decompress the puzzle file
puzzle_zst_file = '../ChessStreamComputerVision/lichess_zst_files/lichess_db_puzzle.csv.zst'
puzzle_csv_file = '../ChessStreamComputerVision/lichess_db_puzzle.csv'
decompress_zst(puzzle_zst_file, puzzle_csv_file)

In [17]:
# Load puzzle csv using pandas
puzzle_df = pd.read_csv(puzzle_csv_file)

In [62]:
# Get puzzle fen list 
fen_list = list(puzzle_df.FEN)

In [127]:
# Randomly sample 20000 fens
random.seed(42)
sampled_fen_list = random.sample(fen_list, 20000)

# Split the fen into lichess/chess.com/365chess/chess42
# lichess - 40% chess.com - 40% rest - 20%
total = len(sampled_fen_list)
num_lichess = int(total * 0.4)
num_chess_com = int(total * 0.4)
num_rest = total - num_lichess - num_chess_com

# Split the list
lichess_games = sampled_fen_list[:num_lichess]
chess_com_games = sampled_fen_list[num_lichess:num_lichess + num_chess_com]
rest_games = sampled_fen_list[num_lichess + num_chess_com:]

# Check the lengths
print(f"lichess: {len(lichess_games)}")
print(f"chess.com: {len(chess_com_games)}")
print(f"rest: {len(rest_games)}")

lichess: 8000
chess.com: 8000
rest: 4000


## Scrape lichess

### 2d board

In [129]:
# Create dataframe to store meta data 
metadata_df = pd.DataFrame()

In [130]:
# 2d board themes
blue1 = '//*[@id="dasher_app"]/div/div/button[1]'
blue2 = '//*[@id="dasher_app"]/div/div/button[2]'
blue3 = '//*[@id="dasher_app"]/div/div/button[3]'
blue_marble = '//*[@id="dasher_app"]/div/div/button[4]'
canvas = '//*[@id="dasher_app"]/div/div/button[5]'
wood = '//*[@id="dasher_app"]/div/div/button[6]'
wood2 = '//*[@id="dasher_app"]/div/div/button[7]'
wood3 = '//*[@id="dasher_app"]/div/div/button[8]'
wood4 = '//*[@id="dasher_app"]/div/div/button[9]'
maple = '//*[@id="dasher_app"]/div/div/button[10]'
maple2 = '//*[@id="dasher_app"]/div/div/button[11]'
brown = '//*[@id="dasher_app"]/div/div/button[12]'
leather = '//*[@id="dasher_app"]/div/div/button[13]'
green = '//*[@id="dasher_app"]/div/div/button[14]'
marble = '//*[@id="dasher_app"]/div/div/button[15]'
green_plastic = '//*[@id="dasher_app"]/div/div/button[16]'
grey = '//*[@id="dasher_app"]/div/div/button[17]'
metal = '//*[@id="dasher_app"]/div/div/button[18]'
olive = '//*[@id="dasher_app"]/div/div/button[19]'
newspaper = '//*[@id="dasher_app"]/div/div/button[20]'
purple = '//*[@id="dasher_app"]/div/div/button[21]'
purple_diag = '//*[@id="dasher_app"]/div/div/button[22]'
pink = '//*[@id="dasher_app"]/div/div/button[23]'
ic = '//*[@id="dasher_app"]/div/div/button[24]'
horsey = '//*[@id="dasher_app"]/div/div/button[25]'

# 2d board theme dictionary
board_theme_2d_dict = {'blue1':blue1, 'blue2':blue2, 'blue3':blue3, 'blue_marble':blue_marble, 'canvas':canvas, 'wood':wood, 
                       'wood2':wood2, 'wood3':wood3, 'wood4':wood4, 'maple':maple, 'maple2':maple2, 'brown':brown, 'leather':leather, 
                       'green':green, 'marble':marble, 'green_plastic':green_plastic, 'grey':grey, 'metal':metal, 'olive':olive, 
                       'newspaper':newspaper, 'purple':purple, 'purple_diag':purple_diag, 'pink':pink, 'ic':ic, 'horsey':horsey}

# 2d piece set 
cburnett = '//*[@id="dasher_app"]/div/div/button[1]'
merida = '//*[@id="dasher_app"]/div/div/button[2]'
alpha = '//*[@id="dasher_app"]/div/div/button[3]'
pirouetti = '//*[@id="dasher_app"]/div/div/button[4]'
chessnut = '//*[@id="dasher_app"]/div/div/button[5]'
chess7 = '//*[@id="dasher_app"]/div/div/button[6]'
reillycraig = '//*[@id="dasher_app"]/div/div/button[7]'
companion = '//*[@id="dasher_app"]/div/div/button[8]'
riohacha = '//*[@id="dasher_app"]/div/div/button[9]'
kosal = '//*[@id="dasher_app"]/div/div/button[10]'
leipzig = '//*[@id="dasher_app"]/div/div/button[11]'
fantasy = '//*[@id="dasher_app"]/div/div/button[12]'
spatial = '//*[@id="dasher_app"]/div/div/button[13]'
celtic = '//*[@id="dasher_app"]/div/div/button[14]'
california = '//*[@id="dasher_app"]/div/div/button[15]'
caliente = '//*[@id="dasher_app"]/div/div/button[16]'
pixel = '//*[@id="dasher_app"]/div/div/button[17]'
maestro = '//*[@id="dasher_app"]/div/div/button[18]'
fresca = '//*[@id="dasher_app"]/div/div/button[19]'
cardinal = '//*[@id="dasher_app"]/div/div/button[20]'
gioco = '//*[@id="dasher_app"]/div/div/button[21]'
tatiana = '//*[@id="dasher_app"]/div/div/button[22]'
staunty = '//*[@id="dasher_app"]/div/div/button[23]'
governor = '//*[@id="dasher_app"]/div/div/button[24]'
dubrovny = '//*[@id="dasher_app"]/div/div/button[25]'
icpieces = '//*[@id="dasher_app"]/div/div/button[26]'
mpchess = '//*[@id="dasher_app"]/div/div/button[27]'

# 2d piece set dictionary  
piece_set_2d_dict = {'cburnett':cburnett, 'merida':merida, 'alpha':alpha, 'pirouetti':pirouetti, 'chessnut':chessnut, 'chess7':chess7, 
                    'reillycraig':reillycraig, 'companion':companion, 'riohacha':riohacha, 'kosal':kosal, 'leipzig':leipzig, 'fantasy':fantasy, 
                    'spatial':spatial, 'celtic':celtic, 'california':california, 'caliente':caliente, 'pixel':pixel, 'maestro':maestro, 
                    'fresca':fresca, 'cardinal':cardinal, 'gioco':gioco, 'tatiana':tatiana, 'staunty':staunty, 'governor':governor, 
                    'dubrovny':dubrovny, 'icpieces':icpieces, 'mpchess':mpchess}

In [134]:
def capture_2d_chessboard(fen, board_id, metadata_df):
    
    # List of user agents
    user_agents = [
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; ) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.3',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36',
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36',
    ]
    chrome_options = Options()
    chrome_options.add_argument(f"user-agent={random.choice(user_agents)}")
    
    # Create lichess analysis url from fen 
    url = f'https://lichess.org/analysis/standard/{fen}'
    
    # Create output path 
    output_path = f'../ChessStreamComputerVision/digital_board_images/digital_board_{board_id}.png'
    
    # Start driver
    driver = webdriver.Chrome()
    driver.get(url)

    # Use explicit waits to ensure the chessboard is loaded before accessing it
    wait = WebDriverWait(driver, 10)
    
    # Click on preferences
    preferences_button = wait.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="top"]/div[2]/div[3]/button')))
    preferences_button.click()
    
    # Click on board theme
    board_theme_button = wait.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="dasher_app"]/div/div/button[5]')))
    board_theme_button.click()
    
    # Select board theme 
    board_theme_key = random.choice(list(board_theme_2d_dict.keys()))
    board_theme = board_theme_2d_dict[board_theme_key]
    selected_theme_button = wait.until(EC.element_to_be_clickable((By.XPATH, board_theme)))
    selected_theme_button.click()
    
    # Go back to preference from board theme selection
    back_board_theme_button = wait.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="dasher_app"]/div/button')))
    back_board_theme_button.click()
    
    # Click on piece set 
    piece_set_button = wait.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="dasher_app"]/div/div/button[6]')))
    piece_set_button.click()
    
    # Select piece set  
    piece_set_key = random.choice(list(piece_set_2d_dict.keys()))
    piece_set = piece_set_2d_dict[piece_set_key]
    selected_piece_button = wait.until(EC.element_to_be_clickable((By.XPATH, piece_set)))
    selected_piece_button.click()
    
    # Take a screenshot of just the chessboard
    chessboard = wait.until(EC.presence_of_element_located((By.XPATH, '//cg-board')))
    chessboard.screenshot(output_path)
    
    # New metadata
    new_df = pd.DataFrame({
    'board_id': [board_id],
    'fen': [fen], 
    'path': [output_path], 
    'board_theme': [board_theme_key],
    'piece_set': [piece_set_key],
    'is_3d': [False]
    })
    metadata_df = pd.concat([metadata_df, new_df], ignore_index=True)

    # Close the driver
    driver.quit()
    
    return metadata_df

### scale images to 256x256!

In [137]:
# Capture lichess digital boards
for board_id in range(len(lichess_games)): 
    fen = lichess_games[board_id]
    metadata_df = capture_2d_chessboard(fen, board_id, metadata_df)

TimeoutException: Message: 
Stacktrace:
0   chromedriver                        0x00000001011ee65c chromedriver + 4318812
1   chromedriver                        0x00000001011e6d00 chromedriver + 4287744
2   chromedriver                        0x0000000100e187ec chromedriver + 296940
3   chromedriver                        0x0000000100e56048 chromedriver + 548936
4   chromedriver                        0x0000000100e8ed28 chromedriver + 781608
5   chromedriver                        0x0000000100e4a178 chromedriver + 500088
6   chromedriver                        0x0000000100e4afc0 chromedriver + 503744
7   chromedriver                        0x00000001011aec40 chromedriver + 4058176
8   chromedriver                        0x00000001011b3160 chromedriver + 4075872
9   chromedriver                        0x0000000101176e68 chromedriver + 3829352
10  chromedriver                        0x00000001011b3c4c chromedriver + 4078668
11  chromedriver                        0x000000010118bf08 chromedriver + 3915528
12  chromedriver                        0x00000001011d0140 chromedriver + 4194624
13  chromedriver                        0x00000001011d02c4 chromedriver + 4195012
14  chromedriver                        0x00000001011e04d0 chromedriver + 4261072
15  libsystem_pthread.dylib             0x0000000188803fa8 _pthread_start + 148
16  libsystem_pthread.dylib             0x00000001887feda0 thread_start + 8


In [139]:
# Checkpoint metadata
# metadata_df.to_csv('../ChessStreamComputerVision/digital_board_metadata.csv', index=False)

### 3d board

In [None]:
def capture_3d_chessboard(url, output_file):
    driver = webdriver.Chrome()
    driver.get(url)

    # Use explicit waits to ensure the chessboard is loaded before accessing it
    wait = WebDriverWait(driver, 10)
    
    # Click on preferences
    preferences_button = wait.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="top"]/div[2]/div[3]/button')))
    preferences_button.click()
    
    # Click on board geometry 
    geometry_button = wait.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="dasher_app"]/div/div/button[4]')))
    geometry_button.click()
    
    # Change to 3D 
    geo_3d_button = wait.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="dasher_app"]/div/div[1]/button[2]')))
    geo_3d_button.click()
    wait = WebDriverWait(driver, 30)
    
     # Wait for an element from the current page to become stale
    current_page_element = driver.find_element(By.XPATH, '//*[@id="top"]/div[2]/div[3]/button')
    wait.until(EC.staleness_of(current_page_element))
    
    # Take a screenshot of just the chessboard
    chessboard = wait.until(EC.presence_of_element_located((By.XPATH, '//cg-board')))
    chessboard.screenshot(output_file)
    
    # Close the driver
    driver.quit()