# 1. 爬取比賽頁面的連結

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import json

def fetch_all_links(base_url, total_pages):
    """
    抓取所有比賽連結。
    :param base_url: 棋手首頁 URL
    :param total_pages: 總頁數
    :return: 去重後的比賽連結集合
    """
    driver = webdriver.Chrome()  # 或者 Firefox
    all_links = set()

    try:
        for page in range(total_pages):
            # 計算頁面的 URL
            offset = page * 40
            page_url = f"{base_url}/?p=1&start={offset}" if page > 0 else base_url
            print(f"正在抓取頁面：{page_url}")

            # 打開頁面並等待載入完成
            driver.get(page_url)
            time.sleep(3)  # 避免過快被網站封鎖
            WebDriverWait(driver, 10).until(
                EC.presence_of_all_elements_located((By.XPATH, '//img[@alt="View Game"]/parent::a'))
            )

            # 抓取比賽連結
            elements = driver.find_elements(By.XPATH, '//img[@alt="View Game"]/parent::a')
            for elem in elements:
                link = elem.get_attribute("href")
                if link:  # 確保連結非空
                    all_links.add(link)
            print(f"頁面 {page + 1} 抓取到 {len(elements)} 個連結")

        print(f"去重後共抓取到 {len(all_links)} 個連結")
    except Exception as e:
        print(f"發生錯誤：{e}")
    finally:
        driver.quit()

    return all_links

# 儲存連結到 JSON
def save_links_to_json(links, filename="game_links.json"):
    """
    儲存連結至 JSON 檔案。
    """
    with open(filename, "w") as f:
        json.dump(list(links), f, indent=4, ensure_ascii=False)
    print(f"連結已保存到 {filename}")

# 執行程式
if __name__ == "__main__":
    BASE_URL = "https://www.365chess.com/players/Robert_James_Fischer"
    TOTAL_PAGES = 24

    # 抓取連結
    all_game_links = fetch_all_links(BASE_URL, TOTAL_PAGES)

    # 保存到 JSON
    if all_game_links:
        save_links_to_json(all_game_links)
    else:
        print("未抓取到任何連結，請檢查程式或網路環境。")

# 2.抓取每個比賽連結內的數據

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from concurrent.futures import ThreadPoolExecutor
import time
import json

def fetch_game_moves(url):
    """
    獨立函數，每個線程抓取一個比賽頁面。
    """
    driver = webdriver.Chrome()
    try:
        print(f"正在訪問比賽頁面：{url}")
        driver.get(url)

        # 等待 GameTextLayer 出現
        wait = WebDriverWait(driver, 30)
        game_text_div = wait.until(EC.presence_of_element_located((By.ID, "GameTextLayer")))

        # 提取棋譜數據
        game_moves = game_text_div.text
        print(f"成功抓取棋譜數據：{url}")
        return {"url": url, "moves": game_moves}

    except Exception as e:
        print(f"抓取失敗：{url}，錯誤：{e}")
        return {"url": url, "moves": None}

    finally:
        driver.quit()

if __name__ == "__main__":
    # 載入連結
    with open("game_links.json", "r") as f:
        game_links = json.load(f)

    # 使用 ThreadPoolExecutor 並行處理
    batch_results = []
    with ThreadPoolExecutor(max_workers=5) as executor:  # 同時啟用 5 個瀏覽器
        results = list(executor.map(fetch_game_moves, game_links))
        batch_results.extend(results)

    # 保存結果到 JSON
    with open("game_data_final.json", "w") as f:
        json.dump(batch_results, f, indent=4, ensure_ascii=False)
    print("所有數據抓取完成，保存至 game_data_final.json。")

# 3.檢查和驗證抓取的棋譜數據

In [8]:
import json

def print_chess_moves_from_json(json_file, max_games=5):
    """
    從 JSON 檔案讀取並打印棋譜數據。
    :param json_file: JSON 檔案名稱
    :param max_games: 要打印的比賽數量
    """
    try:
        # 讀取 JSON 檔案
        with open(json_file, "r", encoding="utf-8") as file:
            data = json.load(file)
        
        print(f"總共抓取到 {len(data)} 場比賽棋譜數據。\n")

        # 打印指定數量的比賽數據
        for idx, game in enumerate(data[:max_games], 1):
            print(f"第 {idx} 場比賽 URL: {game['url']}")
            print("棋譜數據：")
            print(game["moves"])
            print("-" * 40)
    
    except FileNotFoundError:
        print(f"檔案 {json_file} 未找到，請確認檔案路徑是否正確。")
    except json.JSONDecodeError:
        print(f"檔案 {json_file} 格式錯誤，無法解析為 JSON。")
    except Exception as e:
        print(f"發生錯誤：{e}")

# 測試程式
if __name__ == "__main__":
    json_file = "game_data_final.json"  # JSON 檔案名稱
    print_chess_moves_from_json(json_file, max_games=5)  # 預覽 5 場比賽棋譜

總共抓取到 978 場比賽棋譜數據。

第 1 場比賽 URL: https://www.365chess.com/game.php?gid=2536745
棋譜數據：
1.Nf3 Nf6 2.g3 g6 3.Bg2 Bg7 4.O-O O-O 5.c4 d6 6.Nc3 e5 7.d4 Nbd7 8.h3 Re8 9.Qc2 exd4 10.Nxd4 Nb6 11.b3 c5 12.Ndb5 a6 13.Na3 Bf5 14.Qd2 d5 15.g4 Bxg4 16.hxg4 Nxg4 17.Bh3 Qh4 18.Kg2 d4 19.Qg5 Qxg5 20.Bxg5 f5 21.Bxg4 fxg4 22.Nd5 Nxd5 23.cxd5 d3 24.exd3 Bxa1 25.Rxa1 Re5 ½-½
----------------------------------------
第 2 場比賽 URL: https://www.365chess.com/game.php?gid=2575364
棋譜數據：
1.e4 e5 2.Nf3 Nc6 3.Bb5 a6 4.Ba4 Nf6 5.O-O Be7 6.Re1 b5 7.Bb3 d6 8.c3 O-O 9.h3 Na5 10.Bc2 c5 11.d4 cxd4 12.cxd4 Bb7 13.d5 Bc8 14.Nbd2 g6 15.b4 Nb7 16.a4 Bd7 17.axb5 axb5 18.Rxa8 Qxa8 19.Re3 Qc8 20.Ra3 Qc7 21.Nb3 Nh5 22.Bd3 Rc8 23.Qf1 Nf6 24.Bg5 Rb8 25.Ra7 Qd8 26.Qa1 Qe8 27.Qa6 Qc8 28.Nxe5 dxe5 29.Bxf6 Bxf6 30.Qxf6 Qc3 31.Nc5 Nxc5 32.bxc5 Be8 33.Bf1 Qxc5 34.Re7 b4 35.d6 Qb6 36.Bc4 1-0
----------------------------------------
第 3 場比賽 URL: https://www.365chess.com/game.php?gid=2577326
棋譜數據：
1.e4 e5 2.Nf3 d6 3.d4 Nc6 4.Bb5 Bd7 5.Nc3 Be7

In [None]:
import chess
import chess.pgn
import json
from collections import defaultdict

def calculate_move_probabilities(input_file, output_file):
    """
    計算每個棋盤狀態下，下一步棋的概率分布
    """
    fen_move_counts = defaultdict(lambda: defaultdict(int))  # FEN 對應的下一步棋統計

    with open(input_file, "r") as file:
        games = json.load(file)

    for game in games:
        moves = game.get("moves", "")
        if not moves:
            continue
        
        board = chess.Board()
        move_list = moves.split()  # 拆分棋譜

        for move_index, move in enumerate(move_list):
            if move.isdigit():  # 跳過步數標記（如 "1.", "2."）
                continue
            
            # 當前棋盤狀態
            fen = board.fen()

            try:
                chess_move = board.parse_san(move)
                fen_move_counts[fen][move] += 1  # 統計下一步棋的次數
                board.push(chess_move)  # 更新棋盤狀態
            except ValueError:
                print(f"無效的棋步: {move}，跳過該步。")
                break

    # 計算機率
    probabilities = {}
    for fen, moves in fen_move_counts.items():
        total_moves = sum(moves.values())
        probabilities[fen] = {move: count / total_moves for move, count in moves.items()}

    # 保存結果
    with open(output_file, "w") as file:
        json.dump(probabilities, file, indent=4)

    print(f"棋步機率結果已保存到 {output_file}")


# 執行腳本
input_file = "./game_data_final.json"  # 比賽數據
output_file = "./fen_probabilities.json"  # 保存的結果
calculate_move_probabilities(input_file, output_file)

In [6]:
import json
from collections import defaultdict
import chess

def process_chess_games(input_file, output_file):
    """
    處理比賽數據，統計每一步棋下一步的機率。
    """
    # 初始化數據結構
    fen_next_move_counts = defaultdict(lambda: defaultdict(int))  # {FEN: {move: count}}

    # 讀取比賽數據
    with open(input_file, "r") as file:
        games = json.load(file)

    # 遍歷每場比賽
    for game in games:
        moves = game.get("moves", "")
        if not moves:
            continue
        
        # 初始化棋盤
        board = chess.Board()

        # 分割棋步
        move_list = moves.split()
        for move in move_list:
            try:
                # 當前棋盤狀態
                fen = board.fen()
                
                # 解析 SAN 棋步並計算下一步
                chess_move = board.parse_san(move)
                fen_next_move_counts[fen][move] += 1  # 記錄該 FEN 對應的棋步次數
                board.push(chess_move)  # 應用該棋步
            except ValueError:
                print(f"跳過無效棋步: {move}")
                break

    # 計算機率
    fen_probabilities = {}
    for fen, move_counts in fen_next_move_counts.items():
        total_moves = sum(move_counts.values())
        fen_probabilities[fen] = {
            move: count / total_moves for move, count in move_counts.items()
        }

    # 保存結果
    with open(output_file, "w") as file:
        json.dump(fen_probabilities, file, indent=4)

    print(f"統計結果已保存至 {output_file}")

# 執行程式
input_file = "./game_data_final.json"  # 原始比賽數據
output_file = "./fen_move_probabilities.json"  # 統計結果輸出
process_chess_games(input_file, output_file)

跳過無效棋步: 1.Nf3
跳過無效棋步: 1.e4
跳過無效棋步: 1.e4
跳過無效棋步: 1.e4
跳過無效棋步: 1.Nf3
跳過無效棋步: 1.c4
跳過無效棋步: 1.g3
跳過無效棋步: 1.e4
跳過無效棋步: 1.e4
跳過無效棋步: 1.e4
跳過無效棋步: 1.d4
跳過無效棋步: 1.e4
跳過無效棋步: 1.e4
跳過無效棋步: 1.e4
跳過無效棋步: 1.d4
跳過無效棋步: 1.e4
跳過無效棋步: 1.e4
跳過無效棋步: 1.e4
跳過無效棋步: 1.e4
跳過無效棋步: 1.e4
跳過無效棋步: 1.e4
跳過無效棋步: 1.e4
跳過無效棋步: 1.e4
跳過無效棋步: 1.e4
跳過無效棋步: 1.e4
跳過無效棋步: 1.e4
跳過無效棋步: 1.e4
跳過無效棋步: 1.d4
跳過無效棋步: 1.c4
跳過無效棋步: 1.d4
跳過無效棋步: 1.e4
跳過無效棋步: 1.d4
跳過無效棋步: 1.d4
跳過無效棋步: 1.d4
跳過無效棋步: 1.d4
跳過無效棋步: 1.d4
跳過無效棋步: 1.e4
跳過無效棋步: 1.e4
跳過無效棋步: 1.e4
跳過無效棋步: 1.e4
跳過無效棋步: 1.e4
跳過無效棋步: 1.e4
跳過無效棋步: 1.c4
跳過無效棋步: 1.e4
跳過無效棋步: 1.e4
跳過無效棋步: 1.d4
跳過無效棋步: 1.Nf3
跳過無效棋步: 1.e4
跳過無效棋步: 1.e4
跳過無效棋步: 1.f4
跳過無效棋步: 1.e4
跳過無效棋步: 1.e4
跳過無效棋步: 1.e4
跳過無效棋步: 1.e4
跳過無效棋步: 1.e4
跳過無效棋步: 1.Nf3
跳過無效棋步: 1.d4
跳過無效棋步: 1.e4
跳過無效棋步: 1.d4
跳過無效棋步: 1.Nf3
跳過無效棋步: 1.e4
跳過無效棋步: 1.e4
跳過無效棋步: 1.e4
跳過無效棋步: 1.e4
跳過無效棋步: 1.d4
跳過無效棋步: 1.d4
跳過無效棋步: 1.e4
跳過無效棋步: 1.e4
跳過無效棋步: 1.e4
跳過無效棋步: 1.c4
跳過無效棋步: 1.e4
跳過無效棋步: 1.e4
跳過無效棋步: 1.Nf3
跳過無效棋步: 1.e4
跳過無效棋步: 1.e4
跳過無效棋步: 1.e4
跳過無效棋步