## 1. Script Description and Imports

In [22]:

# chess_scraper.py
# Description: This script scrapes chess games from the chess.com game archive page.
# Author: Edward
# Date: 2024-08-16

from selenium import webdriver

options = webdriver.ChromeOptions()
options.add_argument("--headless")
driver = webdriver.Chrome(options=options)

from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import json
import time
import re
import csv
from googleapiclient.discovery import build
from google.oauth2.service_account import Credentials

## 2. Constants and Setup

In [23]:
USERNAME = "jlee2327"

def setup_driver():
    """Sets up the Selenium WebDriver with Chrome options."""
    chrome_options = Options()
    chrome_options.add_argument("--disable-javascript")
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")
    return webdriver.Chrome(options=chrome_options)


## 3. Game Data Extraction

In [24]:

def extract_game_info(row):
    """Extracts relevant game information from a table row."""
    columns = row.find_all('td')
    if len(columns) <= 1:
        return None

    #white_info = columns[0].text.strip() ## This gives you the time control
    names_and_ratings = columns[1].text.strip()
    player_name, player_rating, opponent_name, opponent_rating, player_color, opponent_color = parse_player_info(names_and_ratings)

    accuracies = columns[3].text.strip().split()
    player_accuracy, opponent_accuracy = extract_accuracies(accuracies, player_color)

    result_text = columns[2].text.strip()
#     print(columns[2].text.strip())
    game_result = determine_game_result(result_text, player_color)

    game_link = extract_game_link(row)
    #games_moves = scrape_games_from_game_links(driver, game_link)
    '''print(f"Debugging: Printing Game MOves: {games_moves}")'''

    if not game_link:
        return None  # Skip rows without a game link

    player_country, opponent_country = extract_countries(row)
    

    game_data = {
        'Game_Type': columns[0].text.strip(),
        'Result': game_result,
        'Accuracies': columns[3].text.strip(),
        'Total_Moves': columns[4].text.strip(),
        'Date': columns[5].text.strip(),
        'Link': game_link,
        'Player_Details': {
            player_name: {
                'currentRating': player_rating,
                'country': player_country,
                'username': USERNAME,
                'color': player_color,
                'accuracy': player_accuracy,
                #'moves' : games_moves, 
            },
            opponent_name: {
                'currentRating': opponent_rating,
                'country': opponent_country,
                'username': opponent_name,
                'color': opponent_color,
                'accuracy': opponent_accuracy,
            }
        }
    }
    
    return game_data
    

import re

def parse_player_info(names_and_ratings):
    """Parses the player and opponent information from a text string."""
    
    # Clean up the player info for both white and black
    split = ' '.join(names_and_ratings.split())
    '''
    print(f"Debugging names_and_ratings: {names_and_ratings}")
    print(f"Debugging split: {split}")
    '''

    split = split.split()

    white_name = split[0]
    white_rating = split[1].strip('()')
    black_name = split[2]
    black_rating = split[3].strip('()')
    '''
    print(f"Debugging white_name: {white_name}")
    print(f"Debugging white_rating: {white_rating}")
    print(f"Debugging black_name: {black_name}")
    print(f"Debugging black_rating: {black_rating}")
    '''

    
    # Determine the player's color
    if USERNAME in white_name:
        player_name, player_rating = white_name, white_rating
        opponent_name, opponent_rating = black_name, black_rating
        player_color, opponent_color = 'White', 'Black'
    elif USERNAME in black_name:
        player_name, player_rating = black_name, black_rating
        opponent_name, opponent_rating = white_name, white_rating
        player_color, opponent_color = 'Black', 'White'
    else:
        print("There is a mistake I THINK:")
        player_name, player_rating, opponent_name, opponent_rating = 'Me', 'N/A', 'Opponent', 'N/A'
        player_color, opponent_color = 'Unknown', 'Unknown'

    return player_name, player_rating, opponent_name, opponent_rating, player_color, opponent_color



def extract_accuracies(accuracies, player_color):
    """Extracts accuracies for the player and opponent based on player color."""
    if len(accuracies) == 2:
        return (accuracies[0], accuracies[1]) if player_color == 'White' else (accuracies[1], accuracies[0])
    return 'N/A', 'N/A'

def determine_game_result(result_text, player_color):
    """
    Determines the game result based on the result text and player color.

    Args:
        result_text (str): The result of the game, which may include line breaks (e.g., '1\n0', '0\n1', '½\n½').
        player_color (str): The color of the player ('White' or 'Black').

    Returns:
        str: 'Win', 'Loss', or 'Draw' based on the result.
    """
    # Debugging: Print the raw result_text
#     print(f"Debugging Result_Text: {result_text}")
    
    # Clean the result_text by removing any whitespace and newlines
    cleaned_result = result_text.strip().replace('\n', '')
    
    # Debugging: Print the cleaned result_text
    #print(f"Cleaned Result_Text: {cleaned_result} {player_color}")
    
    if cleaned_result == '½½':
        return 'Draw'
    elif (cleaned_result == '10' and player_color == 'White') or (cleaned_result == '01' and player_color == 'Black'):
        return 'Win'
    else:
        return 'Loss'
    
def extract_game_link(row):
    """Extracts the game link from a table row."""
    game_link_element = row.find('a', {'class': 'archive-games-background-link'})
    return game_link_element['href'] if game_link_element else 'N/A'

def extract_countries(row):
    """Extracts the countries for the player and opponent."""
    country_divs = row.find_all('div', {'data-cy': 'user-country-flag'})
    if len(country_divs) >= 2:
        player_country = country_divs[0].get('v-tooltip', 'Unknown')
        opponent_country = country_divs[1].get('v-tooltip', 'Unknown')
    else:
        player_country = opponent_country = 'Unknown'
    return player_country, opponent_country




In [25]:
def scrape_games_from_page(driver, page_url, scraped_game_links):
    """Scrapes all games from a given page URL."""
    driver.get(page_url)
    time.sleep(1)  # Wait for the page to load
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    games_table = soup.find('table', {'class': 'archive-games-table'})
    games = []

    if games_table:
        tbody = games_table.find('tbody')
        rows = tbody.find_all('tr')

        for row in rows:
            game_data = extract_game_info(row)
            if game_data:
                if game_data['Link'] not in scraped_game_links:
                    games.append(game_data)
                    scraped_game_links.add(game_data['Link'])
                    
                    
    
    return games




In [26]:
def scrape_chess_games():
    """Main function to scrape chess games and save them as JSON and CSV."""
    driver = setup_driver()
    base_url = f"https://www.chess.com/games/archive/{USERNAME}?page="
    all_games = []
    scraped_game_links = set()

    for page_num in range(1, 10):  # Adjust range as needed for the number of pages
        page_url = base_url + str(page_num)
        print(f"Scraping page {page_num}: {page_url}")
        games = scrape_games_from_page(driver, page_url, scraped_game_links)
        if not games:
            print("No more new games found. Stopping.")
            break  # Stop if no new games are found (end of pages or duplicates)
        all_games.extend(games)
        time.sleep(1)  # Avoid hammering the server too hard

    with open(f'{USERNAME}_chess_games.json', 'w') as f:
        json.dump(all_games, f, indent=4)
    
    #save_games_to_csv(all_games)
    driver.quit()
    print(f"Scraping completed. Total unique games scraped: {len(all_games)}")

    
    return all_games


In [27]:
'''
from googleapiclient.discovery import build
from google.oauth2.service_account import Credentials
import json

# Path to your service account JSON key file
SERVICE_ACCOUNT_FILE = 'path/to/your/service-account-key.json'
SCOPES = ['https://www.googleapis.com/auth/spreadsheets']

# Spreadsheet ID and range
SPREADSHEET_ID = 'your_spreadsheet_id'
RANGE_NAME = 'Sheet1!A1'

def save_json_to_google_sheet(json_data):
    """
    Saves JSON data to a Google Spreadsheet.
    """
    # Authenticate with the Google Sheets API
    creds = Credentials.from_service_account_file(SERVICE_ACCOUNT_FILE, scopes=SCOPES)
    service = build('sheets', 'v4', credentials=creds)

    # Prepare the data for insertion
    headers = [
        'Game_Type', 'Result', 'Accuracies', 'Total_Moves', 'Date', 'Link',
        'Player1_Name', 'Player1_Rating', 'Player1_Country', 'Player1_Username', 'Player1_Color', 'Player1_Accuracy',
        'Player2_Name', 'Player2_Rating', 'Player2_Country', 'Player2_Username', 'Player2_Color', 'Player2_Accuracy'
    ]
    
    values = [headers]  # Start with headers
    
    for game in json_data:
        player1, player2 = list(game['Player_Details'].keys())
        row = [
            game['Game_Type'], game['Result'], game['Accuracies'], game['Total_Moves'], game['Date'], game['Link'],
            player1, game['Player_Details'][player1]['currentRating'], game['Player_Details'][player1]['country'], game['Player_Details'][player1]['username'], game['Player_Details'][player1]['color'], game['Player_Details'][player1]['accuracy'],
            player2, game['Player_Details'][player2]['currentRating'], game['Player_Details'][player2]['country'], game['Player_Details'][player2]['username'], game['Player_Details'][player2]['color'], game['Player_Details'][player2]['accuracy']
        ]
        values.append(row)

    # Write data to Google Sheet
    body = {'values': values}
    service.spreadsheets().values().update(
        spreadsheetId=SPREADSHEET_ID,
        range=RANGE_NAME,
        valueInputOption='RAW',
        body=body
    ).execute()

    print("Data successfully saved to Google Spreadsheet.")
'''
import pandas as pd


if __name__ == "__main__":
    all_games = scrape_chess_games()
    if all_games:
        df = pd.DataFrame(all_games)
        print(df.head())  # Print the DataFrame
    else:
        print("No games found to display.")
    # Load the JSON data from file
    #with open(f'{USERNAME}_chess_games.json', 'r') as f:
        #json_data = json.load(f)
    #save_json_to_google_sheet(json_data)  # Save to Google Spreadsheet


Scraping page 1: https://www.chess.com/games/archive/jlee2327?page=1
Scraping page 2: https://www.chess.com/games/archive/jlee2327?page=2
Scraping page 3: https://www.chess.com/games/archive/jlee2327?page=3
Scraping page 4: https://www.chess.com/games/archive/jlee2327?page=4
No more new games found. Stopping.
Scraping completed. Total unique games scraped: 148
  Game_Type Result Accuracies Total_Moves         Date  \
0     5 min   Loss     Review          30  Dec 4, 2024   
1     5 min    Win     Review          30  Dec 3, 2024   
2     5 min   Loss     Review          25  Dec 2, 2024   
3     5 min    Win     Review          30  Dec 2, 2024   
4     5 min    Win     Review          27  Dec 2, 2024   

                                                Link  \
0  https://www.chess.com/game/live/121261386014?u...   
1  https://www.chess.com/game/live/121257886466?u...   
2  https://www.chess.com/game/live/121240012722?u...   
3  https://www.chess.com/game/live/121239910764?u...   
4  https

In [28]:
#print(df)
player_details = df['Player_Details']
#print(player_details)
player = player_details.iloc[0]
#print(player)
#print(df)


In [29]:
the_links = df['Link']
#print(the_links)

In [30]:
### Ed's Notes
### So the webscraping is taking a long long time
### I guess I can just extract the game ID in one function and then at the end
### Make a separate dataframe that includes game_moves_white and game_moves_black for a given game id
### then I could conjoin those dataframes, so different functions

In [31]:
# I Forgot to put that on the CSV, like the game moves lol
#the_links

In [40]:
import pandas as pd
import time
import json
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup

def scrape_games_from_game_links(driver, individual_game_url):
    driver.get(individual_game_url)
    
    # Use WebDriverWait to wait until the necessary element is loaded instead of sleep
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'node.white-move.main-line-ply')))
    
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    # Find both white and black moves
    white_moves = soup.find_all('div', {'class': 'node white-move main-line-ply'})
    black_moves = soup.find_all('div', {'class': 'node black-move main-line-ply'})
    
    # Collect the moves for both white and black
    game_moves = {
        'white_moves': [
            move.find('span', {'class': 'node-highlight-content offset-for-annotation-icon'}).text.strip()
            for move in white_moves if move.find('span', {'class': 'node-highlight-content offset-for-annotation-icon'})
        ],
        'black_moves': [
            move.find('span', {'class': 'node-highlight-content offset-for-annotation-icon'}).text.strip()
            for move in black_moves if move.find('span', {'class': 'node-highlight-content offset-for-annotation-icon'})
        ]
    }
    
    # Check if the number of white and black moves match the number of moves found
    if len(game_moves['white_moves']) != len(white_moves) or len(game_moves['black_moves']) != len(black_moves):
        print("Warning: Some moves did not contain the expected span.")
    
    return game_moves


def scrape_chess_games_links(game_links):
    """Main function to scrape chess games and save them as JSON and CSV."""
    driver = setup_driver()
    
    # Use implicit waits for more general waiting
    driver.implicitly_wait(10)
    
    all_games = []

    try:
        for game_link in game_links:
            moves = scrape_games_from_game_links(driver, game_link)
            print(f"Scraped {len(moves['white_moves'])} white moves and {len(moves['black_moves'])} black moves from game: {game_link}")
            all_games.append({
                'game_url': game_link,
                'white_moves': moves['white_moves'],  # Store white moves directly
                'black_moves': moves['black_moves']   # Store black moves directly
            })
        
        # Convert to DataFrame
        df = pd.DataFrame(all_games)

        # Save to JSON
        #json_file = f'{USERNAME}_chess_games.json'
        #df.to_json(json_file, orient='records', indent=4)
        #print(f"Games saved to JSON file: {json_file}")

        # Save to CSV
        #csv_file = f'{USERNAME}_chess_gameslinks.csv'
        # Convert moves to a string for CSV compatibility
        #df['white_moves'] = df['white_moves'].apply(lambda x: ' '.join(x))
        #df['black_moves'] = df['black_moves'].apply(lambda x: ' '.join(x))
        #df.to_csv(csv_file, index=False)
        #print(f"Games saved to CSV file: {csv_file}")
    
    except Exception as e:
        print(f"Error during scraping: {e}")

    finally:
        driver.quit()

    print(f"Scraping completed. Total unique games scraped: {len(all_games)}")
    return df


In [41]:
start_time = time.time()
info_df = scrape_chess_games_links(the_links)
print(f"Execution time: {time.time() - start_time} seconds")

Scraped 29 white moves and 29 black moves from game: https://www.chess.com/game/live/121261386014?username=jlee2327
Scraped 29 white moves and 29 black moves from game: https://www.chess.com/game/live/121257886466?username=jlee2327
Scraped 24 white moves and 24 black moves from game: https://www.chess.com/game/live/121240012722?username=jlee2327
Scraped 29 white moves and 29 black moves from game: https://www.chess.com/game/live/121239910764?username=jlee2327
Scraped 27 white moves and 26 black moves from game: https://www.chess.com/game/live/121239809642?username=jlee2327
Scraped 40 white moves and 39 black moves from game: https://www.chess.com/game/live/121203593482?username=jlee2327
Scraped 19 white moves and 18 black moves from game: https://www.chess.com/game/live/121203554718?username=jlee2327
Scraped 40 white moves and 39 black moves from game: https://www.chess.com/game/live/121163455136?username=jlee2327
Scraped 38 white moves and 37 black moves from game: https://www.chess.c

In [45]:
#print(info_df)
merged_df = pd.merge(df, info_df, left_on='Link', right_on = 'game_url', how='inner')
merged_dict = merged_df.to_dict('records')

In [43]:
'''
print(merged_df)
csv_file2 = f'{USERNAME}_combined.csv'
merged_df.to_csv(csv_file2, index=False)
'''

"\nprint(merged_df)\ncsv_file2 = f'{USERNAME}_combined.csv'\nmerged_df.to_csv(csv_file2, index=False)\n"

In [47]:

def save_games_to_csv(games, filename='chess_games.csv'):
    """Save the scraped game data to a CSV file."""
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        headers = ['Game_Type', 'Result', 'Accuracies', 'Total_Moves', 'Date', 'Link', 
                   'Player1', 'Player1_Rating', 'Player1_Country', 'Player1_Username', 
                   'Player1_Color', 'Player1_Accuracy', 'Player1_Moves', 'Player2', 'Player2_Rating', 
                   'Player2_Country', 'Player2_Username', 'Player2_Color', 'Player2_Accuracy', 'Player2_Moves']
        writer.writerow(headers)

        for game in games:
            if game.get('Player_Details') is None:
                print("BUGGGG")
                continue  # Skip this game if player details are missing or None

            player1, player2 = game['Player_Details'].keys()

            # Handle None for accuracy values
            player1_accuracy = game['Player_Details'][player1].get('accuracy', 'N/A')
            player2_accuracy = game['Player_Details'][player2].get('accuracy', 'N/A')

            if game['Player_Details'][player1]['color'] == 'white':
                player1_moves = white_moves
                player2_moves = black_moves
            else:
                player1_moves = black_moves
                player2_moves = white_moves

            row = [
                game['Game_Type'], game['Result'], game['Accuracies'], game['Total_Moves'], 
                game['Date'], game['Link'],
                player1, game['Player_Details'][player1]['currentRating'], 
                game['Player_Details'][player1]['country'], game['Player_Details'][player1]['username'], 
                game['Player_Details'][player1]['color'], player1_accuracy, player1_moves,
                player2, game['Player_Details'][player2]['currentRating'], 
                game['Player_Details'][player2]['country'], game['Player_Details'][player2]['username'], 
                game['Player_Details'][player2]['color'], player2_accuracy, player2_moves
            ]
            writer.writerow(row)

In [48]:
save_games_to_csv(merged_dict)

NameError: name 'white' is not defined