In [1]:
import requests
import pandas as pd
from datetime import datetime

def get_chess_archives(username, email):
    # Set up the headers with the input parameters
    headers = {'User-Agent': f'username: {username}, email: {email}'}
    
    # Construct the URL using the username
    URL = f'https://api.chess.com/pub/player/{username}/games/archives'
    
    # Make the request to the API
    data = requests.get(URL, headers=headers)
    
    # Extract the JSON response
    archives = data.json()
    
    return archives

def fetch_and_union_game_data(usernames, email):
    # Initialize variables
    all_game_data = []
    current_timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    # Loop through each username
    for username in usernames:
        # Get the archives for the user
        archives = get_chess_archives(username, email)

        # Loop through each archive URL for the current user
        for archive_url in archives.get("archives", []):
            # Fetch data for the current archive
            response = requests.get(archive_url, headers={'User-Agent': f'username: {username}, email: {email}'})
            archive_data = response.json()

            # Extract game details into a list of dictionaries
            game_data = [
                {
                    # Audit information
                    "archive_url": archive_url,
                    "username": username,
                    "bq_load_date": current_timestamp,
                    # General game information
                    "url": game.get("url", ""),
                    "pgn": game.get("pgn", ""),
                    "time_control": game.get("time_control", ""),
                    "end_time": datetime.fromtimestamp(game.get("end_time", 0)).strftime("%Y-%m-%d %H:%M:%S"),
                    "rated": game.get("rated", False),
                    "accuracies": game.get("accuracies", {}),
                    "tcn": game.get("tcn", ""),
                    "game_uuid": game.get("uuid", ""),
                    "initial_setup": game.get("initial_setup", ""),
                    "fen": game.get("fen", ""),
                    "time_class": game.get("time_class", ""),
                    "rules": game.get("rules", ""),
                    # 'white' subfields
                    "white_username": game.get("white", {}).get("username", ""),
                    "white_rating": game.get("white", {}).get("rating", ""),
                    "white_result": game.get("white", {}).get("result", ""),
                    "white_id": game.get("white", {}).get("@id", ""),
                    "white_uuid": game.get("white", {}).get("uuid", ""),
                    # 'black' subfields
                    "black_username": game.get("black", {}).get("username", ""),
                    "black_rating": game.get("black", {}).get("rating", ""),
                    "black_result": game.get("black", {}).get("result", ""),
                    "black_id": game.get("black", {}).get("@id", ""),
                    "black_uuid": game.get("black", {}).get("uuid", ""),
                }
                for game in archive_data.get("games", [])
            ]
            
            all_game_data.extend(game_data)

    df = pd.DataFrame(all_game_data)

    return df

# Example usage:
usernames = ['piwi100', 'Zundorn', 'leprechess']
email = 'cassado01@gmail.com'
games = fetch_and_union_game_data(usernames, email)

Full load of the data (DROP & CREATE) is only sustainable as long as data volume is small and update frequency is low.
Incremental load scanning only the new games could be implemented.

In [2]:
import os
import pandas as pd
from pandas_gbq import to_gbq
from google.cloud import bigquery

# Set up BigQuery client
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "../keyfile.json"
client = bigquery.Client()

# Define the BigQuery dataset and table name
dataset_id = 'chesscom-451104.staging' 
table_id = f'{dataset_id}.games'

# Load the DataFrame into BigQuery using pandas_gbq
to_gbq(games, table_id, project_id='chesscom-451104', if_exists='replace') # DROP & CREATE data load (full)
print("Data loaded into BigQuery successfully!")

100%|██████████| 1/1 [00:00<?, ?it/s]

Data loaded into BigQuery successfully!



