In [86]:
import os
import pandas as pd
from pandas_gbq import read_gbq
from google.cloud import bigquery

# Set up BigQuery client
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "../keyfile.json"
client = bigquery.Client()

# Define the BigQuery dataset and table names
dataset_id = 'chesscom-451104.staging'
table_games_prefix = 'games_infos_'  # Prefix for wildcard tables

# Check if at least one table with the prefix "games_" exists
def table_with_prefix_exists(client, dataset_id, prefix):
    tables = client.list_tables(dataset_id)  # List all tables in the dataset
    return any(table.table_id.startswith(prefix) for table in tables)

# Check if games_moves_* tables exist
if table_with_prefix_exists(client, dataset_id, table_games_prefix):
    query = f"""
    SELECT  
        username, 
        MAX(archive_url) AS latest_url,
        MAX(end_time) AS latest_end_time
    FROM `{dataset_id}.{table_games_prefix}*`
    GROUP BY 1
    """
    username_import = read_gbq(query, project_id='chesscom-451104', dialect='standard')
    print("Query executed successfully!")
else:
    print(f"No tables with the prefix '{table_games_prefix}' found.")
    username_import = pd.DataFrame()  # Return an empty DataFrame if no tables are found

Downloading: 100%|[32m██████████[0m|
Query executed successfully!


In [84]:
import requests
import pandas as pd
from datetime import datetime

def get_filtered_archives(username, email):
    headers = {'User-Agent': f'username: {username}, email: {email}'}
    URL = f'https://api.chess.com/pub/player/{username}/games/archives'

    response = requests.get(URL, headers=headers)
    archives = response.json()

    if username_import.empty:
        # If username_import is empty, return the original archives dictionary without filtering
        return archives

    # Extract the latest_url for the current username from the username_import dataframe
    user_row = username_import[username_import['username'] == username]
    if not user_row.empty:
        latest_url = user_row.iloc[0]['latest_url']
    else:
        # Set a default value for latest_url if no match is found
        latest_url = ""

    # Filter the archive URLs to keep only those >= latest_url
    filtered_archives = [
        archive_url for archive_url in archives.get("archives", [])
        if latest_url == "" or archive_url >= latest_url
    ]

    # Update the archives dictionary with the filtered archives
    archives['archives'] = filtered_archives

    return archives

def fetch_and_union_game_data(usernames, email):
    # Initialize variables
    all_game_data = []
    current_timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    # Loop through each username
    for username in usernames:
        # Get the archives for the user
        archives = get_filtered_archives(username, email)

        user_row = username_import[username_import['username'] == username]
        if not user_row.empty:
            latest_end_time = datetime.strptime(user_row.iloc[0]['latest_end_time'], "%Y-%m-%d %H:%M:%S")
            latest_end_time = int(latest_end_time.timestamp())
        else:
            latest_end_time = 0

        # Loop through each archive URL for the current user
        for archive_url in archives.get("archives", []):
            # Fetch data for the current archive
            response = requests.get(archive_url, headers={'User-Agent': f'username: {username}, email: {email}'})
            archive_data = response.json()

            # Extract game details into a list of dictionaries
            game_data = [
                {
                    # Audit information
                    "archive_url": archive_url,
                    "username": username,
                    "bq_load_date": current_timestamp,
                    # General game information
                    "url": game.get("url", ""),
                    "pgn": game.get("pgn", ""),
                    "time_control": game.get("time_control", ""),
                    "end_time": datetime.fromtimestamp(game.get("end_time", 0)).strftime("%Y-%m-%d %H:%M:%S"),
                    "rated": game.get("rated", False),
                    "tcn": game.get("tcn", ""),
                    "game_uuid": game.get("uuid", ""),
                    "initial_setup": game.get("initial_setup", ""),
                    "fen": game.get("fen", ""),
                    "time_class": game.get("time_class", ""),
                    "rules": game.get("rules", ""),
                    # 'white' subfields
                    "white_username": game.get("white", {}).get("username", ""),
                    "white_rating": game.get("white", {}).get("rating", ""),
                    "white_result": game.get("white", {}).get("result", ""),
                    "white_id": game.get("white", {}).get("@id", ""),
                    "white_uuid": game.get("white", {}).get("uuid", ""),
                    # 'black' subfields
                    "black_username": game.get("black", {}).get("username", ""),
                    "black_rating": game.get("black", {}).get("rating", ""),
                    "black_result": game.get("black", {}).get("result", ""),
                    "black_id": game.get("black", {}).get("@id", ""),
                    "black_uuid": game.get("black", {}).get("uuid", ""),
                }
                for game in archive_data.get("games", [])
                if game.get("end_time", 0) > latest_end_time
            ]
            
            all_game_data.extend(game_data)

    df = pd.DataFrame(all_game_data)

    return df

# Example usage:
usernames = ['piwi100', 'Zundorn', 'leprechess', 'bgdu33']
email = 'cassado01@gmail.com'
games = fetch_and_union_game_data(usernames, email)

In [None]:
import os
import pandas as pd
from pandas_gbq import to_gbq
from google.cloud import bigquery

# Set up BigQuery client
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "../keyfile.json"
client = bigquery.Client()

# Define the BigQuery dataset and table name
dataset_id = 'chesscom-451104.staging' 

# Generate the table name with current date, hour, and minute
date_suffix = datetime.now().strftime('%Y%m%d_%H%M')
table_id = f'{dataset_id}.games_infos_{date_suffix}'

if not games.empty:
    # Load the DataFrame into BigQuery using pandas_gbq
    to_gbq(games, table_id, project_id='chesscom-451104', if_exists='replace') # DROP & CREATE data load (full)
    print(f"Data loaded into BigQuery table: {table_id}")
else:
    print("The games DataFrame is empty. No data loaded into BigQuery.")

100%|██████████| 1/1 [00:00<?, ?it/s]

Data loaded into BigQuery successfully!



