In [1]:
import json as json
import numpy as np
import pandas as pd
import urllib
import requests
import re
import datetime
import os
import glob

Data Collection from Chess.com API

This workbook:

1. Converts user JSON file to df
2. Retrieves raw game data from user supplied min/max user-indices. Store game data in raw_game_data subfolder
3. Applies a filter to the games_df to keep only live chess games. Do not include dailies/Chess960/bughouse, etc.
3. Extracts pgn game data from the pgn Series in the new games_df and clean using string operations. Store pgn data in raw_pgn_data subfolder
4. Analysis is performed in Chess_Game_Analysis.ipynb

To-do:

1. Schedule/Automate API data collection process locally using Task Scheduler or use a cloud service

In [2]:
#Manually downloaded CanadaUsers.json beforehand from Chess.com
def json_to_df(filename):
    # Opening JSON file 
    f = open(filename)
    
    #json.load returns a dictionary shaped object
    data = json.load(f)
    return pd.DataFrame(data)

In [17]:
def get_game_data(user_index_min, user_index_max, players_df):
    
    #Loops through usernames using provided user index min/max
    #Uses the Chess.com API to retrieve the most recent month of game data for that user
    #Puts that data into a dataframe, saves as a csv file
    
    all_games = pd.DataFrame()
    headers = {
    'User-Agent': 'cnothing01',
    'From': 'cnorth01@gmail.com'
}

    #Select user range
    for i in range(user_index_min, user_index_max):
        user = players_df.iloc[i]['players']
        
        #Current month and year to start
        #Will this initiation work if we are in a month with only 1 digit?
        month = str(datetime.datetime.today()).split()[0][5:7].zfill(2)
        year = str(datetime.datetime.today()).split()[0][0:4]

        #start with empty games dictionary for while loop
        games_dict = {'games': []}

        #Get Game Data
        #If no games found this month, check older records back to January 2020, else skip 
        while len(games_dict['games']) == 0:
            #print(f"checking month {month} for user {user}")
            url = f"https://api.chess.com/pub/player/{user}/games/{year}/{month}"
            req = requests.get(url=url, headers=headers)
            games_dict = req.json()
            df = pd.DataFrame([games_dict])

            #print(len(games_dict['games']),' game(s) found')
            month = int(month)
            
            try:
                if len(games_dict['games']) > 0:
                    all_games = all_games.append(df.iloc[0][0], ignore_index=True)
                    #print(f"games for {user} recorded")
            except:
                break
                
            if month > 1:
                month -= 1
                month = str(month).zfill(2)
                
            else:
                #print(f"All months searched for {user} and no game data was retrieved")
                break
    
    #Re-initialize month/year for file saving
    month = str(datetime.datetime.today()).split()[0][5:7]
    year = str(datetime.datetime.today()).split()[0][0:4]
    
    #Convert to df and save csv
    all_games = pd.DataFrame(all_games)  
    all_games_len = len(all_games)        
    all_games.to_csv(f'raw_game_data/CA_game_data_{month}_{year}_{user_index_min}-{user_index_max}.csv', index = False)
    print(f"{all_games_len} Games retrieved from Chess.com and saved as csv")
    return pd.DataFrame(all_games)


In [19]:
def live_chess_only(games_df):

#Filters to keep only live, standard rules chess matches
#Must be run before get_pgn_data to avoid errors
    games_df.reset_index(drop=True, inplace=True)
    live_game_filter = games_df['url'].str.contains("live") 
    rules_filter = games_df['rules'].str.contains("chess")
    games_df = games_df[live_game_filter][rules_filter]

    games_df.reset_index(drop=True, inplace=True)
    print("Removed non-live chess matches")
    return games_df

In [20]:
def get_pgn(user_min, user_max, games_df):
    
"""
Each game contains a PGN, which is a single string.
In each PGN, there are 22-25 items if split by \n. 
The last entry is always movetext
There is also a row with an empty string
Every other index is a pair of strings with the first string in the pair being the name of the data in the second string
"""

    print("Extracting pgn data...")    
    pgn_df = pd.DataFrame()
    month = str(datetime.datetime.today()).split()[0][5:7]
    year = str(datetime.datetime.today()).split()[0][0:4]

    for i in range(0, len(games_df)):
        
        #Transform pgn to a list and split by \n
        pgn_dict = {}
        
        try:
            game_pgn = games_df.iloc[i]['pgn'].split('\n')
        except AttributeError:
            print(f"Unable to extract pgn data from index {i} of games_df, this index was skipped")
            pass
        
        #Split & Strip PGN list, transform to dictionary and return it as a dataframe
        for count, entry in enumerate(game_pgn):
            if count == len(game_pgn)-1:
                pgn_dict["MoveText"] = entry.strip()
                pgn_df = pgn_df.append(pd.DataFrame([pgn_dict]), ignore_index=True)
                pass  
            
            try:
                pgn_dict[entry.strip('[]').split(" ", 1)[0]] = entry.strip('[]').split(" ", 1)[1][1:-1] #string slice to remove quotes
            except IndexError:
                pass
            
    pgn_df.to_csv(f'raw_pgn_data/pgn_data_{month}_{year}_{user_min}-{user_max}.csv', index=False)
    print("PGN data extracted and saved to csv")

    return pgn_df


In [21]:
def main(user_min, user_max, user_json='CanadaUsers.json'):
    canada_users = json_to_df(user_json)
    games_df = get_game_data(user_min, user_max, canada_users)
    games_df = live_chess_only(games_df)
    pgn_df = get_pgn(user_min, user_max, games_df)
    return pgn_df

In [26]:
%%time
#Takes between 6-11 minutes for 200 users
pgn_df = main(5200, 5400)

10577 Games retrieved from Chess.com and saved as csv
Removed non-live chess matches
Extracting pgn data...
PGN data extracted and saved to csv
Wall time: 6min 16s


In [27]:
pgn_df

Unnamed: 0,Event,Site,Date,Round,White,Black,Result,CurrentPosition,Timezone,ECO,...,StartTime,EndDate,EndTime,Link,MoveText,SetUp,FEN,Variant,Tournament,Match
0,Live Chess,Chess.com,2020.12.04,-,Gurjas_Gs0511,amphidryon,1-0,8/8/p7/Pp3p1p/1P2pPk1/4K1P1/8/8 b - f3,UTC,A10,...,08:17:26,2020.12.04,08:36:36,https://www.chess.com/live/game/5883366316,1. c4 {[%clk 0:09:59.5]} 1... b6 {[%clk 0:09:5...,,,,,
1,Live Chess,Chess.com,2020.12.04,-,amphidryon,Ayedramadeen,1-0,7k/6p1/6Q1/pp1BP3/2p5/3P4/NPP5/2K4R b - -,UTC,A00,...,08:37:02,2020.12.04,08:49:51,https://www.chess.com/live/game/5883448066,1. g4 {[%clk 0:09:59.9]} 1... e5 {[%clk 0:09:5...,,,,,
2,Live Chess,Chess.com,2020.12.04,-,Ayedramadeen,amphidryon,1-0,6k1/4Qp2/4p1pp/8/2P1qN1P/5PP1/6K1/r7 b - -,UTC,B00,...,08:50:23,2020.12.04,09:09:20,https://www.chess.com/live/game/5883503945,1. e4 {[%clk 0:09:59.9]} 1... b6 {[%clk 0:09:5...,,,,,
3,Live Chess,Chess.com,2020.12.04,-,amphidryon,Sportsfan5414,0-1,8/8/1p2k3/4P1P1/1P3K2/8/7p/3b4 b - -,UTC,A00,...,09:09:20,2020.12.04,09:26:03,https://www.chess.com/live/game/5883587832,1. g4 {[%clk 0:09:59.9]} 1... d5 {[%clk 0:09:5...,,,,,
4,Live Chess,Chess.com,2020.12.04,-,namakdoon,amphidryon,0-1,8/8/8/8/4p3/1p4k1/8/r5K1 w - -,UTC,B00,...,09:26:31,2020.12.04,09:43:34,https://www.chess.com/live/game/5883661498,1. e4 {[%clk 0:09:48.7]} 1... b6 {[%clk 0:09:5...,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10378,Live Chess,Chess.com,2020.12.10,-,ergl1771,andreslara5439,1-0,6k1/5r1p/3p2p1/p1pBp3/P1P1P3/1n6/3K4/5R2 w - -,UTC,B20,...,22:23:54,2020.12.10,22:53:38,https://www.chess.com/live/game/5936103604,1. e4 {[%clk 0:29:42.3]} 1... c5 {[%clk 0:29:5...,,,,,
10379,Live Chess,Chess.com,2020.12.10,-,andreslara5439,shjain,1-0,r3k3/1p2rpRN/p7/4p3/P3Q3/8/1P5P/7K b - -,UTC,B86,...,22:58:57,2020.12.10,23:38:31,https://www.chess.com/live/game/5936322929,1. e4 {[%clk 0:30:00]} 1... c5 {[%clk 0:29:55....,,,,,
10380,Live Chess,Chess.com,2020.12.11,-,Hlerma,andreslara5439,0-1,1q6/5Qn1/6Pk/4Pp2/p2p4/7K/P3b3/8 w - -,UTC,B30,...,02:55:05,2020.12.11,03:55:04,https://www.chess.com/live/game/5937461905,1. e4 {[%clk 0:29:56.3]} 1... c5 {[%clk 0:29:5...,,,,,
10381,Live Chess,Chess.com,2020.12.12,-,andreslara5439,Johnblewis,1-0,8/1pp4P/k2p4/1pn5/8/2P1Q3/PP4K1/8 b - -,UTC,C50,...,22:26:44,2020.12.12,22:46:10,https://www.chess.com/live/game/5951877391,1. e4 {[%clk 0:30:00]} 1... e5 {[%clk 0:29:57....,,,,,
