In [None]:
import pprint
import chessdotcom as cdc
import requests
import pandas as pd
from time import sleep
import sys
import warnings
warnings.filterwarnings("ignore")

# Objective: 

1) API webscrapping - use chess.com API to gather data on a user to track their past chess games and their opponents rating and location

2) Data engineering - clean the data and ensure there are no duplicates, missing values or incorrect data types for data exploration and analysis

In [None]:
def get_data(username,months):
    data = cdc.get_player_game_archives(username).json
    #initialising list to input data
    l=[]
    #from exporation of the pgn text file, we only want the first 21 columns
    col_size=21
    #initialising column names
    col_names=[None]*col_size
    for i in range(1,months+1):
        #grabs the recent month's games
        url = data["archives"][-i]
        games = requests.get(url).json()
        #tells us how many games there are in the month
        total_games=len(games["games"])
        
        for j in range(total_games): 
            #the pgn contains the game data, however it comes as a long string data.
            #lack of delimiter between value and column name, requires to be parsed differently
            game = str(games["games"][j]["pgn"]).replace("[","").replace("]","").split("\n")
            #here we only colect the wanted columns
            chess_data= game[0:col_size]
            
            for k in range(col_size):
                #setting dataframe's columm names
                if col_names[k]==None:
                    col_names[k]=chess_data[k].split(" ")[0]
                #we want to get the data between the ""
                chess_data[k]=chess_data[k].split('"')[1].split('"')[0]
                
            l.append(chess_data)
            
    dataset = pd.DataFrame(l, columns=col_names)
    return dataset

In [None]:
def get_location(username):
    profile=cdc.get_player_profile(username).json
    url=profile["player"]["country"]
    location=requests.get(profile["player"]["country"]).json()
    return location["name"]

In [None]:
#we wish to examine 1 year of chess data
months=24
username="VenDez"
games=get_data(username,months)
games.head()

In [None]:
games["Datetime"]=games["UTCDate"] + " " + games["UTCTime"]

#sipplifying termination status and splitting date into year, month and day
for i in range(games.shape[0]):
    games["Termination"][i]=games["Termination"][i].split()[-1]
    
games.head()

In [None]:
column_names=list(games.columns.values)
print(column_names)

In [None]:
features=["White","Black","WhiteElo","BlackElo","Result","Termination","Datetime"]
games=games[features]
games.head()

In [None]:
#getting the right datatypes for each columns
games["Datetime"]=pd.to_datetime(games["Datetime"])
games["WhiteElo"]=pd.to_numeric(games["WhiteElo"])
games["BlackElo"]=pd.to_numeric(games["BlackElo"])
games['Result'] = games['Result'].astype("category")
games['Termination'] = games['Termination'].astype("category")

In [None]:
games.info()

In [None]:
#seperating white and black games
white = games.loc[games["White"] == username]
black = games.loc[games["Black"] == username]

#resetting index for location analysis
black.reset_index(drop=True,inplace=True)
white.reset_index(drop=True,inplace=True)

In [None]:
def location_analysis(data):
    data["Location"]=None
    
    for i in range(data.shape[0]):

        if data["White"][0]==username:
            data["Location"][i]=get_location(data["Black"][i])
        else:
            data["Location"][i]=get_location(data["White"][i])
            
        #just to let me know how long is left
        sys.stdout.write('\r')
        sys.stdout.write("progress = " + str(round(i*100/data.shape[0],2)) + "%")
        sys.stdout.flush()
        sleep(0.25)
    return

In [None]:
#optional as the api requests takes really long, but I want to see the locations of whom I played
location_analysis(white)

In [None]:
location_analysis(black)

In [None]:
#cleaning up dataset for white
white=white.drop(["White","Black"], axis=1)
white.rename(columns={'WhiteElo':'Rating','BlackElo':'Opponent_Rating'}, inplace = True)
white["Result"]=white["Result"].cat.rename_categories({"1-0":"W", "1/2-1/2": "D","0-1":"L"})
white.head()

In [None]:
white.sort_values(by=['Datetime'],inplace=True, ascending=False)
white.to_csv("white_games.csv",index=False)

In [None]:
#cleaning up dataset for black
black=black.drop(["White","Black"], axis=1)
black.rename(columns={'BlackElo':'Rating','WhiteElo':'Opponent_Rating'}, inplace = True)
black["Result"]=black["Result"].cat.rename_categories({"1-0":"L", "1/2-1/2": "D","0-1":"W"})
black[["Rating", "Opponent_Rating","Result","Termination","Datetime","Location"]]
black.head()

In [None]:
black.sort_values(by=['Datetime'],inplace=True, ascending=False)
black.to_csv("black_games.csv",index=False)