## 📝 NB01 - Data Collection

In [18]:
# Importing necessary packages
import os
import json
import requests
import pandas as pd
import numpy as np
import subprocess
import time 

from serpapi import GoogleSearch

from dotenv import load_dotenv

from functions import *

from tqdm.notebook import tqdm
tqdm.pandas()

In [35]:
# Top Chess Players in the world as of December 16, 2024
# Ordering within the dictionary does not matter
top_players =[
    {'name': 'Magnus_Carlsen', 'fide_number': '1503014', 'chess_com_username': 'magnuscarlsen','country':'NO','country_name':'Norway'},
    {'name': 'Fabiano_Caruana', 'fide_number': '2020009', 'chess_com_username': 'fabianocaruana','country':'US','country_name':'United States of America'},
    {'name': 'Hikaru_Nakamura', 'fide_number': '2016192', 'chess_com_username': 'hikaru','country':'US','country_name':'United States of America'},
    {'name': 'Arjun_Erigaisi', 'fide_number': '35009192', 'chess_com_username': 'ghandeevam2003','country':'IN','country_name':'India'},
    {'name': 'Gukesh_Dommaraju', 'fide_number': '46616543', 'chess_com_username': 'gukeshdommaraju','country':'IN','country_name':'India'},
    {'name': 'Nodirbek_Abdusattorov', 'fide_number': '14204118', 'chess_com_username': 'chesswarrior7197','country':'UZ','country_name':'Uzbekistan'},
    {'name': 'Alireza_Firouzja', 'fide_number': '12573981', 'chess_com_username': 'firouzja2003','country':'FR','country_name':'France'},
    {'name': 'Ian_Nepomniachtchi', 'fide_number': '4168119', 'chess_com_username': 'lachesisq','country':'RU','country_name':'Russia'},
    {'name': 'Yi_Wei', 'fide_number': '8603405', 'chess_com_username': 'wei-yi','country':'CN','country_name':'China'},
    {'name': 'Viswanathan_Anand', 'fide_number': '5000017', 'chess_com_username': 'thevish','country':'IN','country_name':'India'}
]

### Step 1: Fetching FIDE Data

In [19]:
# Testing the fetch_fide_data function with Magnus Carlsen's fide number
fide_number = 1503014
fide_data = fetch_fide_data(fide_number)
print(fide_data)

{'name': 'Carlsen, Magnus ', 'federation': 'Norway', 'birth_year': 1990, 'sex': 'Male', 'title': 'Grandmaster', 'standard_elo': '', 'rapid_elo': '', 'blitz_elo': '', 'world_rank_all_players': 1, 'world_rank_active_players': 1, 'national_rank_all_players': 1, 'national_rank_active_players': 1, 'continental_rank_all_players': 1, 'continental_rank_active_players': 1}


In [20]:
# List of FIDE IDs for the top players
top_10_fide_numbers = [player['fide_number'] for player in top_players]

# Fetching data for each FIDE ID
all_data = []
for fide_number in top_10_fide_numbers:
    player_data = fetch_fide_data_with_history(fide_number)
    all_data.extend(player_data)

# Converting combined data to a pandas dataframe
df_combined = pd.DataFrame(all_data)

# Displaying the dataframe
df_combined


Unnamed: 0,fide_id,name,federation,world_rank_active_players,date,standard,rapid,blitz
0,1503014,"Carlsen, Magnus",Norway,1,2025-Feb,2833,2819,2883
1,1503014,"Carlsen, Magnus",Norway,1,2025-Jan,2831,2819,2889
2,1503014,"Carlsen, Magnus",Norway,1,2024-Dec,2831,2838,2890
3,1503014,"Carlsen, Magnus",Norway,1,2024-Nov,2831,2825,2893
4,1503014,"Carlsen, Magnus",Norway,1,2024-Oct,2831,2834,2888
...,...,...,...,...,...,...,...,...
1789,5000017,"Anand, Viswanathan",India,10,2001-Apr,2794,,
1790,5000017,"Anand, Viswanathan",India,10,2001-Jan,2790,,
1791,5000017,"Anand, Viswanathan",India,10,2000-Oct,2774,,
1792,5000017,"Anand, Viswanathan",India,10,2000-Jul,2762,,


In [58]:
# Changes the format of the name column for easy foreign key integration in chess.db
df_combined['name'] = df_combined['name'].apply(lambda x: ' '.join([part.strip() for part in x.split(',')[::-1]]))

# Changes the format of the date column from "2000-Jan" to "Jan 2000"
df_combined['date'] = df_combined['date'].apply(lambda x: pd.to_datetime(x, format='%Y-%b').strftime('%b %Y'))

# Create a mapping dictionary from country names to country codes
country_mapping = {player['country_name']: player['country'] for player in top_players}

# Replace the values in the federation column using the mapping
df_combined['federation'] = df_combined['federation'].map(country_mapping)

In [59]:
print(df_combined)

      fide_id               name federation  world_rank_active_players  \
0     1503014     Magnus Carlsen         NO                          1   
1     1503014     Magnus Carlsen         NO                          1   
2     1503014     Magnus Carlsen         NO                          1   
3     1503014     Magnus Carlsen         NO                          1   
4     1503014     Magnus Carlsen         NO                          1   
...       ...                ...        ...                        ...   
1789  5000017  Viswanathan Anand         IN                         10   
1790  5000017  Viswanathan Anand         IN                         10   
1791  5000017  Viswanathan Anand         IN                         10   
1792  5000017  Viswanathan Anand         IN                         10   
1793  5000017  Viswanathan Anand         IN                         10   

          date standard rapid blitz  
0     Feb 2025     2833  2819  2883  
1     Jan 2025     2831  2819  2889

In [60]:
#
df_combined[['standard', 'rapid', 'blitz']] = df_combined[['standard', 'rapid', 'blitz']].replace(r'^\s*$', np.nan, regex=True)

# 
df_combined[df_combined[['standard', 'rapid', 'blitz']].isnull().any(axis=1)]

# Converting the dataframe into a CSV
df_combined.to_csv("../data/Fide/fide_data.csv")

### Step 2: Google Trends Data via SERPAPI

In [None]:
We will now retrieve the google trends data for the keyword search of "Chess" inside each of the top ten players country. 

# Retrieves the Country Google Trends data (GTrends_Country
destination = "GTrends_Country"

# Remove the hashtag below to run the loop
# for countries in player_countries:
    keyword = "Chess"
    country_code = countries
    
    # Call the fetch_google_trends function
    fetch_google_trends(country_code, keyword, destination, SERPAPI_KEY)
    
## get the google trends data for players 

In [None]:
# Creating pandas dataframe and CSV for players
all_player_gtrends =[]
for players in top_players:
    name = players["name"].replace("_"," ")
    country = players["country"]
    all_player_gtrends.extend(gtrends_players(name,country))
    
final_player_gtrends_df = pd.DataFrame(all_player_gtrends)
final_player_gtrends_df.to_csv("../data/GTrends_Player/players_gtrends_data.csv")

In [None]:
# Creating pandas dataframe and CSV for countries
all_country_gtrends =[]
for country in countries:
    all_country_gtrends.extend(gtrends_country(country))
final_country_gtrends_df = pd.DataFrame(all_country_gtrends)
final_country_gtrends_df.to_csv("../data/GTrends_Country/country_gtrends_data.csv")

### Step 3: Fetching Chess.com Data

In [3]:
# Defining the base URL for the API, using the endpoint "stats"
base_url = "https://api.chess.com/pub/player/{}/stats"
headers = {
    "User-Agent": "Python script for educational use"
}

In [3]:
# Fetchşng stats for all players and storing it in a pandas dataframe
df_players_stats = fetch_all_players_stats(top_players)

# Displaying the dataframe
df_players_stats

Fetching data for Magnus_Carlsen (magnuscarlsen)...
Fetching data for Fabiano_Caruana (fabianocaruana)...
Fetching data for Hikaru_Nakamura (hikaru)...
Fetching data for Arjun_Erigaisi (ghandeevam2003)...
Fetching data for Gukesh_Dommaraju (gukeshdommaraju)...
Fetching data for Nodirbek_Abdusattorov (chesswarrior7197)...
Fetching data for Alireza_Firouzja (firouzja2003)...
Fetching data for Ian_Nepomniachtchi (lachesisq)...
Fetching data for Yi_Wei (wei-yi)...
Fetching data for Viswanathan_Anand (thevish)...


Unnamed: 0,name,fide_number,chess_com_username,country,current_classic,current_blitz,current_rapid
0,Magnus_Carlsen,1503014,magnuscarlsen,NO,,3235.0,2906
1,Fabiano_Caruana,2020009,fabianocaruana,US,,3097.0,2823
2,Hikaru_Nakamura,2016192,hikaru,US,2239.0,3300.0,2769
3,Arjun_Erigaisi,35009192,ghandeevam2003,IN,1187.0,3084.0,2719
4,Gukesh_Dommaraju,46616543,gukeshdommaraju,IN,,3023.0,2729
5,Nodirbek_Abdusattorov,14204118,chesswarrior7197,UZ,1439.0,3139.0,2759
6,Alireza_Firouzja,12573981,firouzja2003,FR,,3201.0,2870
7,Ian_Nepomniachtchi,4168119,lachesisq,RU,1008.0,3122.0,2827
8,Yi_Wei,8603405,wei-yi,CN,,,1457
9,Viswanathan_Anand,5000017,thevish,IN,,2620.0,2731


In [None]:
# Converting the pandas dataframe into a CSV file
df_players_stats.to_csv("../data/Chess_com/chesscom.csv")