## 📝 NB01 - Data Collection

In [None]:
# Importing necessary packages
import os
import json
import requests
import pandas as pd
import numpy as np
import subprocess
import time 

from serpapi import GoogleSearch

from dotenv import load_dotenv

from functions import *

from tqdm.notebook import tqdm
tqdm.pandas()

In [None]:
# Top Chess Players in the world as of December 16, 2024
# Ordering within the dictionary does not matter
top_players =[
    {'name': 'Magnus_Carlsen', 'fide_number': '1503014', 'chess_com_username': 'magnuscarlsen','country':'NO','country_name':'Norway'},
    {'name': 'Fabiano_Caruana', 'fide_number': '2020009', 'chess_com_username': 'fabianocaruana','country':'US','country_name':'United States of America'},
    {'name': 'Hikaru_Nakamura', 'fide_number': '2016192', 'chess_com_username': 'hikaru','country':'US','country_name':'United States of America'},
    {'name': 'Arjun_Erigaisi', 'fide_number': '35009192', 'chess_com_username': 'ghandeevam2003','country':'IN','country_name':'India'},
    {'name': 'Gukesh_Dommaraju', 'fide_number': '46616543', 'chess_com_username': 'gukeshdommaraju','country':'IN','country_name':'India'},
    {'name': 'Nodirbek_Abdusattorov', 'fide_number': '14204118', 'chess_com_username': 'chesswarrior7197','country':'UZ','country_name':'Uzbekistan'},
    {'name': 'Alireza_Firouzja', 'fide_number': '12573981', 'chess_com_username': 'firouzja2003','country':'FR','country_name':'France'},
    {'name': 'Ian_Nepomniachtchi', 'fide_number': '4168119', 'chess_com_username': 'lachesisq','country':'RU','country_name':'Russia'},
    {'name': 'Yi_Wei', 'fide_number': '8603405', 'chess_com_username': 'wei-yi','country':'CN','country_name':'China'},
    {'name': 'Viswanathan_Anand', 'fide_number': '5000017', 'chess_com_username': 'thevish','country':'IN','country_name':'India'}
]

### Step 1: Fetching FIDE Data

To understand how we generate these dataframe, see the functions.py file inside ../notebooks.

Below is an example pulling the FIDE information for Magnus Carlsen

In [None]:
# Testing the fetch_fide_data function with Magnus Carlsen's fide number
fide_number = 1503014
fide_data = fetch_fide_data(fide_number)
print(fide_data)

Next we will use a loop to iterate through the top_players dictionary to retrieve all the player's FIDE data.

Additionally, this code block will combine all of the players data into a single dataframe.

In [None]:
# List of FIDE IDs for the top players
top_10_fide_numbers = [player['fide_number'] for player in top_players]

# Fetching data for each FIDE ID
all_data = []
for fide_number in top_10_fide_numbers:
    player_data = fetch_fide_data_with_history(fide_number)
    all_data.extend(player_data)

# Converting combined data to a pandas dataframe
df_combined = pd.DataFrame(all_data)


In this section we will manipulate how the data is stored in the dataframe. 

This is done for the purpose of organization or for convenience to our database.

In [None]:
# Changes the format of the name column for easy foreign key integration in chess.db
df_combined['name'] = df_combined['name'].apply(lambda x: ' '.join([part.strip() for part in x.split(',')[::-1]]))

# Changes the format of the date column from "2000-Jan" to "Jan 2000"
df_combined['date'] = df_combined['date'].apply(lambda x: pd.to_datetime(x, format='%Y-%b').strftime('%b %Y'))

# Create a mapping dictionary from country names to country codes
country_mapping = {player['country_name']: player['country'] for player in top_players}

# Replace the values in the federation column using the mapping
df_combined['federation'] = df_combined['federation'].map(country_mapping)

Finally, we insert "NaN" for missing values and then push the dataframe into a csv

In [None]:
#
df_combined[['standard', 'rapid', 'blitz']] = df_combined[['standard', 'rapid', 'blitz']].replace(r'^\s*$', np.nan, regex=True)

# 
df_combined[df_combined[['standard', 'rapid', 'blitz']].isnull().any(axis=1)]

# Converting the dataframe into a CSV
df_combined.to_csv("../data/fide_data.csv", index=False)

### Step 2: Google Trends Data via SERPAPI

We will now retrieve the google trends data for the keyword search of "Chess" inside each of the top ten players country. 

The code below will collect country G Trends data by iterating through the top_players dictionary above, pulling the country code. This is neccessary due to the API request needing to specify the region to search into. 

In [None]:
# Retrieves the Country Google Trends data (GTrends_Country)
destination = "GTrends_Country"

# Iterate through the top_players dictionary
for player in top_players:
    country_code = player["country"]  # Extract country code
    keyword = "Chess"
    
    # Call the fetch_google_trends function
    fetch_google_trends(country_code, keyword, destination, SERPAPI_KEY)

Next we use this function to iterate through all countriesl, collecting all the data into a dataframe, then moving it to csv

In [None]:
# Creating pandas dataframe and CSV for countries
all_country_gtrends =[]
for country in countries:
    all_country_gtrends.extend(gtrends_country(country))
final_country_gtrends_df = pd.DataFrame(all_country_gtrends)
final_country_gtrends_df.to_csv("../data/GTrends_Country/country_gtrends_data.csv")

Now we perform the same action but for collecting the player G Trends data. In this case we will also manipulate the player name so that the search replaces "_" for " ". This is necessary since people generally don't use "_" for spaces in their search inquiries.

At the end of the block, we combine all of the data requests into a dataframe then convert the dataframe to a csv file.

In [12]:
# Creating pandas dataframe and CSV for players
all_player_gtrends =[]
for players in top_players:
    name = players["name"].replace("_"," ")
    country = players["country"]
    all_player_gtrends.extend(gtrends_players(name,country))
    
final_player_gtrends_df = pd.DataFrame(all_player_gtrends)
final_player_gtrends_df.to_csv("../data/GTrends_Player/players_gtrends_data.csv")

### Step 3: Fetching Chess.com Data

Now we will be requesting data from the Chess.com api

In the block below we are defining the base URL for the API, using the endpoint "stats"


In [None]:
base_url = "https://api.chess.com/pub/player/{}/stats"
headers = {
    "User-Agent": "Python script for educational use"
}

See functions.py to understand fetch_all_players_stats. 

In essence, we load the top_players dictionary as an input and it will output a combined dataframe of all ten players stats.

In [None]:
# Fetchşng stats for all players and storing it in a pandas dataframe
df_players_stats = fetch_all_players_stats(top_players)

Finally, we move the dataframe into a csv.

In [None]:
# Converting the pandas dataframe into a CSV file
df_players_stats.to_csv("../data/chesscom.csv", index=False)

Click [here](../README.md#order-of-notebooks) to navigate back to the Order of Notebooks table!