# 01b. RG Tournament Page Scraper

Notebook will contain codes and functions for scraping the ATP tournament page for a specific year.
Scraping functions will return datasets containing tournament and match information, as well as the URL for each match's match centre page. 

## 1. Imports and Setup

In [1]:
# Standard math libraries
import numpy as np
import scipy as sp
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.pyplot as plt

# Web-scraping utitilies
import requests
from bs4 import BeautifulSoup

headers = {'User-Agent': 
           'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'} 
import re
import json

import sys
from time import sleep

## 2. Scrape Calendar Page Data for Tournament Page URLs

In [187]:
# Selected tournament year
year = 2019
# Create a list of date strings for all dates from the above year's Roland Garros edition
#dates = pd.date_range("2022-05-16", "2022-06-05", freq='d')
#dates = pd.date_range("2021-05-24", "2021-06-13", freq='d')
#dates = pd.date_range("2020-09-27", "2020-10-11", freq='d')
dates = pd.date_range("2019-05-26", "2019-06-09", freq='d')
dates = [str(d).split(" ")[0] for d in dates]

# Initialise a list to contain all the results data
df_results_list = []

for date in dates:

    link = f'https://www.rolandgarros.com/api/en-us/order-of-play/{date}/{year}'
    # Get request and content from the given link and parse into HTML
    pageTree = requests.get(link, headers=headers)
    #pageSoup = BeautifulSoup(pageTree.content, 'html.parser') 

    results_json = json.loads(pageTree.content)
    # Get all results info from the available court types at Roland Garros
    df_courts = [ pd.DataFrame(court['matchSchedulers'][0]['matches']) for court in results_json['principalCourts'] + results_json['annexeCourts'] ]
    if df_courts == []:
        continue
    else:
        df_results_d = pd.concat(df_courts)

        df_results_list.append(df_results_d)

    sleeptime = np.random.uniform(1, 5)
    sleep(sleeptime)

In [176]:
df_results.to_csv("../data/RG_results-all_raw_2020.csv", index=False)

### Creating a Dataframe/CSV of all player's information

In [177]:
players_teamA = pd.concat(list(df_results.teamA.apply(lambda x : pd.DataFrame(x['players'])))).drop_duplicates(subset=['id'])
players_teamB = pd.concat(list(df_results.teamB.apply(lambda x : pd.DataFrame(x['players'])))).drop_duplicates(subset=['id'])

In [179]:
df_players = pd.concat([players_teamA, players_teamB]).drop_duplicates(subset=['id']).sort_values("firstName").reset_index(drop=True)

In [181]:
df_players.to_csv(f"../data/RG_players_{year}.csv", index=False)

### Raw Data Processing

Add columns for the match round name (e.g. Final), player/team names and respective uuids.

In [182]:
round_names = df_results.matchData.apply(lambda x : x['roundLabel'])
df_results.insert(3, "round", round_names) 

player1_name = df_results.teamA.apply(lambda x : ", ".join([p['firstName']+ " "+ p['lastName'].title() for p in x['players']]))
player2_name = df_results.teamB.apply(lambda x : ", ".join([p['firstName']+ " "+ p['lastName'].title() for p in x['players']]))
df_results.insert(6, "player2_name", player2_name) 
df_results.insert(5, "player1_name", player1_name) 

teamA_player_uuid = df_results.teamA.apply(lambda x : [p['id'] for p in x['players']])
teamB_player_uuid = df_results.teamB.apply(lambda x : [p['id'] for p in x['players']])
df_results.insert(8, "team2_player_uuid", teamB_player_uuid) 
df_results.insert(6, "team1_player_uuid", teamA_player_uuid) 

In [183]:
df_results = df_results.rename(columns={"teamA": "team1", "teamB": "team2"})

In [184]:
df_results.tail(2)

Unnamed: 0,id,url,matchData,round,team1,player1_name,team1_player_uuid,team2,player2_name,team2_player_uuid,umpire,showUmpire
591,DD001,/en-us/matches/2020/DD001,"{'type': 'DD', 'typeLabel': 'Women’s Doubles',...",Final,"{'players': [{'id': 30554, 'firstName': 'Alexa...","Alexa Guarachi, Desirae Krawczyk","[30554, 25892]","{'players': [{'id': 21766, 'firstName': 'Timea...","Timea Babos, Kristina Mladenovic","[21766, 19921]","{'firstName': None, 'lastName': None, 'shortNa...",False
592,SM001,/en-us/matches/2020/SM001,"{'type': 'SM', 'typeLabel': 'Men’s Singles', '...",Final,"{'players': [{'id': 9801, 'firstName': 'Novak'...",Novak Djokovic,[9801],"{'players': [{'id': 7792, 'firstName': 'Rafael...",Rafael Nadal,[7792],"{'firstName': None, 'lastName': None, 'shortNa...",False


In [186]:
df_results.to_csv(f"../data/RG_results-all_processed_{year}.csv", index=False)