# 01. ATP Tournament Page Scraper

Notebook will contain codes and functions for scraping the ATP tournament page for a specific year.
Scraping functions will return datasets containing tournament and match information, as well as the URL for each match's match centre page. 

## 1. Imports and Setup

In [2]:
# Standard math libraries
import numpy as np
import scipy as sp
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.pyplot as plt

# Web-scraping utitilies
import requests
from bs4 import BeautifulSoup

headers = {'User-Agent': 
           'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'} 
import re
import json

import sys
from time import sleep

# Selenium Imports
from selenium import webdriver
from selenium.webdriver.common.by import By
from webdriver_manager.microsoft  import EdgeChromiumDriverManager
from selenium.webdriver.edge.service import Service as EdgeService
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.edge.options import Options
from selenium.common.exceptions import NoSuchElementException

In [28]:
# Setting selenium options
options = Options()
options.headless = False
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
#options.add_argument("--window-size=1920,1080")
options.add_argument("--window-size=1200,800")

## 2. Scrape Calendar Page Data for Tournament Page URLs

In [4]:
service = EdgeService(executable_path=EdgeChromiumDriverManager().install())
driver = webdriver.Edge(service=service, options=options)

In [5]:
def scrape_ATP_calendar(year):
    """
    Scrapes ATP Tournament Info for a given Calendar Year/Season
    """
    # ATP Tournament Archive Page URL
    url = f"https://www.atptour.com/en/scores/results-archive?year={year}"

    # Resets Driver
    service = EdgeService(executable_path=EdgeChromiumDriverManager().install())
    driver = webdriver.Edge(service=service, options=options)

    # Get URL
    driver.get(url)

    # Maxmimise the browser window
    driver.maximize_window()

    # Return lists of tournament info
    list_elem_info = driver.find_elements(By.CSS_SELECTOR, "#scoresResultsArchive > table > tbody > tr")

    # Initialise lists to store all the tournament info
    category_list = []
    name_list = []
    location_list = []
    date_list = []
    draw_list = []
    surface_list = []
    finance_list = []
    winners_list = []
    url_list = []
    #resultS = []

    # Loop through list of elements and append info to individual lists
    for elem in list_elem_info:
        row_elems = elem.find_elements(By.TAG_NAME, "td")

        # Get the asset path for the tournament level/category stamp e.g. ATP 500
        tourn_stamp = elem.find_element(By.TAG_NAME, "img").get_attribute("src").split('/')[-1]
        if tourn_stamp == 'categorystamps_grandslam.png':
            category = "Grand Slam"
        elif tourn_stamp == 'categorystamps_1000.png':
            category = "ATP Masters 1000"
        elif tourn_stamp == 'categorystamps_500.png':
            category = "ATP 500"
        elif tourn_stamp == 'categorystamps_250.png':
            category = "ATP 250"
        else:
            category = "Other"
        # Tourn name, loc and start date
        name, location, date = row_elems[2].text.split('\n')
        # Draw Size for singles and doubles
        draw = row_elems[3].text
        # Surface
        surface = row_elems[4].text
        # Total Financial Commitment
        finance = row_elems[5].text
        # Winners for all categories
        winners = ", ".join(row_elems[6].text.split('\n'))

        # Presence of the 'RESULTS' text on this element indicates a result page is available
        if row_elems[7].text == 'RESULTS':
            url = row_elems[7].find_element(By.TAG_NAME, "a").get_attribute("href")
        else:
            url = ""

        # Append list elements
        category_list.append(category)
        name_list.append(name)
        location_list.append(location)
        date_list.append(date)
        draw_list.append(draw)
        surface_list.append(surface)
        finance_list.append(finance)
        winners_list.append(winners)
        url_list.append(url)
        #resultS.append(result)

    year_list = [year]*len(name_list)

    # Store lists into a DataFrame
    df_tourns = pd.DataFrame({"Year": year_list, "Tournament": name_list, "Category": category_list,  "Location": location_list, "Date_Start": date_list, "Draw": draw_list, \
                                    "Surface": surface_list, "Finance": finance_list, "Winner": winners_list, "URL": url_list})
    df_tourns.head()

    driver.quit()

    return df_tourns

In [58]:
df_tourns = scrape_ATP_calendar(2023)

In [59]:
df_tourns

Unnamed: 0,Year,Tournament,Category,Location,Date_Start,Draw,Surface,Finance,Winner,URL
0,2023,United Cup,Other,"Brisbane-Perth-Sydney, Australia",2022.12.29,SGL 18 DBL 18,Indoor Hard,"$15,000,000",Team: United States,https://www.atptour.com/en/scores/archive/bris...
1,2023,Adelaide International 1,ATP 250,"Adelaide, Australia",2023.01.01,SGL 32 DBL 24,Outdoor Hard,"$642,735","SGL: Novak Djokovic, DBL: Lloyd Glasspool Harr...",https://www.atptour.com/en/scores/archive/adel...
2,2023,Tata Open Maharashtra,ATP 250,"Pune, India",2023.01.02,SGL 28 DBL 16,Outdoor Hard,"$713,495","SGL: Tallon Griekspoor, DBL: Sander Gille Jora...",https://www.atptour.com/en/scores/archive/pune...
3,2023,ASB Classic,ATP 250,"Auckland, New Zealand",2023.01.09,SGL 28 DBL 16,Outdoor Hard,"$713,495",,https://www.atptour.com/en/scores/current/auck...
4,2023,Adelaide International 2,ATP 250,"Adelaide, Australia",2023.01.09,SGL 28 DBL 24,Outdoor Hard,"$642,735",,https://www.atptour.com/en/scores/current/adel...
...,...,...,...,...,...,...,...,...,...,...
62,2023,Rolex Paris Masters,ATP Masters 1000,"Paris, France",2023.10.30,SGL 56 DBL 24,Indoor Hard,"€6,748,815",,
63,2023,Stockholm Open,ATP 250,"Stockholm, Sweden",2023.11.05,SGL 28 DBL 16,Indoor Hard,"€750,950",,
64,2023,Moselle Open,ATP 250,"Metz, France",2023.11.05,SGL 28 DBL 16,Indoor Hard,"€630,705",,
65,2023,Next Gen ATP Finals,Other,TBD,2023.11.12,SGL 8 DBL 0,Indoor Hard,,,


In [60]:
df_tourns.to_csv("tournaments_ATP_2023.csv", index=False)

## 3. Scrape Tournament Page Data for Match URLs

In [5]:
url_tourn = "https://www.atptour.com/en/scores/archive/indian-wells/404/2022/results"

In [6]:
driver.get(url_tourn)

In [7]:
# Maxmimise the browser window
driver.maximize_window()

In [6]:
def scrape_ATP_tournament(url, tournament, year):
    """
    Scrapes ATP Tournament Info for a given Calendar Year/Season
    """

    # Resets Driver
    service = EdgeService(executable_path=EdgeChromiumDriverManager().install())
    driver = webdriver.Edge(service=service, options=options)

    # Get URL
    driver.get(url)

    # Maxmimise the browser window
    driver.maximize_window()

    tournament_rounds = [ elem.text for elem in driver.find_elements(By.CSS_SELECTOR, ".day-table > thead") ]

    # Initialise list to contain for a single tournament, dataframes of match infos per round
    list_df_tourn = []

    # Find matches per round (i.e. final, semi-final, etc.)
    for i, round in enumerate(tournament_rounds):

        round_matches = driver.find_elements(By.CSS_SELECTOR, ".day-table > tbody")[i].find_elements(By.XPATH, "tr")

        seed_1_list = []
        seed_2_list = []
        flag_1_list = []
        flag_2_list = []
        player_1_list = []
        player_2_list = []
        score_list = []
        url_list = []

        for match in round_matches:
            # Player Seedings
            seed_1,seed_2 = [ seed_elem.text.replace('(','').replace(')','') for seed_elem in match.find_elements(By.XPATH, "td[@class='day-table-seed']") ]
            # Player Flags (Nationalities)
            flag_elems = match.find_elements(By.XPATH, "td[@class='day-table-flag']")
            flags = []
            for flag_elem in flag_elems:
                try:
                    flag = flag_elem.find_element(By.TAG_NAME, "img").get_attribute("alt")
                    flags.append(flag)
                except NoSuchElementException:
                    flags.append("")
            flag_1, flag_2 = flags
            #flag_1, flag_2 = [flag_elem.find_element(By.TAG_NAME, "img").get_attribute("alt") for flag_elem in match.find_elements(By.XPATH, "td[@class='day-table-flag']") ]
            # Player Names
            player_1, player_2 = [player_elem.text for player_elem in match.find_elements(By.XPATH, "td[@class='day-table-name']") ]
            # Score
            score = match.find_element(By.XPATH, "td[@class='day-table-score']").text
            score = " ".join( [s[:2]+f"({s[-1]})" if len(s)>2 else s for s in score.split(' ')] )
            # Match Page URL
            try:
                url = match.find_element(By.XPATH, "td[@class='day-table-score']").find_element(By.TAG_NAME, "a").get_attribute("href")
            except NoSuchElementException:
                url = ""
            seed_1_list.append(seed_1)
            seed_2_list.append(seed_2)
            flag_1_list.append(flag_1)
            flag_2_list.append(flag_2)
            player_1_list.append(player_1)
            player_2_list.append(player_2)
            score_list.append(score)
            url_list.append(url)

        round_list = [round]*len(score_list)

        # Create Dataframe for current tournament round 
        
        df_round_info = pd.DataFrame({"Round": round_list, "Player1": player_1_list, "Player1_Seed": seed_1_list, "Player1_Nation": flag_1_list, 
                                    "Player2": player_2_list, "Player2_Seed": seed_2_list, "Player2_Nation": flag_2_list, "Score": score_list,
                                    "URL": url_list})


        # Append to list_df_tourn
        list_df_tourn.append(df_round_info)

    # Concatenate the list of dataframes into one
    df_tourn_matches = pd.concat(list_df_tourn)
    df_tourn_matches = df_tourn_matches.reset_index(drop=True)
    # Add a column for Year
    df_tourn_matches.insert(0, "Year", [year]*len(df_tourn_matches))
    # Add a column for tournament name
    df_tourn_matches.insert(1, "Tournament", [tournament]*len(df_tourn_matches))

    # Quit the browser
    driver.quit()

    return df_tourn_matches

In [8]:
df_tourns = pd.read_csv("data/tournaments_ATP_2022.csv")

In [46]:
df_tourns_1000 = df_tourns[df_tourns.Category=="ATP Masters 1000"]
for i in range(len(df_tourns_1000))[3:]:
    tournament = df_tourns_1000.Tournament.iloc[i]
    year = "2022"
    url = df_tourns_1000.URL.iloc[i]
    print(url)

    df_tourn_matches = scrape_ATP_tournament(url, tournament, year)
    
    df_tourn_matches.to_csv(f"matches_{tournament.replace(' ','-')}_{year}.csv", index=False)

https://www.atptour.com/en/scores/archive/madrid/1536/2022/results
https://www.atptour.com/en/scores/archive/rome/416/2022/results
https://www.atptour.com/en/scores/archive/montreal/421/2022/results
https://www.atptour.com/en/scores/archive/cincinnati/422/2022/results
https://www.atptour.com/en/scores/archive/paris/352/2022/results


In [48]:
tournament = "Nitto ATP Finals"
year = "2022"
url = df_tourns[df_tourns.Tournament == tournament].URL.iloc[0]

df_tourn_matches = scrape_ATP_tournament(url, tournament, year)
df_tourn_matches.to_csv(f"matches_{tournament.replace(' ','-')}_{year}.csv", index=False)

In [77]:
df_tourn_matches[df_tourn_matches.Player1 == "Novak Djokovic"]

Unnamed: 0,Year,Tournament,Round,Player1,Player1_Seed,Player1_Nation,Player2,Player2_Seed,Player2_Nation,Score,URL
0,2023,Adelaide International 1,Final,Novak Djokovic,1,SRB,Sebastian Korda,,USA,67(8) 76(3) 64,https://www.atptour.com/en/scores/stats-centre...
1,2023,Adelaide International 1,Semifinals,Novak Djokovic,1,SRB,Daniil Medvedev,3.0,RUS,63 64,https://www.atptour.com/en/scores/stats-centre...
3,2023,Adelaide International 1,Quarterfinals,Novak Djokovic,1,SRB,Denis Shapovalov,7.0,CAN,63 64,https://www.atptour.com/en/scores/stats-centre...
7,2023,Adelaide International 1,Round Of 16,Novak Djokovic,1,SRB,Quentin Halys,,FRA,76(3) 76(5),https://www.atptour.com/en/scores/stats-centre...
15,2023,Adelaide International 1,Round Of 32,Novak Djokovic,1,SRB,Constant Lestienne,,FRA,63 62,https://www.atptour.com/en/scores/stats-centre...
