# 01. ATP Tournament Page Scraper

Notebook will contain codes and functions for scraping the ATP tournament page for a specific year.
Scraping functions will return datasets containing tournament and match information, as well as the URL for each match's match centre page. 

## 1. Imports and Setup

In [1]:
# Standard math libraries
import numpy as np
import scipy as sp
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.pyplot as plt

# Web-scraping utitilies
import requests
from bs4 import BeautifulSoup

headers = {'User-Agent': 
           'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'} 
import re
import json

import sys
from time import sleep

# Selenium Imports
from selenium import webdriver
from selenium.webdriver.common.by import By
from webdriver_manager.microsoft  import EdgeChromiumDriverManager
from selenium.webdriver.edge.service import Service as EdgeService
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.edge.options import Options
from selenium.common.exceptions import NoSuchElementException

In [2]:
# Setting selenium options
options = Options()
options.headless = False
# options.add_experimental_option("excludeSwitches", ["enable-automation"])
# options.add_experimental_option('useAutomationExtension', False)
# #options.add_argument("--window-size=1920,1080")
# options.add_argument("--window-size=1200,800")

  options.headless = False


## 2. Scrape Calendar Page Data for Tournament Page URLs

In [4]:
service = EdgeService(executable_path=EdgeChromiumDriverManager().install())
driver = webdriver.Edge(service=service, options=options)

In [109]:
def scrape_ATP_calendar(year):
    """
    Scrapes ATP Tournament Info for a given Calendar Year/Season
    """
    # ATP Tournament Archive Page URL
    url = f"https://www.atptour.com/en/scores/results-archive?year={year}"

    # Resets Driver
    service = EdgeService(executable_path=EdgeChromiumDriverManager().install())
    driver = webdriver.Edge(service=service, options=options)

    # Get URL
    driver.get(url)

    # Maxmimise the browser window
    driver.maximize_window()

    # Return lists of tournament info
    list_elem_info = driver.find_elements(By.CSS_SELECTOR, "#scoresResultsArchive > table > tbody > tr")

    # Initialise lists to store all the tournament info
    category_list = []
    name_list = []
    location_list = []
    date_list = []
    draw_list = []
    surface_list = []
    finance_list = []
    winners_list = []
    url_list = []
    #resultS = []

    # Loop through list of elements and append info to individual lists
    for elem in list_elem_info:
        row_elems = elem.find_elements(By.TAG_NAME, "td")

        # Get the asset path for the tournament level/category stamp e.g. ATP 500
        try:
            tourn_stamp = elem.find_element(By.TAG_NAME, "img").get_attribute("src").split('/')[-1]
            if tourn_stamp == 'categorystamps_grandslam.png':
                category = "Grand Slam"
            elif tourn_stamp == 'categorystamps_1000.png':
                category = "ATP Masters 1000"
            elif tourn_stamp == 'categorystamps_500.png':
                category = "ATP 500"
            elif tourn_stamp == 'categorystamps_250.png':
                category = "ATP 250"
            else:
                category = "Other"
        except:
            category = "Other"
        # Tourn name, loc and start date
        name, location, date = row_elems[2].text.split('\n')
        # Draw Size for singles and doubles
        draw = row_elems[3].text
        # Surface
        surface = row_elems[4].text
        # Total Financial Commitment
        finance = row_elems[5].text
        # Winners for all categories
        winners = ", ".join(row_elems[6].text.split('\n'))

        # Presence of the 'RESULTS' text on this element indicates a result page is available
        if row_elems[7].text == 'RESULTS':
            url = row_elems[7].find_element(By.TAG_NAME, "a").get_attribute("href")
        else:
            url = ""

        # Append list elements
        category_list.append(category)
        name_list.append(name)
        location_list.append(location)
        date_list.append(date)
        draw_list.append(draw)
        surface_list.append(surface)
        finance_list.append(finance)
        winners_list.append(winners)
        url_list.append(url)
        #resultS.append(result)

    year_list = [year]*len(name_list)

    # Store lists into a DataFrame
    df_tourns = pd.DataFrame({"Year": year_list, "Tournament": name_list, "Category": category_list,  "Location": location_list, "Date_Start": date_list, "Draw": draw_list, \
                                    "Surface": surface_list, "Finance": finance_list, "Winner": winners_list, "URL": url_list})
    df_tourns.head()

    driver.quit()

    return df_tourns

In [114]:
df_tourns = scrape_ATP_calendar(2020)
# Add a Tournament ID column
df_tourns.insert(2, "Tournament_id", df_tourns.URL.apply(lambda x: x.split("/")[-3]))

In [115]:
df_tourns.to_csv(f"../data/tournaments_ATP_2020.csv", index=False)

## 3. Scrape Tournament Page Data for Match URLs

In [5]:
url_tourn = "https://www.atptour.com/en/scores/archive/indian-wells/404/2022/results"

In [6]:
driver.get(url_tourn)

In [7]:
# Maxmimise the browser window
driver.maximize_window()

In [127]:
def scrape_ATP_tournament(url, tournament, tournament_id, year):
    """
    Scrapes ATP Tournament Info for a given Calendar Year/Season
    """

    # Resets Driver
    service = EdgeService(executable_path=EdgeChromiumDriverManager().install())
    driver = webdriver.Edge(service=service, options=options)

    # Get URL
    driver.get(url)

    # Maxmimise the browser window
    driver.maximize_window()

    tournament_rounds = [ elem.text for elem in driver.find_elements(By.CSS_SELECTOR, ".day-table > thead") ]

    # Initialise list to contain for a single tournament, dataframes of match infos per round
    list_df_tourn = []

    # Find matches per round (i.e. final, semi-final, etc.)
    for i, round in enumerate(tournament_rounds):

        round_matches = driver.find_elements(By.CSS_SELECTOR, ".day-table > tbody")[i].find_elements(By.XPATH, "tr")

        seed_1_list = []
        seed_2_list = []
        flag_1_list = []
        flag_2_list = []
        player_1_list = []
        player_1_id_list = []
        player_2_list = []
        player_2_id_list = []
        score_list = []
        url_list = []
        cv_avail_list = []

        for match in round_matches:
            # Player Seedings
            seed_1,seed_2 = [ seed_elem.text.replace('(','').replace(')','') for seed_elem in match.find_elements(By.XPATH, "td[@class='day-table-seed']") ]
            # Player Flags (Nationalities)
            flag_elems = match.find_elements(By.XPATH, "td[@class='day-table-flag']")
            flags = []
            for flag_elem in flag_elems:
                try:
                    flag = flag_elem.find_element(By.TAG_NAME, "img").get_attribute("alt")
                    flags.append(flag)
                except NoSuchElementException:
                    flags.append("")
            flag_1, flag_2 = flags
            #flag_1, flag_2 = [flag_elem.find_element(By.TAG_NAME, "img").get_attribute("alt") for flag_elem in match.find_elements(By.XPATH, "td[@class='day-table-flag']") ]
            # Player Names
            player_elems = match.find_elements(By.XPATH, "td[@class='day-table-name']")
            player_1, player_2 = [player_elem.text for player_elem in player_elems ]
            player_1_id = player_elems[0].find_element(By.TAG_NAME, "a").get_attribute("href").split('/')[-2].upper()
            if player_2 != "Bye":
                player_2_id = player_elems[1].find_element(By.TAG_NAME, "a").get_attribute("href").split('/')[-2].upper()
            else:
                player_2_id = ""
            # Score
            score = match.find_element(By.XPATH, "td[@class='day-table-score']").text
            score = " ".join( [s[:2]+f"({s[-1]})" if len(s)>2 else s for s in score.split(' ')] )
            # Match Page URL
            try:
                url = match.find_element(By.XPATH, "td[@class='day-table-score']").find_element(By.TAG_NAME, "a").get_attribute("href")
                # Check if 2nd Screen / Court Vision is available for current match
                # If 2nd screen is avail, it should be positioned as the 2nd day-table-button element (after "H2H")
                cv_check = match.find_elements(By.XPATH, "td[@class='day-table-button']")[-1].find_element(By.TAG_NAME, "a").text
                if cv_check == "2ND":
                    cv_avail = 1
                else:
                    cv_avail = 0
            except NoSuchElementException:
                url = ""
                cv_avail = 0
            seed_1_list.append(seed_1)
            seed_2_list.append(seed_2)
            flag_1_list.append(flag_1)
            flag_2_list.append(flag_2)
            player_1_list.append(player_1)
            player_1_id_list.append(player_1_id)
            player_2_list.append(player_2)
            player_2_id_list.append(player_2_id)
            score_list.append(score)
            url_list.append(url)
            cv_avail_list.append(cv_avail)

        round_list = [round]*len(score_list)

        # Create Dataframe for current tournament round 
        
        df_round_info = pd.DataFrame({"round": round_list, "player1_name": player_1_list, "player1_id": player_1_id_list, "player1_seed": seed_1_list, "player1_nation": flag_1_list, 
                                    "player2_name": player_2_list, "player2_id": player_2_id_list, "player2_seed": seed_2_list, "player2_nation": flag_2_list, "score": score_list,
                                    "url": url_list, "court_vision": cv_avail_list})


        # Append to list_df_tourn
        list_df_tourn.append(df_round_info)

    if len(list_df_tourn) == 0:
        return 
    # Concatenate the list of dataframes into one
    df_tourn_matches = pd.concat(list_df_tourn)
    df_tourn_matches = df_tourn_matches.reset_index(drop=True)
    # Add a column for Year
    df_tourn_matches.insert(0, "year", [year]*len(df_tourn_matches))
    # Add a column for tournament name
    df_tourn_matches.insert(1, "tournament", [tournament]*len(df_tourn_matches))
    # Add a column for tournament id
    df_tourn_matches.insert(2, "tournament_id", [tournament_id]*len(df_tourn_matches))

    # Quit the browser
    driver.quit()

    return df_tourn_matches

In [116]:
df_tourns = pd.read_csv("../data/tournaments_ATP_2021.csv")

In [None]:
# Add a Tournament ID column
#df_tourns.insert(2, "Tournament_id", df_tourns.URL.apply(lambda x: x.split("/")[-3]))

In [26]:
#df_tourns.to_csv("../data/tournaments_ATP_2022.csv", index=False)

In [53]:
df_tourns.Category.unique()

array(['Other', 'ATP 250', 'Grand Slam', 'ATP 500', 'ATP Masters 1000'],
      dtype=object)

In [135]:
type(df_tourn_matches) == pd.core.frame.DataFrame

True

In [137]:
df_tourns_scrape = df_tourns[df_tourns.Category=="ATP 250"]
for i in range(len(df_tourns_scrape))[:]:
    tournament = df_tourns_scrape.Tournament.iloc[i]
    year = "2021"
    tournament_id = df_tourns_scrape.Tournament_id.iloc[i]
    url = df_tourns_scrape.URL.iloc[i]
    print(url)

    df_tourn_matches = scrape_ATP_tournament(url, tournament, tournament_id, year)
    if type(df_tourn_matches) != pd.core.frame.DataFrame:
        continue
    
    df_tourn_matches.to_csv(f"../data/atp-tournament-matches/matches_{tournament.replace(' /','').replace(' ','-')}_{year}.csv", index=False)

https://www.atptour.com/en/scores/archive/delray-beach/499/2021/results
https://www.atptour.com/en/scores/archive/antalya/9426/2021/results
https://www.atptour.com/en/scores/archive/auckland/301/2021/results
https://www.atptour.com/en/scores/archive/melbourne/8998/2021/results
https://www.atptour.com/en/scores/archive/melbourne/9428/2021/results
https://www.atptour.com/en/scores/archive/pune/891/2021/results
https://www.atptour.com/en/scores/archive/new-york/424/2021/results
https://www.atptour.com/en/scores/archive/singapore/9460/2021/results
https://www.atptour.com/en/scores/archive/cordoba/9158/2021/results
https://www.atptour.com/en/scores/archive/montpellier/375/2021/results
https://www.atptour.com/en/scores/archive/buenos-aires/506/2021/results
https://www.atptour.com/en/scores/archive/doha/451/2021/results
https://www.atptour.com/en/scores/archive/marseille/496/2021/results
https://www.atptour.com/en/scores/archive/santiago/8996/2021/results
https://www.atptour.com/en/scores/arc

## 3. Combine all Match Results together

In [143]:
import glob
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor

In [176]:
file_list = glob.glob("../data/atp-tournament-matches/2021/*.csv*")

In [177]:
def read_file(file):
    """
    Reads ATP tournament results csv file and sorts in ascending match order
    """
    df = pd.read_csv(file)

    return df.iloc[::-1].reset_index(drop=True)

In [178]:
with ThreadPoolExecutor(8) as pool:
    df_results_all = pd.concat(pool.map(read_file, file_list[:]))

In [179]:
df_results_all.court_vision.value_counts()

0    2062
1    1186
Name: court_vision, dtype: int64

In [180]:
# Order the rows in the df_results_all dataframe to follow that of the Tournament calendar in df_tourns
df_tourns = pd.read_csv("../data/tournaments_ATP_2021.csv")

In [181]:
# Merges the tournament info and results dataframes to enable the combined results dataframe 
# to be sorted via chronological occurence of tournaments throughout the year``
df_results_all2 = pd.merge(df_results_all,df_tourns[['Tournament', 'Date_Start']].rename({"Tournament":"tournament"},axis=1), how='left', on='tournament')

In [183]:
df_results_year_sorted = df_results_all2.rename_axis("match_index").sort_values(["Date_Start", "match_index"]).reset_index(drop=True).iloc[:,:-1]
df_results_year_sorted.to_csv("../data/atp-tournament-matches/2021/match_results_all_2021.csv", index=False)

In [184]:
df_results_year_sorted

Unnamed: 0,year,tournament,tournament_id,round,player1_name,player1_id,player1_seed,player1_nation,player2_name,player2_id,player2_seed,player2_nation,score,url,court_vision
0,2021,Antalya,9426,1st Round Qualifying,Alex Molcan,MV14,12,SVK,Andrey Golubev,G806,,KAZ,62 75,,0
1,2021,Antalya,9426,1st Round Qualifying,Michael Vrbensky,V0B2,11,CZE,Vitaliy Sachko,SS25,,UKR,75 62,,0
2,2021,Antalya,9426,1st Round Qualifying,Lucas Miedler,MO55,10,AUT,Alexandar Lazarov,LI39,,BUL,16 63 63,,0
3,2021,Antalya,9426,1st Round Qualifying,Harri Heliovaara,H808,,FIN,Mukund Sasikumar,MP13,9,IND,64 75,,0
4,2021,Antalya,9426,1st Round Qualifying,Dimitar Kuzmanov,KC33,8,BUL,Mirza Basic,BE67,,BIH,63 61,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3243,2021,Nitto ATP Finals,605,Round Robin,Casper Ruud,RH16,8,NOR,Andrey Rublev,RE44,5,RUS,26 75 76(5),https://www.atptour.com/en/scores/stats-centre...,1
3244,2021,Nitto ATP Finals,605,Round Robin,Novak Djokovic,D643,1,SRB,Cameron Norrie,N771,10 AL,GBR,62 61,https://www.atptour.com/en/scores/stats-centre...,1
3245,2021,Nitto ATP Finals,605,Semi-Finals,Daniil Medvedev,MM58,2,RUS,Casper Ruud,RH16,8,NOR,64 62,https://www.atptour.com/en/scores/stats-centre...,1
3246,2021,Nitto ATP Finals,605,Semi-Finals,Alexander Zverev,Z355,3,GER,Novak Djokovic,D643,1,SRB,76(4) 46 63,https://www.atptour.com/en/scores/stats-centre...,1
