# 02. ATP Match Centre 2D Court Vision Scraper

Notebook will contain codes and functions for scraping almost all service data from a ATP court vision page (where available) for a specific match.
Also a rough pipeline to enable scraping at a much larger scale e.g. all matches per valid tournament

## 1. Imports and Setup

In [3]:
# Standard math libraries
import numpy as np
import scipy as sp
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.pyplot as plt

# Web-scraping utitilies
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
# headers = {'User-Agent': 
#            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'} 
import re
import json

import sys
from time import sleep


# Selenium Imports
from selenium import webdriver
from selenium.webdriver.common.by import By
from webdriver_manager.microsoft  import EdgeChromiumDriverManager
from selenium.webdriver.edge.service import Service as EdgeService
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.edge.options import Options
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException

In [4]:
from selenium.webdriver.edge.options import Options

# Setting selenium options
options = Options()
options.headless = True
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
#options.add_argument("--window-size=1920,1080")
options.add_argument("--window-size=1200,800")
#options.add_argument("--user-data-dir=C:\\Users\\lgjg1\\AppData\\Local\\Microsoft\\Edge\\User Data")
#options.add_argument("--profile-directory=Default")
#options.add_argument("--remote-debugging-port=0")
 

In [5]:
service = EdgeService(executable_path=EdgeChromiumDriverManager().install())
driver = webdriver.Edge(service=service, options=options)

In [4]:
# ATP match centre URL
url = "https://ausopen.com/match/2023-elena-rybakina-vs-danielle-collins-ws302#!infosys-3"

In [9]:
driver.quit()

In [12]:
# Get URL
driver.get(url)
# Maxmimise the browser window
#driver.maximize_window()

In [46]:
driver.get_screenshot_as_file("screenshot_test_window16x9_error3.png")

True

In [14]:
#Click the cookies popup
driver.find_element(By.CSS_SELECTOR, "body > div.gdpr-popup > div > button").click()

In [72]:
# Get the player names
player1, player2 = driver.find_element(By.XPATH, "//div[@class='playersDiv']").text.split('\n')
print(player1, player2)

I. Swiatek E. Rybakina


In [15]:
# Move to where "Compare both players" is clickable
driver.find_elements(By.CSS_SELECTOR, "#AODropDown > div.dropdown-container > div")[-1].location_once_scrolled_into_view

{'x': 776, 'y': 0}

In [16]:
# Click on the compare players button...
driver.find_element(By.CSS_SELECTOR, "#CourtVision > div.CourtVisionWrapper > div.timeline-court-wrapper > div.filter-description-wrapper > div.compare-zoom-wrap > div.compare-wrap > button").click()

In [17]:
#Close "Compare Players"
driver.find_element(By.CSS_SELECTOR, "#CourtVision > div.CourtVisionWrapper > div.both-court-wrap > div.both-court-top > div.both-court-cross").location_once_scrolled_into_view 
driver.find_element(By.CSS_SELECTOR, "#CourtVision > div.CourtVisionWrapper > div.both-court-wrap > div.both-court-top > div.both-court-cross").click()

In [7]:
def select_shot_type(shot_type):
    """
    Toggle between the different shot selection types on the Match Centre webapp
    
    """
    if shot_type not in ["First Serve", "Second Serve", "Double Faults"]:
        sys.exit('Please provide a valid shot type selection')
    # Click on the shot type dropdown then select the required serve stat or other
    # Old Version but wasn't working for non serve types e.g. DFs
    #driver.find_elements(By.XPATH, "//div[@class='select-arrow']")[4].click()
    #WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.XPATH, f"//div[contains(text(),'{shot_type}')]"))).click()

    driver.find_elements(By.XPATH, "//div[@class='select-arrow']")[4].click()
    WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.XPATH, "//div[@class='dropdown-link']")))
    dropdowns = driver.find_elements(By.XPATH, "//div[@class='dropdown-link']")
    drop_opts = [ s.text for s in dropdowns ]
    drop_index = drop_opts.index(shot_type)
    WebDriverWait(driver, 5).until(EC.element_to_be_clickable(dropdowns[drop_index])).click()

In [180]:
# Service Box Dims
servebox_html = driver.find_elements(By.CSS_SELECTOR, "#serviceBox")[0].get_attribute('outerHTML')
box_x, box_y = float(servebox_html.split(' width="')[1].split('" height="')[0]), float(servebox_html.split(' height="')[1].split('" stroke="')[0])

## 3. Function to loop every point and gather data

In [181]:
swap_players = False
shot_type = "First Serve"
#shot_type = "Second Serve"
#shot_type = "Double Faults"

In [6]:
select_shot_type(shot_type)

NameError: name 'select_shot_type' is not defined

In [9]:
def scrape_serve_data(driver, player1, player2, shot_type, box_x, box_y, swap_players=False):
    """
    Scrape serve data from a single serve type from Court Vision 2D
    
    """

    # Select just the timeline within the current "compare" popup
    timeline = driver.find_element(By.CSS_SELECTOR, "#CourtVision > div.CourtVisionWrapper > div.both-court-wrap > div.timeline-container > div")

    # Get all the "point" elements i.e. the dot-separators and the clickable point circle elements
    all_points = timeline.find_elements(By.CSS_SELECTOR, "svg[id^='point'], #dot-separator")

    # Gets above + the set-score elements
    all_time_eles = timeline.find_elements(By.CSS_SELECTOR, "svg[id^='point'], #dot-separator, #team1Score")

    # Get all the clickable point circle elements
    point_circs = timeline.find_elements(By.CSS_SELECTOR, "svg[id^='point']")


    # Initialise all data lists
    x_list = [] # x-coords
    y_list = [] # y-coords
    outcomes_list = [] # point outcome

    p1_score_G_list = []
    p2_score_G_list = []
    p1_setswon_list = [] # no. of sets won so far in match
    p2_setswon_list = []
    p1_setscore_list = [] # no. of games won so far in set
    p2_setscore_list = [] 
    set_N_list = [] # contains the set of associated point
    point_list = [] # contains point numbers


    speed_list = [] # speeds list
    type_list = [] # type of serve
    ral_len_list = [] # len of rally
    height_grd_list = [] # "height above ground"
    height_bounce_list = [] # "bounce height"
    outcome_type_list = []
    player_serving_list= []

    if len(point_circs) == 0:
        return None

    # Loop through every point on the timeline, click and grab info
    for i in range(len(point_circs)):
        
        # Being nice and randomly sleeping
        sleeptime = np.random.uniform(0, 1)
        sleep(sleeptime)

        # Get the current point number
        #point_circs = timeline.find_elements(By.CSS_SELECTOR, "svg[id^='point']")
        point = point_circs[i] 
        point_n = all_points.index(point) + 1
        point_list.append(point_n)

        # This scrolls back to where the point-bar is viewable (using the actual element itself gets it blocked by the navbar sadly)
        #driver.find_elements(By.CSS_SELECTOR, "#MatchBeats > div > div.bottom > div.select-shots")[0].location_once_scrolled_into_view 
        # Clicking on the point circle highlights a point/scatter on the point, allowing it to be clicked with
        # precedence over other overlapping points
        point.location_once_scrolled_into_view 
        point.click()

        # Get the selected highlighted ball element
        #selected_ball = driver.find_element(By.CSS_SELECTOR, "g[id='plottedBallsSelected']")
        try:
            if shot_type == "First Serve":
                selected_ball = driver.find_elements(By.CSS_SELECTOR, "g[id='plottedBallsSelected'] > g[class^='court-ball-']")[-1]
            elif shot_type == "Second Serve" or shot_type == "Double Faults":
                try:
                    selected_ball = driver.find_elements(By.CSS_SELECTOR, "g[id='plottedBallsSelected'] > g[class^='court-ball-']")[1]
                except IndexError:
                    selected_ball = driver.find_elements(By.CSS_SELECTOR, "g[id='plottedBallsSelected'] > g[class^='court-ball-']")[-1]
            # Click to bring up the mini stats window
            try:
                selected_ball.click()
            except:
                sleep(3)
                selected_ball.click()


            # Get ball's "x" (depth) coord
            x = float(selected_ball.get_attribute('outerHTML').split('<use x="')[1].split('" y="')[0])
            # Get ball's "y" coord
            y = float(selected_ball.get_attribute('outerHTML').split('y="')[1].split("\"")[0])
            # Get the outcome of this point (Won/Lost/Ace)
            outcome = selected_ball.get_attribute('outerHTML').split('href="#')[-1].split("\"")[0].split('Selected')[-1]


            # Find the needed elements on the stats widget and assign to variables
            # Point context/metadata
            # Current game score
            try:
                p1_score_G, p2_score_G = driver.find_element(By.CSS_SELECTOR, "div[class^='game-scores']").text.split('\n')
            except:
                driver.find_element(By.CSS_SELECTOR, "#CourtVision > div.CourtVisionWrapper > div.both-court-wrap > div.timeline-container > div").location_once_scrolled_into_view
                p1_score_G, p2_score_G = driver.find_element(By.CSS_SELECTOR, "svg[class^='selected-ball'] > text:nth-child(4) ").text.replace('(','').replace(')','').split('-')

            # Get number of sets won per player and current set score
            p1_setswon = 0
            p2_setswon = 0

            # Set score elements
            sets_eles = driver.find_elements(By.CSS_SELECTOR, "div[class^='set-scores-']")

            # Current set number
            set_N = len(sets_eles)

            # Get the played sets' scores and simplify to just sets won
            if len(sets_eles) > 1:
                for set in sets_eles[:-1]:
                    try: # Need this to handle exception data where the set score is missing
                        p1_setscore, p2_setscore = [ int(x.text) for x in set.find_elements(By.TAG_NAME, "span[class^='set-value']") ]
                        if p1_setscore > p2_setscore:
                            p1_setswon += 1
                        else:
                            p2_setswon += 1
                    except:
                        p1_setswon = p1_setswon_list[-1]
                        p2_setswon = p2_setswon_list[-1]

            # Current current set's score
            try:
                p1_setscore, p2_setscore = [ int(x.text) for x in sets_eles[-1].find_elements(By.TAG_NAME, "span[class^='set-value']") ]
            except:
                p1_setscore = p1_setscore_list[-1]
                p2_setscore = p2_setscore_list[-1]
                if p1_score_G == "G":
                    p1_setscore += 1
                elif p2_score_G == "G":
                    p2_setscore += 1

            # Now get the actual shot info (speed, subsequent rally length)
            # Speed in KMH
            speed = int(driver.find_elements(By.XPATH, ".//div[@class='speed-data speed-kmh']")[0].text.split('\n')[0])

            # Serve Type 
            if shot_type != "Double Faults":
                serve_type = driver.find_elements(By.XPATH, ".//div[@class='md-block md-serve-type']")[0].text.split('\n')[-1]
            else:
                serve_type = "-"

            # Rally Length
            if outcome == "Ace":
                ral_len = 1
                try:
                    height_grd = float(driver.find_element(By.CSS_SELECTOR, "div.md-block.md-height-above-ground > div.value").text.split(' ')[0])
                except:
                    height_grd = driver.find_element(By.CSS_SELECTOR, "div.md-block.md-height-above-ground > div.value").text.split(' ')[0]
                try:
                    height_bounce = float(driver.find_element(By.CSS_SELECTOR, "div.md-block.md-bounce-height > div.value").text.split(' ')[0])
                except ValueError:
                    height_bounce = driver.find_element(By.CSS_SELECTOR, "div.md-block.md-bounce-height > div.value").text.split(' ')[0]

            else:
                if shot_type == "Double Faults":
                    ral_len = 0
                else:
                    ral_len = int(driver.find_elements(By.XPATH, ".//div[@class='md-block md-rally-length']")[0].text.split('\n')[-1].split(' Shot')[0])
                height_grd = '-'
                height_bounce = '-'

            # Rally Outcome Type
            try:
                outcome_type = driver.find_elements(By.XPATH, ".//div[@class='shot-description']")[0].text.split("'S ")[1].title()
            except:
                outcome_type = "-"

            # Get the side of the court where the ball is pointed
            court_side = selected_ball.find_element(By.XPATH, "../../../../../../..").get_attribute("class")
            if court_side == 'both-court-1':
                player_serving = 1
            else:
                player_serving = 2
            
            # # Unclick the ball
            # # Click to close the mini stats window
            # try:
            #     selected_ball.click()
            # except:
            #     sleep(3)
            #     selected_ball.click()
        ###################################################
        #Some points may have missing court vision data
        except: 
            outcome = driver.find_element(By.CSS_SELECTOR, "svg[class^='selected-ball'] > text:nth-child(3) ").text
            if outcome == "W":
                outcome = "Winner"
            elif outcome == "A":
                outcome = "Ace"
            else:
                outcome = "Lost"
            # Move back to where timeline is visible first
            driver.find_element(By.CSS_SELECTOR, "#CourtVision > div.CourtVisionWrapper > div.both-court-wrap > div.timeline-container > div").location_once_scrolled_into_view
            p1_score_G,p2_score_G = driver.find_element(By.CSS_SELECTOR, "svg[class^='selected-ball'] > text:nth-child(4) ").text.replace('(','').replace(')','').split('-')
            # Other data no choide but to be '-'
            x = y = np.nan
            speed = serve_type = ral_len = height_bounce = height_bounce = outcome_type = "-"
            # Fill other data based on current context (check if previous point was from the same game or a new game/set)
            # Find out if current point happened in the current game or a new one
            point_idx = all_time_eles.index(point) #index of the current point in all_time_eles
            # Check if the current point is not the first point
            if point_n != 1:
                point_prev_idx = all_time_eles.index(all_points[point_n-2]) #index of the previous (-1) point in all_time_eles
                if point_idx - point_prev_idx == 1: # point occured in same game
                    # Set game info to be the same as the previous point
                    player_serving = player_serving_list[-1]
                    p1_setswon = p1_setswon_list[-1]
                    p2_setswon = p2_setswon_list[-1]
                    p1_setscore = p1_setscore_list[-1]
                    p2_setscore = p2_setscore_list[-1]
                    set_N = set_N_list[-1]
                    
                # Problem...this happens in a new game
                else:
                    if player_serving_list[-1] == 1:
                        player_serving = 2
                    else:
                        player_serving = 1

                    # First set the no. of sets won to be same as the prev point
                    p1_setswon = p1_setswon_list[-1]
                    p2_setswon = p2_setswon_list[-1]
                    if p1_setscore_list[-1] == 7 or p2_setscore_list[-1] == 7: # i.e. the previous point finished a set, so start a new one
                        p1_setscore = p2_setscore = 0
                        set_N = set_N_list[-1] + 1
                        # Add a set if point was set-winning
                        if p1_score_G == "G":
                            p1_setswon += 1
                        elif p2_score_G == "G":
                            p2_setswon += 1
                    else:
                        # Previous point completed a game, not a set
                        p1_setscore = p1_setscore_list[-1]
                        p2_setscore = p2_setscore_list[-1]
                        set_N = set_N_list[-1]
                        # Add game is point was game-winning
                        if p1_score_G == "G":
                            p1_setscore += 1
                        elif p2_score_G == "G":
                            p2_setscore += 1
            else: # There is no previous point data to reference!
                p1_setswon = p2_setswon = p1_setscore = p2_setscore = set_N = player_serving = 0
                # Set player_serving to 0 temporarily

            # Swap the outcome if the point-winner is player 2 instead of 1
            if player_serving == 2:
                if outcome == "Winner":
                    outcome = "Lost"
                else:
                    outcome = "Winner"

        # Append data to the lists
        x_list.append(x)
        y_list.append(y)
        outcomes_list.append(outcome)

        p1_score_G_list.append(p1_score_G)
        p2_score_G_list.append(p2_score_G)
        p1_setswon_list.append(p1_setswon)
        p2_setswon_list.append(p2_setswon)
        p1_setscore_list.append(p1_setscore)
        p2_setscore_list.append(p2_setscore)
        set_N_list.append(set_N)

        speed_list.append(speed)
        type_list.append(serve_type)
        ral_len_list.append(ral_len)
        height_grd_list.append(height_grd)
        height_bounce_list.append(height_bounce)
        outcome_type_list.append(outcome_type)
        player_serving_list.append(player_serving)

    shot_type_list = [shot_type]*len(x_list)

    # Transform the x,y data to real court dims
    # This is now the same for both players as in the "compare" view data is plotted on the bottom court half
    x_trans = (np.array(y_list)*-1/box_y)*8.23/2
    y_trans = (np.array(x_list)*1/box_x)*6.4

        
    if swap_players == False:
        df = pd.DataFrame({
            "Player1": [player1]*len(x_list), "Player2":[player2]*len(x_list), "Set": set_N_list, "Point": point_list, "Player1_Sets":p1_setswon_list, "Player2_Sets":p2_setswon_list, \
            "Player1_Game": p1_setscore_list, "Player2_Game": p2_setscore_list, "Player1_Score": p1_score_G_list , "Player2_Score": p2_score_G_list, \
            "Serving":player_serving_list, "Shot_Type": shot_type_list, \
            "X":x_trans, "Y": y_trans, "Speed_kmh": speed_list, "Serve_Type": type_list, "Rally_Length":ral_len_list, "Height_Above_Ground":height_grd_list, "Bounce_Height": height_bounce_list, \
            "Outcome": outcomes_list, "Outcome_Type":outcome_type_list,})

    else: # i.e. the player 1 and 2 name orders contradict between the stats page and court vision
        df = pd.DataFrame({
            "Player1": [player2]*len(x_list), "Player2":[player1]*len(x_list), "Set": set_N_list, "Point": point_list, "Player1_Sets":p2_setswon_list, "Player2_Sets":p1_setswon_list, \
            "Player1_Game": p2_setscore_list, "Playe`r2_Game": p1_setscore_list, "Player1_Score": p2_score_G_list , "Player2_Score": p1_score_G_list, \
            "Serving":player_serving_list, "Shot_Type": shot_type_list, \
            "X":x_trans, "Y": y_trans, "Speed_kmh": speed_list, "Serve_Type": type_list, "Rally_Length":ral_len_list, "Height_Above_Ground":height_grd_list, "Bounce_Height": height_bounce_list, \
            "Outcome": outcomes_list, "Outcome_Type":outcome_type_list,})


    return df

## 4. Function Scrape all Serve Types for a match

In [10]:
def scrape_match_data(driver, url, tournament, year, round_n, uuid, player1, player2):
    """
    Full function for scraping all service points in a given match
    """

    # Get URL
    driver.get(url)

    # Maxmimise the browser window
    #driver.maximize_window()

    #Click the cookies popup
    driver.find_element(By.CSS_SELECTOR, "body > div.gdpr-popup > div > button").click()

    sleep(2)
    # Service Box Dims
    servebox_html = driver.find_elements(By.CSS_SELECTOR, "#serviceBox")[0].get_attribute('outerHTML')
    box_x, box_y = float(servebox_html.split(' width="')[1].split('" height="')[0]), float(servebox_html.split(' height="')[1].split('" stroke="')[0])

    df_serves_list = []

    for shot_type in ["First Serve", "Second Serve", "Double Faults"]:
        # Click on the compare players button...
        driver.find_element(By.CSS_SELECTOR, "#CourtVision > div.CourtVisionWrapper > div.timeline-court-wrapper > div.filter-description-wrapper > div.compare-zoom-wrap > div.compare-wrap > button").click()
        select_shot_type(shot_type)

        df_serves = scrape_serve_data(driver, player1, player2, shot_type, box_x, box_y, swap_players=False)
        df_serves_list.append(df_serves)

        #Close "Compare Players"
        driver.find_element(By.CSS_SELECTOR, "#CourtVision > div.CourtVisionWrapper > div.both-court-wrap > div.both-court-top > div.both-court-cross").location_once_scrolled_into_view 
        driver.find_element(By.CSS_SELECTOR, "#CourtVision > div.CourtVisionWrapper > div.both-court-wrap > div.both-court-top > div.both-court-cross").click()
        
        sleep(3)

    # Combine all 3 dataframes into one
    df_serves_all = pd.concat(df_serves_list).sort_values('Point')
    df_serves_all = df_serves_all.reset_index(drop=True)

            
    # Add additional metadata columns
    # Add a column for Year
    df_serves_all.insert(0, "Year", [year]*len(df_serves_all))
    # Add a column for tournament name
    df_serves_all.insert(1, "Tournament", [tournament]*len(df_serves_all))
    # Add a column for tournament round
    df_serves_all.insert(2, "Round", [round_n]*len(df_serves_all))
    # Add a column for event uuid
    df_serves_all.insert(2, "uuid", [uuid]*len(df_serves_all))

    return df_serves_all
    

In [56]:
url = "https://ausopen.com/match/2023-elena-rybakina-vs-aryna-sabalenka-ws701#!infosys-3"
tournament = "Australian Open"
year = 2023
round = "Final"
player1 = "Elena Rybakina"
player2 = "Aryna Sabalenka"
# Resets Driver
service = EdgeService(executable_path=EdgeChromiumDriverManager().install())
driver = webdriver.Edge(service=service, options=options)

df_serves_all = scrape_match_data(driver, url, tournament, year, round, player1, player2)

# Quit Browser
driver.quit()

In [57]:
df_serves_all.tail()

Unnamed: 0,Year,Tournament,Round,Player1,Player2,Set,Point,Player1_Sets,Player2_Sets,Player1_Game,...,Shot_Type,X,Y,Speed_kmh,Serve_Type,Rally_Length,Height_Above_Ground,Bounce_Height,Outcome,Outcome_Type
208,2023,Australian Open,Final,Elena Rybakina,Aryna Sabalenka,3,209,1,1,4,...,First Serve,1.851518,2.819024,192,Pronated,1,-,-,Winner,Forced Error
209,2023,Australian Open,Final,Elena Rybakina,Aryna Sabalenka,3,210,1,1,4,...,Second Serve,-1.706464,2.94439,168,Slice,5,-,-,Winner,Unforced Error
210,2023,Australian Open,Final,Elena Rybakina,Aryna Sabalenka,3,211,1,1,4,...,Second Serve,0.312087,2.766829,166,Slice,2,-,-,Lost,Unforced Error
211,2023,Australian Open,Final,Elena Rybakina,Aryna Sabalenka,3,212,1,1,4,...,First Serve,-0.165567,2.76878,187,Flat,3,-,-,Winner,Winner
212,2023,Australian Open,Final,Elena Rybakina,Aryna Sabalenka,3,213,1,1,4,...,First Serve,0.255432,3.089268,184,Flat,5,-,-,Winner,Unforced Error


In [58]:
df_serves_all.to_csv(f'AO_F_{player1.replace(" ","-")}_{player2.replace(" ","-")}_2023.csv', index=False)

In [23]:
####################################################################################
def scrape_shot_data(driver, player1, player2, shot_type, swap_players=False):
    """
    Scrape other shots data (Winners) data from a single serve type from Court Vision 2D
    
    """

    # Select just the timeline within the current "compare" popup
    timeline = driver.find_element(By.CSS_SELECTOR, "#CourtVision > div.CourtVisionWrapper > div.both-court-wrap > div.timeline-container > div")

    # Get all the "point" elements i.e. the dot-separators and the clickable point circle elements
    all_points = timeline.find_elements(By.CSS_SELECTOR, "svg[id^='point'], #dot-separator")

    # Gets above + the set-score elements
    all_time_eles = timeline.find_elements(By.CSS_SELECTOR, "svg[id^='point'], #dot-separator, #team1Score")

    # Get all the clickable point circle elements
    point_circs = timeline.find_elements(By.CSS_SELECTOR, "svg[id^='point']")


    # Initialise all data lists
    x_list = [] # x-coords
    y_list = [] # y-coords
    outcomes_list = [] # point outcome

    p1_score_G_list = []
    p2_score_G_list = []
    p1_setswon_list = [] # no. of sets won so far in match
    p2_setswon_list = []
    p1_setscore_list = [] # no. of games won so far in set
    p2_setscore_list = [] 
    set_N_list = [] # contains the set of associated point
    point_list = [] # contains point numbers


    speed_list = [] # speeds list
    type_list = [] # type of serve
    ral_len_list = [] # len of rally
    height_grd_list = [] # "height above ground"
    height_bounce_list = [] # "bounce height"
    stroke_type_list = []
    stroke_hand_list = []
    spin_list = []
    outcome_type_list = []
    player_serving_list= []

    if len(point_circs) == 0:
        return None

    # Loop through every point on the timeline, click and grab info
    for i in range(len(point_circs)):
        
        # Being nice and randomly sleeping
        sleeptime = np.random.uniform(0, 1)
        sleep(sleeptime)

        # Get the current point number
        #point_circs = timeline.find_elements(By.CSS_SELECTOR, "svg[id^='point']")
        point = point_circs[i] 
        point_n = all_points.index(point) + 1
        point_list.append(point_n)

        # This scrolls back to where the point-bar is viewable (using the actual element itself gets it blocked by the navbar sadly)
        #driver.find_elements(By.CSS_SELECTOR, "#MatchBeats > div > div.bottom > div.select-shots")[0].location_once_scrolled_into_view 
        # Clicking on the point circle highlights a point/scatter on the point, allowing it to be clicked with
        # precedence over other overlapping points
        point.location_once_scrolled_into_view 
        point.click()

        # Get the selected highlighted ball element
        #selected_ball = driver.find_element(By.CSS_SELECTOR, "g[id='plottedBallsSelected']")
        try:
            if shot_type == "Winners":
                selected_ball = driver.find_elements(By.CSS_SELECTOR, "g[id='plottedBallsSelected'] > g[class^='court-ball-']")[-1]
            elif shot_type == "Second Serve" or shot_type == "Double Faults":
                try:
                    selected_ball = driver.find_elements(By.CSS_SELECTOR, "g[id='plottedBallsSelected'] > g[class^='court-ball-']")[1]
                except IndexError:
                    selected_ball = driver.find_elements(By.CSS_SELECTOR, "g[id='plottedBallsSelected'] > g[class^='court-ball-']")[-1]
            # Click to bring up the mini stats window
            try:
                selected_ball.click()
            except:
                sleep(3)
                selected_ball.click()


            # Get ball's "x" (depth) coord
            x = float(selected_ball.get_attribute('outerHTML').split('<use x="')[1].split('" y="')[0])
            # Get ball's "y" coord
            y = float(selected_ball.get_attribute('outerHTML').split('y="')[1].split("\"")[0])
            # Get the outcome of this point (Won/Lost/Ace)
            outcome = selected_ball.get_attribute('outerHTML').split('href="#')[-1].split("\"")[0].split('Selected')[-1]


            # Find the needed elements on the stats widget and assign to variables
            # Point context/metadata
            # Current game score
            try:
                p1_score_G, p2_score_G = driver.find_element(By.CSS_SELECTOR, "div[class^='game-scores']").text.split('\n')
            except:
                driver.find_element(By.CSS_SELECTOR, "#CourtVision > div.CourtVisionWrapper > div.both-court-wrap > div.timeline-container > div").location_once_scrolled_into_view
                p1_score_G, p2_score_G = driver.find_element(By.CSS_SELECTOR, "svg[class^='selected-ball'] > text:nth-child(4) ").text.replace('(','').replace(')','').split('-')

            # Get number of sets won per player and current set score
            p1_setswon = 0
            p2_setswon = 0

            # Set score elements
            sets_eles = driver.find_elements(By.CSS_SELECTOR, "div[class^='set-scores-']")

            # Current set number
            set_N = len(sets_eles)

            # Get the played sets' scores and simplify to just sets won
            if len(sets_eles) > 1:
                for set in sets_eles[:-1]:
                    try: # Need this to handle exception data where the set score is missing
                        p1_setscore, p2_setscore = [ int(x.text) for x in set.find_elements(By.TAG_NAME, "span[class^='set-value']") ]
                        if p1_setscore > p2_setscore:
                            p1_setswon += 1
                        else:
                            p2_setswon += 1
                    except:
                        p1_setswon = p1_setswon_list[-1]
                        p2_setswon = p2_setswon_list[-1]

            # Current current set's score
            try:
                p1_setscore, p2_setscore = [ int(x.text) for x in sets_eles[-1].find_elements(By.TAG_NAME, "span[class^='set-value']") ]
            except:
                p1_setscore = p1_setscore_list[-1]
                p2_setscore = p2_setscore_list[-1]
                if p1_score_G == "G":
                    p1_setscore += 1
                elif p2_score_G == "G":
                    p2_setscore += 1

            # Now get the actual shot info (speed, subsequent rally length)
            # Speed in KMH
            speed = int(driver.find_elements(By.XPATH, ".//div[@class='speed-data speed-kmh']")[0].text.split('\n')[0])

            # Rally Length
            if outcome == "Ace":
                ral_len = 1
                serve_type = driver.find_elements(By.XPATH, ".//div[@class='md-block md-serve-type']")[0].text.split('\n')[-1]
                try:
                    height_grd = float(driver.find_element(By.CSS_SELECTOR, "div.md-block.md-height-above-ground > div.value").text.split(' ')[0])
                except:
                    height_grd = driver.find_element(By.CSS_SELECTOR, "div.md-block.md-height-above-ground > div.value").text.split(' ')[0]
                try:
                    height_bounce = float(driver.find_element(By.CSS_SELECTOR, "div.md-block.md-bounce-height > div.value").text.split(' ')[0])
                except ValueError:
                    height_bounce = driver.find_element(By.CSS_SELECTOR, "div.md-block.md-bounce-height > div.value").text.split(' ')[0]
                stroke_type = "Serve"
                stroke_hand = "-"
                spin = "-"

            else:
                if shot_type == "Double Faults":
                    ral_len = 0
                else:
                    ral_len = int(driver.find_elements(By.XPATH, ".//div[@class='md-block md-rally-length']")[0].text.split('\n')[-1].split(' Shot')[0])
                try:
                    height_grd = float(driver.find_element(By.CSS_SELECTOR, "div.md-block.md-height-above-ground > div.value").text.split(' ')[0])
                except:
                    height_grd = driver.find_element(By.CSS_SELECTOR, "div.md-block.md-height-above-ground > div.value").text.split(' ')[0]
                height_bounce = '-'
                serve_type = "-"
                stroke_type = driver.find_elements(By.CSS_SELECTOR, "div.md-block.md-stroke > div.value")[0].text
                stroke_hand = driver.find_elements(By.CSS_SELECTOR, "div.md-block.md-hand > div.value")[0].text
                try:
                    spin = float(driver.find_element(By.CSS_SELECTOR, "div.md-block.md-spin > div.value").text.split(' ')[0])
                except ValueError:
                    spin = driver.find_element(By.CSS_SELECTOR, "div.md-block.md-spin > div.value").text.split(' ')[0]

            # Rally Outcome Type
            try:
                outcome_type = driver.find_elements(By.XPATH, ".//div[@class='shot-description']")[0].text.split("'S ")[1].title()
            except:
                outcome_type = "-"

            # Get the side of the court where the ball is pointed
            court_side = selected_ball.find_element(By.XPATH, "../../../../../../..").get_attribute("class")
            if court_side == 'both-court-1':
                player_serving = 1
            else:
                player_serving = 2
            
        ###################################################
        #Some points may have missing court vision data
        except: 
            outcome = driver.find_element(By.CSS_SELECTOR, "svg[class^='selected-ball'] > text:nth-child(3) ").text
            if outcome == "W":
                outcome = "Winner"
            elif outcome == "A":
                outcome = "Ace"
            else:
                outcome = "Lost"
            # Move back to where timeline is visible first
            driver.find_element(By.CSS_SELECTOR, "#CourtVision > div.CourtVisionWrapper > div.both-court-wrap > div.timeline-container > div").location_once_scrolled_into_view
            p1_score_G,p2_score_G = driver.find_element(By.CSS_SELECTOR, "svg[class^='selected-ball'] > text:nth-child(4) ").text.replace('(','').replace(')','').split('-')
            # Other data no choide but to be '-'
            x = y = np.nan
            speed = serve_type = ral_len = height_bounce = height_bounce = outcome_type = stroke_type = stroke_hand = spin = "-"
            # Fill other data based on current context (check if previous point was from the same game or a new game/set)
            # Find out if current point happened in the current game or a new one
            point_idx = all_time_eles.index(point) #index of the current point in all_time_eles
            # Check if the current point is not the first point
            if point_n != 1:
                point_prev_idx = all_time_eles.index(all_points[point_n-2]) #index of the previous (-1) point in all_time_eles
                if point_idx - point_prev_idx == 1: # point occured in same game
                    # Set game info to be the same as the previous point
                    player_serving = player_serving_list[-1]
                    p1_setswon = p1_setswon_list[-1]
                    p2_setswon = p2_setswon_list[-1]
                    p1_setscore = p1_setscore_list[-1]
                    p2_setscore = p2_setscore_list[-1]
                    set_N = set_N_list[-1]
                    
                # Problem...this happens in a new game
                else:
                    if player_serving_list[-1] == 1:
                        player_serving = 2
                    else:
                        player_serving = 1

                    # First set the no. of sets won to be same as the prev point
                    p1_setswon = p1_setswon_list[-1]
                    p2_setswon = p2_setswon_list[-1]
                    if p1_setscore_list[-1] == 7 or p2_setscore_list[-1] == 7: # i.e. the previous point finished a set, so start a new one
                        p1_setscore = p2_setscore = 0
                        set_N = set_N_list[-1] + 1
                        # Add a set if point was set-winning
                        if p1_score_G == "G":
                            p1_setswon += 1
                        elif p2_score_G == "G":
                            p2_setswon += 1
                    else:
                        # Previous point completed a game, not a set
                        p1_setscore = p1_setscore_list[-1]
                        p2_setscore = p2_setscore_list[-1]
                        set_N = set_N_list[-1]
                        # Add game is point was game-winning
                        if p1_score_G == "G":
                            p1_setscore += 1
                        elif p2_score_G == "G":
                            p2_setscore += 1
            else: # There is no previous point data to reference!
                p1_setswon = p2_setswon = p1_setscore = p2_setscore = set_N = player_serving = 0
                # Set player_serving to 0 temporarily

            # Swap the outcome if the point-winner is player 2 instead of 1
            if player_serving == 2:
                if outcome == "Winner":
                    outcome = "Lost"
                else:
                    outcome = "Winner"


        # Append data to the lists
        x_list.append(x)
        y_list.append(y)
        outcomes_list.append(outcome)

        p1_score_G_list.append(p1_score_G)
        p2_score_G_list.append(p2_score_G)
        p1_setswon_list.append(p1_setswon)
        p2_setswon_list.append(p2_setswon)
        p1_setscore_list.append(p1_setscore)
        p2_setscore_list.append(p2_setscore)
        set_N_list.append(set_N)

        speed_list.append(speed)
        type_list.append(serve_type)
        ral_len_list.append(ral_len)
        height_grd_list.append(height_grd)
        height_bounce_list.append(height_bounce)
        stroke_type_list.append(stroke_type)
        stroke_hand_list.append(stroke_hand)
        spin_list.append(spin)
        outcome_type_list.append(outcome_type)
        player_serving_list.append(player_serving)

    shot_type_list = [shot_type]*len(x_list)

    # Transform the x,y data to real court dims
    # This is now the same for both players as in the "compare" view data is plotted on the bottom court half
    x_trans = (np.array(y_list)*-1/box_y)*8.23/2
    y_trans = (np.array(x_list)*1/box_x)*6.4

        
    if swap_players == False:
        df = pd.DataFrame({
            "Player1": [player1]*len(x_list), "Player2":[player2]*len(x_list), "Set": set_N_list, "Point": point_list, "Player1_Sets":p1_setswon_list, "Player2_Sets":p2_setswon_list, \
            "Player1_Game": p1_setscore_list, "Player2_Game": p2_setscore_list, "Player1_Score": p1_score_G_list , "Player2_Score": p2_score_G_list, \
            "Striker":player_serving_list, "Shot_Type": shot_type_list, \
            "X":x_trans, "Y": y_trans, "Speed_kmh": speed_list, "Serve_Type": type_list, "Rally_Length":ral_len_list, "Height_Above_Ground":height_grd_list, "Bounce_Height": height_bounce_list, \
            "Stroke_Type": stroke_type_list, "Stroke_Hand": stroke_hand_list, "Spin": spin_list, \
            "Outcome": outcomes_list, "Outcome_Type":outcome_type_list,})

    else: # i.e. the player 1 and 2 name orders contradict between the stats page and court vision
        df = pd.DataFrame({
            "Player1": [player2]*len(x_list), "Player2":[player1]*len(x_list), "Set": set_N_list, "Point": point_list, "Player1_Sets":p2_setswon_list, "Player2_Sets":p1_setswon_list, \
            "Player1_Game": p2_setscore_list, "Playe`r2_Game": p1_setscore_list, "Player1_Score": p2_score_G_list , "Player2_Score": p1_score_G_list, \
            "Striker":player_serving_list, "Shot_Type": shot_type_list, \
            "X":x_trans, "Y": y_trans, "Speed_kmh": speed_list, "Serve_Type": type_list, "Rally_Length":ral_len_list, "Height_Above_Ground":height_grd_list, "Bounce_Height": height_bounce_list, \
            "Stroke_Type": stroke_type_list, "Stroke_Hand": stroke_hand_list, "Spin": spin_list, \
            "Outcome": outcomes_list, "Outcome_Type":outcome_type_list,})


    return df

In [26]:
df_w = scrape_shot_data(driver, player1, player2, "Winners", swap_players=False)

In [24]:
df_w.to_csv(f'AO_R2_{player1.replace(" ","-")}_{player2.replace(" ","-")}_2023_winners.csv', index=False)

## 5. Tournament Scraping

In [11]:
from ast import literal_eval

In [12]:
df_results = pd.read_csv("data/AO_results-all_processed_2023.csv",  converters={"team1_player_uuid": literal_eval, "team2_player_uuid": literal_eval})

In [13]:
# Get all Mens' Singles
df_results_MS = df_results[df_results.match_id.str.contains("MS")]
df_results_MS.tail()

Unnamed: 0,date,actual_start_time,match_centre_link,uuid,match_id,team_substituted_footnote,team_substituted,id,order,promoted,...,player1_name,team2_player_uuid,player2_name,event_uuid,court_id,session,session_order,restricted_start_time,restricted_start_time_timestamp,activity_order
541,2023-01-25,04:22,https://ausopen.com/match/2023-ben-shelton-vs-...,225521,MS504,,False,225521,4,True,...,Ben Shelton,[718a4d5f-cb5f-48bb-ba45-aea2697293fc],Tommy Paul,675d28ec-b177-46b2-959c-595f3ca862a0,12,Day session,0,14:30,1674617000.0,2
542,2023-01-25,08:47,https://ausopen.com/match/2023-andrey-rublev-v...,225316,MS503,,False,225316,3,False,...,Andrey Rublev,[7b04e056-88b3-4e0a-9431-84f2fe039993],Novak Djokovic,675d28ec-b177-46b2-959c-595f3ca862a0,12,Night session,1,,,0
639,2023-01-27,03:43,https://ausopen.com/match/2023-karen-khachanov...,225516,MS601,,False,225516,1,True,...,Karen Khachanov,[205ed9a3-96ca-4d75-b222-a705d572952a],Stefanos Tsitsipas,675d28ec-b177-46b2-959c-595f3ca862a0,12,Day session,0,14:30,1674790000.0,1
640,2023-01-27,08:45,https://ausopen.com/match/2023-novak-djokovic-...,225741,MS602,,False,225741,2,True,...,Novak Djokovic,[718a4d5f-cb5f-48bb-ba45-aea2697293fc],Tommy Paul,675d28ec-b177-46b2-959c-595f3ca862a0,12,Night session,1,,,0
661,2023-01-29,08:45,https://ausopen.com/match/2023-stefanos-tsitsi...,225626,MS701,,False,225626,1,True,...,Stefanos Tsitsipas,[7b04e056-88b3-4e0a-9431-84f2fe039993],Novak Djokovic,675d28ec-b177-46b2-959c-595f3ca862a0,12,Twilight session,2,19:30,1674981000.0,1


In [299]:
# Resets Driver
service = EdgeService(executable_path=EdgeChromiumDriverManager().install())
driver = webdriver.Edge(service=service, options=options)

In [302]:
df_serves_all = scrape_match_data(driver, url, tournament, year, round, uuid, player1, player2)

In [20]:
f'data/court-vision/AO_{round_short}_{player1.replace(" ","-")}_{player2.replace(" ","-")}_2023.csv'

'data/court-vision/AO_1R_Kyle-Edmund_Jannik-Sinner_2023.csv'

In [19]:
for i in range(5):#range(len(df_results_MS)):

    url = df_results_MS.iloc[i].match_centre_link + "#!infosys-3"
    round_n = df_results_MS.iloc[i]['round']
    player1 = df_results_MS.iloc[i]['player1_name']
    player2 = df_results_MS.iloc[i]['player2_name']
    uuid = df_results_MS.iloc[i]['uuid']

    if "Round" in round_n:
        round_short = "".join([s[0] for s in round_n.split(" ")])
    elif round_n == "Quarterfinals":
        round_short = "QF"
    elif round_n == "Semifinals":
        round_short = "SF"
    elif round_n == "Final":
        round_short = "F"

    try:
        df_serves_all = scrape_match_data(driver, url, tournament, year, round_n, uuid, player1, player2)
        
        df_serves_all.to_csv(f'data/court-vision/AO_{round_short}_{player1.replace(" ","-")}_{player2.replace(" ","-")}_2023.csv', index=False)
        print(f"Done {player1} vs {player2}")
    except:
        print(f"Failed {player1} vs {player2}")
        pass

    sleeptime = np.random.uniform(0, 5)
    sleep(sleeptime)

# # Quit Browser
# driver.quit()

Failed Rafael Nadal vs Jack Draper
Failed Marcos Giron vs Daniil Medvedev
Failed Hubert Hurkacz vs Pedro Martinez
Failed Stefanos Tsitsipas vs Quentin Halys
Failed Kyle Edmund vs Jannik Sinner


In [21]:
url

'https://ausopen.com/match/2023-kyle-edmund-vs-jannik-sinner-ms124#!infosys-3'

In [287]:
driver.get(url)

In [288]:
driver.quit()