# Introduction

- Webscrape https://www.tennislive.net for pro-level point-by-point (PBP) match data

### Import packages

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

### Load the data
Input desired url match

In [3]:
url = 'https://www.tennislive.net/atp/match/borna-gojo-VS-rudy-quan/sioux-falls-challenger-2024/'

In [4]:
def extract_point_by_point(url):
    # get HTML from url and convert to BeautifulSoup
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html')
    
    # find player names
    table = soup.find('table', class_ = 'table_pmatches')
    player1 = soup.find_all('td', class_='w130')[0].text.strip()
    player2 = soup.find_all('td', class_='w130')[1].text.strip()
    
    # get all set data
    content = soup.find_all('table', class_='table_stats_match')[1:]

    # initialize list to hold data for all sets
    set_dfs = []

    for j in range(len(content)):
        # initialize lists to hold each of the designated values
        server_names = []
        point_scores = []
        game_scores = []
        set_num = []
        is_break_points = []

        # get all table data for the current set
        current_set = content[j].find_all('tr', class_=None)

        # determine starting tag by how the table is formatted
        start = 0
        if not current_set[1].find('td', class_='mp_15'):
            start = 1

        if len(current_set) % 2 == 0:
            length_current_set = len(current_set) - 1
        else:
            length_current_set = len(current_set)

        # iterate over all the `tr` tags in set_1
        # step over by 2 since each server and PBP data is paired together
        for i in range(start, length_current_set, 2):
            # extract the data from the tr
            server_name = ' '.join(current_set[i].find('img')['alt'].split()[0:-1])
            game_score = current_set[i].find('td', class_='mp_info_txt').text.strip()
            points = current_set[i + 1].find('td').text.split(', ')

            # create a new row for each point
            for point in points:
                
                if '[BP]' in point:
                    is_break_point = True
                    point = point.replace('[BP]', '').strip()
                else:
                    is_break_point = False

                server_names.append(server_name)
                point_scores.append(point)
                game_scores.append(game_score)
                set_num.append(j + 1)
                is_break_points.append(is_break_point)

        # construct the dataframe
        df = pd.DataFrame({
            'pointScore': point_scores,
            'serverName': server_names,
            'gameScore': game_scores,
            'setNum': set_num,
            'isBreakPoint': is_break_points
        })

        last_game_score = df['gameScore'].iloc[-1]

        if '6' in last_game_score and (last_game_score != '6-6' and last_game_score != '6-5' and last_game_score != '5-6'):
            df.loc[df['gameScore'] == df['gameScore'].iloc[-1], 'setNum'] += 1
            df.loc[df['gameScore'] == df['gameScore'].iloc[-1], 'gameScore'] = '0-0' 

            # switch server names when in tiebreaker        
        if '6-6' in last_game_score or '0-0' in last_game_score:
            tiebreaker_df = df[df['gameScore'] == last_game_score]
            tiebreaker_rows = df.shape[0] - tiebreaker_df.shape[0]
            server = tiebreaker_df['serverName'].iloc[0]

            if player1 != server:
                server = player1

            for i in range(1, len(tiebreaker_df), 4):
                df.loc[tiebreaker_rows + i:tiebreaker_rows + i + 1, 'serverName'] = server
                
        set_dfs.append(df)
        
    df = pd.concat(set_dfs).reset_index(drop=True)    

    # flip point_score and game_score of player2
    df['pointScore'] = df.apply(lambda x : '-'.join(x['pointScore'].split('-')[::-1]) if x['serverName'] == player2 else x['pointScore'], axis=1)
    df['gameScore'] = df.apply(lambda x : '-'.join(x['gameScore'].split('-')[::-1]) if x['serverName'] == player2 else x['gameScore'], axis=1)
    
    return df

In [5]:
df = extract_point_by_point(url)
df.head(20)

Unnamed: 0,pointScore,serverName,gameScore,setNum,isBreakPoint
0,0-0,Borna Gojo,0-0,1,False
1,15-0,Borna Gojo,0-0,1,False
2,15-15,Borna Gojo,0-0,1,False
3,30-15,Borna Gojo,0-0,1,False
4,30-30,Borna Gojo,0-0,1,False
5,30-40,Borna Gojo,0-0,1,True
6,0-0,Rudy Quan,1-0,1,False
7,15-0,Rudy Quan,1-0,1,False
8,15-15,Rudy Quan,1-0,1,False
9,30-15,Rudy Quan,1-0,1,False


## Format to UCLA Tennis Consulting Standard

### Add pointNumber column

In [6]:
df.insert(0, 'pointNumber', range(1, len(df) + 1))

### Add gameNumber column

In [7]:
def game_score(game):
    return sum(map(int,('').join(game.split('-')))) + 1

df['gameNumber'] = df['gameScore'].apply(game_score)

### Add player1Name and player2Name columns

In [8]:
serverNames = df['serverName'].unique()

player1 = serverNames[0]
player2 = serverNames[1]

df['player1Name'] = player1
df['player2Name'] = player2

### Add returnName column

In [9]:
def switch_names(name):
    if name != player1:
        return player1
    else :
        return player2

df['returnerName'] = df['serverName'].apply(switch_names)

### Add tiebreakeScore column

In [10]:
# Transfer values from pointScore to tiebreakScore where gameScore is '6-6'
if 'tiebreakScore' not in df.columns:
    df['tiebreakScore'] = np.nan

for i in df['setNum'].unique():
    current_set = df[df['setNum'] == i]

    last_game_score = current_set['gameScore'].iloc[-1]

    if '6-6' in last_game_score or '0-0' in last_game_score:
        df.loc[(df['setNum'] == i) & (df['gameScore'] == last_game_score), 'tiebreakScore'] = df['pointScore']
        
        # Set the pointScore to NaN where tiebreakScore is not NaN
        df.loc[pd.notna(df['tiebreakScore']), 'pointScore'] = np.nan

### Add setScore column
#### WARNING: Manually input the setScores for desired player

In [11]:
set_list = ['0-0', '1-0']

def set_setScores(x):
    
    if 1 <= x <= len(set_list):
        return set_list[x - 1]
        
df['setScore'] = df['setNum'].apply(set_setScores)

In [12]:
df

Unnamed: 0,pointNumber,pointScore,serverName,gameScore,setNum,isBreakPoint,gameNumber,player1Name,player2Name,returnerName,tiebreakScore,setScore
0,1,0-0,Borna Gojo,0-0,1,False,1,Borna Gojo,Rudy Quan,Rudy Quan,,0-0
1,2,15-0,Borna Gojo,0-0,1,False,1,Borna Gojo,Rudy Quan,Rudy Quan,,0-0
2,3,15-15,Borna Gojo,0-0,1,False,1,Borna Gojo,Rudy Quan,Rudy Quan,,0-0
3,4,30-15,Borna Gojo,0-0,1,False,1,Borna Gojo,Rudy Quan,Rudy Quan,,0-0
4,5,30-30,Borna Gojo,0-0,1,False,1,Borna Gojo,Rudy Quan,Rudy Quan,,0-0
...,...,...,...,...,...,...,...,...,...,...,...,...
132,133,0-0,Rudy Quan,1-5,3,False,7,Borna Gojo,Rudy Quan,Borna Gojo,,
133,134,0-15,Rudy Quan,1-5,3,False,7,Borna Gojo,Rudy Quan,Borna Gojo,,
134,135,0-30,Rudy Quan,1-5,3,False,7,Borna Gojo,Rudy Quan,Borna Gojo,,
135,136,0-40,Rudy Quan,1-5,3,True,7,Borna Gojo,Rudy Quan,Borna Gojo,,


### Add Timestamp columns

In [13]:
df['Position'] = ''
df['pointEndPosition'] = ''
df['Duration'] = ''

### Add Team columns
#### WARNING: Manually set Team Names

In [14]:
df['clientTeam'] = 'UCLA'
df['opponentTeam'] = ''

### Add name column

In [15]:
df['Name'] = df.apply(lambda row: f"Set {row['setNum']}: {row['gameScore']}, {row['tiebreakScore']} {row['serverName']} Serving" if pd.notna(row['tiebreakScore']) else f"Set {row['setNum']}: {row['gameScore']}, {row['pointScore']} {row['serverName']} Serving", axis=1)

## Specify Order

In [16]:
df = df[['Name', 'pointNumber', 'setNum', 'gameNumber', 'player1Name', 'player2Name', 
         'pointScore', 'gameScore', 'setScore', 'tiebreakScore', 'serverName', 'returnerName',
        'Position', 'pointEndPosition', 'Duration', 'isBreakPoint', 'clientTeam', 'opponentTeam']]


In [17]:
df.head(5)

Unnamed: 0,Name,pointNumber,setNum,gameNumber,player1Name,player2Name,pointScore,gameScore,setScore,tiebreakScore,serverName,returnerName,Position,pointEndPosition,Duration,isBreakPoint,clientTeam,opponentTeam
0,"Set 1: 0-0, 0-0 Borna Gojo Serving",1,1,1,Borna Gojo,Rudy Quan,0-0,0-0,0-0,,Borna Gojo,Rudy Quan,,,,False,UCLA,
1,"Set 1: 0-0, 15-0 Borna Gojo Serving",2,1,1,Borna Gojo,Rudy Quan,15-0,0-0,0-0,,Borna Gojo,Rudy Quan,,,,False,UCLA,
2,"Set 1: 0-0, 15-15 Borna Gojo Serving",3,1,1,Borna Gojo,Rudy Quan,15-15,0-0,0-0,,Borna Gojo,Rudy Quan,,,,False,UCLA,
3,"Set 1: 0-0, 30-15 Borna Gojo Serving",4,1,1,Borna Gojo,Rudy Quan,30-15,0-0,0-0,,Borna Gojo,Rudy Quan,,,,False,UCLA,
4,"Set 1: 0-0, 30-30 Borna Gojo Serving",5,1,1,Borna Gojo,Rudy Quan,30-30,0-0,0-0,,Borna Gojo,Rudy Quan,,,,False,UCLA,


# Add in Timestamps

In [18]:
# # Put timestamps file here
# timestamp = pd.read_csv("timestamps_9uYpCdMy4eA.csv")

In [19]:
# if timestamp.shape[0] != point_df.shape[0]:
#     raise ValueError("Error: The number of rows in timestamp and point_df are not the same.")
# else:
#     # Assign values to point_df
#     point_df['Position'] = timestamp['pointStartTime'].values
#     point_df['pointEndPosition'] = timestamp['endStartTime'].values
#     print("\u2713 Check passed")

# Output the Point Visuals csv

In [20]:
# Save point_df to CSV file

# Assuming point_df is your DataFrame and player1Name and player2Name are the names from the first row
player1NameNoSpace = df.iloc[0]['player1Name'].replace(" ", "")
player2NameNoSpace = df.iloc[0]['player2Name'].replace(" ", "")

# Save DataFrame to CSV file with modified player names
df.to_csv(f'Shortened_Point_Visuals_{player1NameNoSpace}_{player2NameNoSpace}.csv', index=False)