# Introduction

- Webscrape https://www.tennislive.net for pro-level match data
- Output two .csv files 
    - Single row dataframe for summary statistics dashboard
    - point by point (Pbp) level csv file for Match Viewer website => to attach timestamps using website tagger

need to write some documentation so

1. People reviewing your GitHub can understand what’s it’s doing. (Make a read me file)

2. We can use this in our products for tennis consulting

# Install Packages

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

pd.reset_option('display.max_rows')
pd.reset_option('display.max_columns')

In [365]:
page = requests.get('https://www.tennislive.net/atp/match/bernard-tomic-VS-govind-nanda/m25-tulsa-2024/')

In [366]:
soup = BeautifulSoup(page.text, 'html')

### Check Status Code
- 200 = success
- 404 = failed

In [367]:
page.status_code

200

# Single Row For Summary Statistics Dashboard

### Match Info

In [369]:
table = soup.find('table', class_ = 'table_pmatches')

date = soup.find('td', class_ = 'w50').text.strip().split(' ')[0]
round_info = soup.find_all('td', class_='w50')[1].text.strip()
player1 = soup.find_all('td', class_='w130')[0].text.strip()
player2 = soup.find_all('td', class_='w130')[1].text.strip()
score = soup.find('span', id='score').text.strip()
tournament = soup.find('td', class_='w200').find('a').text.strip()

# Create a DataFrame
data = {
    'Date': [date],
    'Round': [round_info],
    'Player 1': [player1],
    'Player 2': [player2],
    'Score': [score],
    'Tournament': [tournament]
}

df_match_info = pd.DataFrame(data)

In [370]:
df_match_info

Unnamed: 0,Date,Round,Player 1,Player 2,Score,Tournament
0,21.06.24,1/4,Bernard Tomic,Govind Nanda,"6-3, 6-1",M25 Tulsa


### Match Statistics

In [372]:
# Function to extract data for a given statistic
def extract_statistic(statistic):
    row = soup.find('td', string=statistic).parent
    player1_stat = row.find_all('td')[1].text.strip().split(' ')[0]
    player2_stat = row.find_all('td')[2].text.strip().split(' ')[0]
    return player1_stat, player2_stat

statistics = [
    '1st SERVE %',
    '1st SERVE POINTS WON',
    '2nd SERVE POINTS WON',
    'TOTAL RETURN POINTS WON',
    'TOTAL POINTS WON',
    'DOUBLE FAULTS',
    'ACES'
]

# List of statistics to extract 
stats_length = len(soup.find_all('td', class_='info_txt'))

if stats_length == 8:
    statistics.insert(3, 'BREAK POINTS WON')  # Insert 'BREAK POINTS WON' at the correct position

# Dictionary to hold the statistics
data = {}

# Extract and store statistics for both players
for stat in statistics:
    player1_stat, player2_stat = extract_statistic(stat)
    stat_name = stat.lower().replace(' ', '_').replace('%', 'percentage').replace('/', '_').replace('(','').replace(')','')
    data[f'{stat_name}_player1'] = player1_stat
    data[f'{stat_name}_player2'] = player2_stat

# Convert the dictionary to a DataFrame
df_stats = pd.DataFrame([data])

In [373]:
df_stats

Unnamed: 0,1st_serve_percentage_player1,1st_serve_percentage_player2,1st_serve_points_won_player1,1st_serve_points_won_player2,2nd_serve_points_won_player1,2nd_serve_points_won_player2,break_points_won_player1,break_points_won_player2,total_return_points_won_player1,total_return_points_won_player2,total_points_won_player1,total_points_won_player2,double_faults_player1,double_faults_player2,aces_player1,aces_player2
0,33/70,28/48,27/33,16/28,16/37,8/20,5/7,1/10,24/48,27/70,67/118,51/118,2,1,7,0


### Combine the DataFrames by column binding them


In [374]:
# Combine the DataFrames by column binding them
df_combined = pd.concat([df_match_info, df_stats], axis=1)

df_combined

Unnamed: 0,Date,Round,Player 1,Player 2,Score,Tournament,1st_serve_percentage_player1,1st_serve_percentage_player2,1st_serve_points_won_player1,1st_serve_points_won_player2,...,break_points_won_player1,break_points_won_player2,total_return_points_won_player1,total_return_points_won_player2,total_points_won_player1,total_points_won_player2,double_faults_player1,double_faults_player2,aces_player1,aces_player2
0,21.06.24,1/4,Bernard Tomic,Govind Nanda,"6-3, 6-1",M25 Tulsa,33/70,28/48,27/33,16/28,...,5/7,1/10,24/48,27/70,67/118,51/118,2,1,7,0


In [375]:

# # Extract match information
# date = soup.find('td', class_='w50').text.strip().split(' ')[0]
# round_info = soup.find_all('td', class_='w50')[1].text.strip()
# player1 = soup.find_all('td', class_='w130')[0].text.strip()
# player2 = soup.find_all('td', class_='w130')[1].text.strip()
# score = soup.find('span', id='score').text.strip()
# tournament = soup.find('td', class_='w200').find('a').text.strip()

# # Create a DataFrame for match info
# match_data = {
#     'Date': [date],
#     'Round': [round_info],
#     'Player 1': [player1],
#     'Player 2': [player2],
#     'Score': [score],
#     'Tournament': [tournament]
# }
# df_match_info = pd.DataFrame(match_data)

# # Function to extract data for a given statistic
# def extract_statistic(statistic):
#     row = soup.find('td', string=statistic).parent
#     player1_stat = row.find_all('td')[1].text.strip().split(' ')[0]
#     player2_stat = row.find_all('td')[2].text.strip().split(' ')[0]
#     return player1_stat, player2_stat

# # List of statistics to extract
# statistics = [
#     '1st SERVE %',
#     '1st SERVE POINTS WON',
#     '2nd SERVE POINTS WON',
#     'TOTAL RETURN POINTS WON',
#     'TOTAL POINTS WON',
#     'DOUBLE FAULTS',
#     'ACES'
# ]

# # List of statistics to extract 
# stats_length = len(soup.find_all('td', class_='info_txt'))

# if stats_length == 8:
#     statistics.insert(3, 'BREAK POINTS WON')  # Insert 'BREAK POINTS WON' at the correct position



# # Dictionary to hold the statistics
# stat_data = {}

# # Extract and store statistics for both players
# for stat in statistics:
#     player1_stat, player2_stat = extract_statistic(stat)
#     stat_name = stat.lower().replace(' ', '_').replace('%', 'percentage').replace('/', '_').replace('(','').replace(')','')
#     stat_data[f'{stat_name}_player1'] = player1_stat
#     stat_data[f'{stat_name}_player2'] = player2_stat

# # Convert the dictionary to a DataFrame
# df_stats = pd.DataFrame([stat_data])

# # Combine the DataFrames by column binding them
# df_combined = pd.concat([df_match_info, df_stats], axis=1)

# df_combined

Unnamed: 0,Date,Round,Player 1,Player 2,Score,Tournament,1st_serve_percentage_player1,1st_serve_percentage_player2,1st_serve_points_won_player1,1st_serve_points_won_player2,...,break_points_won_player1,break_points_won_player2,total_return_points_won_player1,total_return_points_won_player2,total_points_won_player1,total_points_won_player2,double_faults_player1,double_faults_player2,aces_player1,aces_player2
0,21.06.24,1/4,Bernard Tomic,Govind Nanda,"6-3, 6-1",M25 Tulsa,33/70,28/48,27/33,16/28,...,5/7,1/10,24/48,27/70,67/118,51/118,2,1,7,0


# Extract Data from Multiple Links 

In [346]:
import pandas as pd

import requests

from bs4 import BeautifulSoup

In [376]:

def extract_match_data(urls):
    data_list = []
    
    for url in urls:
        # Fetch HTML content from the URL
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Extract match information
        date = soup.find('td', class_='w50').text.strip().split(' ')[0]
        round_info = soup.find_all('td', class_='w50')[1].text.strip()
        player1 = soup.find_all('td', class_='w130')[0].text.strip()
        player2 = soup.find_all('td', class_='w130')[1].text.strip()
        score = soup.find('span', id='score').text.strip()
        tournament = soup.find('td', class_='w200').find('a').text.strip()
        
        # Extract statistics function
        def extract_statistic(statistic):
            row = soup.find('td', string=statistic).parent
            player1_stat = row.find_all('td')[1].text.strip().split(' ')[0]
            player2_stat = row.find_all('td')[2].text.strip().split(' ')[0]
            return player1_stat, player2_stat
        
            # List of statistics to extract
            statistics = [
                '1st SERVE %',
                '1st SERVE POINTS WON',
                '2nd SERVE POINTS WON',
                'TOTAL RETURN POINTS WON',
                'TOTAL POINTS WON',
                'DOUBLE FAULTS',
                'ACES'
            ]

            # List of statistics to extract 
            stats_length = len(soup.find_all('td', class_='info_txt'))

            if stats_length == 8:
                statistics.insert(3, 'BREAK POINTS WON')  # Insert 'BREAK POINTS WON' at the correct position

        
        # Extract statistics for original players
        
        stat_data = {}
        
        for stat in statistics:
            player1_stat, player2_stat = extract_statistic(stat)
            stat_name = stat.lower().replace(' ', '_').replace('%', 'percentage').replace('/', '_').replace('(','').replace(')','')
            stat_data[f'{stat_name}_player1'] = player1_stat
            stat_data[f'{stat_name}_player2'] = player2_stat
        
        # Append match data to the list
        match_data = {
            'Date': date,
            'Round': round_info,
            'Player 1': player1,
            'Player 2': player2,
            'Score': score,
            'Tournament': tournament,
            **stat_data
        }
        data_list.append(match_data)
    
    # Convert list of dictionaries to DataFrame
    df = pd.DataFrame(data_list)
    
    # Sort DataFrame by Date in descending order
    df['Date'] = pd.to_datetime(df['Date'], format='%d.%m.%y')
    
    df = df.sort_values(by='Date', ascending=True).reset_index(drop=True)
    
    return df


In [377]:
extract_match_data(['https://www.tennislive.net/atp/match/bernard-tomic-VS-govind-nanda/m25-tulsa-2024/'])

Unnamed: 0,Date,Round,Player 1,Player 2,Score,Tournament,1st_serve_percentage_player1,1st_serve_percentage_player2,1st_serve_points_won_player1,1st_serve_points_won_player2,...,break_points_won_player1,break_points_won_player2,total_return_points_won_player1,total_return_points_won_player2,total_points_won_player1,total_points_won_player2,double_faults_player1,double_faults_player2,aces_player1,aces_player2
0,2024-06-21,1/4,Bernard Tomic,Govind Nanda,"6-3, 6-1",M25 Tulsa,33/70,28/48,27/33,16/28,...,5/7,1/10,24/48,27/70,67/118,51/118,2,1,7,0


### Accomodating for the player profile link

In [396]:
response = requests.get('https://www.tennislive.net/atp/match/rudy-quan-VS-shintaro-imai/little-rock-challenger-2024/')
soup = BeautifulSoup(response.content, 'html.parser')

### Modify Function
- https://www.tennislive.net/atp/match/rudy-quan-VS-jonas-pelle-hartenstein/m15-orange-park-2024/
- ^modify function for profiles with no statistics

In [397]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def extract_match_data(urls):
    data_list = []
    
    for url in urls:
        # Fetch HTML content from the URL
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Extract match information
        date = soup.find('td', class_='w50').text.strip().split(' ')[0]
        round_info = soup.find_all('td', class_='w50')[1].text.strip()
        player1 = soup.find_all('td', class_='w130')[0].text.strip()
        player2 = soup.find_all('td', class_='w130')[1].text.strip()
        score = soup.find('span', id='score').text.strip()
        tournament = soup.find('td', class_='w200').find('a').text.strip()
        
        # Extract statistics function
        def extract_statistic(statistic):
            row = soup.find('td', string=statistic).parent
            player1_stat = row.find_all('td')[1].text.strip().split(' ')[0]
            player2_stat = row.find_all('td')[2].text.strip().split(' ')[0]
            return player1_stat, player2_stat
        
        # List of statistics to extract
        statistics = [
            '1st SERVE %',
            '1st SERVE POINTS WON',
            '2nd SERVE POINTS WON',
            'TOTAL RETURN POINTS WON',
            'TOTAL POINTS WON',
            'DOUBLE FAULTS',
            'ACES'
        ]

        # List of statistics to extract 
        stats_length = len(soup.find_all('td', class_='info_txt'))

        if stats_length == 8:
            statistics.insert(3, 'BREAK POINTS WON')  # Insert 'BREAK POINTS WON' at the correct position
        
        # Extract statistics for original players
        stat_data = {}
        table = soup.find_all('table', class_ = 'table_stats_match')

        if len(table) > 0 and table[0].find('td', class_='info_txt'):  # Process only if the table has data
            for stat in statistics:
                player1_stat, player2_stat = extract_statistic(stat)  # Extract stats for each player
                stat_name = (stat.lower()
                                .replace(' ', '_')
                                .replace('%', 'percentage')
                                .replace('/', '_')
                                .replace('(', '')
                                .replace(')', ''))

                # Store stats in dictionary
                stat_data[f'{stat_name}_player1'] = player1_stat
                stat_data[f'{stat_name}_player2'] = player2_stat
        else:
            # If the table is empty, set all stats to None
            for stat in statistics:
                stat_name = (stat.lower()
                                .replace(' ', '_')
                                .replace('%', 'percentage')
                                .replace('/', '_')
                                .replace('(', '')
                                .replace(')', ''))

                stat_data[f'{stat_name}_player1'] = None
                stat_data[f'{stat_name}_player2'] = None

        # Append match data to the list
        match_data = {
            'Date': date,
            'Round': round_info,
            'Player 1': player1,
            'Player 2': player2,
            'Score': score,
            'Tournament': tournament,
            **stat_data
        }
        data_list.append(match_data)
    
    # Convert list of dictionaries to DataFrame
    df = pd.DataFrame(data_list)
    
    # Sort DataFrame by Date in descending order
    df['Date'] = pd.to_datetime(df['Date'], format='%d.%m.%y')
    
    df = df.sort_values(by='Date', ascending=True).reset_index(drop=True)
    
    return df

In [398]:
# Example usage
urls = ['https://www.tennislive.net/atp/match/rudy-quan-VS-viktor-frydrych/us-open-juniors-2024/#google_vignette',
       'https://www.tennislive.net/atp/match/rudy-quan-VS-jonas-pelle-hartenstein/m15-orange-park-2024/',
       'https://www.tennislive.net/atp/match/rudy-quan-VS-viktor-frydrych/us-open-juniors-2024/']
df = extract_match_data(urls)
df

Unnamed: 0,Date,Round,Player 1,Player 2,Score,Tournament,1st_serve_percentage_player1,1st_serve_percentage_player2,1st_serve_points_won_player1,1st_serve_points_won_player2,2nd_serve_points_won_player1,2nd_serve_points_won_player2,total_return_points_won_player1,total_return_points_won_player2,total_points_won_player1,total_points_won_player2,double_faults_player1,double_faults_player2,aces_player1,aces_player2
0,2024-05-02,2nd round,Rudy Quan,Jonas Pelle Hartenstein,- walk over,M15 Orange,,,,,,,,,,,,,,
1,2024-09-01,1st round,Rudy Quan,Viktor Frydrych,"7-5, 6-1",U.S. Open,,,,,,,,,,,,,,
2,2024-09-01,1st round,Rudy Quan,Viktor Frydrych,"7-5, 6-1",U.S. Open,,,,,,,,,,,,,,


In [382]:
extract_match_data(['https://www.tennislive.net/atp/match/rudy-quan-VS-jonas-pelle-hartenstein/m15-orange-park-2024/'])

Unnamed: 0,Date,Round,Player 1,Player 2,Score,Tournament,1st_serve_percentage_player1,1st_serve_percentage_player2,1st_serve_points_won_player1,1st_serve_points_won_player2,2nd_serve_points_won_player1,2nd_serve_points_won_player2,total_return_points_won_player1,total_return_points_won_player2,total_points_won_player1,total_points_won_player2,double_faults_player1,double_faults_player2,aces_player1,aces_player2
0,2024-05-02,2nd round,Rudy Quan,Jonas Pelle Hartenstein,- walk over,M15 Orange,,,,,,,,,,,,,,


In [383]:
extract_match_data(['https://www.tennislive.net/atp/match/rudy-quan-VS-shintaro-imai/little-rock-challenger-2024/'])

Unnamed: 0,Date,Round,Player 1,Player 2,Score,Tournament,1st_serve_percentage_player1,1st_serve_percentage_player2,1st_serve_points_won_player1,1st_serve_points_won_player2,...,break_points_won_player1,break_points_won_player2,total_return_points_won_player1,total_return_points_won_player2,total_points_won_player1,total_points_won_player2,double_faults_player1,double_faults_player2,aces_player1,aces_player2
0,2024-05-26,q 1,Rudy Quan,Shintaro Imai,"4-6, 6-4, 6-2",Little Rock,55/76,54/118,36/55,34/54,...,6/19,4/6,55/118,30/76,101/194,93/194,2,9,0,15


In [384]:
extract_match_data(['https://www.tennislive.net/atp/match/rudy-quan-VS-viktor-frydrych/us-open-juniors-2024/'])

Unnamed: 0,Date,Round,Player 1,Player 2,Score,Tournament,1st_serve_percentage_player1,1st_serve_percentage_player2,1st_serve_points_won_player1,1st_serve_points_won_player2,2nd_serve_points_won_player1,2nd_serve_points_won_player2,total_return_points_won_player1,total_return_points_won_player2,total_points_won_player1,total_points_won_player2,double_faults_player1,double_faults_player2,aces_player1,aces_player2
0,2024-09-01,1st round,Rudy Quan,Viktor Frydrych,"7-5, 6-1",U.S. Open,,,,,,,,,,,,,,


### INPUT (playerName) and (links) HERE

In [385]:
# Example usage:
# player_name = "Mikel Lopez Hernaez"

urls = ['https://www.tennislive.net/atp/match/rudy-quan-VS-jonas-pelle-hartenstein/m15-orange-park-2024/',
        'https://www.tennislive.net/atp/match/rudy-quan-VS-shintaro-imai/little-rock-challenger-2024/',
        'https://www.tennislive.net/atp/match/rudy-quan-VS-filip-peliwo/little-rock-challenger-2024/',
        'https://www.tennislive.net/atp/match/rudy-quan-VS-stefan-kozlov/little-rock-challenger-2024/',
        'https://www.tennislive.net/atp/match/rudy-quan-VS-andres-andrade/little-rock-challenger-2024/',
        'https://www.tennislive.net/atp/match/yuta-shimizu-VS-rudy-quan/little-rock-challenger-2024/'
]

# extract_match_data(player_name, urls)
extract_match_data(urls)

# page.status_code

Unnamed: 0,Date,Round,Player 1,Player 2,Score,Tournament,1st_serve_percentage_player1,1st_serve_percentage_player2,1st_serve_points_won_player1,1st_serve_points_won_player2,...,total_return_points_won_player1,total_return_points_won_player2,total_points_won_player1,total_points_won_player2,double_faults_player1,double_faults_player2,aces_player1,aces_player2,break_points_won_player1,break_points_won_player2
0,2024-05-02,2nd round,Rudy Quan,Jonas Pelle Hartenstein,- walk over,M15 Orange,,,,,...,,,,,,,,,,
1,2024-05-26,q 1,Rudy Quan,Shintaro Imai,"4-6, 6-4, 6-2",Little Rock,55/76,54/118,36/55,34/54,...,55/118,30/76,101/194,93/194,2.0,9.0,0.0,15.0,6/19,4/6
2,2024-05-27,qual.,Rudy Quan,Filip Peliwo,"6-4, 6-2",Little Rock,36/51,23/49,20/36,12/23,...,32/49,25/51,58/100,42/100,1.0,4.0,0.0,1.0,7/7,4/8
3,2024-05-28,1st round,Rudy Quan,Stefan Kozlov,"6-1, 7-5",Little Rock,42/62,41/58,25/42,19/41,...,34/58,27/62,69/120,51/120,1.0,3.0,0.0,0.0,6/11,3/6
4,2024-05-30,2nd round,Rudy Quan,Andres Andrade,"6-3, 3-6, 6-3",Little Rock,59/84,44/89,32/59,25/44,...,48/89,42/84,90/173,83/173,7.0,10.0,1.0,3.0,8/17,6/9
5,2024-05-31,1/4,Yuta Shimizu,Rudy Quan,"6-4, 6-3",Little Rock,37/55,64/80,21/37,29/64,...,44/80,25/55,74/135,61/135,1.0,2.0,4.0,0.0,7/16,4/7


# Accomodate for Player Profile Page

### Cj Notes

In [400]:
page = requests.get('https://www.tennislive.net/atp/ethan-quinn/')

In [401]:
soup = BeautifulSoup(page.text, 'html')

In [402]:
table = soup.find_all('table', class_ = 'table_pmatches')[1]

In [403]:
rows = table.find_all('td', class_='w50')

### Grab the Link
##### WARNING: This also grabs the links of the player profiles, we only want the match links
- use the fact that the match links are every 3rd index as per the previous cells

In [404]:
match_links = []

for a in table.find_all('a', href=True):
    if "https://www.tennislive.net/atp/match/" in a['href']:
        match_links.append(a['href'])
        
        
match_links

['https://www.tennislive.net/atp/match/ethan-quinn-VS-enzo-couacaud/charlottesville-challenger-2024/',
 'https://www.tennislive.net/atp/match/patrick-kypson-VS-ethan-quinn/sioux-falls-challenger-2024/',
 'https://www.tennislive.net/atp/match/tristan-schoolkate-VS-ethan-quinn/fairfield-challenger-2024/',
 'https://www.tennislive.net/atp/match/ethan-quinn-VS-rudy-quan/fairfield-challenger-2024/',
 'https://www.tennislive.net/atp/match/ethan-quinn-VS-carl-emil-overbeck/fairfield-challenger-2024/',
 'https://www.tennislive.net/atp/match/alexis-galarneau-VS-ethan-quinn/tiburon-challenger-2024/',
 'https://www.tennislive.net/atp/match/nishesh-basavareddy-VS-ethan-quinn/charleston-challenger-2024/',
 'https://www.tennislive.net/atp/match/ethan-quinn-VS-brandon-holt/charleston-challenger-2024/',
 'https://www.tennislive.net/atp/match/nishesh-basavareddy-VS-ethan-quinn/columbus-challenger-2024/',
 'https://www.tennislive.net/atp/match/ethan-quinn-VS-patrick-kypson/columbus-challenger-2024/',
 '

In [391]:
extract_match_data(match_links)

Unnamed: 0,Date,Round,Player 1,Player 2,Score,Tournament,1st_serve_percentage_player1,1st_serve_percentage_player2,1st_serve_points_won_player1,1st_serve_points_won_player2,...,break_points_won_player1,break_points_won_player2,total_return_points_won_player1,total_return_points_won_player2,total_points_won_player1,total_points_won_player2,double_faults_player1,double_faults_player2,aces_player1,aces_player2
0,2024-02-07,1st round,Rinky Hijikata,Ethan Quinn,"6-4, 6-4",Dallas,35/66,45/72,23/35,28/45,...,3/10,1/7,32/72,23/66,75/138,63/138,1,2,4,5
1,2024-02-10,q 1,Zachary Svajda,Ethan Quinn,"6-3, 6-3",Delray Beach,33/53,29/60,26/33,22/29,...,4/8,1/4,29/60,16/53,66/113,47/113,0,3,1,3
2,2024-03-04,q 1,Ethan Quinn,Quentin Halys,"6-1, 7-5",Indian Wells,34/64,26/58,24/34,20/26,...,5/6,2/4,28/58,24/64,68/122,54/122,2,5,4,6
3,2024-03-05,qual.,Ethan Quinn,Andrea Vavassori,"4-6, 6-1, 6-2",Indian Wells,48/85,46/79,37/48,31/46,...,4/12,1/3,33/79,29/85,89/164,75/164,4,6,4,5
4,2024-03-07,1st round,Patrick Kypson,Ethan Quinn,"6-1, 7-61",Indian Wells,47/63,37/63,38/47,30/37,...,2/5,0/2,26/63,15/63,74/126,52/126,0,4,10,5
5,2024-03-13,1st round,Timo Legout,Ethan Quinn,"6-4, 5-7, 6-0",M25 Bakersfield,54/85,50/76,40/54,31/50,...,5/9,2/5,33/76,29/85,89/161,72/161,2,2,2,0
6,2024-03-19,1st round,Ethan Quinn,Dominique Rolland,"3-6, 6-3, 6-2",M25 Calabasas,57/80,46/77,39/57,28/46,...,6/9,4/7,36/77,32/80,84/157,73/157,7,5,4,3
7,2024-03-21,2nd round,Kyle Kang,Ethan Quinn,"7-5, 6-3",M25 Calabasas,42/73,33/60,30/42,21/33,...,3/6,1/4,24/60,25/73,72/133,61/133,0,6,3,0
8,2024-03-30,q 1,Valentin Vacherot,Ethan Quinn,"3-6, 6-3, 6-4",Houston,59/87,57/84,39/59,38/57,...,3/5,2/10,29/84,30/87,86/171,85/171,1,2,4,4
9,2024-04-08,1st round,Ethan Quinn,Clement Tabur,"6-4, 7-62",Sarasota Challenger,37/65,51/80,29/37,35/51,...,2/8,1/1,31/80,17/65,79/145,66/145,2,7,5,5


In [393]:
def player_profile(url):
    
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html')
    
    table = soup.find_all('table', class_ = 'table_pmatches')[1]
    rows = table.find_all('td', class_='w50')
    
    
    match_links = []

    for a in table.find_all('a', href=True):
        if "https://www.tennislive.net/atp/match/" in a['href']:
            match_links.append(a['href'])
            
    return extract_match_data(match_links)


# (edit this function to accomodate for missing stats error)
#     - so copy and paste the function to edit it before this cell
    
#     output dataframe

In [394]:
url = 'https://www.tennislive.net/atp/govind-nanda/'
player_profile(url)

Unnamed: 0,Date,Round,Player 1,Player 2,Score,Tournament,1st_serve_percentage_player1,1st_serve_percentage_player2,1st_serve_points_won_player1,1st_serve_points_won_player2,...,break_points_won_player1,break_points_won_player2,total_return_points_won_player1,total_return_points_won_player2,total_points_won_player1,total_points_won_player2,double_faults_player1,double_faults_player2,aces_player1,aces_player2
0,2022-10-25,1st round,Govind Nanda,Nicolas Mejia,"7-61, 6-4",Las Vegas,41/69,42/77,31/41,26/42,...,3/8,2/8,33/77,24/69,78/146,68/146,1.0,3.0,1.0,4.0
1,2022-10-27,2nd round,Alexis Galarneau,Govind Nanda,"6-1, 2-1, - retired",Las Vegas,14/28,26/34,10/14,10/26,...,4/8,1/2,20/34,10/28,38/62,24/62,1.0,0.0,1.0,2.0
2,2023-01-06,1st round,Learner Tien,Govind Nanda,"6-1, 6-75, 11-9",M25 Malibu,,,,,...,,,,,,,,,,
3,2023-01-25,1st round,Govind Nanda,Huzaifa Abdul Rehman,"6-4, 6-4",M25 Wesley,53/82,52/82,33/53,31/52,...,4/15,2/14,38/82,33/82,87/164,77/164,1.0,3.0,1.0,3.0
4,2023-01-26,2nd round,Matthew Segura,Govind Nanda,"2-6, 7-63, 6-1",M25 Wesley,57/90,56/90,36/57,31/56,...,6/11,5/11,45/90,40/90,95/180,85/180,6.0,5.0,0.0,2.0
5,2023-04-17,q 1,Govind Nanda,Aidan Kim,"6-4, 6-2",Tallahassee Challenger,43/73,37/83,33/43,23/37,...,3/6,0/7,39/83,25/73,87/156,69/156,1.0,9.0,2.0,0.0
6,2023-04-17,qual.,Federico Agustin Gomez,Govind Nanda,"6-1, 6-4",Tallahassee Challenger,29/62,34/48,26/29,20/34,...,4/8,1/4,24/48,22/62,64/110,46/110,6.0,1.0,4.0,0.0
7,2023-04-23,q 1,Govind Nanda,Isaiah Strode,"2-6, 7-62, 6-3",Savannah Challenger,64/100,43/88,39/64,30/43,...,4/9,5/12,36/88,41/100,95/188,93/188,0.0,5.0,1.0,4.0
8,2023-04-24,qual.,Kyrian Jacquet,Govind Nanda,"7-5, 6-1",Savannah Challenger,30/59,30/53,19/30,16/30,...,5/7,2/4,27/53,24/59,62/112,50/112,5.0,2.0,1.0,2.0
9,2023-05-02,1st round,Govind Nanda,Thomas Brown,"7-64, 2-6, 6-3",M15 Orange,61/100,64/84,41/61,45/64,...,2/3,3/8,28/84,37/100,91/184,93/184,0.0,4.0,10.0,1.0


In [395]:
url = 'https://www.tennislive.net/atp/rudy-quan/'
player_profile(url)

Unnamed: 0,Date,Round,Player 1,Player 2,Score,Tournament,1st_serve_percentage_player1,1st_serve_percentage_player2,1st_serve_points_won_player1,1st_serve_points_won_player2,...,break_points_won_player1,break_points_won_player2,total_return_points_won_player1,total_return_points_won_player2,total_points_won_player1,total_points_won_player2,double_faults_player1,double_faults_player2,aces_player1,aces_player2
0,2023-03-24,2nd round,Andrew Rogers,Rudy Quan,"6-3, 6-3",M25 Calabasas,,,,,...,,,,,,,,,,
1,2023-05-29,q 1,Rudy Quan,William Chase Thompson,"6-3, 1-6, 10-5",M15 Rancho,,,,,...,,,,,,,,,,
2,2023-05-30,qual.,Rudy Quan,Piotr Galus,"3-6, 6-4, 10-4",M15 Rancho,,,,,...,,,,,,,,,,
3,2023-05-31,1st round,Connor Farren,Rudy Quan,"7-64, 6-3",M15 Rancho,39/66,31/64,31/39,22/31,...,3/4,2/5,27/64,20/66,73/130,57/130,5.0,4.0,7.0,1.0
4,2023-06-05,q 1,Rohan Murali,Rudy Quan,"6-1, 7-60",M15 San,,,,,...,,,,,,,,,,
5,2023-06-12,q 1,Rudy Quan,James Watt,"7-63, 7-66",M15 San,,,,,...,,,,,,,,,,
6,2023-06-13,qual.,Ryan Seggerman,Rudy Quan,"6-3, 6-71, 10-4",M15 San,,,,,...,,,,,,,,,,
7,2023-06-19,q 1,Rudy Quan,Marco Alvarez,"6-0, 6-3",M15 Los,,,,,...,,,,,,,,,,
8,2023-06-20,qual.,Daniel De Jonge,Rudy Quan,"6-4, 6-2",M15 Los,,,,,...,,,,,,,,,,
9,2023-06-26,q 1,Rudy Quan,Neel Joshi,"6-2, 6-2",M15 Irvine,,,,,...,,,,,,,,,,


player_data.to_csv(f'GovindNanda_profile.csv', index=False)

# Point by Point (PBP) Dataframe

In [3]:
page = requests.get('https://www.tennislive.net/atp/match/borna-gojo-VS-rudy-quan/sioux-falls-challenger-2024/')

In [4]:
soup = BeautifulSoup(page.text, 'html')

### Check Status Code
- 200 = success
- 404 = failed

In [5]:
page.status_code

200

### Find HTML Tags that Have Point By Point (PBP) Data

The `table` tag with the class `table_stats_match` contains all the info needed for the point by point data.

In [6]:
table = soup.find_all('table', class_ = 'table_stats_match')
# table[1]

### First Set HTML

In [7]:
set_1 = soup.find_all('table', class_='table_stats_match')[1].find_all('tr', class_=None)
set_1[0]

<tr><td class="mp_serve" width="40%">Borna Gojo <img alt="Borna Gojo serve" height="8" src="https://www.tennislive.net/styles/images/tennis_ball.gif" title="Borna Gojo serve" width="8"/></td><td class="mp_info_txt" width="18%">0-0</td><td class="mp_serve" width="40%"></td></tr>

### Example of Getting Server of First Game

In [8]:
' '.join(set_1[0].find('img')['alt'].split()[0:2])

'Borna Gojo'

### Example of Extracting PBP From First Game

In [9]:
set_1[1].find('td')

<td class="mp_15" colspan="3" width="99%">0-0, 15-0, 15-15, 30-15, 30-30, 30-40<span title="Break point">[BP]</span></td>

Here we replace `[BP]` with signifies a "break-point" in the game with empty strings to keep the point data consistent. Then we can simply split the string into the points.

In [10]:
set_1[1].find('td').text.replace('[BP]', '').split(', ')

['0-0', '15-0', '15-15', '30-15', '30-30', '30-40']

### Example of Creating a PBP DataFrame from Set 1

In [11]:
# initialize lists to hold each of the designated values
server_names = []
point_scores = []
game_scores = []

# iterate over all the `tr` tags in set_1
# step over by 2 since each server and PBP data is paired together
for i in range(0, len(set_1), 2):
    # extract the data from the tr
    server_name = ' '.join(set_1[i].find('img')['alt'].split()[0:2])
    game_score = set_1[i].find('td', class_='mp_info_txt').text.strip()
    points = set_1[i + 1].find('td').text.replace('[BP]', '').split(', ')
    
    # create a row for each point in the game
    for point in points:
        server_names.append(server_name)
        point_scores.append(point)
        game_scores.append(game_score)


In [12]:
# construct the dataframe 
df = pd.DataFrame({
    'point_score': point_scores,
    'server_name': server_names,
    'game_score': game_scores
})

# replace the last known score with 0-0 since the score
# doesn't update between sets
df = df.replace(df['game_score'].iloc[-1], '0-0')
df

Unnamed: 0,point_score,server_name,game_score
0,0-0,Borna Gojo,0-0
1,15-0,Borna Gojo,0-0
2,15-15,Borna Gojo,0-0
3,30-15,Borna Gojo,0-0
4,30-30,Borna Gojo,0-0
5,30-40,Borna Gojo,0-0
6,0-0,Rudy Quan,0-1
7,0-15,Rudy Quan,0-1
8,15-15,Rudy Quan,0-1
9,15-30,Rudy Quan,0-1


### Find and Denote Each Player to Correctly Swap the Scores

We do this because the PBP is always in the order of `player1`-`player2`. To make the dataframe easier to understand, we swap the point and game scores when the server is `player2` as the server's points are read first.

In [13]:
table = soup.find('table', class_ = 'table_pmatches')

player1 = soup.find_all('td', class_='w130')[0].text.strip()
player2 = soup.find_all('td', class_='w130')[1].text.strip()

player1, player2

('Borna Gojo', 'Rudy Quan')

In [14]:
df.iloc[3]

point_score         30-15
server_name    Borna Gojo
game_score            0-0
Name: 3, dtype: object

In [15]:
'-'.join(df['point_score'][3].split('-')[::-1])

'15-30'

In [16]:
df['point_score'] = df.apply(lambda x : '-'.join(x['point_score'].split('-')[::-1]) if x['server_name'] == player2 else x['point_score'], axis=1)
df['game_score'] = df.apply(lambda x : '-'.join(x['game_score'].split('-')[::-1]) if x['server_name'] == player2 else x['game_score'], axis=1)

In [17]:
df

Unnamed: 0,point_score,server_name,game_score
0,0-0,Borna Gojo,0-0
1,15-0,Borna Gojo,0-0
2,15-15,Borna Gojo,0-0
3,30-15,Borna Gojo,0-0
4,30-30,Borna Gojo,0-0
5,30-40,Borna Gojo,0-0
6,0-0,Rudy Quan,1-0
7,15-0,Rudy Quan,1-0
8,15-15,Rudy Quan,1-0
9,30-15,Rudy Quan,1-0


### Generalize the Scraping to Apply to All Sets

We'll add a new column to keep track of the set number as well to help with readability in the dataframe.

In [18]:
# 2 set example
page = requests.get('https://www.tennislive.net/atp/match/borna-gojo-VS-rudy-quan/sioux-falls-challenger-2024/')

In [19]:
soup = BeautifulSoup(page.text, 'html')

In [20]:
def extract_point_by_point(url):
    # get HTML from url and convert to BeautifulSoup
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html')
    
    # find player names
    table = soup.find('table', class_ = 'table_pmatches')
    player1 = soup.find_all('td', class_='w130')[0].text.strip()
    player2 = soup.find_all('td', class_='w130')[1].text.strip()
    
    # get all set data
    content = soup.find_all('table', class_='table_stats_match')[1:]

    # initialize list to hold data for all sets
    set_dfs = []

#     for j in range(len(content)-1): #### maybe this???
    for j in range(len(content)):
        # initialize lists to hold each of the designated values
        server_names = []
        point_scores = []
        game_scores = []
        set_num = []
        is_break_points = []

        # get all table data for the current set
        current_set = content[j].find_all('tr', class_=None)

        # determine starting tag by how the table is formatted
        start = 0
        if not current_set[1].find('td', class_='mp_15'):
            start = 1

        # iterate over all the `tr` tags in set_1
        # step over by 2 since each server and PBP data is paired together
        for i in range(start, len(current_set), 2):
            # extract the data from the tr
            server_name = ' '.join(current_set[i].find('img')['alt'].split()[0:-1])
            game_score = current_set[i].find('td', class_='mp_info_txt').text.strip()
            points = current_set[i + 1].find('td').text.split(', ')

            # create a new row for each point
            for point in points:
                
                if '[BP]' in point:
                    is_break_point = True
                    point = point.replace('[BP]', '').strip()
                else:
                    is_break_point = False

                server_names.append(server_name)
                point_scores.append(point)
                game_scores.append(game_score)
                set_num.append(j + 1)
                is_break_points.append(is_break_point)

        # construct the dataframe
        df = pd.DataFrame({
            'pointScore': point_scores,
            'serverName': server_names,
            'gameScore': game_scores,
            'setNum': set_num,
            'isBreakPoint': is_break_points
        })

        last_game_score = df['gameScore'].iloc[-1]

        if '6' in last_game_score and (last_game_score != '6-6' and last_game_score != '6-5' and last_game_score != '5-6'):
            df.loc[df['gameScore'] == df['gameScore'].iloc[-1], 'setNum'] += 1
            df.loc[df['gameScore'] == df['gameScore'].iloc[-1], 'gameScore'] = '0-0' 

            # switch server names when in tiebreaker        
        if '6-6' in last_game_score or '0-0' in last_game_score:
            tiebreaker_df = df[df['gameScore'] == last_game_score]
            tiebreaker_rows = df.shape[0] - tiebreaker_df.shape[0]
            server = tiebreaker_df['serverName'].iloc[0]

            if player1 != server:
                server = player1

            for i in range(1, len(tiebreaker_df), 4):
                df.loc[tiebreaker_rows + i:tiebreaker_rows + i + 1, 'serverName'] = server
                
        set_dfs.append(df)
        
    df = pd.concat(set_dfs).reset_index(drop=True)    

    # flip point_score and game_score of player2
    df['pointScore'] = df.apply(lambda x : '-'.join(x['pointScore'].split('-')[::-1]) if x['serverName'] == player2 else x['pointScore'], axis=1)
    df['gameScore'] = df.apply(lambda x : '-'.join(x['gameScore'].split('-')[::-1]) if x['serverName'] == player2 else x['gameScore'], axis=1)
    
    return df

In [21]:
url = 'https://www.tennislive.net/atp/match/borna-gojo-VS-rudy-quan/sioux-falls-challenger-2024/'
extract_point_by_point(url)

IndexError: list index out of range

In [22]:
url = 'https://www.tennislive.net/atp/match/govind-nanda-VS-karue-sell/m15-lakewood-2024/'
extract_point_by_point(url)

Unnamed: 0,pointScore,serverName,gameScore,setNum,isBreakPoint
0,0-0,Govind Nanda,0-0,1,False
1,0-15,Govind Nanda,0-0,1,False
2,0-30,Govind Nanda,0-0,1,False
3,0-40,Govind Nanda,0-0,1,True
4,0-0,Karue Sell,1-0,1,False
...,...,...,...,...,...
110,A-40,Govind Nanda,5-1,2,False
111,40-40,Govind Nanda,5-1,2,False
112,A-40,Govind Nanda,5-1,2,False
113,40-40,Govind Nanda,5-1,2,False


In [23]:
url = 'https://www.tennislive.net/atp/match/borna-gojo-VS-rudy-quan/sioux-falls-challenger-2024/'
# url = 'https://www.tennislive.net/atp/match/govind-nanda-VS-karue-sell/m15-lakewood-2024/'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html')
    
# find player names
table = soup.find('table', class_ = 'table_pmatches')
player1 = soup.find_all('td', class_='w130')[0].text.strip()
player2 = soup.find_all('td', class_='w130')[1].text.strip()

# get all set data
content = soup.find_all('table', class_='table_stats_match')[1:]

# initialize list to hold data for all sets
set_dfs = []

content
# for j in range(len(content)):
#     # initialize lists to hold each of the designated values
#     server_names = []
#     point_scores = []
#     game_scores = []
#     set_num = []
#     is_break_points = []

#     # get all table data for the current set
#     current_set = content[j].find_all('tr', class_=None)

#     # determine starting tag by how the table is formatted
#     start = 0
#     if not current_set[1].find('td', class_='mp_15'):
#         start = 1
        

#     # iterate over all the `tr` tags in set_1
#     # step over by 2 since each server and PBP data is paired together
#     for i in range(start, len(current_set), 2):
#         # extract the data from the tr
#         server_name = ' '.join(current_set[i].find('img')['alt'].split()[0:-1])
#         game_score = current_set[i].find('td', class_='mp_info_txt').text.strip()
#         points = current_set[i + 1].find('td').text.split(', ')
        
# len(points)

#######

#         # create a new row for each point
#         for point in points:

#             if '[BP]' in point:
#                 is_break_point = True
#                 point = point.replace('[BP]', '').strip()
#             else:
#                 is_break_point = False

#             server_names.append(server_name)
#             point_scores.append(point)
#             game_scores.append(game_score)
#             set_num.append(j + 1)
#             is_break_points.append(is_break_point)

#     # construct the dataframe
#     df = pd.DataFrame({
#         'pointScore': point_scores,
#         'serverName': server_names,
#         'gameScore': game_scores,
#         'setNum': set_num,
#         'isBreakPoint': is_break_points
#     })

#     last_game_score = df['gameScore'].iloc[-1]

#     if '6' in last_game_score and (last_game_score != '6-6' and last_game_score != '6-5' and last_game_score != '5-6'):
#         df.loc[df['gameScore'] == df['gameScore'].iloc[-1], 'setNum'] += 1
#         df.loc[df['gameScore'] == df['gameScore'].iloc[-1], 'gameScore'] = '0-0' 

#         # switch server names when in tiebreaker        
#     if '6-6' in last_game_score or '0-0' in last_game_score:
#         tiebreaker_df = df[df['gameScore'] == last_game_score]
#         tiebreaker_rows = df.shape[0] - tiebreaker_df.shape[0]
#         server = tiebreaker_df['serverName'].iloc[0]

#         if player1 != server:
#             server = player1

#         for i in range(1, len(tiebreaker_df), 4):
#             df.loc[tiebreaker_rows + i:tiebreaker_rows + i + 1, 'serverName'] = server

#     set_dfs.append(df)

# df = pd.concat(set_dfs).reset_index(drop=True)    

# # flip point_score and game_score of player2
# df['pointScore'] = df.apply(lambda x : '-'.join(x['pointScore'].split('-')[::-1]) if x['serverName'] == player2 else x['pointScore'], axis=1)
# df['gameScore'] = df.apply(lambda x : '-'.join(x['gameScore'].split('-')[::-1]) if x['serverName'] == player2 else x['gameScore'], axis=1)

[<table class="table_stats_match"><tr class="mp_tour_head"><td colspan="3" width="100%">1</td></tr><tr><td class="mp_serve" width="40%">Borna Gojo <img alt="Borna Gojo serve" height="8" src="https://www.tennislive.net/styles/images/tennis_ball.gif" title="Borna Gojo serve" width="8"/></td><td class="mp_info_txt" width="18%">0-0</td><td class="mp_serve" width="40%"></td></tr><tr><td class="mp_15" colspan="3" width="99%">0-0, 15-0, 15-15, 30-15, 30-30, 30-40<span title="Break point">[BP]</span></td></tr><tr><td class="mp_serve" width="40%"></td><td class="mp_info_txt" width="18%">0-1</td><td class="mp_serve" width="40%">Rudy Quan <img alt="Rudy Quan serve" height="8" src="https://www.tennislive.net/styles/images/tennis_ball.gif" title="Rudy Quan serve" width="8"/></td></tr><tr><td class="mp_15" colspan="3" width="99%">0-0, 0-15, 15-15, 15-30, 15-40</td></tr><tr><td class="mp_serve" width="40%">Borna Gojo <img alt="Borna Gojo serve" height="8" src="https://www.tennislive.net/styles/images

### More Examples

Some other interesting match urls are tested below to test functionality of varying sets and number of tiebreaks.

In [24]:
url = 'https://www.tennislive.net/atp/match/yuta-shimizu-VS-rudy-quan/little-rock-challenger-2024/'
extract_point_by_point(url).head(20)

Unnamed: 0,pointScore,serverName,gameScore,setNum,isBreakPoint
0,0-0,Rudy Quan,0-0,1,False
1,15-0,Rudy Quan,0-0,1,False
2,15-15,Rudy Quan,0-0,1,False
3,30-15,Rudy Quan,0-0,1,False
4,30-30,Rudy Quan,0-0,1,False
5,40-30,Rudy Quan,0-0,1,False
6,40-40,Rudy Quan,0-0,1,False
7,40-A,Rudy Quan,0-0,1,True
8,40-40,Rudy Quan,0-0,1,False
9,A-40,Rudy Quan,0-0,1,False


In [25]:
# data.query("isBreakPoint = True")

In [26]:
url = 'https://www.tennislive.net/atp/match/jorge-plans-VS-mikel-lopez-hernaez/m25-bakio-2024/'
extract_point_by_point(url)

Unnamed: 0,pointScore,serverName,gameScore,setNum,isBreakPoint
0,0-0,Jorge Plans,0-0,1,False
1,15-0,Jorge Plans,0-0,1,False
2,30-0,Jorge Plans,0-0,1,False
3,30-15,Jorge Plans,0-0,1,False
4,30-30,Jorge Plans,0-0,1,False
...,...,...,...,...,...
198,4-8,Mikel Lopez Hernaez,0-0,3,False
199,9-4,Jorge Plans,0-0,3,False
200,9-5,Jorge Plans,0-0,3,False
201,6-9,Mikel Lopez Hernaez,0-0,3,False


In [27]:
url = 'https://www.tennislive.net/atp/match/jorge-plans-VS-mikel-lopez-hernaez/m25-bakio-2024/'
extract_point_by_point(url)

Unnamed: 0,pointScore,serverName,gameScore,setNum,isBreakPoint
0,0-0,Jorge Plans,0-0,1,False
1,15-0,Jorge Plans,0-0,1,False
2,30-0,Jorge Plans,0-0,1,False
3,30-15,Jorge Plans,0-0,1,False
4,30-30,Jorge Plans,0-0,1,False
...,...,...,...,...,...
198,4-8,Mikel Lopez Hernaez,0-0,3,False
199,9-4,Jorge Plans,0-0,3,False
200,9-5,Jorge Plans,0-0,3,False
201,6-9,Mikel Lopez Hernaez,0-0,3,False


In [28]:
url = 'https://www.tennislive.net/atp/match/mitchell-krueger-VS-abedallah-shelbayh/little-rock-challenger-2024/'
extract_point_by_point(url)

Unnamed: 0,pointScore,serverName,gameScore,setNum,isBreakPoint
0,0-0,Abedallah Shelbayh,0-0,1,False
1,0-15,Abedallah Shelbayh,0-0,1,False
2,15-15,Abedallah Shelbayh,0-0,1,False
3,30-15,Abedallah Shelbayh,0-0,1,False
4,40-15,Abedallah Shelbayh,0-0,1,False
...,...,...,...,...,...
157,0-15,Mitchell Krueger,5-4,3,False
158,15-15,Mitchell Krueger,5-4,3,False
159,30-15,Mitchell Krueger,5-4,3,False
160,30-30,Mitchell Krueger,5-4,3,False


In [29]:
url = 'https://www.tennislive.net/atp/match/borna-gojo-VS-rudy-quan/sioux-falls-challenger-2024/'
extract_point_by_point(url)

IndexError: list index out of range

In [30]:
url = 'https://www.tennislive.net/atp/match/ethan-quinn-VS-rudy-quan/fairfield-challenger-2024/'
extract_point_by_point(url)

Unnamed: 0,pointScore,serverName,gameScore,setNum,isBreakPoint
0,0-0,Rudy Quan,0-0,1,False
1,15-0,Rudy Quan,0-0,1,False
2,30-0,Rudy Quan,0-0,1,False
3,30-15,Rudy Quan,0-0,1,False
4,40-15,Rudy Quan,0-0,1,False
...,...,...,...,...,...
153,0-0,Rudy Quan,2-5,3,False
154,0-15,Rudy Quan,2-5,3,False
155,0-30,Rudy Quan,2-5,3,False
156,15-30,Rudy Quan,2-5,3,False


# Input link and modify data frame to be formatted correctly

In [31]:
url = 'https://www.tennislive.net/atp/match/yuta-shimizu-VS-rudy-quan/little-rock-challenger-2024/'
df = extract_point_by_point(url)

In [32]:
df[df['setNum'] == 1]

Unnamed: 0,pointScore,serverName,gameScore,setNum,isBreakPoint
0,0-0,Rudy Quan,0-0,1,False
1,15-0,Rudy Quan,0-0,1,False
2,15-15,Rudy Quan,0-0,1,False
3,30-15,Rudy Quan,0-0,1,False
4,30-30,Rudy Quan,0-0,1,False
...,...,...,...,...,...
69,30-40,Yuta Shimizu,5-4,1,True
70,40-40,Yuta Shimizu,5-4,1,False
71,40-A,Yuta Shimizu,5-4,1,True
72,40-40,Yuta Shimizu,5-4,1,False


## Add Columns

### Add pointNumber column

In [33]:
df.insert(0, 'pointNumber', range(1, len(df) + 1))

### Add gameNumber column

In [34]:
def game_score(game):
    return sum(map(int,('').join(game.split('-')))) + 1

df['gameNumber'] = df['gameScore'].apply(game_score)

### Add player1Name and player2Name columns

In [35]:
serverNames = df['serverName'].unique()

player1 = serverNames[0]
player2 = serverNames[1]

df['player1Name'] = player1
df['player2Name'] = player2

### Add returnName column

In [36]:
def switch_names(name):
    if name != player1:
        return player1
    else :
        return player2

df['returnerName'] = df['serverName'].apply(switch_names)

### Add tiebreakeScore column

In [37]:
# Transfer values from pointScore to tiebreakScore where gameScore is '6-6'
if 'tiebreakScore' not in df.columns:
    df['tiebreakScore'] = np.nan

for i in df['setNum'].unique():
    current_set = df[df['setNum'] == i]

    last_game_score = current_set['gameScore'].iloc[-1]

    if '6-6' in last_game_score or '0-0' in last_game_score:
        df.loc[(df['setNum'] == i) & (df['gameScore'] == last_game_score), 'tiebreakScore'] = df['pointScore']
        
        # Set the pointScore to NaN where tiebreakScore is not NaN
        df.loc[pd.notna(df['tiebreakScore']), 'pointScore'] = np.nan

### Add setScore column
#### WARNING: Manually input the setScores for desired player

In [38]:
set_list = ['0-0', '1-0']

def set_setScores(x):
    
    if 1 <= x <= len(set_list):
        return set_list[x - 1]
        
df['setScore'] = df['setNum'].apply(set_setScores)

In [39]:
df

Unnamed: 0,pointNumber,pointScore,serverName,gameScore,setNum,isBreakPoint,gameNumber,player1Name,player2Name,returnerName,tiebreakScore,setScore
0,1,0-0,Rudy Quan,0-0,1,False,1,Rudy Quan,Yuta Shimizu,Yuta Shimizu,,0-0
1,2,15-0,Rudy Quan,0-0,1,False,1,Rudy Quan,Yuta Shimizu,Yuta Shimizu,,0-0
2,3,15-15,Rudy Quan,0-0,1,False,1,Rudy Quan,Yuta Shimizu,Yuta Shimizu,,0-0
3,4,30-15,Rudy Quan,0-0,1,False,1,Rudy Quan,Yuta Shimizu,Yuta Shimizu,,0-0
4,5,30-30,Rudy Quan,0-0,1,False,1,Rudy Quan,Yuta Shimizu,Yuta Shimizu,,0-0
...,...,...,...,...,...,...,...,...,...,...,...,...
128,129,0-15,Rudy Quan,3-5,2,False,9,Rudy Quan,Yuta Shimizu,Yuta Shimizu,,1-0
129,130,0-30,Rudy Quan,3-5,2,False,9,Rudy Quan,Yuta Shimizu,Yuta Shimizu,,1-0
130,131,0-40,Rudy Quan,3-5,2,True,9,Rudy Quan,Yuta Shimizu,Yuta Shimizu,,1-0
131,132,15-40,Rudy Quan,3-5,2,True,9,Rudy Quan,Yuta Shimizu,Yuta Shimizu,,1-0


### Add Timestamp columns

In [40]:
df['Position'] = ''
df['pointEndPosition'] = ''
df['Duration'] = ''

### Add Team columns
#### WARNING: Manually set Team Names

In [41]:
df['clientTeam'] = 'UCLA'
df['opponentTeam'] = ''

### Add name column

In [42]:
df['Name'] = df.apply(lambda row: f"Set {row['setNum']}: {row['gameScore']}, {row['tiebreakScore']} {row['serverName']} Serving" if pd.notna(row['tiebreakScore']) else f"Set {row['setNum']}: {row['gameScore']}, {row['pointScore']} {row['serverName']} Serving", axis=1)

## Specify Order

In [43]:
df = df[['Name', 'pointNumber', 'setNum', 'gameNumber', 'player1Name', 'player2Name', 
         'pointScore', 'gameScore', 'setScore', 'tiebreakScore', 'serverName', 'returnerName',
        'Position', 'pointEndPosition', 'Duration', 'isBreakPoint', 'clientTeam', 'opponentTeam']]


In [44]:
df.head(5)

Unnamed: 0,Name,pointNumber,setNum,gameNumber,player1Name,player2Name,pointScore,gameScore,setScore,tiebreakScore,serverName,returnerName,Position,pointEndPosition,Duration,isBreakPoint,clientTeam,opponentTeam
0,"Set 1: 0-0, 0-0 Rudy Quan Serving",1,1,1,Rudy Quan,Yuta Shimizu,0-0,0-0,0-0,,Rudy Quan,Yuta Shimizu,,,,False,UCLA,
1,"Set 1: 0-0, 15-0 Rudy Quan Serving",2,1,1,Rudy Quan,Yuta Shimizu,15-0,0-0,0-0,,Rudy Quan,Yuta Shimizu,,,,False,UCLA,
2,"Set 1: 0-0, 15-15 Rudy Quan Serving",3,1,1,Rudy Quan,Yuta Shimizu,15-15,0-0,0-0,,Rudy Quan,Yuta Shimizu,,,,False,UCLA,
3,"Set 1: 0-0, 30-15 Rudy Quan Serving",4,1,1,Rudy Quan,Yuta Shimizu,30-15,0-0,0-0,,Rudy Quan,Yuta Shimizu,,,,False,UCLA,
4,"Set 1: 0-0, 30-30 Rudy Quan Serving",5,1,1,Rudy Quan,Yuta Shimizu,30-30,0-0,0-0,,Rudy Quan,Yuta Shimizu,,,,False,UCLA,


# Add in Timestamps

In [None]:
# # Put timestamps file here
# timestamp = pd.read_csv("timestamps_9uYpCdMy4eA.csv")

In [None]:
# if timestamp.shape[0] != point_df.shape[0]:
#     raise ValueError("Error: The number of rows in timestamp and point_df are not the same.")
# else:
#     # Assign values to point_df
#     point_df['Position'] = timestamp['pointStartTime'].values
#     point_df['pointEndPosition'] = timestamp['endStartTime'].values
#     print("\u2713 Check passed")

# Output the Point Visuals csv

In [None]:
# Save point_df to CSV file

# Assuming point_df is your DataFrame and player1Name and player2Name are the names from the first row
player1NameNoSpace = df.iloc[0]['player1Name'].replace(" ", "")
player2NameNoSpace = df.iloc[0]['player2Name'].replace(" ", "")

# Save DataFrame to CSV file with modified player names
df.to_csv(f'Point_Visuals_{player1NameNoSpace}_{player2NameNoSpace}.csv', index=False)