# Introduction

- Webscrape https://www.tennislive.net for pro-level match data
- Output two .csv files 
    - Single row dataframe for summary statistics dashboard
    - point by point (Pbp) level csv file for Match Viewer website => to attach timestamps using website tagger

### Install Packages

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np


### Check Status Code
- 200 = success
- 404 = failed

In [3]:
url = 'https://www.tennislive.net/atp/govind-nanda/'
page = requests.get(url)
page.status_code

200

### Extract Match Data

In [4]:
def extract_match_data(urls):
    data_list = []
    
    for url in urls:
        # Fetch HTML content from the URL
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Extract match information
        date = soup.find('td', class_='w50').text.strip().split(' ')[0]
        round_info = soup.find_all('td', class_='w50')[1].text.strip()
        player1 = soup.find_all('td', class_='w130')[0].text.strip()
        player2 = soup.find_all('td', class_='w130')[1].text.strip()
        score = soup.find('span', id='score').text.strip()
        tournament = soup.find('td', class_='w200').find('a').text.strip()
        
        # Extract statistics function
        def extract_statistic(statistic):
            row = soup.find('td', string=statistic).parent
            player1_stat = row.find_all('td')[1].text.strip().split(' ')[0]
            player2_stat = row.find_all('td')[2].text.strip().split(' ')[0]
            return player1_stat, player2_stat
        
        # List of statistics to extract
        statistics = [
            '1st SERVE %',
            '1st SERVE POINTS WON',
            '2nd SERVE POINTS WON',
            'TOTAL RETURN POINTS WON',
            'TOTAL POINTS WON',
            'DOUBLE FAULTS',
            'ACES'
        ]

        # List of statistics to extract 
        stats_length = len(soup.find_all('td', class_='info_txt'))

        if stats_length == 8:
            statistics.insert(3, 'BREAK POINTS WON')  # Insert 'BREAK POINTS WON' at the correct position
        
        # Extract statistics for original players
        stat_data = {}
        table = soup.find_all('table', class_ = 'table_stats_match')

        if len(table) > 0 and table[0].find('td', class_='info_txt'):  # Process only if the table has data
            for stat in statistics:
                player1_stat, player2_stat = extract_statistic(stat)  # Extract stats for each player
                stat_name = (stat.lower()
                                .replace(' ', '_')
                                .replace('%', 'percentage')
                                .replace('/', '_')
                                .replace('(', '')
                                .replace(')', ''))

                # Store stats in dictionary
                stat_data[f'{stat_name}_player1'] = player1_stat
                stat_data[f'{stat_name}_player2'] = player2_stat
        else:
            # If the table is empty, set all stats to None
            for stat in statistics:
                stat_name = (stat.lower()
                                .replace(' ', '_')
                                .replace('%', 'percentage')
                                .replace('/', '_')
                                .replace('(', '')
                                .replace(')', ''))

                stat_data[f'{stat_name}_player1'] = None
                stat_data[f'{stat_name}_player2'] = None

        # Append match data to the list
        match_data = {
            'Date': date,
            'Round': round_info,
            'Player 1': player1,
            'Player 2': player2,
            'Score': score,
            'Tournament': tournament,
            **stat_data
        }
        data_list.append(match_data)
    
    # Convert list of dictionaries to DataFrame
    df = pd.DataFrame(data_list)
    
    # Sort DataFrame by Date in descending order
    df['Date'] = pd.to_datetime(df['Date'], format='%d.%m.%y')
    
    df = df.sort_values(by='Date', ascending=True).reset_index(drop=True)
    
    return df

### Input individual links HERE
- optional

In [7]:
# urls = ['https://www.tennislive.net/atp/match/rudy-quan-VS-jonas-pelle-hartenstein/m15-orange-park-2024/',
#         'https://www.tennislive.net/atp/match/rudy-quan-VS-shintaro-imai/little-rock-challenger-2024/',
#         'https://www.tennislive.net/atp/match/rudy-quan-VS-filip-peliwo/little-rock-challenger-2024/',
#         'https://www.tennislive.net/atp/match/rudy-quan-VS-stefan-kozlov/little-rock-challenger-2024/',
#         'https://www.tennislive.net/atp/match/rudy-quan-VS-andres-andrade/little-rock-challenger-2024/',
#         'https://www.tennislive.net/atp/match/yuta-shimizu-VS-rudy-quan/little-rock-challenger-2024/']

# extract_match_data(urls)

# Accomodate for Player Profile Page

In [5]:
def player_profile(url):
    
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html')
    
    table = soup.find_all('table', class_ = 'table_pmatches')[1]
    rows = table.find_all('tod', class_='w50')
    
    
    match_links = []

    for a in table.find_all('a', href=True):
        if "https://www.tennislive.net/atp/match/" in a['href']:
            match_links.appe`nd(a['href'])
            
    return extract_match_data(match_links)

In [6]:
df = player_profile(url)

Unnamed: 0,Date,Round,Player 1,Player 2,Score,Tournament,1st_serve_percentage_player1,1st_serve_percentage_player2,1st_serve_points_won_player1,1st_serve_points_won_player2,...,total_return_points_won_player1,total_return_points_won_player2,total_points_won_player1,total_points_won_player2,double_faults_player1,double_faults_player2,aces_player1,aces_player2,break_points_won_player1,break_points_won_player2
0,2022-10-27,2nd round,Alexis Galarneau,Govind Nanda,"6-1, 2-1, - retired",Las Vegas,14/28,26/34,10/14,10/26,...,20/34,10/28,38/62,24/62,1.0,0.0,1.0,2.0,4/8,1/2
1,2023-01-06,1st round,Learner Tien,Govind Nanda,"6-1, 6-75, 11-9",M25 Malibu,,,,,...,,,,,,,,,,
2,2023-01-25,1st round,Govind Nanda,Huzaifa Abdul Rehman,"6-4, 6-4",M25 Wesley,53/82,52/82,33/53,31/52,...,38/82,33/82,87/164,77/164,1.0,3.0,1.0,3.0,4/15,2/14
3,2023-01-26,2nd round,Matthew Segura,Govind Nanda,"2-6, 7-63, 6-1",M25 Wesley,57/90,56/90,36/57,31/56,...,45/90,40/90,95/180,85/180,6.0,5.0,0.0,2.0,6/11,5/11
4,2023-04-17,qual.,Federico Agustin Gomez,Govind Nanda,"6-1, 6-4",Tallahassee Challenger,29/62,34/48,26/29,20/34,...,24/48,22/62,64/110,46/110,6.0,1.0,4.0,0.0,4/8,1/4
5,2023-04-17,q 1,Govind Nanda,Aidan Kim,"6-4, 6-2",Tallahassee Challenger,43/73,37/83,33/43,23/37,...,39/83,25/73,87/156,69/156,1.0,9.0,2.0,0.0,3/6,0/7
6,2023-04-23,q 1,Govind Nanda,Isaiah Strode,"2-6, 7-62, 6-3",Savannah Challenger,64/100,43/88,39/64,30/43,...,36/88,41/100,95/188,93/188,0.0,5.0,1.0,4.0,4/9,5/12
7,2023-04-24,qual.,Kyrian Jacquet,Govind Nanda,"7-5, 6-1",Savannah Challenger,30/59,30/53,19/30,16/30,...,27/53,24/59,62/112,50/112,5.0,2.0,1.0,2.0,5/7,2/4
8,2023-05-02,1st round,Govind Nanda,Thomas Brown,"7-64, 2-6, 6-3",M15 Orange,61/100,64/84,41/61,45/64,...,28/84,37/100,91/184,93/184,0.0,4.0,10.0,1.0,2/3,3/8
9,2023-05-04,2nd round,Govind Nanda,Huzaifa Abdul Rehman,"6-3, 6-2",M15 Orange,31/50,27/48,22/31,15/27,...,24/48,16/50,58/98,40/98,0.0,4.0,4.0,0.0,5/9,2/3


# Output the Point Visuals csv

In [None]:
# Save point_df to CSV file

# Assuming point_df is your DataFrame and player1Name and player2Name are the names from the first row
player1NameNoSpace = df.iloc[0]['player1Name'].replace(" ", "")
player2NameNoSpace = df.iloc[0]['player2Name'].replace(" ", "")

# Save DataFrame to CSV file with modified player names
df.to_csv(f'Point_Visuals_{player1NameNoSpace}_{player2NameNoSpace}.csv', index=False)