# Introduction

- Webscrape https://www.tennislive.net for pro-level match data
- Data Information:
    Player1 denotes who won the match

### Install Packages

In [14]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re

### Check Status Code
- 200 = success
- 404 = failed

### Insert Player Profile Link

In [15]:
url = 'https://www.tennislive.net/atp/alexander-hoogmartens/'
page = requests.get(url)
page.status_code

200

### Extract Match Data

In [16]:
def extract_match_data(urls):
    data_list = []
    
    for url in urls:
        # Fetch HTML content from the URL
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Check if the page contains a 404 error message
        if "404 - ERROR" in soup.text:
            print(f"Skipping URL due to 404 error: {url}")
            continue
        
        # Extract match information
        try:
            date = soup.find('td', class_='w50').text.strip().split(' ')[0]
            round_info = soup.find_all('td', class_='w50')[1].text.strip()
            player1 = soup.find_all('td', class_='w130')[0].text.strip()
            player2 = soup.find_all('td', class_='w130')[1].text.strip()
            score = soup.find('span', id='score').text.strip()
            tournament = soup.find('td', class_='w200').find('a').text.strip()
        except AttributeError:
            print(f"Skipping URL due to missing data: {url}")
            continue
        
        # Extract statistics function
        def extract_statistic(statistic):
            row = soup.find('td', string=statistic).parent
            player1_stat = row.find_all('td')[1].text.strip().split(' ')[0]
            player2_stat = row.find_all('td')[2].text.strip().split(' ')[0]
            return player1_stat, player2_stat
        
        # List of statistics to extract
        statistics = [
            '1st SERVE %',
            '1st SERVE POINTS WON',
            '2nd SERVE POINTS WON',
            'TOTAL RETURN POINTS WON',
            'TOTAL POINTS WON',
            'DOUBLE FAULTS',
            'ACES'
        ]

        # Check if 'BREAK POINTS WON' should be added to statistics
        stats_length = len(soup.find_all('td', class_='info_txt'))
        if stats_length == 8:
            statistics.insert(3, 'BREAK POINTS WON')
        
        # Extract statistics for players
        stat_data = {}
        table = soup.find_all('table', class_ = 'table_stats_match')
        
        if len(table) > 0 and table[0].find('td', class_='info_txt'):  # Process only if the table has data
            for stat in statistics:
                try:
                    player1_stat, player2_stat = extract_statistic(stat)
                except AttributeError:
                    player1_stat, player2_stat = None, None
                stat_name = (stat.lower()
                                .replace(' ', '_')
                                .replace('%', 'percentage')
                                .replace('/', '_')
                                .replace('(', '')
                                .replace(')', ''))
                
                # Store stats in dictionary
                stat_data[f'{stat_name}_player1'] = player1_stat
                stat_data[f'{stat_name}_player2'] = player2_stat
        else:
            # If the table is empty, set all stats to None
            for stat in statistics:
                stat_name = (stat.lower()
                                .replace(' ', '_')
                                .replace('%', 'percentage')
                                .replace('/', '_')
                                .replace('(', '')
                                .replace(')', ''))
                
                stat_data[f'{stat_name}_player1'] = None
                stat_data[f'{stat_name}_player2'] = None

        # Append match data to the list
        match_data = {
            'Date': date,
            'Round': round_info,
            'Player 1': player1,
            'Player 2': player2,
            'Score': score,
            'Tournament': tournament,
            **stat_data
        }
        data_list.append(match_data)
    
    # Convert list of dictionaries to DataFrame
    df = pd.DataFrame(data_list)
    
    # Sort DataFrame by Date in descending order
    if not df.empty:
        df['Date'] = pd.to_datetime(df['Date'], format='%d.%m.%y')
        df = df.sort_values(by='Date', ascending=False).reset_index(drop=True)
    
    return df


### Input individual links HERE
- optional

In [17]:
urls = ['https://www.tennislive.net/atp/match/david-ferrer-VS-novak-djokovic/open-romania-bucharest-2004/',
        'https://www.tennislive.net/atp/match/yuta-shimizu-VS-rudy-quan/little-rock-challenger-2024/']
extract_match_data(urls)

Unnamed: 0,Date,Round,Player 1,Player 2,Score,Tournament,1st_serve_percentage_player1,1st_serve_percentage_player2,1st_serve_points_won_player1,1st_serve_points_won_player2,...,total_return_points_won_player1,total_return_points_won_player2,total_points_won_player1,total_points_won_player2,double_faults_player1,double_faults_player2,aces_player1,aces_player2,break_points_won_player1,break_points_won_player2
0,2024-05-31,1/4,Yuta Shimizu,Rudy Quan,"6-4, 6-3",Little Rock,37/55,64/80,21/37,29/64,...,44/80,25/55,74/135,61/135,1.0,2.0,4.0,0.0,7/16,4/7
1,2004-09-16,2nd round,David Ferrer,Novak Djokovic,"4-6, 6-4, 6-4",Bucharest,,,,,...,,,,,,,,,,


# Accomodate for Player Profile Page

In [18]:
def complete_profile(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html')

    table = soup.find_all('tr', class_ = re.compile(r'pair|unpair'))

    hrefs = []
    for row in table:
        # Find all <a> tags in the row and get their href attribute
        links = row.find_all('a', href=True)
        for link in links:
            # Check if the href starts with '?y='
            if link['href'].startswith('?y='):
                hrefs.append(link['href'])
  
    return [url + href for href in hrefs]

In [19]:
complete_profile(url)

['https://www.tennislive.net/atp/alexander-hoogmartens/?y=2023',
 'https://www.tennislive.net/atp/alexander-hoogmartens/?y=2022',
 'https://www.tennislive.net/atp/alexander-hoogmartens/?y=2021',
 'https://www.tennislive.net/atp/alexander-hoogmartens/?y=2020']

In [20]:
def player_profile(urls):

    df = pd.DataFrame()

    for url in urls:
        df_year = pd.DataFrame()
        page = requests.get(url)
        soup = BeautifulSoup(page.text, 'html')
        
        table = soup.find_all('table', class_ = 'table_pmatches')[1]
        rows = table.find_all('td', class_='w50')
        
        match_links = []

        for a in table.find_all('a', href=True):
            if "https://www.tennislive.net/atp/match/" in a['href']:
                match_links.append(a['href'])
            
        df_year = extract_match_data(match_links)

        df = pd.concat([df, df_year], ignore_index=True)
    
    return df

    

In [21]:
urls = complete_profile(url)
df = player_profile(urls)

In [22]:
df

Unnamed: 0,Date,Round,Player 1,Player 2,Score,Tournament,1st_serve_percentage_player1,1st_serve_percentage_player2,1st_serve_points_won_player1,1st_serve_points_won_player2,...,total_return_points_won_player1,total_return_points_won_player2,total_points_won_player1,total_points_won_player2,double_faults_player1,double_faults_player2,aces_player1,aces_player2,break_points_won_player1,break_points_won_player2
0,2023-01-03,qual.,Noah Schachter,Alexander Hoogmartens,"1-6, 6-2, 10-5",M25 Malibu,,,,,...,,,,,,,,,,
1,2023-01-02,q 1,Alexander Hoogmartens,Stefan Menichella,"6-2, 6-1",M25 Malibu,,,,,...,,,,,,,,,,
2,2022-08-30,1st round,Dominik Boehler,Alexander Hoogmartens,"7-5, 7-5",M15 Haren,46/88,41/80,33/46,24/41,...,37/80,35/88,90/168,78/168,3.0,4.0,5.0,3.0,5/13,3/11
3,2022-08-25,2nd round,Thomas Deschamps,Alexander Hoogmartens,"3-6, 6-3, 7-5",M15 Lambermont,63/90,69/115,44/63,48/69,...,48/115,32/90,106/205,99/205,3.0,9.0,4.0,5.0,4/13,3/5
4,2022-08-24,1st round,Alexander Hoogmartens,Jack Loge,"7-5, 2-6, 6-1",M15 Lambermont,41/82,50/74,30/41,32/50,...,32/74,28/82,86/156,70/156,6.0,7.0,3.0,2.0,5/7,4/10
5,2022-08-15,1st round,Tibo Colson,Alexander Hoogmartens,"6-4, 6-2",M25 Koksijde,41/69,30/58,27/41,19/30,...,27/58,26/69,70/127,57/127,1.0,4.0,2.0,4.0,4/7,1/8
6,2022-08-09,1st round,Michiel De Krom,Alexander Hoogmartens,"7-5, 6-3",M15 Eupen,47/64,28/54,35/47,21/28,...,23/54,20/64,67/118,51/118,0.0,4.0,1.0,3.0,3/4,1/3
7,2022-07-26,1st round,Toby Samuel,Alexander Hoogmartens,"6-3, 6-2",M25 Nottingham,40/55,35/56,27/40,20/35,...,26/56,18/55,63/111,48/111,0.0,2.0,5.0,2.0,4/12,1/4
8,2022-07-25,qual.,Alexander Hoogmartens,Michael Shaw,"6-2, 6-1",M25 Nottingham,33/47,27/41,22/33,16/27,...,25/41,15/47,57/88,31/88,4.0,3.0,7.0,2.0,5/10,1/1
9,2022-07-24,q 1,Alexander Hoogmartens,Eric Morris,"6-4, 6-3",M25 Nottingham,45/71,52/67,34/45,29/52,...,33/67,28/71,76/138,62/138,4.0,0.0,3.0,1.0,4/9,2/5


In [23]:
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html')

player_info = soup.find('div', class_ = 'player_stats')
player_name_nospace = player_info.find('a').text.strip().replace(' ', '')

In [24]:
df.to_csv(f'{player_name_nospace}_player_profile.csv', index=False)
print(f'{player_name_nospace}_player_profile.csv')

AlexanderHoogmartens_player_profile.csv
