# Introduction

- Webscrape https://www.tennislive.net for pro-level match data

### Install Packages

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re

### Check Status Code
- 200 = success
- 404 = failed

### Insert Player Profile Link

In [2]:
url = 'https://www.tennislive.net/atp/emon-van-loben-sels/'
page = requests.get(url)
page.status_code

200

### Extract Match Data

In [4]:
def extract_match_data(urls):
    data_list = []
    
    for url in urls:
        # Fetch HTML content from the URL
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Check if the page contains a 404 error message
        if "404 - ERROR" in soup.text:
            print(f"Skipping URL due to 404 error: {url}")
            continue
        
        # Extract match information
        try:
            date = soup.find('td', class_='w50').text.strip().split(' ')[0]
            round_info = soup.find_all('td', class_='w50')[1].text.strip()
            player1 = soup.find_all('td', class_='w130')[0].text.strip()
            player2 = soup.find_all('td', class_='w130')[1].text.strip()
            score = soup.find('span', id='score').text.strip()
            tournament = soup.find('td', class_='w200').find('a').text.strip()
        except AttributeError:
            print(f"Skipping URL due to missing data: {url}")
            continue
        
        # Extract statistics function
        def extract_statistic(statistic):
            row = soup.find('td', string=statistic).parent
            player1_stat = row.find_all('td')[1].text.strip().split(' ')[0]
            player2_stat = row.find_all('td')[2].text.strip().split(' ')[0]
            return player1_stat, player2_stat
        
        # List of statistics to extract
        statistics = [
            '1st SERVE %',
            '1st SERVE POINTS WON',
            '2nd SERVE POINTS WON',
            'TOTAL RETURN POINTS WON',
            'TOTAL POINTS WON',
            'DOUBLE FAULTS',
            'ACES'
        ]

        # Check if 'BREAK POINTS WON' should be added to statistics
        stats_length = len(soup.find_all('td', class_='info_txt'))
        if stats_length == 8:
            statistics.insert(3, 'BREAK POINTS WON')
        
        # Extract statistics for players
        stat_data = {}
        table = soup.find_all('table', class_ = 'table_stats_match')
        
        if len(table) > 0 and table[0].find('td', class_='info_txt'):  # Process only if the table has data
            for stat in statistics:
                try:
                    player1_stat, player2_stat = extract_statistic(stat)
                except AttributeError:
                    player1_stat, player2_stat = None, None
                stat_name = (stat.lower()
                                .replace(' ', '_')
                                .replace('%', 'percentage')
                                .replace('/', '_')
                                .replace('(', '')
                                .replace(')', ''))
                
                # Store stats in dictionary
                stat_data[f'{stat_name}_player1'] = player1_stat
                stat_data[f'{stat_name}_player2'] = player2_stat
        else:
            # If the table is empty, set all stats to None
            for stat in statistics:
                stat_name = (stat.lower()
                                .replace(' ', '_')
                                .replace('%', 'percentage')
                                .replace('/', '_')
                                .replace('(', '')
                                .replace(')', ''))
                
                stat_data[f'{stat_name}_player1'] = None
                stat_data[f'{stat_name}_player2'] = None

        # Append match data to the list
        match_data = {
            'Date': date,
            'Round': round_info,
            'Player 1': player1,
            'Player 2': player2,
            'Score': score,
            'Tournament': tournament,
            **stat_data
        }
        data_list.append(match_data)
    
    # Convert list of dictionaries to DataFrame
    df = pd.DataFrame(data_list)
    
    # Sort DataFrame by Date in descending order
    if not df.empty:
        df['Date'] = pd.to_datetime(df['Date'], format='%d.%m.%y')
        df = df.sort_values(by='Date', ascending=True).reset_index(drop=True)
    
    return df


### Input individual links HERE
- optional

In [5]:
urls = ['https://www.tennislive.net/atp/match/david-ferrer-VS-novak-djokovic/open-romania-bucharest-2004/',
        'https://www.tennislive.net/atp/match/yuta-shimizu-VS-rudy-quan/little-rock-challenger-2024/']
extract_match_data(urls)

Unnamed: 0,Date,Round,Player 1,Player 2,Score,Tournament,1st_serve_percentage_player1,1st_serve_percentage_player2,1st_serve_points_won_player1,1st_serve_points_won_player2,...,total_return_points_won_player1,total_return_points_won_player2,total_points_won_player1,total_points_won_player2,double_faults_player1,double_faults_player2,aces_player1,aces_player2,break_points_won_player1,break_points_won_player2
0,2004-09-16,2nd round,David Ferrer,Novak Djokovic,"4-6, 6-4, 6-4",Bucharest,,,,,...,,,,,,,,,,
1,2024-05-31,1/4,Yuta Shimizu,Rudy Quan,"6-4, 6-3",Little Rock,37/55,64/80,21/37,29/64,...,44/80,25/55,74/135,61/135,1.0,2.0,4.0,0.0,7/16,4/7


# Accomodate for Player Profile Page

In [6]:
def complete_profile(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html')

    table = soup.find_all('tr', class_ = re.compile(r'pair|unpair'))

    hrefs = []
    for row in table:
        # Find all <a> tags in the row and get their href attribute
        links = row.find_all('a', href=True)
        for link in links:
            # Check if the href starts with '?y='
            if link['href'].startswith('?y='):
                hrefs.append(link['href'])
  
    return [url + href for href in hrefs]

In [7]:
complete_profile(url)

['https://www.tennislive.net/atp/emon-van-loben-sels/?y=2024',
 'https://www.tennislive.net/atp/emon-van-loben-sels/?y=2023']

In [8]:
def player_profile(urls):

    df = pd.DataFrame()

    for url in urls:
        df_year = pd.DataFrame()
        page = requests.get(url)
        soup = BeautifulSoup(page.text, 'html')
        
        table = soup.find_all('table', class_ = 'table_pmatches')[1]
        rows = table.find_all('td', class_='w50')
        
        match_links = []

        for a in table.find_all('a', href=True):
            if "https://www.tennislive.net/atp/match/" in a['href']:
                match_links.append(a['href'])
            
        df_year = extract_match_data(match_links)

        df = pd.concat([df, df_year], ignore_index=True)
    
    return df

    

In [9]:
urls = complete_profile(url)
# urls = ['https://www.tennislive.net/atp/novak-djokovic/?y=2004']
df = player_profile(urls)

In [10]:
df.head()

Unnamed: 0,Date,Round,Player 1,Player 2,Score,Tournament,1st_serve_percentage_player1,1st_serve_percentage_player2,1st_serve_points_won_player1,1st_serve_points_won_player2,...,break_points_won_player1,break_points_won_player2,total_return_points_won_player1,total_return_points_won_player2,total_points_won_player1,total_points_won_player2,double_faults_player1,double_faults_player2,aces_player1,aces_player2
0,2024-06-03,q 1,Emon Van Loben Sels,Perry Gregg,"6-2, 4-1, - retired",M15 San,,,,,...,,,,,,,,,,
1,2024-06-04,qual.,Alexander Petrov,Emon Van Loben Sels,"6-3, 6-3",M15 San,,,,,...,,,,,,,,,,
2,2024-06-17,q 1,Noah Zamora,Emon Van Loben Sels,"4-6, 6-4, 11-9",M15 Rancho,,,,,...,,,,,,,,,,
3,2024-06-24,q 1,Emon Van Loben Sels,Phoenix Weir,"6-0, 6-0",M15 Los,,,,,...,,,,,,,,,,
4,2024-06-25,qual.,Emon Van Loben Sels,Ryan Dickerson,"3-6, 6-1, 14-12",M15 Los,,,,,...,,,,,,,,,,


In [11]:
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html')

player_info = soup.find('div', class_ = 'player_stats')
player_name_nospace = player_info.find('a').text.strip().replace(' ', '')

In [12]:
df.to_csv(f'{player_name_nospace}_player_profile.csv', index=False)