# Introduction

- Webscrape https://www.tennislive.net for pro-level match data

### Install Packages

In [9]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np


### Check Status Code
- 200 = success
- 404 = failed

### Insert Player Profile Link

In [10]:
url = 'https://www.tennislive.net/atp/novak-djokovic/'
page = requests.get(url)
page.status_code

200

### Extract Match Data

In [11]:
def extract_match_data(urls):
    data_list = []
    
    for url in urls:
        # Fetch HTML content from the URL
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Extract match information
        date = soup.find('td', class_='w50').text.strip().split(' ')[0]
        round_info = soup.find_all('td', class_='w50')[1].text.strip()
        player1 = soup.find_all('td', class_='w130')[0].text.strip()
        player2 = soup.find_all('td', class_='w130')[1].text.strip()
        score = soup.find('span', id='score').text.strip()
        tournament = soup.find('td', class_='w200').find('a').text.strip()
        
        # Extract statistics function
        def extract_statistic(statistic):
            row = soup.find('td', string=statistic).parent
            player1_stat = row.find_all('td')[1].text.strip().split(' ')[0]
            player2_stat = row.find_all('td')[2].text.strip().split(' ')[0]
            return player1_stat, player2_stat
        
        # List of statistics to extract
        statistics = [
            '1st SERVE %',
            '1st SERVE POINTS WON',
            '2nd SERVE POINTS WON',
            'TOTAL RETURN POINTS WON',
            'TOTAL POINTS WON',
            'DOUBLE FAULTS',
            'ACES'
        ]

        # List of statistics to extract 
        stats_length = len(soup.find_all('td', class_='info_txt'))

        if stats_length == 8:
            statistics.insert(3, 'BREAK POINTS WON')  # Insert 'BREAK POINTS WON' at the correct position
        
        # Extract statistics for original players
        stat_data = {}
        table = soup.find_all('table', class_ = 'table_stats_match')

        if len(table) > 0 and table[0].find('td', class_='info_txt'):  # Process only if the table has data
            for stat in statistics:
                player1_stat, player2_stat = extract_statistic(stat)  # Extract stats for each player
                stat_name = (stat.lower()
                                .replace(' ', '_')
                                .replace('%', 'percentage')
                                .replace('/', '_')
                                .replace('(', '')
                                .replace(')', ''))

                # Store stats in dictionary
                stat_data[f'{stat_name}_player1'] = player1_stat
                stat_data[f'{stat_name}_player2'] = player2_stat
        else:
            # If the table is empty, set all stats to None
            for stat in statistics:
                stat_name = (stat.lower()
                                .replace(' ', '_')
                                .replace('%', 'percentage')
                                .replace('/', '_')
                                .replace('(', '')
                                .replace(')', ''))

                stat_data[f'{stat_name}_player1'] = None
                stat_data[f'{stat_name}_player2'] = None

        # Append match data to the list
        match_data = {
            'Date': date,
            'Round': round_info,
            'Player 1': player1,
            'Player 2': player2,
            'Score': score,
            'Tournament': tournament,
            **stat_data
        }
        data_list.append(match_data)
    
    # Convert list of dictionaries to DataFrame
    df = pd.DataFrame(data_list)
    
    # Sort DataFrame by Date in descending order
    df['Date'] = pd.to_datetime(df['Date'], format='%d.%m.%y')
    
    df = df.sort_values(by='Date', ascending=True).reset_index(drop=True)
    
    return df

### Input individual links HERE
- optional

In [12]:
# urls = ['https://www.tennislive.net/atp/match/rudy-quan-VS-jonas-pelle-hartenstein/m15-orange-park-2024/',
#         'https://www.tennislive.net/atp/match/rudy-quan-VS-shintaro-imai/little-rock-challenger-2024/',
#         'https://www.tennislive.net/atp/match/rudy-quan-VS-filip-peliwo/little-rock-challenger-2024/',
#         'https://www.tennislive.net/atp/match/rudy-quan-VS-stefan-kozlov/little-rock-challenger-2024/',
#         'https://www.tennislive.net/atp/match/rudy-quan-VS-andres-andrade/little-rock-challenger-2024/',
#         'https://www.tennislive.net/atp/match/yuta-shimizu-VS-rudy-quan/little-rock-challenger-2024/']

# extract_match_data(urls)

# Accomodate for Player Profile Page

In [13]:
def player_profile(url):
    
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html')
    
    table = soup.find_all('table', class_ = 'table_pmatches')[1]
    rows = table.find_all('td', class_='w50')
    
    
    match_links = []

    for a in table.find_all('a', href=True):
        if "https://www.tennislive.net/atp/match/" in a['href']:
            match_links.append(a['href'])
            
    return extract_match_data(match_links)

In [14]:
df = player_profile(url)

In [15]:
df

Unnamed: 0,Date,Round,Player 1,Player 2,Score,Tournament,1st_serve_percentage_player1,1st_serve_percentage_player2,1st_serve_points_won_player1,1st_serve_points_won_player2,...,total_return_points_won_player1,total_return_points_won_player2,total_points_won_player1,total_points_won_player2,double_faults_player1,double_faults_player2,aces_player1,aces_player2,break_points_won_player1,break_points_won_player2
0,2023-11-23,,Novak Djokovic,Cameron Norrie,"6-4, 6-4",Davis Cup,27/48,53/75,24/27,35/53,...,30/75,8/48,70/123,53/123,1.0,1.0,8.0,4.0,,
1,2023-11-25,,Jannik Sinner,Novak Djokovic,"6-2, 2-6, 7-5",Davis Cup,67/105,46/77,48/67,35/46,...,24/77,40/105,89/182,93/182,5.0,1.0,12.0,6.0,3/4,2/9
2,2023-12-31,,Novak Djokovic,Zhizhen Zhang,"6-3, 6-2",Australia,28/46,35/57,24/28,24/35,...,26/57,10/46,62/103,41/103,1.0,2.0,7.0,4.0,,
3,2024-01-02,,Novak Djokovic,Jiri Lehecka,"6-1, 6-73, 6-1",Australia,49/80,47/75,36/49,30/47,...,36/75,25/80,91/155,64/155,2.0,1.0,3.0,4.0,6/11,2/5
4,2024-01-03,1/4,Alex De Minaur,Novak Djokovic,"6-4, 6-4",Australia,34/49,43/68,33/34,30/43,...,27/68,8/49,68/117,49/117,2.0,1.0,7.0,4.0,,
5,2024-01-14,1st round,Novak Djokovic,Dino Prizmic,"6-2, 6-75, 6-3, 6-4",Melbourne,77/129,84/123,51/77,55/84,...,56/123,46/129,139/252,113/252,1.0,7.0,11.0,7.0,8/17,4/9
6,2024-01-17,2nd round,Novak Djokovic,Alexei Popyrin,"6-3, 4-6, 7-64, 6-3",Melbourne,73/119,64/111,60/73,47/64,...,37/111,32/119,124/230,106/230,0.0,4.0,11.0,17.0,3/11,2/7
7,2024-01-19,3rd round,Novak Djokovic,Tomas Martin Etcheverry,"6-3, 6-3, 7-62",Melbourne,57/84,72/100,49/57,51/72,...,39/100,19/84,104/184,80/184,0.0,1.0,10.0,13.0,,
8,2024-01-21,4th round,Novak Djokovic,Adrian Mannarino,"6-0, 6-0, 6-3",Melbourne,48/69,34/58,39/48,17/34,...,33/58,17/69,85/127,42/127,5.0,0.0,17.0,1.0,7/11,0/3
9,2024-01-23,1/4,Novak Djokovic,Taylor Harry Fritz,"7-63, 4-6, 6-2, 6-3",Melbourne,72/115,90/154,59/72,59/90,...,64/154,31/115,148/269,121/269,3.0,2.0,20.0,16.0,4/21,2/6


In [8]:
df.to_csv('player.csv', index=False)