In [1]:
import time
import requests
import unicodedata
import pandas as pd
import json

from random import random
from datetime import datetime
from bs4 import BeautifulSoup

In [2]:
def get_html(url):
    time.sleep(0.3 * random())
    r = requests.get(url)
    return r.text

In [3]:
def get_finals_data():
    url = 'https://en.wikipedia.org/wiki/List_of_Grand_Slam_singles_finals'
    html_text = get_html(url)
    soup = BeautifulSoup(html_text, 'html.parser')
    
    for it, table in enumerate(soup.find_all('table', class_ = 'sortable')):
        table = table.tbody
        
        data = []
        for row in table.find_all('tr'):
            crs = row.find_all('td')
            data.append((crs[0].get_text().strip(),
                         crs[1].get_text().strip(),
                         crs[2].get_text().strip(),
                         crs[3].get_text().strip(),
                         crs[4].get_text().strip()))

        df = pd.DataFrame(data[1:], columns = data[0])
        df.to_csv('data/' + ('wo' if it == 1 else '') + 'men_finals.csv', index = False)

In [52]:
def parse_birth_date(x):
    try:
        return x.split(')')[0].split('(')[1]
    except:
        return ''


def parse_height(x):
    try:
        x = x.split('(')
        ret = ''
        if 'in' in x[0]:
            ret = x[1].split(')')[0]
        else:
            ret = x[0]
        ret = ret.strip()
        return ret
    except:
        return ''


def get_players_info(gender):
    df = pd.read_csv('data/%s_finals.csv' % gender)
    players = set(df['Winner']) | set(df['Runner-up'])

    data = []
    for it, player in enumerate(players):
        if (it + 1) % 10 == 0:
            print('%3d/%3d' % (it + 1, len(players)))
        url = 'https://en.wikipedia.org/wiki/' + player
        html_text = get_html(url)
        if 'Wikipedia does not have an article with this exact name.' in html_text:
            print('Skipping: %s' % player)
            continue
        soup = BeautifulSoup(html_text, 'html.parser')

        cr = {'Name': player}
        for infobox in soup.find_all('table', class_ = 'infobox vcard'):
            for row in infobox.find_all('tr')[1:]:
                if len(row.find_all('th', class_ = 'infobox-header')) > 0:
                    break
                try:
                    attr = unicodedata.normalize('NFKD', row.find_all('th')[0].get_text().strip())
                    val = unicodedata.normalize('NFKD', row.find_all('td')[0].get_text().strip())
                    cr[attr] = val
                except:
                    pass
        data.append(cr)
        
        image = soup.find_all('td', class_ = 'infobox-image')
        if len(image) == 0:
            continue
        image_url = 'https:' + image[0].find_all('a')[0].img.attrs['src']
        with open('data/player_images/%s.jpg' % player, 'wb') as fw:
            fw.write(requests.get(image_url).content)

    df = pd.DataFrame(data)
    df['Born'] = df['Born'].apply(parse_birth_date)
    df['Height'] = df['Height'].apply(parse_height)
    df.to_csv('data/%s_players_info.csv' % gender, index = False)

In [53]:
# get_finals_data()
get_players_info('men')
get_players_info('women')

Skipping: Reginald F. Doherty
 10/294
Skipping: Geoff E. Brown
 20/294
Skipping: C. Gene Mako
Skipping: Francis Kovacs, 2d
 30/294
Skipping: E. Victor Seixas, Jr.
Skipping: Francis T. Hunter
 40/294
Skipping: George M. Lott, Jr.
Skipping: C.St.John
 50/294
Skipping: Tom P. Brown
Skipping: Cecil Parke
 60/294
Skipping: E. Victor Seixas Jr.
Skipping: Henry W. Slocum Jr.
 70/294
Skipping: Wilmer L. Allison
Skipping: S. Welby van Horn
Skipping: Giorgo de Stefani
 80/294
Skipping: R. Falkenburg
Skipping: Henry W. Slocum, Jr.
Skipping: Frederick R. Schroeder, Jr.
 90/294
100/294
Skipping: E. Pockley
110/294
Skipping: Hugh L. Doherty
Skipping: Fred H. Hovey
Skipping: Robert D. Wrenn
120/294
Skipping: Thomas C. Bundy
130/294
Skipping: C.R. McKinley
140/294
Skipping: H. Roper Barrett
150/294
Skipping: Richard N. Williams
160/294
Skipping: Francis X. Shields
170/294
Skipping: E.F. Parker
Skipping: V. St. Leger Gould
Skipping: Roderik Menzel
180/294
Skipping: F.R. Schroeder
Skipping: William A. L

In [13]:
def get_match_stats():
    data_list = []
    tournament_event_ids = []
    tournament_ids = [17, 18, 19, 21]
    for slam in tournament_ids:
        page = 1
        while True:
            url = 'https://www.ultimatetennisstatistics.com/tournamentEventsTable?tournamentId=%d' %slam + '&current=%d' %page
            html_text = json.loads(get_html(url))
            tournament_event_ids.extend([d['id'] for d in html_text['rows']])
            page += 1
            if html_text['rowCount'] == 0: break

    for tournament_event_id in tournament_event_ids:
        url = 'https://www.ultimatetennisstatistics.com/tournamentEvent?tournamentEventId=%d' % tournament_event_id
        html_text = get_html(url)
        soup = BeautifulSoup(html_text, 'html.parser')

        data = {
            'Tournament': soup.find_all('h3')[0].get_text().strip()
        }

        fields = [(2, 2), (3, 1), (4, 0), (4, 1), (4, 2)]
        for col, tr in fields:    
            cr = soup.find_all('div', class_ = 'col-md-%d' % col)[0].find_all('tr')[tr].get_text().strip().split('\n')
            data[cr[0]] = cr[-1]
        data['Winner'] = ' '.join(data['Winner'].split()[:-1])
        data['Runner-up'] = ' '.join(data['Runner-up'].split()[:-1])

        try:
            match_id = int(soup.find_all('table', class_ = 'table-condensed text-nowrap')[0].tbody.find_all('tr')[0].find_all('td', attrs = {'data-round-index': True})[6].find_all('a', attrs = {'onclick': True})[0].attrs['id'].split('-')[1])
            url = 'https://www.ultimatetennisstatistics.com/matchStats?matchId=%d' % match_id
            html_text = get_html(url)
            soup = BeautifulSoup(html_text, 'html.parser')
        except:
            match_id = None

        if match_id is None:
            print('Skipping: %s' % data['Tournament'])
            continue

        for stats in soup.find_all('div', class_ = 'tab-content')[0].find_all('div'):
            stats_type = stats.attrs['id'].split('%s' % match_id)[1]

            subtype = ''
            for substats in stats.find_all('table')[0].find_all('tr'):
                if len(substats.find_all('i')) > 0:
                    subtype = substats.find_all('th')[2].get_text().strip()
                    continue

                ths = substats.find_all('th')

                try:
                    subsubtype = substats.find_all('td')[0].get_text().strip()
                    data['%s_%s_%s_Winner' % (stats_type, subtype, subsubtype)] = ths[1].get_text().strip()
                    data['%s_%s_%s_Runner-up' % (stats_type, subtype, subsubtype)] = ths[2].get_text().strip()
                except:
                    pass

        data_list.append(data)

    df = pd.DataFrame(data_list)
    df.to_csv('data/men_match_stats.csv', index = False)


In [14]:
get_match_stats()

Skipping: Roland Garros 1990
Skipping: Roland Garros 1989
Skipping: Roland Garros 1988
Skipping: Roland Garros 1987
Skipping: Roland Garros 1986
Skipping: Roland Garros 1985
Skipping: Roland Garros 1984
Skipping: Roland Garros 1983
Skipping: Roland Garros 1982
Skipping: Roland Garros 1981
Skipping: Roland Garros 1980
Skipping: Roland Garros 1979
Skipping: Roland Garros 1978
Skipping: Roland Garros 1977
Skipping: Roland Garros 1976
Skipping: Roland Garros 1975
Skipping: Roland Garros 1974
Skipping: Roland Garros 1973
Skipping: Roland Garros 1972
Skipping: Roland Garros 1971
Skipping: Roland Garros 1970
Skipping: Roland Garros 1969
Skipping: Roland Garros 1968
Skipping: Wimbledon 1990
Skipping: Wimbledon 1989
Skipping: Wimbledon 1988
Skipping: Wimbledon 1987
Skipping: Wimbledon 1986
Skipping: Wimbledon 1985
Skipping: Wimbledon 1984
Skipping: Wimbledon 1983
Skipping: Wimbledon 1982
Skipping: Wimbledon 1981
Skipping: Wimbledon 1980
Skipping: Wimbledon 1979
Skipping: Wimbledon 1978
Skipping