In [12]:
# Defines procedures to crawl match statistics

import re
import pandas as pd
import urllib.request as request

from bs4 import BeautifulSoup


def tryParse(pattern, strVal):
    found = re.search(pattern, strVal)
    if found:
        return int(found.group(1))

    return None


def fetch(url):    
    page = request.urlopen(url).read()
    return BeautifulSoup(page, 'html.parser')


def normalize(bag):
    return [round(item / sum(bag), 2) for item in bag]


def crawlMatchUrl(url):
    doc = fetch(url)
    
    for table in doc.select('.scoringtable'):
        print(len(table.select('.linkItem.matchstats a')))
        for link in table.select('.linkItem.matchstats a'):
            yield link['href']


def crawlMatchStats(url):
    doc = fetch(url)

    team1_stats, team2_stats = [], []

    for row in doc.select('#summary #match-stats .row'):
        ignoredStats      = ['Winners']
        normalizableStats = ['Total points won']

        stat = row.select('.statlabel')[0].string

        if stat not in ignoredStats:
            str_team1 = row.select('.team.team1')[0].string
            str_team2 = row.select('.team.team2')[0].string

            percentPattern = r'(\d+)\%'
            val_team1 = tryParse(percentPattern, str_team1)
            val_team2 = tryParse(percentPattern, str_team2)

            if (val_team1 is None) or (val_team2 is None):
                speedPattern = r'(\d+) MPH'
                val_team1 = tryParse(speedPattern, str_team1)
                val_team2 = tryParse(speedPattern, str_team2)

            if (val_team1 is None) or (val_team2 is None):
                try:
                    val_team1 = int(str_team1)
                    val_team2 = int(str_team2)
                except ValueError:
                    val_team1 = float(str_team1)
                    val_team2 = float(str_team2)

            if stat in normalizableStats:
                val_team1, val_team2 = normalize([val_team1, val_team2])

            team1_stats.append(val_team1)
            team2_stats.append(val_team2)

    if 'team1' in doc.select('.crticon.winner')[0].parent['id']:
        winnerStat = 0
    else:
        winnerStat = 1

    return tuple(team1_stats + team2_stats + [winnerStat])

In [20]:
crawlMatchStats('http://www.usopen.org/en_US/scores/completed_matches/day7.html?event=MS')

#df.to_csv('USSSSopen.csv', index=False)

KeyError: 'id'

In [25]:
# Crawl match statistics from ausopen.com using defined procedures in previous cell.

domainUrl = 'http://www.usopen.org'
dayOfMatchesUrl = domainUrl + '/en_US/scores/'

matchUrls = crawlMatchUrl(dayOfMatchesUrl)
matchStat = [crawlMatchStats(domainUrl + url) for url in matchUrls]

In [19]:
# Convert collected statistics to pandas DataFrame

matchStat = [crawlMatchStats('http://www.usopen.org/en_US/scores/stats/day20/21601ms.html')]

templateLabels = [
    'Ace', 'Double faults',
    '1st serves in', '1st serve points won',
    '2nd serve points won', 'Fastest serve',
    'Average 1st serve speed', 'Average 2nd serve speed',
    'Net points won', 'Break points won',
    'Receiving points won', 'Unforced errors',
    'Total points won','Distance Covered (M)',
    'Dist. Covered/Pt. (M)',
]

team1ColLabels = [lbl + ' - Team 1' for lbl in templateLabels]
team2ColLabels = [lbl + ' - Team 2' for lbl in templateLabels]
colLabels  = team1ColLabels + team2ColLabels + ['Match Winner']

df = pd.DataFrame(matchStat, columns = colLabels).dropna()

df[colLabels[:12]] = df[colLabels[:12]].astype(int)
df[colLabels[15:27]] = df[colLabels[15:27]].astype(int)
df[colLabels[-1]]= df[colLabels[-1]].astype(int)

AssertionError: 31 columns passed, passed data had 27 columns

In [18]:
# Save to disk as csv format

df.to_csv('USSSSopen1.csv', index=False)