In [53]:
# Defines procedures to crawl match statistics

import re
import pandas as pd
import urllib.request as request

from bs4 import BeautifulSoup


def tryParse(pattern, strVal):
    found = re.search(pattern, strVal)
    if found:
        return int(found.group(1))

    return None


def fetch(url):    
    page = request.urlopen(url).read()
    return BeautifulSoup(page, 'html.parser')





def normalize(bag):
    return [round(item / sum(bag), 2) for item in bag]


def crawlMatchUrl(url):
    doc = fetch(url)

    for table in doc.select('.scoringtable'):
        if table['data-event'] == 'MS':
            for link in table.select('.linkItem.matchstats a'):
                yield link['href']


def crawlMatchStats(url):
    doc = fetch(url)

    team1_stats, team2_stats = [], []

    for row in doc.select('#summary #match-stats .row'):
        ignoredStats      = ['Winners']
        normalizableStats = ['Total points won']

        stat = row.select('.statlabel')[0].string

        if stat not in ignoredStats:
            str_team1 = row.select('.team.team1')[0].string
            str_team2 = row.select('.team.team2')[0].string

            percentPattern = r'(\d+) \%'
            val_team1 = tryParse(percentPattern, str_team1)
            
            val_team2 = tryParse(percentPattern, str_team2)

            if (val_team1 is None) or (val_team2 is None):
                speedPattern = r'(\d+) KMH'
                val_team1 = tryParse(speedPattern, str_team1)
                val_team2 = tryParse(speedPattern, str_team2)
                if (val_team1 is not None) or (val_team2 is not None):
                    val_team1 = round(val_team1 / 1.6, 2)
                    val_team2 = round(val_team2 / 1.6, 2)
            
            
            if (val_team1 is None) or (val_team2 is None):
                try:
                    val_team1 = int(str_team1)
                    val_team2 = int(str_team2)
                except ValueError:
                    val_team1 = float(str_team1)
                    val_team2 = float(str_team2)
            
            if stat in normalizableStats:
                val_team1, val_team2 = normalize([val_team1, val_team2])

            team1_stats.append(val_team1)
            team2_stats.append(val_team2)

    if 'team1' in doc.select('.crticon.winner')[0].parent['id']:
        winnerStat = 0
    else:
        winnerStat = 1

    return tuple(team1_stats + team2_stats + [winnerStat])

In [54]:
# Crawl match statistics from ausopen.com using defined procedures in previous cell.

domainUrl = 'http://www.ausopen.com'
dayOfMatchesUrl = domainUrl + '/en_AU/scores/completed_matches/day7.html'

matchUrls = crawlMatchUrl(dayOfMatchesUrl)
matchStat = [crawlMatchStats(domainUrl + url) for url in matchUrls]

7
6
64
69
50
125.0
113.12
90.0
70
38
39
38
180
3319.5
9.2
15
6
52
72
44
140.0
124.38
101.88
56
25
33
42
91
2300.9
11.6
5
7
63
63
32
135.62
127.5
107.5
54
0
28
43
68
2190.1
13.1
2
2
63
62
61
132.5
121.88
93.12
64
0
20
23
66
1210.5
8.2
8
4
58
67
51
125.0
109.38
86.88
55
67
36
27
113
1567.8
6.6
16
0
69
79
47
128.75
114.38
93.12
74
40
34
43
137
2677.1
10.3
12
2
64
89
68
146.25
122.5
115.62
73
67
40
20
84
1000.9
7.2
10
4
64
74
53
140.0
122.5
98.75
63
0
13
18
74
860.1
4.9
12
10
53
74
47
122.5
112.5
94.38
67
29
32
57
146
3270.8
10.5
12
2
70
84
58
129.38
117.5
101.88
79
33
37
25
133
1937.2
7.9
7
3
65
82
62
118.12
111.88
92.5
33
63
44
26
91
1785.5
11.5
7
1
51
71
58
0
40
43
36
109
7
4
60
72
31
56
33
24
43
89
7
1
66
61
50
123.12
108.12
86.88
65
44
39
48
114
2939.8
12.4
12
3
70
83
72
126.88
116.25
97.5
100
38
49
27
90
1459.3
10.1
7
5
56
53
47
55
36
30
65
85
3
2
58
87
74
53
56
45
20
86
16
9
54
74
52
69
33
29
42
113
34
8
61
89
51
129.38
116.25
98.75
70
9
34
51
187
2508.2
7.0
12
1
67
72
56
121.88
110

In [55]:
# Convert collected statistics to pandas DataFrame

templateLabels = [
    'Ace', 'Double faults',
    '1st serves in', '1st serve points won',
    '2nd serve points won', 'Fastest serve',
    'Average 1st serve speed', 'Average 2nd serve speed',
    'Net points won', 'Break points won',
    'Receiving points won', 'Unforced errors',
    'Total points won','Distance Covered (M)',
    'Dist. Covered/Pt. (M)',
]

team1ColLabels = [lbl + ' - Team 1' for lbl in templateLabels]
team2ColLabels = [lbl + ' - Team 2' for lbl in templateLabels]
colLabels  = team1ColLabels + team2ColLabels + ['Match Winner']

df = pd.DataFrame(matchStat, columns = colLabels).dropna()

df[colLabels[:12]] = df[colLabels[:12]].astype(int)
df[colLabels[15:27]] = df[colLabels[15:27]].astype(int)
df[colLabels[-1]]= df[colLabels[-1]].astype(int)

In [56]:
df

Unnamed: 0,Ace - Team 1,Double faults - Team 1,1st serves in - Team 1,1st serve points won - Team 1,2nd serve points won - Team 1,Fastest serve - Team 1,Average 1st serve speed - Team 1,Average 2nd serve speed - Team 1,Net points won - Team 1,Break points won - Team 1,...,Average 1st serve speed - Team 2,Average 2nd serve speed - Team 2,Net points won - Team 2,Break points won - Team 2,Receiving points won - Team 2,Unforced errors - Team 2,Total points won - Team 2,Distance Covered (M) - Team 2,Dist. Covered/Pt. (M) - Team 2,Match Winner
0,7,6,64,69,50,125,113,90,70,38,...,122,91,93,33,38,91,0.5,3026.3,8.4,1
1,15,6,52,72,44,140,124,101,56,25,...,106,89,63,36,42,23,0.54,2395.6,12.0,1
2,5,7,63,63,32,135,127,107,54,0,...,116,93,86,46,48,20,0.59,2308.7,13.8,1
3,2,2,63,62,61,132,121,93,64,0,...,116,99,67,11,39,21,0.55,1235.3,8.3,1
4,8,4,58,67,51,125,109,86,55,67,...,126,108,52,71,40,51,0.52,1928.9,8.1,1
5,16,0,69,79,47,128,114,93,74,40,...,115,92,63,7,31,35,0.47,2848.1,10.9,0
6,12,2,64,89,68,146,122,115,73,67,...,115,100,50,0,19,11,0.4,947.8,6.8,0
7,10,4,64,74,53,140,122,98,63,0,...,129,111,58,50,34,15,0.58,990.4,5.6,1
8,12,10,53,74,47,122,112,94,67,29,...,117,88,70,23,39,43,0.53,3260.6,10.4,1
9,12,2,70,84,58,129,117,101,79,33,...,120,105,59,0,24,55,0.45,1852.8,7.6,0


In [57]:
# Save to disk as csv format

df.to_csv('ausopen.csv', index=False)

In [58]:
localData = pd.read_csv('ausopen.csv')

localData

Unnamed: 0,Ace - Team 1,Double faults - Team 1,1st serves in - Team 1,1st serve points won - Team 1,2nd serve points won - Team 1,Fastest serve - Team 1,Average 1st serve speed - Team 1,Average 2nd serve speed - Team 1,Net points won - Team 1,Break points won - Team 1,...,Average 1st serve speed - Team 2,Average 2nd serve speed - Team 2,Net points won - Team 2,Break points won - Team 2,Receiving points won - Team 2,Unforced errors - Team 2,Total points won - Team 2,Distance Covered (M) - Team 2,Dist. Covered/Pt. (M) - Team 2,Match Winner
0,7,6,64,69,50,125,113,90,70,38,...,122,91,93,33,38,91,0.5,3026.3,8.4,1
1,15,6,52,72,44,140,124,101,56,25,...,106,89,63,36,42,23,0.54,2395.6,12.0,1
2,5,7,63,63,32,135,127,107,54,0,...,116,93,86,46,48,20,0.59,2308.7,13.8,1
3,2,2,63,62,61,132,121,93,64,0,...,116,99,67,11,39,21,0.55,1235.3,8.3,1
4,8,4,58,67,51,125,109,86,55,67,...,126,108,52,71,40,51,0.52,1928.9,8.1,1
5,16,0,69,79,47,128,114,93,74,40,...,115,92,63,7,31,35,0.47,2848.1,10.9,0
6,12,2,64,89,68,146,122,115,73,67,...,115,100,50,0,19,11,0.4,947.8,6.8,0
7,10,4,64,74,53,140,122,98,63,0,...,129,111,58,50,34,15,0.58,990.4,5.6,1
8,12,10,53,74,47,122,112,94,67,29,...,117,88,70,23,39,43,0.53,3260.6,10.4,1
9,12,2,70,84,58,129,117,101,79,33,...,120,105,59,0,24,55,0.45,1852.8,7.6,0
