In [12]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import urllib.parse
import numpy as np

pd.set_option('display.max_colwidth', None)

### AFL Game - data wrangling player statistics

In [13]:
# Read sample game
df = pd.read_html("https://afltables.com/afl/stats/games/2022/151620220821.html")

In [14]:
# Preview table (St Kilda players' stats for the game)
df[2].head()

Unnamed: 0_level_0,St Kilda Match Statistics [Season][Game by Game],St Kilda Match Statistics [Season][Game by Game],St Kilda Match Statistics [Season][Game by Game],St Kilda Match Statistics [Season][Game by Game],St Kilda Match Statistics [Season][Game by Game],St Kilda Match Statistics [Season][Game by Game],St Kilda Match Statistics [Season][Game by Game],St Kilda Match Statistics [Season][Game by Game],St Kilda Match Statistics [Season][Game by Game],St Kilda Match Statistics [Season][Game by Game],St Kilda Match Statistics [Season][Game by Game],St Kilda Match Statistics [Season][Game by Game],St Kilda Match Statistics [Season][Game by Game],St Kilda Match Statistics [Season][Game by Game],St Kilda Match Statistics [Season][Game by Game],St Kilda Match Statistics [Season][Game by Game],St Kilda Match Statistics [Season][Game by Game],St Kilda Match Statistics [Season][Game by Game],St Kilda Match Statistics [Season][Game by Game],St Kilda Match Statistics [Season][Game by Game],St Kilda Match Statistics [Season][Game by Game]
Unnamed: 0_level_1,#,Player,KI,MK,HB,DI,GL,BH,HO,TK,...,FA,BR,CP,UP,CM,MI,1%,BO,GA,%P
0,26,"Battle, Josh",11,3.0,2.0,13,,,,3.0,...,1.0,,3.0,10.0,,,1.0,,,49.0
1,16,"Butler, Dan",8,,3.0,11,1.0,,,7.0,...,2.0,,7.0,3.0,,,1.0,,,76.0
2,38,"Campbell, Tom",8,5.0,4.0,12,,,17.0,4.0,...,,,10.0,2.0,3.0,,5.0,,,84.0
3,11 ↓,"Clark, Hunter",4,,,4,,,,1.0,...,,,1.0,3.0,,,,,,34.0
4,10,"Hannebery, Dan",18,2.0,12.0,30,,,,6.0,...,1.0,3.0,13.0,17.0,,,,,,70.0


In [15]:
# Drop top table heading
df[2] = df[2].droplevel(0, axis=1) # St Kilda table
df[4] = df[4].droplevel(0, axis=1) # Sydney table

In [16]:
df[2].head()

Unnamed: 0,#,Player,KI,MK,HB,DI,GL,BH,HO,TK,...,FA,BR,CP,UP,CM,MI,1%,BO,GA,%P
0,26,"Battle, Josh",11,3.0,2.0,13,,,,3.0,...,1.0,,3.0,10.0,,,1.0,,,49.0
1,16,"Butler, Dan",8,,3.0,11,1.0,,,7.0,...,2.0,,7.0,3.0,,,1.0,,,76.0
2,38,"Campbell, Tom",8,5.0,4.0,12,,,17.0,4.0,...,,,10.0,2.0,3.0,,5.0,,,84.0
3,11 ↓,"Clark, Hunter",4,,,4,,,,1.0,...,,,1.0,3.0,,,,,,34.0
4,10,"Hannebery, Dan",18,2.0,12.0,30,,,,6.0,...,1.0,3.0,13.0,17.0,,,,,,70.0


In [17]:
# Record club name in each row
df[2]['Club'] = df[0][1][1]
df[4]['Club'] = df[0][1][2]

# Record opponent club name in each row
df[2]['Opponent'] = df[0][1][2]
df[4]['Opponent'] = df[0][1][1]

In [18]:
df[4].head()

Unnamed: 0,#,Player,KI,MK,HB,DI,GL,BH,HO,TK,...,CP,UP,CM,MI,1%,BO,GA,%P,Club,Opponent
0,22,"Blakey, Nick",12,2.0,5,17,,,,1.0,...,4,9,,,3.0,1.0,1.0,74.0,Sydney,St Kilda
1,4,"Clarke, Ryan",4,1.0,6,10,,,,5.0,...,4,7,,,3.0,,1.0,86.0,Sydney,St Kilda
2,13,"Florent, Oliver",16,5.0,5,21,,1.0,,1.0,...,10,13,,1.0,3.0,,,86.0,Sydney,St Kilda
3,42,"Fox, Robbie",11,5.0,6,17,,,,,...,3,15,,,,,,88.0,Sydney,St Kilda
4,23,"Franklin, Lance",8,4.0,3,11,2.0,1.0,,1.0,...,7,5,2.0,1.0,1.0,1.0,1.0,93.0,Sydney,St Kilda


In [19]:
# Record year and round in each row
df[2]['Round'] = df[4]['Round'] = df[0][1][0].split('Round: ')[1].split(" Venue")[0]
df[2]['Year'] = df[4]['Year'] = df[0][1][0].split('-')[2].split(' ')[0]

In [20]:
df[4]

Unnamed: 0,#,Player,KI,MK,HB,DI,GL,BH,HO,TK,...,CM,MI,1%,BO,GA,%P,Club,Opponent,Round,Year
0,22,"Blakey, Nick",12,2.0,5,17,,,,1.0,...,,,3.0,1.0,1.0,74.0,Sydney,St Kilda,23,2022
1,4,"Clarke, Ryan",4,1.0,6,10,,,,5.0,...,,,3.0,,1.0,86.0,Sydney,St Kilda,23,2022
2,13,"Florent, Oliver",16,5.0,5,21,,1.0,,1.0,...,,1.0,3.0,,,86.0,Sydney,St Kilda,23,2022
3,42,"Fox, Robbie",11,5.0,6,17,,,,,...,,,,,,88.0,Sydney,St Kilda,23,2022
4,23,"Franklin, Lance",8,4.0,3,11,2.0,1.0,,1.0,...,2.0,1.0,1.0,1.0,1.0,93.0,Sydney,St Kilda,23,2022
5,21,"Gulden, Errol",15,5.0,5,20,1.0,,,4.0,...,,1.0,,,,77.0,Sydney,St Kilda,23,2022
6,9,"Hayward, Will",11,8.0,4,15,3.0,1.0,,2.0,...,1.0,1.0,1.0,,1.0,89.0,Sydney,St Kilda,23,2022
7,5,"Heeney, Isaac",16,6.0,7,23,2.0,2.0,,3.0,...,2.0,1.0,,,,86.0,Sydney,St Kilda,23,2022
8,31,"Hickey, Tom",6,1.0,4,10,,,24.0,1.0,...,1.0,,5.0,,1.0,72.0,Sydney,St Kilda,23,2022
9,44,"Lloyd, Jake",14,7.0,10,24,,,,1.0,...,,,3.0,,1.0,89.0,Sydney,St Kilda,23,2022


In [21]:
# Add result if the player's team won or lost the match
team1_score = df[0][5][1].split('.')[2]
team2_score = df[0][5][2].split('.')[2]

if team1_score > team2_score:
    df[2]['Result'] = 'W'
    df[4]['Result'] = 'L'
elif team2_score > team1_score:
    df[2]['Result'] = 'L'
    df[4]['Result'] = 'W'
elif team1_score == team2_score:
    df[2]['Result'] = 'D'
    df[4]['Result'] = 'D'

In [22]:
# Add game margin
df[2]['Margin'] = int(team1_score) - int(team2_score)
df[4]['Margin'] = int(team2_score) - int(team1_score)

In [23]:
# Check the bottom of the table
df[2].tail()

Unnamed: 0,#,Player,KI,MK,HB,DI,GL,BH,HO,TK,...,1%,BO,GA,%P,Club,Opponent,Round,Year,Result,Margin
21,44,"Wilkie, Callum",10,7,3,13,,,,,...,7.0,,,100.0,St Kilda,Sydney,23,2022,L,-14
22,17,"Windhager, Marcus",6,1,5,11,1,,,1.0,...,1.0,,,90.0,St Kilda,Sydney,23,2022,L,-14
23,Rushed,Rushed,Rushed,Rushed,Rushed,Rushed,Rushed,2.0,,,...,,,,,St Kilda,Sydney,23,2022,L,-14
24,Totals,Totals,244,120,132,376,11,8.0,38.0,59.0,...,52.0,4.0,6.0,,St Kilda,Sydney,23,2022,L,-14
25,Opposition,Opposition,219,84,128,347,13,10.0,28.0,56.0,...,48.0,3.0,10.0,,St Kilda,Sydney,23,2022,L,-14


In [24]:
# Replace NaN with 0s
df[2] = df[2].fillna(0)
df[4] = df[4].fillna(0)

In [25]:
# Drop 'Rushed behind' row and totals rows from the bottom
df[2] = df[2][(df[2]['#'] != 'Rushed') & (df[2]['#'] != 'Totals') & (df[2]['#'] != 'Opposition')]
df[4] = df[4][(df[4]['#'] != 'Rushed') & (df[4]['#'] != 'Totals') & (df[4]['#'] != 'Opposition')]

In [26]:
df[2]

Unnamed: 0,#,Player,KI,MK,HB,DI,GL,BH,HO,TK,...,1%,BO,GA,%P,Club,Opponent,Round,Year,Result,Margin
0,26,"Battle, Josh",11,3,2,13,0,0.0,0.0,3.0,...,1.0,0.0,0.0,49.0,St Kilda,Sydney,23,2022,L,-14
1,16,"Butler, Dan",8,0,3,11,1,0.0,0.0,7.0,...,1.0,0.0,0.0,76.0,St Kilda,Sydney,23,2022,L,-14
2,38,"Campbell, Tom",8,5,4,12,0,0.0,17.0,4.0,...,5.0,0.0,0.0,84.0,St Kilda,Sydney,23,2022,L,-14
3,11 ↓,"Clark, Hunter",4,0,0,4,0,0.0,0.0,1.0,...,0.0,0.0,0.0,34.0,St Kilda,Sydney,23,2022,L,-14
4,10,"Hannebery, Dan",18,2,12,30,0,0.0,0.0,6.0,...,0.0,0.0,0.0,70.0,St Kilda,Sydney,23,2022,L,-14
5,22,"Higgins, Jack",9,7,4,13,1,2.0,0.0,1.0,...,2.0,0.0,0.0,73.0,St Kilda,Sydney,23,2022,L,-14
6,8,"Hill, Bradley",12,8,6,18,0,0.0,0.0,3.0,...,4.0,0.0,0.0,82.0,St Kilda,Sydney,23,2022,L,-14
7,20,"Howard, Dougal",9,9,3,12,0,0.0,0.0,0.0,...,5.0,0.0,0.0,100.0,St Kilda,Sydney,23,2022,L,-14
8,3,"Jones, Zak",7,4,7,14,0,0.0,0.0,3.0,...,1.0,0.0,0.0,76.0,St Kilda,Sydney,23,2022,L,-14
9,25 ↑,"Kent, Dean",2,1,2,4,0,0.0,0.0,4.0,...,1.0,0.0,2.0,38.0,St Kilda,Sydney,23,2022,L,-14


In [27]:
df[4]

Unnamed: 0,#,Player,KI,MK,HB,DI,GL,BH,HO,TK,...,1%,BO,GA,%P,Club,Opponent,Round,Year,Result,Margin
0,22,"Blakey, Nick",12,2.0,5,17,0.0,0.0,0.0,1.0,...,3.0,1.0,1.0,74.0,Sydney,St Kilda,23,2022,W,14
1,4,"Clarke, Ryan",4,1.0,6,10,0.0,0.0,0.0,5.0,...,3.0,0.0,1.0,86.0,Sydney,St Kilda,23,2022,W,14
2,13,"Florent, Oliver",16,5.0,5,21,0.0,1.0,0.0,1.0,...,3.0,0.0,0.0,86.0,Sydney,St Kilda,23,2022,W,14
3,42,"Fox, Robbie",11,5.0,6,17,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,88.0,Sydney,St Kilda,23,2022,W,14
4,23,"Franklin, Lance",8,4.0,3,11,2.0,1.0,0.0,1.0,...,1.0,1.0,1.0,93.0,Sydney,St Kilda,23,2022,W,14
5,21,"Gulden, Errol",15,5.0,5,20,1.0,0.0,0.0,4.0,...,0.0,0.0,0.0,77.0,Sydney,St Kilda,23,2022,W,14
6,9,"Hayward, Will",11,8.0,4,15,3.0,1.0,0.0,2.0,...,1.0,0.0,1.0,89.0,Sydney,St Kilda,23,2022,W,14
7,5,"Heeney, Isaac",16,6.0,7,23,2.0,2.0,0.0,3.0,...,0.0,0.0,0.0,86.0,Sydney,St Kilda,23,2022,W,14
8,31,"Hickey, Tom",6,1.0,4,10,0.0,0.0,24.0,1.0,...,5.0,0.0,1.0,72.0,Sydney,St Kilda,23,2022,W,14
9,44,"Lloyd, Jake",14,7.0,10,24,0.0,0.0,0.0,1.0,...,3.0,0.0,1.0,89.0,Sydney,St Kilda,23,2022,W,14


In [31]:
# Combine both teams
df_concat = pd.concat([df[2], df[4]])

#### Alter format of player names to match the next dataset we will use

In [32]:
# Change format to initial of first name, followed by a space, then last name
df_concat['Player'] = df_concat['Player'].apply(lambda x: f"{x.split(', ')[1][0]} {x.split(', ')[0]}")
# Change format of hyphenated last names to initialise first component
df_concat['Player'] = df_concat['Player'].apply(lambda x: f"{x.split('-')[0][:3]}-{x.split('-')[1]}" if '-' in x else x)

In [33]:
df_concat

Unnamed: 0,#,Player,KI,MK,HB,DI,GL,BH,HO,TK,...,1%,BO,GA,%P,Club,Opponent,Round,Year,Result,Margin
0,26,J Battle,11,3.0,2,13,0.0,0.0,0.0,3.0,...,1.0,0.0,0.0,49.0,St Kilda,Sydney,23,2022,L,-14
1,16,D Butler,8,0.0,3,11,1.0,0.0,0.0,7.0,...,1.0,0.0,0.0,76.0,St Kilda,Sydney,23,2022,L,-14
2,38,T Campbell,8,5.0,4,12,0.0,0.0,17.0,4.0,...,5.0,0.0,0.0,84.0,St Kilda,Sydney,23,2022,L,-14
3,11 ↓,H Clark,4,0.0,0,4,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,34.0,St Kilda,Sydney,23,2022,L,-14
4,10,D Hannebery,18,2.0,12,30,0.0,0.0,0.0,6.0,...,0.0,0.0,0.0,70.0,St Kilda,Sydney,23,2022,L,-14
5,22,J Higgins,9,7.0,4,13,1.0,2.0,0.0,1.0,...,2.0,0.0,0.0,73.0,St Kilda,Sydney,23,2022,L,-14
6,8,B Hill,12,8.0,6,18,0.0,0.0,0.0,3.0,...,4.0,0.0,0.0,82.0,St Kilda,Sydney,23,2022,L,-14
7,20,D Howard,9,9.0,3,12,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,100.0,St Kilda,Sydney,23,2022,L,-14
8,3,Z Jones,7,4.0,7,14,0.0,0.0,0.0,3.0,...,1.0,0.0,0.0,76.0,St Kilda,Sydney,23,2022,L,-14
9,25 ↑,D Kent,2,1.0,2,4,0.0,0.0,0.0,4.0,...,1.0,0.0,2.0,38.0,St Kilda,Sydney,23,2022,L,-14


## Web scraping
To scrape match stats for all the matches we want we need to find all the links for each page

### HTTP Request

In [18]:
# Get Request
response = requests.get('https://afltables.com/afl/seas/2022.html')

# Status Code check
response.status_code

200

### Soup Object

In [19]:
soup = BeautifulSoup(response.content, 'html.parser')
soup

<html><script src="/js/common.js" type="text/javascript"></script>
<script src="/js/css.js" type="text/javascript"></script>
<script src="/js/simpletabs_1.3.js" type="text/javascript"></script>
<script src="/js/standardista-table-sorting.js" type="text/javascript"></script>
<script src="/js/tabs.js" type="text/javascript"></script>
<title>AFL Tables -  2022 Season Scores</title>
<a name="top"></a>
<body background="../images/back.jpg" style="font: 12px Verdana;">
<center>[<a href="season_idx.html">Season Main</a>]
[<a href="../afl_index.html">AFL Main</a>]
<br/><br/>[<a href="2021.html">2021</a>]
[<a href="1897.html">1897</a>]
<h1> 2022 Season Scores and Results</h1>
<b>Rounds</b><br/>
[<a href="#1">1</a>]
[<a href="#2">2</a>]
[<a href="#3">3</a>]
[<a href="#4">4</a>]
[<a href="#5">5</a>]
[<a href="#6">6</a>]
[<a href="#7">7</a>]
[<a href="#8">8</a>]
[<a href="#9">9</a>]
[<a href="#10">10</a>]
[<a href="#11">11</a>]
[<a href="#12">12</a>]
[<a href="#13">13</a>]
[<a href="#14">14</a>]
[

In [20]:
url1 = 'https://afltables.com/afl'

# Concatenate the second half of each link to get all 2022 matches
match_stats_urls = [url1 + link.get('href').split('..')[1] for link in soup.findAll('a', href=True, text='Match stats')]

# Drop the last 9 games, which are finals. Brownlow votes are not awarded in finals matches
match_stats_urls = match_stats_urls[:-9]

### Create function to clean tables

In [40]:
def wrangle_stats(page_link):

    df = pd.read_html(page_link)

    # Drop top table heading
    df[2] = df[2].droplevel(0, axis=1)
    df[4] = df[4].droplevel(0, axis=1)

    # Record club name in each row
    df[2]['Club'] = df[0][1][1]
    df[4]['Club'] = df[0][1][2]

    # Record opponent club name in each row
    df[2]['Opponent'] = df[0][1][2]
    df[4]['Opponent'] = df[0][1][1]
    
    # Record year and round in each row
    df[2]['Round'] = df[4]['Round'] = df[0][1][0].split('Round: ')[1].split(" Venue")[0]
    df[2]['Year'] = df[4]['Year'] = df[0][1][0].split('-')[2].split(' ')[0]
    
    team1_score = df[0][5][1].split('.')[2]
    team2_score = df[0][5][2].split('.')[2]
    
    # Add result if the player's team won or lost the match
    if team1_score > team2_score:
        df[2]['Result'] = 'W'
        df[4]['Result'] = 'L'
    elif team2_score > team1_score:
        df[2]['Result'] = 'L'
        df[4]['Result'] = 'W'
    elif team1_score == team2_score:
        df[2]['Result'] = 'D'
        df[4]['Result'] = 'D'
    
    # Add game margin
    df[2]['Margin'] = int(team1_score) - int(team2_score)
    df[4]['Margin'] = int(team2_score) - int(team1_score)
    
    # Replace NaN with 0s
    df[2] = df[2].fillna(0)
    df[4] = df[4].fillna(0)
    
    # Drop 'Rushed behind' row and totals rows from the bottom
    df[2] = df[2][(df[2]['#'] != 'Rushed') & (df[2]['#'] != 'Totals') & (df[2]['#'] != 'Opposition')]
    df[4] = df[4][(df[4]['#'] != 'Rushed') & (df[4]['#'] != 'Totals') & (df[4]['#'] != 'Opposition')]
    
    return pd.concat([df[2], df[4]])

In [39]:
# Get dataframe ready
df = pd.DataFrame(dtype=wrangle_stats(match_stats_urls[0]).info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 46 entries, 0 to 22
Data columns (total 31 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   #         46 non-null     object 
 1   Player    46 non-null     object 
 2   KI        46 non-null     object 
 3   MK        46 non-null     object 
 4   HB        46 non-null     object 
 5   DI        46 non-null     object 
 6   GL        46 non-null     object 
 7   BH        46 non-null     float64
 8   HO        46 non-null     float64
 9   TK        46 non-null     float64
 10  RB        46 non-null     float64
 11  IF        46 non-null     float64
 12  CL        46 non-null     float64
 13  CG        46 non-null     float64
 14  FF        46 non-null     float64
 15  FA        46 non-null     float64
 16  BR        46 non-null     float64
 17  CP        46 non-null     float64
 18  UP        46 non-null     float64
 19  CM        46 non-null     float64
 20  MI        46 non-null     float64


In [41]:
for match in match_stats_urls:
    df = pd.concat([df, wrangle_stats(match)])

In [42]:
df

Unnamed: 0,#,Player,KI,MK,HB,DI,GL,BH,HO,TK,...,1%,BO,GA,%P,Club,Opponent,Round,Year,Result,Margin
0,12 ↑,"Bedford, Toby",5,3,4,9,0,1.0,0.0,2.0,...,2.0,1.0,0.0,55.0,Melbourne,Western Bulldogs,1,2022,W,26
1,17,"Bowey, Jake",8,2,1,9,1,0.0,0.0,0.0,...,2.0,0.0,0.0,73.0,Melbourne,Western Bulldogs,1,2022,W,26
2,10,"Brayshaw, Angus",12,6,11,23,0,0.0,0.0,3.0,...,1.0,0.0,0.0,83.0,Melbourne,Western Bulldogs,1,2022,W,26
3,50,"Brown, Ben",9,8,4,13,3,3.0,0.0,0.0,...,2.0,0.0,0.0,86.0,Melbourne,Western Bulldogs,1,2022,W,26
4,31,"Fritsch, Bayley",8,4,1,9,2,2.0,0.0,1.0,...,0.0,0.0,1.0,81.0,Melbourne,Western Bulldogs,1,2022,W,26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18,20,"Reid, Sam",6,4.0,6,12,0.0,2.0,4.0,5.0,...,2.0,0.0,0.0,79.0,Sydney,St Kilda,23,2022,W,14
19,8,"Rowbottom, James",13,3.0,11,24,1.0,1.0,0.0,7.0,...,2.0,0.0,0.0,79.0,Sydney,St Kilda,23,2022,W,14
20,3,"Stephens, Dylan",14,5.0,3,17,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,77.0,Sydney,St Kilda,23,2022,W,14
21,1,"Warner, Chad",10,0.0,10,20,1.0,1.0,0.0,5.0,...,1.0,1.0,0.0,89.0,Sydney,St Kilda,23,2022,W,14


### Footywire data

In [393]:
# Get Request
response = requests.get('https://www.footywire.com/afl/footy/ft_match_statistics?mid=10544&advv=Y') # First game of 2022

# Status Code check
response.status_code

200

In [394]:
soup = BeautifulSoup(response.content, 'html.parser')
soup


<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">

<html>
<head>
<title>AFL Match Statistics : Melbourne defeats Western Bulldogs at MCG Round 1 Wednesday, 16th March 2022</title>
<meta content="Melbourne defeats Western Bulldogs at MCG Round 1 Wednesday, 16th March 2022 AFL match statistics" name="description"/>
<meta content="AFL Match Statistics, AFL Game Statistics, AFL Match Stats" name="keywords"/>
<meta content="noindex" name="robots"/>
<style id="styleMatchStatistics">
td.subLegend {
	color: #555;
	padding-top: 5px;
}
span.subIcon {
	cursor: default;
	color: #FF4500;
}
td.unusedsubtext { font-style: italic; color: #666666; }
</style>
<style>
td.statdata {
  text-align:center;
  cursor:default;
}
div.toppadding { padding-top:10px; }
</style>
<script>
var template = "pages/match-statistics";
var slashIndex = template.indexOf('/');
var templateName = template.substring(slashIndex + 1);
var originalTemplate = templateName;
checkForFantasyTemplate();

var advv = 'N';


Hyphenated surnames such as "Neal-Bullen" and "Ugle-Hagen" are abbreviated to "N-Bullen" and "U-Hagen" in the Footywire tables but the full names were also stored, so we can overwrite the abbreviated form with the full name. This will make them consistent with the player names retrieved from AFL Tables website.

In [395]:
# Get full player names
results = soup.findAll('tr', {'class':["darkcolor", 'lightcolor']})

In [396]:
players = []

# Get player names
for x in results:
    try:
        players.append(x.find('td', {'align':'left'}).find('a').get('title'))
    except:
        pass
    
players[:10]

['Christian Petracca',
 'Clayton Oliver',
 'Jack Viney',
 'Angus Brayshaw',
 'Ed Langdon',
 'James Jordon',
 'Alex Neal-Bullen',
 'James Harmes',
 'Max Gawn',
 'Steven May']

In [397]:
for i in range(len(players)):
    name_split = players[i].split(' ') # Split into list of first and last name
    players[i] = f"{name_split[0][0]} {name_split[1]}" # Replace name with first initial and last name

players[:10]

['C Petracca',
 'C Oliver',
 'J Viney',
 'A Brayshaw',
 'E Langdon',
 'J Jordon',
 'A Neal-Bullen',
 'J Harmes',
 'M Gawn',
 'S May']

In [426]:
# Get Footywire tables
fw = pd.read_html("https://www.footywire.com/afl/footy/ft_match_statistics?mid=10544&advv=Y")

In [427]:
# Fix headings
fw[12].columns = fw[17].columns = fw[12].iloc[0]
fw[12] = fw[12][1:]
fw[17] = fw[17][1:]

In [428]:
fw[12].head(8)

Unnamed: 0,Player,CP,UP,ED,DE%,CM,GA,MI5,1%,BO,CCL,SCL,SI,MG,TO,ITC,T5,TOG%
1,C Petracca,16,22,29,76.3,0,1,2,0,0,3,6,13,869,4,3,0,87
2,C Oliver,15,20,24,75.0,0,0,0,2,1,3,3,8,549,8,7,2,81
3,J Viney,12,11,16,69.6,0,0,1,1,0,1,3,5,329,4,7,0,79
4,A Brayshaw,4,19,15,65.2,0,0,0,1,0,0,0,3,264,3,7,0,83
5,E Langdon,11,12,17,77.3,0,1,0,2,1,0,1,6,446,4,7,0,100
6,J Jordon,8,12,16,80.0,0,0,0,0,0,0,0,2,259,5,8,0,70
7,A N-Bullen,8,10,13,72.2,1,3,0,2,2,0,0,8,320,4,4,1,88
8,J Harmes,7,10,14,82.4,1,0,1,3,0,0,2,4,163,3,2,2,77


In [401]:
# If sub was activated for team 1, idx 17 will identify team 2, otherwise it will be idx 16
fw[13][0][0][0] == '\u2197' 

True

In [429]:
# Add team names to each row
fw[12]['Club'] = fw[8]['Team'][0]
fw[17]['Club'] = fw[8]['Team'][1]

In [430]:
fw[12].head()

Unnamed: 0,Player,CP,UP,ED,DE%,CM,GA,MI5,1%,BO,CCL,SCL,SI,MG,TO,ITC,T5,TOG%,Club
1,C Petracca,16,22,29,76.3,0,1,2,0,0,3,6,13,869,4,3,0,87,Melbourne
2,C Oliver,15,20,24,75.0,0,0,0,2,1,3,3,8,549,8,7,2,81,Melbourne
3,J Viney,12,11,16,69.6,0,0,1,1,0,1,3,5,329,4,7,0,79,Melbourne
4,A Brayshaw,4,19,15,65.2,0,0,0,1,0,0,0,3,264,3,7,0,83,Melbourne
5,E Langdon,11,12,17,77.3,0,1,0,2,1,0,1,6,446,4,7,0,100,Melbourne


In [431]:
# Put both teams together
combined_fw = pd.concat([fw[12], fw[17]])

# Add additional columns (for merging dataframes)
combined_fw['Round'] = soup.find('title').text.split('Round ')[1].split(' ')[0]
combined_fw['Year'] = soup.find('title').text.split(' ')[-1]

In [224]:
# Replace player names with the list from above
combined_fw['Player'] = players

In [229]:
# Keep only columns of interest (note we already have some of the statistics from AFL tables)
combined_fw = combined_fw[['Player', 'ED', 'DE%', 'SI', 'MG', 'TO', 'ITC' , 'T5', 'Round', 'Year', 'Club']]

In [230]:
combined_fw

Unnamed: 0,Player,ED,DE%,SI,MG,TO,ITC,T5,Round,Year,Club
1,C Petracca,29,76.3,13,869,4,3,0,1,2022,Melbourne
2,C Oliver,24,75,8,549,8,7,2,1,2022,Melbourne
3,J Viney,16,69.6,5,329,4,7,0,1,2022,Melbourne
4,A Brayshaw,15,65.2,3,264,3,7,0,1,2022,Melbourne
5,E Langdon,17,77.3,6,446,4,7,0,1,2022,Melbourne
6,J Jordon,16,80,2,259,5,8,0,1,2022,Melbourne
7,A Neal-Bullen,13,72.2,8,320,4,4,1,1,2022,Melbourne
8,J Harmes,14,82.4,4,163,3,2,2,1,2022,Melbourne
9,M Gawn,8,57.1,8,326,4,4,0,1,2022,Melbourne
10,S May,11,78.6,1,384,3,5,0,1,2022,Melbourne


### Get URLs to stats page for each match

In [231]:
# Get Request
response = requests.get('https://www.footywire.com/afl/footy/ft_match_list?year=2022')

# Status Code check
response.status_code

200

In [232]:
soup = BeautifulSoup(response.content, 'html.parser')
soup


<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">

<html>
<head>
<title>AFL Fixture 2022</title>
<meta content="AFL fixture and matches for season 2022." name="description"/>
<meta content="AFL Fixture 2022, AFL Games 2022, AFL Matches 2022" name="keywords"/>
<link href="https://www.footywire.com/afl/footy/ft_match_list?year=2022" rel="canonical"/>
<style id="styleMainDesktop">

.tabbg { background-color: #000077; vertical-align: middle; }
.blkbg { background-color: #000000; vertical-align: middle; }
.tabbdr { background-color: #d8dfea; vertical-align: middle; }
.wspace { background-color: #ffffff; vertical-align: middle; }

.greybg { background-color: #f4f5f1; }
.greybdr { background-color: #e3e4e0; }
.blackbdr { background-color: #000000; }
.lbgrey { background-color: #d4d5d1; text-decoration: none; color: #000000; vertical-align: middle; text-align: left; font-weight: bold; }

.caprow { background-color: #f2f4f7; text-decoration: none; color: #000000; vertical-align: 

In [262]:
table_rows = soup.findAll('tr', {'class':["darkcolor", 'lightcolor']})

In [263]:
table_rows[0].findAll('td', {'class': 'data'})

[<td class="data" height="24"> Wed 16 Mar 7:10pm</td>,
 <td class="data">
 <a href="th-melbourne-demons">Melbourne</a>
 v 
 <a href="th-western-bulldogs">Western Bulldogs</a>
 </td>,
 <td class="data">MCG</td>,
 <td align="center" class="data">58002</td>,
 <td align="center" class="data"><a href="ft_match_statistics?mid=10544">97-71</a></td>,
 <td class="data">
 <a href="ft_player_profile?pid=3800">J. Macrae</a> 39<br/>
 </td>,
 <td class="data">
 <a href="ft_player_profile?pid=6491">A. Naughton</a> 4<br/>
 </td>]

In [264]:
table_rows[0].findAll('td', {'class': 'data'})[4].find('a').get('href')

'ft_match_statistics?mid=10544'

In [284]:
fw_urls = []
url_p1 = "https://www.footywire.com/afl/footy/"

# Find url to advanced stats page for each home and away match in 2022 season
for row in table_rows:
    try:
        fw_urls.append(f"{url_p1}{row.findAll('td', {'class': 'data'})[4].find('a').get('href')}&advv=Y")
    except:
        pass

In [285]:
fw_urls = fw_urls[:-9] # Cut out finals
fw_urls

['https://www.footywire.com/afl/footy/ft_match_statistics?mid=10544&advv=Y',
 'https://www.footywire.com/afl/footy/ft_match_statistics?mid=10545&advv=Y',
 'https://www.footywire.com/afl/footy/ft_match_statistics?mid=10546&advv=Y',
 'https://www.footywire.com/afl/footy/ft_match_statistics?mid=10547&advv=Y',
 'https://www.footywire.com/afl/footy/ft_match_statistics?mid=10548&advv=Y',
 'https://www.footywire.com/afl/footy/ft_match_statistics?mid=10549&advv=Y',
 'https://www.footywire.com/afl/footy/ft_match_statistics?mid=10550&advv=Y',
 'https://www.footywire.com/afl/footy/ft_match_statistics?mid=10551&advv=Y',
 'https://www.footywire.com/afl/footy/ft_match_statistics?mid=10552&advv=Y',
 'https://www.footywire.com/afl/footy/ft_match_statistics?mid=10553&advv=Y',
 'https://www.footywire.com/afl/footy/ft_match_statistics?mid=10554&advv=Y',
 'https://www.footywire.com/afl/footy/ft_match_statistics?mid=10555&advv=Y',
 'https://www.footywire.com/afl/footy/ft_match_statistics?mid=10556&advv=Y',

In [434]:
def format_names(url):
    """
    Auxiliary function to reformat names of players
    """
    
    # Use BeautifulSoup to overwrite the format of player names
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    # Get full player names
    results = soup.findAll('tr', {'class':["darkcolor", 'lightcolor']})   
    players = []
    # Get player names
    for x in results:
        try:
            players.append(x.find('td', {'align':'left'}).find('a').get('title'))
        except:
            pass        
    for i in range(len(players)):
        name_split = players[i].split(' ', 1) # Split into list of first and last name
        players[i] = f"{name_split[0][0]} {name_split[1]}" # Replace name with first initial and last name
        
    return players

In [435]:
def wrangle_fw_stats(url):
    """
    Wrangle the footywire statistics
    """

    # Use BeautifulSoup to retrieve soup object
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Call format names to provide reformatted version of player names
    players = format_names(url)

    # Get Footywire tables
    fw = pd.read_html(url)
    
    # If substitute activated for team 1, the website shows an additional line below table 1, which pushses
    # the index for team 2 down to 17, otherwise it will be 16. '\u2197' is the unicode for the arrow symbol
    # that can be used to determine if team 1 activated a substitute in that match.
    if fw[13][0][0][0] == '\u2197':
        t2_idx = 17
    else:
        t2_idx = 16
    
    # Fix headings
    fw[12].columns = fw[t2_idx].columns = fw[12].iloc[0]
    fw[12] = fw[12][1:]
    fw[t2_idx] = fw[t2_idx][1:]
    
    # Add team names to each row
    fw[12]['Club'] = fw[8]['Team'][0]
    fw[t2_idx]['Club'] = fw[8]['Team'][1]
    
    # Put both teams together
    combined_fw = pd.concat([fw[12], fw[t2_idx]])
    
    # Add additional columns (for merging dataframes)
    combined_fw['Round'] = soup.find('title').text.split('Round ')[1].split(' ')[0]
    combined_fw['Year'] = soup.find('title').text.split(' ')[-1]
    
    # Replace player names with the list from above
    combined_fw['Player'] = players
    
    # Keep only columns of interest (note we already have some of the statistics from AFL tables)
    combined_fw = combined_fw[['Player', 'ED', 'DE%', 'SI', 'MG', 'TO', 'ITC' , 'T5', 'Round', 'Year', 'Club']]
    
    return combined_fw

In [436]:
# Get dataframe ready
fw = pd.DataFrame(dtype=wrangle_fw_stats(fw_urls[0]).info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 46 entries, 1 to 23
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Player  46 non-null     object
 1   ED      46 non-null     object
 2   DE%     46 non-null     object
 3   SI      46 non-null     object
 4   MG      46 non-null     object
 5   TO      46 non-null     object
 6   ITC     46 non-null     object
 7   T5      46 non-null     object
 8   Round   46 non-null     object
 9   Year    46 non-null     object
 10  Club    46 non-null     object
dtypes: object(11)
memory usage: 4.3+ KB


In [437]:
%%time
for url in fw_urls[:5]:
    fw = pd.concat([fw, wrangle_fw_stats(url)])

Wall time: 20.4 s


In [438]:
fw

Unnamed: 0,Player,ED,DE%,SI,MG,TO,ITC,T5,Round,Year,Club
1,C Petracca,29,76.3,13,869,4,3,0,1,2022,Melbourne
2,C Oliver,24,75,8,549,8,7,2,1,2022,Melbourne
3,J Viney,16,69.6,5,329,4,7,0,1,2022,Melbourne
4,A Brayshaw,15,65.2,3,264,3,7,0,1,2022,Melbourne
5,E Langdon,17,77.3,6,446,4,7,0,1,2022,Melbourne
...,...,...,...,...,...,...,...,...,...,...,...
19,A Sheldrick,6,75,5,38,1,0,1,1,2022,Sydney
20,H Cunningham,7,100,2,64,2,2,0,1,2022,Sydney
21,S Wicks,4,57.1,3,129,2,0,1,1,2022,Sydney
22,L McDonald,1,20,2,82,4,2,1,1,2022,Sydney


<font color = red>Need to alter the format of names in other dataset</font>

In [500]:
name = "Neal-Bullen, Alex"

In [455]:
name = "De Koning, Tom"

In [467]:
name = "Oliver, Clayton"

In [501]:
name_split = name.split(', ')
f"{name_split[1][0]} {name_split[0]}"

'A Neal-Bullen'

In [504]:
f"{name_split[1][0]} {name_split[0].split('-')[0][0]}-{name_split[0].split('-')[1]}"

'A N-Bullen'

In [471]:
'Clayton'.split('-')

['Clayton']

In [493]:
df_concat['Player'] = df_concat['Player'].apply(lambda x: f"{x.split(', ')[1][0]} {x.split(', ')[0]}")

In [497]:
df_concat[20:21]

Unnamed: 0,#,Player,KI,MK,HB,DI,GL,BH,HO,TK,...,1%,BO,GA,%P,Club,Opponent,Round,Year,Result,Margin
20,7,N Wanganeen-Milera,15,6,4,19,0,2.0,0.0,1.0,...,0.0,1.0,1.0,82.0,St Kilda,Sydney,23,2022,L,-14


In [507]:
f"{'N Wanganeen-Milera'.split('-')[0][:3]}-{'N Wanganeen-Milera'.split('-')[1]}"

'N W-Milera'

In [508]:
'-' in 'N Wanganeen-Milera'

True

In [515]:
df_concat['Player'].apply(lambda x: f"{x.split('-')[0][:3]}-{x.split('-')[1]}" if '-' in x else x)

0        J Battle
1        D Butler
2      T Campbell
3         H Clark
4     D Hannebery
5       J Higgins
6          B Hill
7        D Howard
8         Z Jones
9          D Kent
10         M King
11      J Lienert
12         B Long
13     R Marshall
14      T Membrey
15        M Owens
16        B Paton
17         S Ross
18     J Sinclair
19       J Steele
20     N W-Milera
21       C Wilkie
22    M Windhager
0        N Blakey
1        R Clarke
2       O Florent
3           R Fox
4      L Franklin
5        E Gulden
6       W Hayward
7        I Heeney
8        T Hickey
9         J Lloyd
10     P McCartin
11     T McCartin
12     L McDonald
13    J McInerney
14        C Mills
15       T Papley
16       L Parker
17        D Rampe
18         S Reid
19    J Rowbottom
20     D Stephens
21       C Warner
22        S Wicks
Name: Player, dtype: object