In [1]:
import requests
import pandas as pd

In [2]:
# standings_url contains the URL for accessing Premier League statistics on FBref.com.

standings_url = "https://fbref.com/en/comps/9/Premier-League_Stats"

In [3]:
# Send a GET request to the URL to retrieve the data
data = requests.get(standings_url)

In [4]:
data.text



In [5]:
# Importing the BeautifulSoup module from the bs4 library to enable parsing and navigating HTML and XML documents.
from bs4 import BeautifulSoup

In [6]:
# Using BeautifulSoup's select method to locate the first table element with the class 'stats_table' in the HTML data.
# Extracting all the 'a' (anchor) tags within the standings table.
# Filtering the links to include only those containing '/squads/' in their href attribute.

soup = BeautifulSoup(data.text)
standings_table = soup.select('table.stats_table')[0]
links = standings_table.find_all('a')
links = [l.get("href") for l in links]
links = [l for l in links if '/squads/' in l]

In [7]:
# Creating a list of team URLs by concatenating each link in the 'links' list with the base URL.
team_urls = [f"https://fbref.com{l}" for l in links]

In [8]:
# Sending a GET request to retrieve data from the first team URL in the 'team_urls' list.
data = requests.get(team_urls[0])

In [9]:
# Using Pandas' read_html function to parse HTML data and extract the table containing match scores
# and fixtures from the requested webpage.
# The [0] index is used to select the first table found matching the specified criteria ("Scores & Fixtures").

matches = pd.read_html(data.text, match="Scores & Fixtures")[0]

In [10]:
# Using BeautifulSoup to parse the HTML data retrieved from the webpage.
soup = BeautifulSoup(data.text)


links = soup.find_all('a') # Extracting all 'a' (anchor) tags from the parsed HTML.

links = [l.get("href") for l in links] # Extracting the 'href' attribute from each anchor tag in the list of links.

# Filtering the links to include only those containing 'all_comps/shooting/' in their href attribute.
links = [l for l in links if l and 'all_comps/shooting/' in l]

In [11]:
# Sending a GET request to retrieve data from the first link in the 'links' list, concatenated with the base URL.

data = requests.get(f"https://fbref.com{links[0]}")

In [12]:
# Using Pandas' read_html function to parse HTML data and extract the table
# containing shooting statistics from the requested webpage.

shooting = pd.read_html(data.text, match="Shooting")[0]

# The [0] index is used to select the first table found matching the specified criteria ("Shooting").
shooting.head()

Unnamed: 0_level_0,For Arsenal,For Arsenal,For Arsenal,For Arsenal,For Arsenal,For Arsenal,For Arsenal,For Arsenal,For Arsenal,For Arsenal,...,Standard,Standard,Standard,Standard,Expected,Expected,Expected,Expected,Expected,Unnamed: 25_level_0
Unnamed: 0_level_1,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Dist,FK,PK,PKatt,xG,npxG,npxG/Sh,G-xG,np:G-xG,Match Report
0,2023-08-06,16:00,Community Shield,FA Community Shield,Sun,Neutral,D,1 (4),1 (1),Manchester City,...,,,0,0,,,,,,Match Report
1,2023-08-12,12:30,Premier League,Matchweek 1,Sat,Home,W,2,1,Nott'ham Forest,...,19.1,0.0,0,0,0.8,0.8,0.06,1.2,1.2,Match Report
2,2023-08-21,20:00,Premier League,Matchweek 2,Mon,Away,W,1,0,Crystal Palace,...,16.4,0.0,1,1,2.0,1.2,0.09,-1.0,-1.2,Match Report
3,2023-08-26,15:00,Premier League,Matchweek 3,Sat,Home,D,2,2,Fulham,...,13.8,0.0,1,1,3.2,2.4,0.14,-1.2,-1.4,Match Report
4,2023-09-03,16:30,Premier League,Matchweek 4,Sun,Home,W,3,1,Manchester Utd,...,15.0,0.0,0,0,2.3,2.3,0.13,0.7,0.7,Match Report


In [13]:
# Dropping the top level of the multi-level column index to simplify column names.
shooting.columns = shooting.columns.droplevel()

In [14]:
# Merging the 'matches' DataFrame with selected columns from the 'shooting' DataFrame based on the common column "Date".

team_data = matches.merge(shooting[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]], on="Date")

team_data.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Formation,Referee,Match Report,Notes,Sh,SoT,Dist,FK,PK,PKatt
0,2023-08-06,16:00,Community Shield,FA Community Shield,Sun,Neutral,D,1 (4),1 (1),Manchester City,...,4-3-3,Stuart Attwell,Match Report,Arsenal won on penalty kicks following normal ...,7,3,,,0,0
1,2023-08-12,12:30,Premier League,Matchweek 1,Sat,Home,W,2,1,Nott'ham Forest,...,4-3-3,Michael Oliver,Match Report,,15,7,19.1,0.0,0,0
2,2023-08-21,20:00,Premier League,Matchweek 2,Mon,Away,W,1,0,Crystal Palace,...,4-3-3,David Coote,Match Report,,13,2,16.4,0.0,1,1
3,2023-08-26,15:00,Premier League,Matchweek 3,Sat,Home,D,2,2,Fulham,...,4-3-3,Paul Tierney,Match Report,,18,9,13.8,0.0,1,1
4,2023-09-03,16:30,Premier League,Matchweek 4,Sun,Home,W,3,1,Manchester Utd,...,4-3-3,Anthony Taylor,Match Report,,17,5,15.0,0.0,0,0


In [15]:
# Creating a list of years ranging from 2024 to 2018 in descending order.
years = list(range(2024,2018, -1))
years
all_matches = [] # Initializing an empty list 'all_matches' to store all match data.

In [16]:
# Defining the URL for accessing Premier League standings on FBref.com.
standings_url = "https://fbref.com/en/comps/9/Premier-League_Stats"
 

In [17]:
import time # For pausing between requests

# Looping over each team URL
for team_url in team_urls:
    # Extracting team name from team URL
    team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-", " ")
    # Sending a GET request to retrieve data from the team URL for matches
    data = requests.get(team_url)
    # Reading HTML tables containing match data
    matches = pd.read_html(data.text, match="Scores & Fixtures")[0]
    # Extracting links for shooting statistics
    soup = BeautifulSoup(data.text)
    shooting_link = [l.get("href") for l in soup.find_all('a') if l and 'all_comps/shooting/' in l]
    goalie_link = [l.get("href") for l in soup.find_all('a') if l and 'all_comps/goalkeeping/' in l]

    # Sending a GET request to retrieve data from the shooting statistics URL
    data_shooting = requests.get(f"https://fbref.com{shooting_link[0]}")
    # Reading HTML tables containing shooting statistics
    shooting = pd.read_html(data_shooting.text, match="Shooting")[0]
    # Dropping the top level of the multi-level column index to simplify column names
    shooting.columns = shooting.columns.droplevel()

    # Sending a GET request to retrieve data from the goalie statistics URL
    data_goalie = requests.get(f"https://fbref.com{goalie_link[0]}")
    # Reading HTML tables containing goalie statistics
    goalie = pd.read_html(data_goalie.text, match="Goalkeeping")[0]
    # Dropping the top level of the multi-level column index to simplify column names
    goalie.columns = goalie.columns.droplevel()

    # Merging matches, shooting statistics, and goalie statistics data
    try:
        team_data = matches.merge(shooting[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]], on="Date")
        team_data = team_data.merge(goalie[["Date", "GA", "GA90", "SoTA", "Saves", "Save%", "CS", "CS%"]], on="Date")
    except ValueError:
        continue

    # Filtering data to include only matches from the Premier League
    team_data = team_data[team_data["Comp"] == "Premier League"]
    
    # Adding year and team name as columns
    team_data["Season"] = year
    team_data["Team"] = team_name

    # Appending processed data to the list
    all_matches.append(team_data)
    # Pausing for 1 second to avoid overwhelming the server
    time.sleep(1)

KeyboardInterrupt: 

In [None]:
# Combining all match data stored in the list 'all_matches' into a single DataFrame 'match_df'.
match_df = pd.concat(all_matches)

In [None]:
# Converting column names to lowercase for consistency.
match_df.columns = [c.lower() for c in match_df.columns]
match_df

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
1,2023-08-12,12:30,Premier League,Matchweek 1,Sat,Home,W,2,1,Nott'ham Forest,...,Match Report,,15.0,7.0,19.1,0.0,0,0,2024,Arsenal
2,2023-08-21,20:00,Premier League,Matchweek 2,Mon,Away,W,1,0,Crystal Palace,...,Match Report,,13.0,2.0,16.4,0.0,1,1,2024,Arsenal
3,2023-08-26,15:00,Premier League,Matchweek 3,Sat,Home,D,2,2,Fulham,...,Match Report,,18.0,9.0,13.8,0.0,1,1,2024,Arsenal
4,2023-09-03,16:30,Premier League,Matchweek 4,Sun,Home,W,3,1,Manchester Utd,...,Match Report,,17.0,5.0,15.0,0.0,0,0,2024,Arsenal
5,2023-09-17,16:30,Premier League,Matchweek 5,Sun,Away,W,1,0,Everton,...,Match Report,,13.0,4.0,17.4,0.0,0,0,2024,Arsenal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38,2022-04-30,15:00,Premier League,Matchweek 35,Sat,Away,L,0,2,Aston Villa,...,Match Report,,9.0,3.0,21.6,0.0,0,0,2022,Norwich City
39,2022-05-08,14:00,Premier League,Matchweek 36,Sun,Home,L,0,4,West Ham,...,Match Report,,8.0,2.0,22.2,1.0,0,0,2022,Norwich City
40,2022-05-11,19:45,Premier League,Matchweek 21,Wed,Away,L,0,3,Leicester City,...,Match Report,,9.0,5.0,17.0,0.0,0,0,2022,Norwich City
41,2022-05-15,14:00,Premier League,Matchweek 37,Sun,Away,D,1,1,Wolves,...,Match Report,,11.0,2.0,14.4,0.0,0,0,2022,Norwich City


In [None]:
# Saving the DataFrame 'match_df' to a CSV file named "epl_matches.csv".
match_df.to_csv("epl_matches.csv")