In [152]:
import requests
import time
standings_url = "https://fbref.com/en/comps/9/Premier-League-Stats"
time.sleep(1)
data = requests.get(standings_url)
time.sleep(1)

#data.text -- will show you literally all the text


In [153]:
#grab url for each team's matches webpage
#when doing inspect on the page, we need the a (anchor) tag that links to the page

from bs4 import BeautifulSoup #library used for parsing html

soup = BeautifulSoup(data.text) #passing in html text to initialize the object
time.sleep(1)
#now give the soup object something to select from the page
    #we want to select the table and all the a tags inside the table

standings_table = soup.select('table.stats_table')[0] #table is name of the tag, stats_table is the class name
        #selects any table elements in the page that have the class stats table
        #gets the first table in the list

time.sleep(1)
    



In [154]:
#find all the links for each element/team in the table

links = standings_table.find_all('a') #finds all of the a tags

#get href property of each link using list comprehension
links = [l.get("href") for l in links]
    #runs through all the a elements and gets the href properties

#filter for only squad links
links = [l for l in links if '/squads/' in l]
    #resets links to only elements that are squads and not other types of links


In [155]:
#the links did not include the domain, we need to complete the full links ourselves

team_urls = [f"https://fbref.com{l}" for l in links]
    #takes each link and completes it with the domain name in the beginning

In [156]:
#extract match stats

team_url = team_urls[0]
data = requests.get(team_url)

#grab matches table and turn it into pandas dataframe

import pandas as pd

matches = pd.read_html(data.text, match="Scores & Fixtures")
    #read_html scans all the tables on the page 
    # match finds the one with the string in the 2nd arg

#matches[0] -- will show the table in the proper format

In [157]:
#get match shooting stats

soup = BeautifulSoup(data.text)

links = soup.find_all('a')

links = [l.get("href") for l in links]

links = [l for l in links if l and 'all_comps/shooting/' in l]

data=requests.get(f"https://fbref.com{links[0]}")

shooting = pd.read_html(data.text, match="Shooting")[0]
#shooting.head() #.head only shows first 5 rows






In [158]:
#clean and merge data

shooting.columns = shooting.columns.droplevel() #drops an index level -- basically just removing unnecessary double index/column headers
#shooting.head()

In [159]:
#merge matches and shooting dataframes

team_data = matches[0].merge(shooting[["Date", "Sh","SoT", "Dist", "FK", "PK", "PKatt"]], on = "Date")




In [160]:
#get data for all teams and seasons in a loop
years = list(range(2024, 2020, -1))

all_matches = [] #will hold the dataframe for each team in one season

standings_url = "https://fbref.com/en/comps/9/Premier-League-Stats"

for year in years: #loop through each year
    data = requests.get(standings_url)
    soup = BeautifulSoup(data.text)
    standings_table = soup.select('table.stats_table')[0]

    links = [l.get("href") for l in standings_table.find_all('a')]
    links = [l for l in links if '/squads/' in l]
    team_urls = [f"https://fbref.com{l}" for l in links]


    previous_season = soup.select("a.prev")[0].get("href")
    standings_url = f"https://fbref.com{previous_season}"
        #updates url for next season each loop

    for url in team_urls:
        team_name = url.split("/")[-1].replace("-Stats","").replace("-"," ")
        

        data = requests.get(url)   
        matches = pd.read_html(data.text, match="Scores & Fixtures")

        soup = BeautifulSoup(data.text)

        links = soup.find_all('a')

        links = [l.get("href") for l in links]

        links = [l for l in links if l and 'all_comps/shooting/' in l]

        data=requests.get(f"https://fbref.com{links[0]}")
        import pandas as pd
        shooting = pd.read_html(data.text, match="Shooting")[0]

        shooting.columns = shooting.columns.droplevel() #drops an index level -- basically just removing unnecessary double index/column headers
        try:
            team_data = matches[0].merge(shooting[["Date", "Sh","SoT", "Dist", "FK", "PK", "PKatt"]], on = "Date")
        except ValueError:
            continue #some shooting stats for some teams are unavailable, skip over those

        team_data = team_data[team_data["Comp"] == "Premier League"] #only get prem games
        team_data["Season"] = year #add season and team columns bc this info is on the title of the 
                                    #website itself but its not in the data that we're scraping
        team_data["Team"] = team_name
        all_matches.append(team_data)
        import time
        time.sleep(10) #sites dont want you to scrape too quickly, so you may get blocked if you're making too many requests too quickly

            



In [161]:
#combine all individual dataframes into one dataframe
match_df = pd.concat(all_matches)

match_df.columns = [c.lower() for c in match_df.columns]

match_df.to_csv("matches.csv")

match_df

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
1,2023-08-11,20:00,Premier League,Matchweek 1,Fri,Away,W,3,0,Burnley,...,Match Report,,17.0,8.0,13.9,0.0,0,0,2024,Manchester City
3,2023-08-19,20:00,Premier League,Matchweek 2,Sat,Home,W,1,0,Newcastle Utd,...,Match Report,,14.0,4.0,17.9,0.0,0,0,2024,Manchester City
4,2023-08-27,14:00,Premier League,Matchweek 3,Sun,Away,W,2,1,Sheffield Utd,...,Match Report,,29.0,9.0,17.3,2.0,0,1,2024,Manchester City
5,2023-09-02,15:00,Premier League,Matchweek 4,Sat,Home,W,5,1,Fulham,...,Match Report,,6.0,4.0,14.8,0.0,1,1,2024,Manchester City
6,2023-09-16,15:00,Premier League,Matchweek 5,Sat,Away,W,3,1,West Ham,...,Match Report,,29.0,13.0,16.4,1.0,0,0,2024,Manchester City
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38,2021-05-02,19:15,Premier League,Matchweek 34,Sun,Away,L,0,4,Tottenham,...,Match Report,,8.0,1.0,18.2,0.0,0,0,2021,Sheffield United
39,2021-05-08,15:00,Premier League,Matchweek 35,Sat,Home,L,0,2,Crystal Palace,...,Match Report,,7.0,0.0,13.4,1.0,0,0,2021,Sheffield United
40,2021-05-16,19:00,Premier League,Matchweek 36,Sun,Away,W,1,0,Everton,...,Match Report,,10.0,3.0,18.5,0.0,0,0,2021,Sheffield United
41,2021-05-19,18:00,Premier League,Matchweek 37,Wed,Away,L,0,1,Newcastle Utd,...,Match Report,,11.0,1.0,18.3,1.0,0,0,2021,Sheffield United
