In [None]:
import requests
import pandas as pd
import time
from io import StringIO
from bs4 import BeautifulSoup

standings_url = "https://fbref.com/en/comps/9/2022-2023/2022-2023-Premier-League-Stats"
data = requests.get(standings_url) #grab info from the page using the url

soup = BeautifulSoup(data.text)
time.sleep(5)
standings_table = soup.select('table.stats_table')[0] #grab first element in html table called stats_table
links = standings_table.find_all('a') #find a tags within the html we have grabbed being held in standings_table
links = [l.get("href") for l in links] #for every a tag in our list, grab the href property
links = [l for l in links if '/squads/' in l] #if link contains the word squads, keep it
team_urls = [f"https://fbref.com{l}" for l in links] #create proper links with the data left
team_urls

In [None]:
team_url = team_urls[0]
data = requests.get(team_url)

matches = pd.read_html(StringIO(str(data.text)), match="Scores & Fixtures")[0]
# matches.head()

In [None]:
soup = BeautifulSoup(data.text)
links = soup.find_all('a')
links = [l.get("href") for l in links]
links = [l for l in links if l and 'all_comps/shooting/' in l] #getting shooting stats page via link on the scores and fixtures page
data = requests.get(f"https://fbref.com{links[0]}")
shooting = pd.read_html(StringIO(str(data.text)), match="Shooting")[0]
shooting.columns = shooting.columns.droplevel() #drop a column has there were two rows of headers (top row not being needed)
# shooting.head()

In [None]:
team_data = matches[0].merge(shooting[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]], on="Date")
team_data.shape

In [2]:
import requests
import pandas as pd
import time
from io import StringIO
from bs4 import BeautifulSoup

years = list(range(2023, 2020, -1))
all_matches = []
standings_url = "https://fbref.com/en/comps/9/2022-2023/2022-2023-Premier-League-Stats"
for year in years:
    data = requests.get(standings_url)
    soup = BeautifulSoup(data.text)
    standings_table = soup.select('table.stats_table')[0]

    links = [l.get("href") for l in standings_table.find_all('a')]
    links = [l for l in links if '/squads/' in l]
    team_urls = [f"https://fbref.com{l}" for l in links]

    previous_season = soup.select("a.prev")[0].get("href")
    standings_url = f"https://fbref.com{previous_season}"

    for team_url in team_urls:
        team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-", " ")

        data = requests.get(team_url)
        matches = pd.read_html(StringIO(str(data.text)), match="Scores & Fixtures")[0]

        soup = BeautifulSoup(data.text)
        links = [l.get("href") for l in soup.find_all('a')]
        links = [l for l in links if l and 'all_comps/shooting/' in l]
        data = requests.get(f"https://fbref.com{links[0]}")
        shooting = pd.read_html(StringIO(str(data.text)), match="Shooting")[0]
        shooting.columns = shooting.columns.droplevel()

        #attempt merge, if ValueError occurs ignore the team data merge
        try:
            team_data = matches.merge(shooting[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]], on="Date")
        except ValueError:
            continue

        team_data = team_data[team_data["Comp"] == "Premier League"] #filter data to only include prem games
        team_data["Season"] = year
        team_data["Team"] = team_name
        all_matches.append(team_data)
        time.sleep(5) #slow down how quickly scraping occurs, to keep site bandwidth low and not get blocked from the site

match_df = pd.concat(all_matches)
match_df.columns = [c.lower() for c in match_df.columns] #make column names lowercase (just preference)
match_df.to_csv("matches.csv") #write to csv file

In [3]:
match_df

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
1,2022-08-07,16:30,Premier League,Matchweek 1,Sun,Away,W,2,0,West Ham,...,Match Report,,13.0,1.0,18.7,1.0,1,1,2023,Manchester City
2,2022-08-13,15:00,Premier League,Matchweek 2,Sat,Home,W,4,0,Bournemouth,...,Match Report,,19.0,7.0,17.5,0.0,0,0,2023,Manchester City
3,2022-08-21,16:30,Premier League,Matchweek 3,Sun,Away,D,3,3,Newcastle Utd,...,Match Report,,21.0,10.0,16.2,1.0,0,0,2023,Manchester City
4,2022-08-27,15:00,Premier League,Matchweek 4,Sat,Home,W,4,2,Crystal Palace,...,Match Report,,18.0,5.0,14.1,0.0,0,0,2023,Manchester City
5,2022-08-31,19:30,Premier League,Matchweek 5,Wed,Home,W,6,0,Nott'ham Forest,...,Match Report,,17.0,9.0,14.8,0.0,0,0,2023,Manchester City
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38,2021-05-02,19:15,Premier League,Matchweek 34,Sun,Away,L,0,4,Tottenham,...,Match Report,,8.0,1.0,18.2,0.0,0,0,2021,Sheffield United
39,2021-05-08,15:00,Premier League,Matchweek 35,Sat,Home,L,0,2,Crystal Palace,...,Match Report,,7.0,0.0,13.4,1.0,0,0,2021,Sheffield United
40,2021-05-16,19:00,Premier League,Matchweek 36,Sun,Away,W,1,0,Everton,...,Match Report,,10.0,3.0,18.5,0.0,0,0,2021,Sheffield United
41,2021-05-19,18:00,Premier League,Matchweek 37,Wed,Away,L,0,1,Newcastle Utd,...,Match Report,,11.0,1.0,18.3,1.0,0,0,2021,Sheffield United
