In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager


In [44]:
DRIVER = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

In [55]:
FBREF_HOMEPAGE_URL = "https://fbref.com/"
PREM_URL = "https://fbref.com/en/comps/9/Premier-League-Stats"
LEAGUE_TEAM_TABLE_ID = "results2022-202391_overall"

In [3]:
player_name = "douglas luiz"
player_team = "aston villa"

In [92]:
def get_full_url(url_tail):
    return urljoin(FBREF_HOMEPAGE_URL, url_tail)


def get_table_info(url, driver, table_tag_id = "matchlogs_for"):
    # Load the page using Selenium
    driver.get(url)
    
    wait = WebDriverWait(driver, 10)
    table_wait = wait.until(EC.presence_of_element_located((By.ID, table_tag_id)))

    # create a Beautiful Soup object from the response content
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    # find the table with id 'div_matchlogs_for'
    table = soup.find('table', {'id': table_tag_id})

    return table

def get_teams_urls(table):
    # extract the table rows
    rows = table.find_all('tr')

    # extract the data from each row and store it in a list
    url_data = []
    i = 0
    for row in rows:
        i += 1
        team_cell = row.find('td', {'data-stat': 'team'})
        if team_cell:
            team_name = team_cell.find("a").text
            team_url_tail = team_cell.find("a")['href']
            team_uid = team_url_tail.split("/")[3]
            team_url = get_full_url(team_url_tail)
            url_data.append({"Team Name": team_name, "Unique ID": team_uid, "URL Link": team_url})
 
    df = pd.DataFrame(url_data)
    return df

In [90]:
table_prem_teams = get_table_info(PREM_URL, DRIVER, table_tag_id = LEAGUE_TEAM_TABLE_ID)
df_team_urls = get_teams_urls(table_prem_teams)

In [91]:
df_team_urls

Unnamed: 0,Team Name,Unique ID,URL Link
0,Manchester City,b8fd03ef,https://fbref.com/en/squads/b8fd03ef/Mancheste...
1,Arsenal,18bb7c10,https://fbref.com/en/squads/18bb7c10/Arsenal-S...
2,Manchester Utd,19538871,https://fbref.com/en/squads/19538871/Mancheste...
3,Newcastle Utd,b2b47a98,https://fbref.com/en/squads/b2b47a98/Newcastle...
4,Liverpool,822bd0ba,https://fbref.com/en/squads/822bd0ba/Liverpool...
5,Brighton,d07537b9,https://fbref.com/en/squads/d07537b9/Brighton-...
6,Aston Villa,8602292d,https://fbref.com/en/squads/8602292d/Aston-Vil...
7,Tottenham,361ca564,https://fbref.com/en/squads/361ca564/Tottenham...
8,Brentford,cd051869,https://fbref.com/en/squads/cd051869/Brentford...
9,Fulham,fd962109,https://fbref.com/en/squads/fd962109/Fulham-Stats


In [76]:
def get_team_matches(table):
    # Get the table headers
    headers = [header.text for header in table.find_all('th')]
    headers_front = headers[:19]
    headers_date = headers[19:]
    
    # Get the table rows
    rows = table.find_all('tr')
    
    table_data = []
    row_index = 0
    for row in rows[1:]:  # We skip the first row, because it contains the headers
        row_data = []
        cells = row.find_all('td')
        # get URLs for team names (position 8 from Time column onwards) and for Match reports (position -2 from back)
        row_data = [cell.text.strip() if i != len(cells)-2 and i != 8 else get_full_url(cell.find('a')["href"]) for i, cell in enumerate(cells)]

        row_data.insert(0, str(headers_date[row_index]))
        table_data.append(row_data)
        row_index += 1
    
    # Create a pandas DataFrame from the scraped data
    df = pd.DataFrame(table_data, columns=headers_front)
    return df

In [77]:
table_team_matches = get_table_info("https://fbref.com/en/squads/b8fd03ef/Manchester-City-Stats", DRIVER, table_tag_id = "matchlogs_for")
df_team_matches = get_team_matches(table_team_matches) # need to indicate the team i am scraping for in the table.

In [80]:
df_team_matches.head(3)

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,Attendance,Captain,Formation,Referee,Match Report,Notes
0,2022-07-30,17:00 (00:00),Community Shield,FA Community Shield,Sat,Neutral,L,1,3,https://fbref.com/en/squads/822bd0ba/Liverpool...,,,57,,Rúben Dias,4-3-3,Craig Pawson,https://fbref.com/en/matches/341bf7a1/Liverpoo...,
1,2022-08-07,16:30 (23:30),Premier League,Matchweek 1,Sun,Away,W,2,0,https://fbref.com/en/squads/7c21e445/West-Ham-...,2.2,0.5,75,62443.0,İlkay Gündoğan,4-3-3,Michael Oliver,https://fbref.com/en/matches/ece62baf/West-Ham...,
2,2022-08-13,15:00 (22:00),Premier League,Matchweek 2,Sat,Home,W,4,0,https://fbref.com/en/squads/4ba7cbea/Bournemou...,1.7,0.1,67,53453.0,İlkay Gündoğan,4-2-3-1,David Coote,https://fbref.com/en/matches/311d705c/Manchest...,


In [81]:
df_team_matches["Match Report"][3]

'https://fbref.com/en/matches/b513d9fe/Newcastle-United-Manchester-City-August-21-2022-Premier-League'

In [141]:
def load_scorebox_div(url, driver, tag_id = "scorebox"):
    # Load the page using Selenium
    driver.get(url)
    
    wait = WebDriverWait(driver, 10)
    scorebox_wait = wait.until(EC.presence_of_element_located((By.CLASS_NAME, tag_id)))

    # create a Beautiful Soup object from the response content
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    # find the div with id 'scorebox'
    div = soup.find('div', {'class': "scorebox"})

    return div

def get_teams_playing_uids(scorebox_div):
    divs = scorebox_div.find_all("div", recursive=False, limit=2)
    home_team_div, away_team_div = divs[0], divs[1]
    home_team_uid = home_team_div.find('a')['href'].split('/')[3]
    away_team_uid = away_team_div.find('a')['href'].split('/')[3]

    return [home_team_uid, away_team_uid]

In [142]:
sb_div = load_scorebox_div("https://fbref.com/en/matches/b513d9fe/Newcastle-United-Manchester-City-August-21-2022-Premier-League", DRIVER)
team_uids = get_teams_playing_uids(sb_div) # home_team_uid, away_team_uid

In [143]:
team_uids

['b2b47a98', 'b8fd03ef']

In [154]:
def load_outfield_perf_soup(uids, url, driver):
    # Load the page using Selenium
    driver.get(url)
    
    wait = WebDriverWait(driver, 10)    
    wait.until(EC.presence_of_element_located((By.ID, f"stats_{uids[0]}_summary")))
    wait.until(EC.presence_of_element_located((By.ID, f"stats_{uids[1]}_summary")))

    # create a Beautiful Soup object from the response content
    soup = BeautifulSoup(DRIVER.page_source, 'html.parser')
    return soup

def get_outfield_perf(team_uids, soup):

    df_list = []
    # find the div with id 'stats_teamuid_summary'
    for team_uid in team_uids:
        table = soup.find('table', {'id': f"stats_{team_uid}_summary"})

        data = []
        table_headers = table.find_all('th')

        # retrive column headers (aka names of the statistics)
        col_stats_names = [header.get('aria-label') for header in table_headers][7:38] # ignore 0-6, take 7-37
        col_stats_names.insert(0, "player_uid")
        col_stats_names.insert(0, "team_uid")

        # retrieve player names
        player_info = table_headers[38:-1]
        player_names = [player.get_text().strip() for player in player_info]
        player_uid = [player.find('a')['href'].split('/')[3] for player in player_info]
        
        table_body = table.find('tbody')

        rows = table_body.find_all('tr')
        row_index = 0
        for row in rows:
            cols = row.find_all('td')
            cols = [ele.text.strip() for ele in cols]
            cols.insert(0, player_names[row_index])
            cols.insert(0, player_uid[row_index])
            cols.insert(0, team_uid)
            row_index += 1
            data.append([ele for ele in cols if ele])

        # Convert data to DataFrame
        df = pd.DataFrame(data, columns = col_stats_names)
        df_list.append(df)

    final_match_df = pd.concat(df_list, ignore_index=True)
    return final_match_df

In [155]:
soup_outfield_perf = load_outfield_perf_soup(team_uids, "https://fbref.com/en/matches/b513d9fe/Newcastle-United-Manchester-City-August-21-2022-Premier-League", DRIVER)
final_match_df = get_outfield_perf(team_uids, soup_outfield_perf)

In [156]:
final_match_df

Unnamed: 0,team_uid,player_uid,Player,Shirt Number,Nation,Position,Age,Minutes,Goals,Assists,...,Shot-Creating Actions,Goal-Creating Actions,Passes Completed,Passes Attempted,Pass Completion %,Progressive Passes,Carries,Progressive Carries,Take-Ons Attempted,Successful Take-Ons
0,b2b47a98,c596fcb0,Callum Wilson,9,eng ENG,FW,30-175,69,1,0,...,0,0,9,13,69.2,0,9,1,0,0.0
1,b2b47a98,4e9a0555,Chris Wood,20,nz NZL,FW,30-257,21,0,0,...,0,0,7,9,77.8,1,4,1,0,0.0
2,b2b47a98,2b16cb1a,Allan Saint-Maximin,10,fr FRA,LW,25-162,90,0,2,...,8,4,15,23,65.2,7,23,9,7,5.0
3,b2b47a98,862a1c15,Miguel Almirón,24,py PAR,RW,28-192,82,1,0,...,1,0,14,20,70.0,5,11,0,2,0.0
4,b2b47a98,de112b84,Jacob Murphy,23,eng ENG,RW,27-178,8,0,0,...,0,0,0,0,0.0,1,0,0,0,
5,b2b47a98,c17bfb65,Joelinton,7,br BRA,LM,26-007,90,0,0,...,4,0,23,27,85.2,4,12,1,2,2.0
6,b2b47a98,82518f62,Bruno Guimarães,39,br BRA,CM,24-278,90,0,0,...,3,1,26,31,83.9,7,26,0,0,0.0
7,b2b47a98,a3b03921,Joe Willock,28,eng ENG,RM,23-001,69,0,0,...,1,0,14,22,63.6,0,7,4,2,2.0
8,b2b47a98,a2b105e0,Sean Longstaff,36,eng ENG,RM,24-295,21,0,0,...,0,0,3,5,60.0,0,2,0,0,0.0
9,b2b47a98,b2d31e83,Dan Burn,33,eng ENG,LB,30-104,89,0,0,...,1,0,18,25,72.0,1,4,0,1,0.0


In [None]:
## to do keeper performances for match report.
# eg; "https://fbref.com/en/matches/b513d9fe/Newcastle-United-Manchester-City-August-21-2022-Premier-League", separate db from players.
# then need to edit the code to only run on 1 load of the entire page instead of in chunks with wait() function on specific tables separately.s


In [158]:
DRIVER.quit()