In [230]:
from selenium import webdriver
from bs4 import BeautifulSoup as bs
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import pandas as pd
import time
import copy

In [None]:
# Webpage url for 2023-2024 season
url = "https://www.premierleague.com/results?co=1&se=578&cl=-1"

# Set up the Selenium WebDriver
driver = webdriver.Chrome()
driver.get(url)

# Wait for page to load
time.sleep(5) 

def scroll_down(driver):
    """A function to scroll to the end of a webpage in order to load all content."""

    # Get scroll height.
    last_height = driver.execute_script("return document.body.scrollHeight")

    while True:

        # Scroll down to the bottom.
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        # Wait to load the page.
        time.sleep(4)

        # Calculate new scroll height and compare with last scroll height.
        new_height = driver.execute_script("return document.body.scrollHeight")

        if new_height == last_height:

            break

        last_height = new_height


# Call the function
scroll_down(driver)

# Get the page source
soup = bs(driver.page_source, 'html.parser')

# Close the driver
driver.quit()

# Extract game elements
games = soup.select('.match-fixture__container')
print(f"Number of games found: {len(games)}") # There should be a total of 380 games

Number of games found: 380


In [232]:
# List to store important data
game_links = []
game_id = []
home_team = []
away_team = []
home_goals = []
away_goals = []

# Loop through all of the games and collect important information
for game in games:

    # Link to match stats
        game_links.append(game.find_parent().get('data-href'))
        game_id.append(game_links[-1][-5:])

    # Team names
        team_name = [team.text for team in game.find_all('span', class_='match-fixture__short-name')]
        home_team.append(team_name[0])
        away_team.append(team_name[1])

    # Score
        score = [*game.find('span', class_='match-fixture__score').stripped_strings]
        home_goals.append(score[0])
        away_goals.append(score[2])


In [237]:
# List to store data tables
match_stats = []

# Loop through all matches
for link in game_links:
    try:
        match_url = 'https:' + link

        # Use headless mode
        options = Options()
        options.add_argument('--headless')

        # Setup driver
        stat_driver = webdriver.Chrome(options=options)
        stat_driver.get(match_url)

        # Accept cookies
        WebDriverWait(stat_driver, 10).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, '#onetrust-accept-btn-handler'))
        )
        cookies = stat_driver.find_element(By.CSS_SELECTOR, '#onetrust-accept-btn-handler')
        stat_driver.execute_script("arguments[0].click();", cookies)

        # Click on the stats menu
        WebDriverWait(stat_driver, 10).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, '.matchCentreSquadLabelContainer+ li'))
        )
        stats = stat_driver.find_element(By.CSS_SELECTOR, '.matchCentreSquadLabelContainer+ li')
        stat_driver.execute_script("arguments[0].click();", stats)

        # Wait for the data table to load
        WebDriverWait(stat_driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, ".matchCentreStatsContainer tr"))
        )

        # Get the data table
        table_page = bs(stat_driver.page_source, 'html.parser')

        # Close the driver
        stat_driver.quit()

        # Append the data table to the list
        match_stats.append(table_page.find('tbody', class_='matchCentreStatsContainer'))

    except:
        print(match_url)

In [238]:
# List to store game stats
home_stats = {
    'Possession %': [],
    'Shots on target': [],
    'Shots': [],
    'Touches': [],
    'Passes': [],
    'Tackles': [],
    'Clearances': [],
    'Corners': [],
    'Offsides': [],
    'Yellow cards': [],
    'Fouls conceded': [],
    'Red cards': []
}
away_stats = copy.deepcopy(home_stats)

# Loop trough matches
for match in match_stats:
    
    # Temporary dictionaries
    tmp_home_stats = {}
    tmp_away_stats = {}

    # Go through tables rows and columns
    for row in match.find_all('tr'):
        columns = row.find_all('td')  
        if len(columns) == 3:

            # Add stats to temporary dictionary
            stat_name = columns[1].get_text(strip=True)
            tmp_home_stats[stat_name] = columns[0].get_text(strip=True)
            tmp_away_stats[stat_name] = columns[2].get_text(strip=True)

    # Add the stats to the home and away dictionaries
    for key in home_stats.keys():

        # Add zeros if a stats did not occur that game (Example: No Red Cards)
        if not key in tmp_home_stats:
            home_stats[key].append(0)
            away_stats[key].append(0)

        # Add the stat if it was in the table
        else: 
            home_stats[key].append(tmp_home_stats[key])
            away_stats[key].append(tmp_away_stats[key])

# Create CSV File

In [245]:
# Get all of the values saved into one list
values = [game_id, home_team, away_team, home_goals, away_goals]

for key in home_stats.keys():
    values.append(home_stats[key])
    values.append(away_stats[key])

# Column names
column_names = columns = [
    'game_ID', 'home_team', 'away_team', 'home_goals', 'away_goals', 
    'possession_home', 'possession_away', 'shots_on_target_home', 'shots_on_target_away', 
    'shots_home', 'shots_away', 'touches_home', 
    'touches_away', 'passes_home', 'passes_away', 'tackles_home', 
    'tackles_away', 'clearances_home', 'clearances_away', 'corners_home', 
    'corners_away', 'offsides_home', 'offsides_away', 'yellow_cards_home', 
    'yellow_cards_away', 'fouls_conceded_home', 'fouls_conceded_away', 
    'red_cards_home', 'red_cards_away'
]

# Create a dictionary of the data frame
data = {}
for i in range(len(values)):
    data[column_names[i]] = values[i]

# Create data frame
df = pd.DataFrame(data=data)
df


Unnamed: 0,game_ID,home_team,away_team,home_goals,away_goals,possession_home,possession_away,shots_on_target_home,shots_on_target_away,shots_home,...,corners_home,corners_away,offsides_home,offsides_away,yellow_cards_home,yellow_cards_away,fouls_conceded_home,fouls_conceded_away,red_cards_home,red_cards_away
0,93691,Arsenal,Everton,2,1,69.1,30.9,5,2,26,...,8,1,1,4,4,3,8,11,0,0
1,93692,Brentford,Newcastle,2,4,53.8,46.2,5,7,10,...,3,0,2,1,4,4,15,11,0,0
2,93693,Brighton,Man Utd,0,2,54.8,45.2,3,4,17,...,7,5,0,1,1,3,10,9,0,0
3,93694,Burnley,Nott'm Forest,1,2,72.6,27.4,3,6,20,...,4,3,0,2,1,0,11,5,0,0
4,93695,Chelsea,Bournemouth,2,1,61.4,38.6,6,5,16,...,6,5,0,1,2,3,5,9,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,93324,Brighton,Luton,4,1,70.7,29.3,12,3,27,...,6,7,2,3,2,2,11,12,0,0
376,93325,Everton,Fulham,0,1,40.6,59.4,9,2,19,...,10,4,6,3,0,2,12,6,0,0
377,93327,Sheffield Utd,Crystal Palace,0,1,32.2,67.8,1,8,8,...,5,5,2,2,3,0,18,11,0,0
378,93326,Newcastle,Aston Villa,5,1,52.6,47.4,13,6,17,...,6,5,2,0,4,4,12,17,0,0


Save as .csv file

In [246]:
# Save as .csv file
df.to_csv('EPL_match_data.csv', index=False)