The point of this notebook is to web-scrape MLS data for years 2022 through 2024.
I found a pretty useful set on Kaggle, covering years 1996 - 2022.
I included 2022 in the web-scraping to get a better understanding of how to fit 2023 and
2024 data to the existing set. I haven't completed scraping all the data I want, so currently
this data is not included when creating my initial models.

# Essential Libraries + Other

In [1]:
%load_ext autoreload

%autoreload 2

import configparser
import os
import time
import random
import pandas as pd
import requests
import openpyxl
import sqlite3


from io import StringIO
from pathlib import Path
from bs4 import BeautifulSoup
from alive_progress import alive_bar
import datetime

# for file saving
today = datetime.datetime.now()
today = today.strftime("%Y_%m_%d")

# some stuff I set up in a config file so I don't have to keep updating certain
# variables in every script
config = configparser.ConfigParser()
config.read('../src/config.ini')

config = configparser.ConfigParser()
config.read('../src/config.ini')

# the output path is specified in the config.ini file
output = Path(config['paths']['output'])
# I want data for the 2022 through 2024 season
yearly_directories = [Path(output/f"mls_{year}") for year in range(2022,2025)]

# create output directory and sub-directories if doesnt exist
for directory in yearly_directories+[output]:
    try:
        assert directory.exists()
    except:
        os.mkdir(directory)

# Starting Point

In [2]:
# I will be web-scraping alot, so I made this function as a result

def get_html_data(url, parser='html.parser') -> BeautifulSoup:
    '''
    Extract html data from specified url and return a bs4 object.
    Parser can be specified if needed. Default is html.parser.
    '''
    response = requests.get(url)
    html_content = response.text
    soup = BeautifulSoup(html_content, parser)
    
    return soup

In [3]:
# I ended up using this a lot in the end
def get_table_data_from_html(soup) -> list:
    '''
    Extract tables from bs4 object and return a list of dataframes.
    '''
    
    # get all tables in the html
    tables = soup.findAll('table')

    # create dfs for each table and append each one to a list
    dfs_from_tables = []
    for table in tables:
        dfs_from_tables.append(pd.read_html(StringIO(str(table)))[0])
    
    return dfs_from_tables

In [4]:
def get_all_player_match_data(excel_file_path) -> tuple[pd.DataFrame, list]:
    '''
    Input an excel file of the copied tables from the websites and return a tuple
    where the first element is a dataframe of all player data and the second element
    is a list of URLs where extraction failed. This attempts to extract data from
    the failed URLs once more before completing.
    '''
    # get year from directory the excel file is in
    year = os.path.dirname(excel_file_path).split('\\')[-1].replace('mls_', '')
    # extract all links from the excel file
    # returned as a df in case of future use with an idea I had
    player_links = get_all_player_match_data_links(excel_file_path, year)
    # initialize empty df for all player data
    player_data_df = pd.DataFrame()
    
    # generate the initial player data df and failed links list
    player_data_df, failed_links = generate_player_df(list(player_links['stat_link']), player_data_df)
    
    # if first round has failed links, retry them
    # sometimes extracting the data the first time just doesn't work but rerunning usually does
    if len(failed_links)>0:
        player_data_df, failed_links = generate_player_df(failed_links, player_data_df)

    correct_col_names = ['Date', 'Day', 'Comp', 'Round', 'Venue', 'Result', 'Squad', 'Opponent',
       'Start', 'Pos', 'Min', 'Gls', 'Ast', 'PK', 'PKatt', 'Sh', 'SoT', 'CrdY',
       'CrdR', 'Touches', 'Tkl', 'Int', 'Blocks', 'xG', 'npxG', 'xAG', 'SCA',
       'GCA', 'Cmp', 'Att', 'Cmp%', 'PrgP', 'Carries', 'PrgC', 'Att_TakeOn', 'Succ',
       'Match Report', 'player_url']
    player_data_df.columns = correct_col_names

    return player_data_df, failed_links 

def generate_player_df(player_links, df=pd.DataFrame()) -> tuple[pd.DataFrame, list]:
    '''
    Given an iterable of individual player links (and a df to modify), return a tuple (DataFrame, list)
    where the dataframe contains all player data and the list contains URLs with failed extractions.
    '''
    # get total number of players for alive_bar
    total_players = len(player_links)
    # to append failed links to a list
    failed_links = []
    
    # progress bar, force_tty=True might be needed depending if animations don't show for you
    # I used jupyter notebooks, so I needed this
    with alive_bar(total_players, force_tty=True) as bar:
        for player_url in player_links:
            # limited to 10 requests a minute per website rules :(
            # I set this as 7 seconds just to be safe but you can adjust as needed
            # in the config file
            time.sleep(int(config['other']['request_time_limit']))
            
            # if you get a player df, append it to df, otherwise add to failed links
            temp_df = attempt_data_extraction(player_url)
            if type(temp_df)==type(None):
                failed_links.append(player_url)
            else:
                temp_df['player'] = player_url
                df = pd.concat([df, temp_df], ignore_index=True)
            bar()
    # return df and failed links
    return df, failed_links

def attempt_data_extraction(url) -> pd.DataFrame | None:
    '''
    Attempt to get a player data dataframe. Return the df if successful,
    otherwise return None.
    '''
    # try to get a player_df and return it
    try:
        player_df = get_player_data_df(url)
        return player_df
    # otherwise, let the user know and return nothing
    except:
        print(f"Could not get player data for {url}")
        return None

def get_all_player_match_data_links(excel_file_path, year) -> pd.DataFrame:
    '''
    Extract all links from the excel that is a copy of the website data. Return
    a dataframe of the player names and URLs.
    '''
    # read excel file
    all_players = pd.read_excel(excel_file_path)
    wb = openpyxl.load_workbook(excel_file_path)
    sheets = wb.sheetnames
    ws = wb[sheets[0]]
    # get hyper links from file
    # in this situation, links didn't start until +2 and are found in column 37
    # this may need to be adjusted depending on how the data comes out
    all_players['stat_link'] = [ws.cell(row=i+2, column=37).hyperlink.target for i in range(all_players.shape[0])]
    # save all links out
    all_players[['Player', 'stat_link']].to_csv(output/f'mls_{year}' / 'player_links.csv', index=False)
    
    # return df of just the player name and stat link
    return all_players[['Player', 'stat_link']]
    
    
def get_player_data_df(url) -> pd.DataFrame:
    '''
    Using the URL for the individual player, get the first table on the website
    and return it as a dataframe.
    '''
    # get html data from url
    player_data_html = get_html_data(url)
    # generate tables list
    player_df = get_table_data_from_html(player_data_html)
    # add player url to data in first df in list
    # the lists always had 1 df but would break unless I left it as-is
    # player_df[0]['player'] = player_url
    
    return player_df[0]

In [20]:
soup = get_html_data(r'https://fbref.com/en/players/1339039e/matchlogs/2024/Liel-Abada-Match-Logs')

In [21]:
tables = soup.findAll('table')

# create dfs for each table and append each one to a list
dfs_from_tables = []
for table in tables:
    dfs_from_tables.append(pd.read_html(StringIO(str(table)))[0])

In [40]:
base_player_link = 'https://fbref.com/en/players/1339039e/matchlogs/2024/Liel-Abada-Match-Logs'
table_links = ['summary', 'passing', 'passing_types', 'gca', 'defense', 'possession', 'misc']

table_links_column_names =[
    'date', 'day', 'comp', 'round', 'venue', 'result', 'squad', 'opponent', 'start', 'pos', 'min', 'gls', 'ast', 'pk', 'pk_att', 'sh', 'sot', 'crdy', 'crdr', 'touches', 'tkl', 'int', 'blocks', 'xg', 'npxg', 'xag', 'sca', 'gca', 'cmp', 'att', 'cmp_perc', 'prgp', 'carries', 'prgc', 'att', 'succ'
]


https://fbref.com/en/players/1339039e/matchlogs/2024/summary/Liel-Abada-Match-Logs
https://fbref.com/en/players/1339039e/matchlogs/2024/passing/Liel-Abada-Match-Logs
https://fbref.com/en/players/1339039e/matchlogs/2024/passing_types/Liel-Abada-Match-Logs
https://fbref.com/en/players/1339039e/matchlogs/2024/gca/Liel-Abada-Match-Logs
https://fbref.com/en/players/1339039e/matchlogs/2024/defense/Liel-Abada-Match-Logs
https://fbref.com/en/players/1339039e/matchlogs/2024/possession/Liel-Abada-Match-Logs
https://fbref.com/en/players/1339039e/matchlogs/2024/misc/Liel-Abada-Match-Logs


In [38]:
temp = base_player_link.split('/')
temp.insert(-1,'test')
temp

['https:',
 '',
 'fbref.com',
 'en',
 'players',
 '1339039e',
 'matchlogs',
 '2024',
 'test',
 'Liel-Abada-Match-Logs']

In [50]:
def generate_player_df(player_links, df=pd.DataFrame()) -> tuple[pd.DataFrame, list]:
    '''
    Given an iterable of individual player links (and a df to modify), return a tuple (DataFrame, list)
    where the dataframe contains all player data and the list contains URLs with failed extractions.
    '''
    # get total number of players for alive_bar
    total_players = len(player_links)
    # to append failed links to a list
    failed_links = []
    table_links = ['summary', 'passing', 'passing_types', 'gca', 'defense', 'possession', 'misc']
    
    # progress bar, force_tty=True might be needed depending if animations don't show for you
    # I used jupyter notebooks, so I needed this
    with alive_bar(total_players, force_tty=True) as bar:
        for player_url in player_links:
            player_tables = []
            for table_ in table_links:
                temp = base_player_link.split('/')
                temp.insert(-1, table_)
                temp = '/'.join(temp)
                player_tables.append(temp)
            # limited to 10 requests a minute per website rules :(
            # I set this as 7 seconds just to be safe but you can adjust as needed
            # in the config file
            temp_df = pd.DataFrame()
            for table_url in player_tables:
                print(table_url)
                time.sleep(int(config['other']['request_time_limit']))
            
        # if you get a player df, append it to df, otherwise add to failed links
                temp = attempt_data_extraction(table_url)
                temp_df = pd.concat([temp_df, temp], ignore_index=True, axis=1)

            if type(temp_df)==type(None):
                failed_links.append(player_url)
            else:
                temp_df['player'] = player_url
                df = pd.concat([df, temp_df], ignore_index=True)
            bar()
    # return df and failed links
    return df, failed_links

In [51]:
temp_df = generate_player_df(['https://fbref.com/en/players/1339039e/matchlogs/2024/Liel-Abada-Match-Logs'])

on 0: https://fbref.com/en/players/1339039e/matchlogs/2024/summary/Liel-Abada-Match-Logs
on 0: https://fbref.com/en/players/1339039e/matchlogs/2024/passing/Liel-Abada-Match-Logs
on 0: https://fbref.com/en/players/1339039e/matchlogs/2024/passing_types/Liel-Abada-Match-Logs
on 0: https://fbref.com/en/players/1339039e/matchlogs/2024/gca/Liel-Abada-Match-Logs
on 0: https://fbref.com/en/players/1339039e/matchlogs/2024/defense/Liel-Abada-Match-Logs
on 0: https://fbref.com/en/players/1339039e/matchlogs/2024/possession/Liel-Abada-Match-Logs
on 0: https://fbref.com/en/players/1339039e/matchlogs/2024/misc/Liel-Abada-Match-Logs
|████████████████████████████████████████| 1/1 [100%] in 52.9s (0.02/s)         


In [60]:
temp_df[0].to_csv('test.csv', index=False)

# Current Season - 2024

In [None]:
# this url gives me a list of all players in the current league
base_url = 'https://fbref.com/en/comps/22/Major-League-Soccer-Stats'

# this page gives me a bunch of tables for team stats in the current moment
# not sure how much of the data here will be useful, but I'll grab it just-in-case
# html = get_html_data(base_url + 'players/')

In [None]:
# team_stat_dfs = get_table_data_from_html(html)

# Individual Player Data
I had to copy and paste the "Player Standard Stats" table from this url (https://fbref.com/en/comps/22/stats/Major-League-Soccer-Stats) since I failed to do so with bs4. I saved it
out as an excel file. I just want to extract the urls that go directly to the players stats.

In [None]:
player_data_2024_df, failed_links_2024 = get_all_player_match_data(config['paths']['all_players_2024'])

In [None]:
player_data_2024_df

In [None]:
player_data_2024_df.to_csv(output/"mls_2024"/f"all_player_data_2024_{today}.csv", index=False)

# PRIOR YEARS
I recommend running each year/season individually. At the time of creating this script,
the website has a limit of 10 requests/minute. As a result, I had to stall with a timer
of 7 seconds before running each request. This causes each year to take ~2 hours just to
get the individual player performance data for each match. It would be best to finish
tackling a year before moving onto the next. Depending on your PC, you might also need
to take resources into account as well and save each year out before moving onto the next.

# 2023 Season

In [None]:
player_data_2023_df, failed_links_2023 = get_all_player_match_data(config['paths']['all_players_2023'])

In [None]:
player_data_2023_df

In [None]:
player_data_2023_df.to_csv(output/"mls_2023"/f"all_player_data_2023_{today}.csv", index=False)

# 2022 Season

In [None]:
player_data_2022_df, failed_links_2022 = get_all_player_match_data(config['paths']['all_players_2022'])

In [None]:
player_data_2022_df

In [None]:
player_data_2022_df.to_csv(output/"mls_2022"/f"all_player_data_2022_{today}.csv", index=False)

# UPDATED METHOD

In [399]:
def get_all_teams(url):
    soup = get_html_data(url)
    year = soup.find('h1').text[10:14]
    eastern_conference = soup.find('table', {'id': f'results{year}221Eastern-Conference_overall'})
    western_conference = soup.find('table', {'id': f'results{year}221Western-Conference_overall'})

    # Extract links from each table

    eastern_links = extract_table_links(eastern_conference)
    western_links = extract_table_links(western_conference)

    all_teams = pd.DataFrame(eastern_links + western_links)
    all_teams.columns = ['team', 'team_url']
    
    all_teams.to_csv(output / f"mls_{year}/all_teams.csv", index=False)

    return all_teams, year

def extract_table_links(table):
    links = []
    if table:
        rows = table.find_all('tr')
        for row in rows:
            link_tag = row.find('a')
            if link_tag:
                links.append((link_tag.text, f"https://fbref.com{link_tag['href']}"))
    return links

def extract_table_links_and_positions(table):
    links = []
    if table:
        rows = table.find_all('tr')
        for row in rows:
            link_tag = row.find('a')
            try:
                position = row.find('td',{'data-stat':'position'})
                position = position.text
            except:
                position = None
            if link_tag:
                links.append((link_tag.text, f"https://fbref.com{link_tag['href']}", position))
    return links

def get_team_players(team):
    time.sleep(int(config['other']['request_time_limit']))
    soup = get_html_data(team['team_url'])
    players_df = extract_table_links_and_positions(soup.find('table', {'id': 'stats_standard_22'}))
    players_df = pd.DataFrame(players_df)
    players_df.columns = ['player_name', 'player_url', 'position']
    players_df['team'] = team['team']
    return players_df

def get_all_players(all_teams_df, year):
    all_team_players = list(all_teams_df.apply(get_team_players,axis=1))

    all_players_df = pd.DataFrame()
    for players in all_team_players:
        all_players_df = pd.concat([all_players_df, players], ignore_index=True)
    all_players_df.drop_duplicates(inplace=True)

    all_players_df.to_csv(output / f"mls_{year}/all_players.csv", index=False)

    return all_players_df

def get_teams_and_players(url):
    all_teams_df, year = get_all_teams(url)

    all_players_df = get_all_players(all_teams_df, year)

    return year, all_teams_df, all_players_df

In [400]:
year, all_teams_df, all_players_df = get_teams_and_players('https://fbref.com/en/comps/22/Major-League-Soccer-Stats')

In [401]:
year

'2024'

In [402]:
all_teams_df

Unnamed: 0,team,team_url
0,Inter Miami,https://fbref.com/en/squads/cb8b86a2/Inter-Mia...
1,Columbus Crew,https://fbref.com/en/squads/529ba333/Columbus-...
2,FC Cincinnati,https://fbref.com/en/squads/e9ea41b2/FC-Cincin...
3,Orlando City,https://fbref.com/en/squads/46ef01d0/Orlando-C...
4,Charlotte,https://fbref.com/en/squads/eb57545a/Charlotte...
5,NYCFC,https://fbref.com/en/squads/64e81410/New-York-...
6,NY Red Bulls,https://fbref.com/en/squads/69a0fb10/New-York-...
7,CF Montréal,https://fbref.com/en/squads/fc22273c/CF-Montre...
8,Atlanta Utd,https://fbref.com/en/squads/1ebc1a5b/Atlanta-U...
9,D.C. United,https://fbref.com/en/squads/44117292/DC-United...


In [403]:
all_players_df

Unnamed: 0,player_name,player_url,position,team
0,Drake Callender,https://fbref.com/en/players/c4d9567d/Drake-Ca...,GK,Inter Miami
1,Julian Gressel,https://fbref.com/en/players/acd47bc0/Julian-G...,"MF,FW",Inter Miami
2,Sergio Busquets,https://fbref.com/en/players/5ab0ea87/Sergio-B...,"MF,DF",Inter Miami
3,Tomás Avilés,https://fbref.com/en/players/f51b9ae1/Tomas-Av...,DF,Inter Miami
4,Jordi Alba,https://fbref.com/en/players/4601e194/Jordi-Alba,DF,Inter Miami
...,...,...,...,...
938,Beau Leroux,https://fbref.com/en/players/6a9ab308/Beau-Leroux,MF,SJ Earthquakes
939,Riley Lynch,https://fbref.com/en/players/9eb24ed6/Riley-Lynch,FW,SJ Earthquakes
940,Cruz Medina,https://fbref.com/en/players/89d44509/Cruz-Medina,MF,SJ Earthquakes
941,Emi Ochoa,https://fbref.com/en/players/30a08779/Emi-Ochoa,GK,SJ Earthquakes


In [248]:
table_stat_types = ['keeper', 'summary', 'passing', 'passing_types', 'gca', 'defense', 'possession', 'misc']

def generate_player_stat_links(player_url, year):
    player_stat_links = []
    for stat_type in table_stat_types:
        temp = player_url.split('/')
        temp.insert(-1, f"matchlogs/{year}/{stat_type}")
        temp = '/'.join(temp)
        player_stat_links.append(temp)
    return player_stat_links

def generate_individual_player_df(player_url):
    player_stat_links = generate_player_stat_links(player_url)
    player_df = pd.DataFrame()
    for stat_link in player_stat_links:
        time.sleep(int(config['other']['request_time_limit']))
        soup = get_html_data(stat_link)
        stat_df = soup.find('table')
        stat_df = pd.read_html(StringIO(str(stat_df)))[0]
        player_df = pd.concat([player_df, stat_df], axis=1)
    return player_df

In [392]:
all_players_df[all_players_df['player_name' == 'Aziel Jackson']]

KeyError: False

In [393]:
table.find('td')

<td class="left poptip" data-stat="nationality" data-tip="birth"><a href="/en/country/USA/United-States-Football"><span style="white-space: nowrap"><span class="f-i f-us" style="">us</span> USA</span></a></td>