# Essential Libraries

In [1]:
%load_ext autoreload

%autoreload 2

import configparser
import os
import time
import random
import pandas as pd
import requests
import openpyxl

from io import StringIO
from pathlib import Path
from bs4 import BeautifulSoup

# some stuff I set up in a config file so I don't have to keep updating certain
# variables in every script
config = configparser.ConfigParser()
config.read('../src/config.ini')

# Starting Point

In [11]:
# the output path is specified in the config.ini file
output = Path(config['paths']['output'])
# team stats data at time of running code will be placed here
# not sure if the data placed here will be very useful
team_stats = Path(output/'team_stats')
mls_2023 = Path(output/'mls_2023')

directories = [output, team_stats, mls_2023]

# create output directory and sub-directories if doesnt exist
for directory in directories:
    try:
        assert directory.exists()
    except:
        os.mkdir(directory)

In [7]:
# I will be web-scraping alot, so I made this function as a result

def get_html_data(url, parser='html.parser'):
    '''
    Extract html data from specified url and return a bs4 object.
    Parser can be specified if needed. Default is html.parser.
    '''
    response = requests.get(url)
    html_content = response.text
    soup = BeautifulSoup(html_content, parser)
    
    return soup

In [8]:
# I ended up using this a lot in the end
def get_table_data_from_html(soup)->list:
    '''
    Extract tables from bs4 object and return a list of dataframes.
    '''
    
    # get all tables in the html
    tables = soup.findAll('table')

    # create dfs for each table and append each one to a list
    dfs_from_tables = []
    for table in tables:
        dfs_from_tables.append(pd.read_html(StringIO(str(table)))[0])
    
    return dfs_from_tables

# Current Season - 2024

In [5]:
# this url gives me a list of all players in the current league
base_url = 'https://fbref.com/en/comps/22/Major-League-Soccer-Stats'

# this page gives me a bunch of tables for team stats in the current moment
# not sure how much of the data here will be useful, but I'll grab it just-in-case
html = get_html_data(base_url + 'players/')

In [6]:
team_stat_dfs = get_table_data_from_html(html)

In [7]:
# table names in the html
tables = ['eastern_conference',
          'western_conference',
          'squad_standard_stats',
          'squad_goalkeeping',
          'squad_advanced_goalkeeping',
          'squad_shooting',
          'squad_passing',
          'squad_pass_types',
          'squad_goal_and_shot_creation',
          'squad_defensive_actions',
          'squad_possession',
          'squad_playing_time',
          'squad_miscellaneous_stats']

# table pairings based on index in the list
pairs = list(zip([i for i in range(0,26,2)], [i for i in range(1,27,2)]))

# create csv files for each table
for index, pair in enumerate(pairs):
    pair_df = pd.concat([team_stat_dfs[pair[0]], team_stat_dfs[pair[1]]],axis=1)
    pair_df.to_csv(team_stats/f'{tables[index]}.csv')

# Individual Player Data
I had to copy and paste the "Player Standard Stats" table from this url (https://fbref.com/en/comps/22/stats/Major-League-Soccer-Stats) since I failed to do so with bs4. I saved it
out as an excel file. I just want to extract the urls that go directly to the players stats.

In [9]:
all_players = pd.read_excel(config['paths']['all_players'])
wb = openpyxl.load_workbook(config['paths']['all_players'])
sheets = wb.sheetnames
ws = wb[sheets[0]]

all_players['stat_link'] = [ws.cell(row=i+2, column=37).hyperlink.target for i in range(all_players.shape[0])]
all_players[['Player', 'stat_link']].to_csv(output/'player_links.csv')

In [240]:
all_players['stat_link'].value_counts().keys()

Index(['https://fbref.com/en/players/10d1139c/matchlogs/2024/summary/DeJuan-Jones-Match-Logs',
       'https://fbref.com/en/players/9862259f/matchlogs/2024/summary/Aziel-Jackson-Match-Logs',
       'https://fbref.com/en/players/6e327d0d/matchlogs/2024/summary/Caden-Clark-Match-Logs',
       'https://fbref.com/en/players/bd7b916c/matchlogs/2024/summary/Ariel-Lassiter-Match-Logs',
       'https://fbref.com/en/players/59059f1e/matchlogs/2024/summary/Derrick-Etienne-Match-Logs',
       'https://fbref.com/en/players/23c75879/matchlogs/2024/summary/Henry-Kessler-Match-Logs',
       'https://fbref.com/en/players/8ef72983/matchlogs/2024/summary/McKinze-Gaines-Match-Logs',
       'https://fbref.com/en/players/3d14c62e/matchlogs/2024/summary/Javain-Brown-Match-Logs',
       'https://fbref.com/en/players/86429529/matchlogs/2024/summary/Samuel-Adeniran-Match-Logs',
       'https://fbref.com/en/players/7f2b7640/matchlogs/2024/summary/Xavier-Arreaga-Match-Logs',
       ...
       'https://fbref.com/

In [199]:
player_data = get_html_data('https://fbref.com/en/players/1339039e/matchlogs/2024/summary/Liel-Abada-Match-Logs')

In [245]:
total_players = len(all_players['stat_link'].value_counts().keys())

In [28]:
player_data_df = pd.DataFrame()
i = 0
failed_links = []
for player_url in list(all_players['stat_link'].value_counts().keys()):
    time.sleep(random.randint(7,10))
    print(i)
    try:
        player_data = get_html_data(player_url)
        # get all tables in the html
        tables = player_data.findAll('table')

        # create dfs for each table and append each one to a list
        table_dfs = []
        for table in tables:
            table_dfs.append(pd.read_html(StringIO(str(table)))[0])
        table_dfs[0]['player'] = player_url
        player_data_df = pd.concat([player_data_df, table_dfs[0]],ignore_index=True)
        i+=1
    except:
        print(f"Could not get player data for {player_url}")
        failed_links.append(player_url)
        i+=1

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
Could not get player data for https://fbref.com/en/players/fdffba5d/matchlogs/2024/summary/Cody-Baker-Match-Logs
70
Could not get player data for https://fbref.com/en/players/816d7aec/matchlogs/2024/summary/Reed-Baker-Whiting-Match-Logs
71
Could not get player data for https://fbref.com/en/players/8b379fcd/matchlogs/2024/summary/Monsef-Bakrar-Match-Logs
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189


In [30]:
player_data_df['player'].value_counts()

player
https://fbref.com/en/players/16ceb862/matchlogs/2024/summary/Olivier-Giroud-Match-Logs     57
https://fbref.com/en/players/ca366055/matchlogs/2024/summary/Maxime-Chanot-Match-Logs      50
https://fbref.com/en/players/18a8e594/matchlogs/2024/summary/Tai-Baribo-Match-Logs         48
https://fbref.com/en/players/7ad32ae1/matchlogs/2024/summary/Henrich-Ravas-Match-Logs      47
https://fbref.com/en/players/7af31216/matchlogs/2024/summary/Matti-Peltola-Match-Logs      46
                                                                                           ..
https://fbref.com/en/players/9f925166/matchlogs/2024/summary/Anthony-Ramirez-Match-Logs     2
https://fbref.com/en/players/50c9bf12/matchlogs/2024/summary/Mykhi-Joyner-Match-Logs        2
https://fbref.com/en/players/729f8330/matchlogs/2024/summary/Rory-ODriscoll-Match-Logs      2
https://fbref.com/en/players/4eecde21/matchlogs/2024/summary/Piero-Elias-Match-Logs         2
https://fbref.com/en/players/7b02a7e1/matchlogs/2024/

In [31]:
failed_links

['https://fbref.com/en/players/fdffba5d/matchlogs/2024/summary/Cody-Baker-Match-Logs',
 'https://fbref.com/en/players/816d7aec/matchlogs/2024/summary/Reed-Baker-Whiting-Match-Logs',
 'https://fbref.com/en/players/8b379fcd/matchlogs/2024/summary/Monsef-Bakrar-Match-Logs']

In [32]:
player_data_df.to_csv(output/'all_player_data.csv')

In [33]:
failed_df = pd.DataFrame()
i = 0
for player_url in failed_links:
    time.sleep(random.randint(7,10))
    print(i)
    try:
        player_data = get_html_data(player_url)
        # get all tables in the html
        tables = player_data.findAll('table')

        # create dfs for each table and append each one to a list
        table_dfs = []
        for table in tables:
            table_dfs.append(pd.read_html(StringIO(str(table)))[0])
        table_dfs[0]['player'] = player_url
        failed_df = pd.concat([failed_df, table_dfs[0]],ignore_index=True)
    except:
        print(f"Could not get player data for {player_url}")

0
0
0


In [34]:
failed_df

Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,...,Passes,Passes,Passes,Passes,Carries,Carries,Take-Ons,Take-Ons,Unnamed: 36_level_0,player
Unnamed: 0_level_1,Date,Day,Comp,Round,Venue,Result,Squad,Opponent,Start,Pos,...,Cmp,Att,Cmp%,PrgP,Carries,PrgC,Att,Succ,Match Report,Unnamed: 21_level_1
0,2024-02-24,Sat,MLS,Regular Season,Away,L 1–2,Seattle Sounders FC,LAFC,N,RB,...,18,24,75.0,2,14,1,1,0,Match Report,https://fbref.com/en/players/fdffba5d/matchlog...
1,,,,,,,,,,,...,,,,,,,,,,https://fbref.com/en/players/fdffba5d/matchlog...
2,2024-03-16,Sat,MLS,Regular Season,Home,D 1–1,Seattle Sounders FC,Colorado Rapids,N,RB,...,0,2,0.0,0,1,1,1,1,Match Report,https://fbref.com/en/players/fdffba5d/matchlog...
3,2024-03-23,Sat,MLS,Regular Season,Away,L 2–3,Seattle Sounders FC,SJ Earthquakes,N,CM,...,18,24,75.0,2,15,1,0,0,Match Report,https://fbref.com/en/players/fdffba5d/matchlog...
4,2024-03-30,Sat,MLS,Regular Season,Away,L 0–1,Seattle Sounders FC,LA Galaxy,Y,RB,...,30,44,68.2,4,26,1,1,0,Match Report,https://fbref.com/en/players/fdffba5d/matchlog...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81,2024-09-21,Sat,MLS,Regular Season,Home,D 1–1,NYCFC,Inter Miami,N,FW,...,5,7,71.4,1,3,0,0,0,Match Report,https://fbref.com/en/players/8b379fcd/matchlog...
82,2024-09-28,Sat,MLS,Regular Season,Away,W 5–1,NYCFC,NY Red Bulls,N,FW,...,2,6,33.3,0,7,0,0,0,Match Report,https://fbref.com/en/players/8b379fcd/matchlog...
83,2024-10-02,Wed,MLS,Regular Season,Home,W 3–2,NYCFC,FC Cincinnati,N,FW,...,1,3,33.3,0,6,0,0,0,Match Report,https://fbref.com/en/players/8b379fcd/matchlog...
84,2024-10-06,Sun,MLS,Regular Season,Home,W 3–1,NYCFC,Nashville SC,N,FW,...,3,4,75.0,0,5,0,1,0,Match Report,https://fbref.com/en/players/8b379fcd/matchlog...


In [35]:
player_data_df_all = pd.concat([player_data_df, failed_df],ignore_index=True)

In [36]:
player_data_df.to_csv(output/'all_player_data_v2.csv')

# 2023 DATA


In [12]:
# this url gives me a list of all players in the current league
base_url_2023 = 'https://fbref.com/en/comps/22/2023/2023-Major-League-Soccer-Stats'

# this page gives me a bunch of tables for team stats in the current moment
# not sure how much of the data here will be useful, but I'll grab it just-in-case
html = get_html_data(base_url_2023)

In [14]:
team_stat_dfs_2023 = get_table_data_from_html(html)

In [16]:
# table names in the html
tables = ['eastern_conference',
          'western_conference',
          'squad_standard_stats',
          'squad_goalkeeping',
          'squad_advanced_goalkeeping',
          'squad_shooting',
          'squad_passing',
          'squad_pass_types',
          'squad_goal_and_shot_creation',
          'squad_defensive_actions',
          'squad_possession',
          'squad_playing_time',
          'squad_miscellaneous_stats']

# table pairings based on index in the list
pairs = list(zip([i for i in range(0,26,2)], [i for i in range(1,27,2)]))

# create csv files for each table
for index, pair in enumerate(pairs):
    pair_df = pd.concat([team_stat_dfs_2023[pair[0]], team_stat_dfs_2023[pair[1]]],axis=1)
    pair_df.to_csv(mls_2023/f'{tables[index]}.csv')

In [20]:
all_players_2023 = pd.read_excel(config['paths']['all_players_2023'])
wb = openpyxl.load_workbook(config['paths']['all_players_2023'])
sheets = wb.sheetnames
ws = wb[sheets[0]]

all_players_2023['stat_link'] = [ws.cell(row=i+2, column=37).hyperlink.target for i in range(all_players_2023.shape[0])]
all_players_2023[['Player', 'stat_link']].to_csv(mls_2023/'player_links.csv')

In [21]:
len(all_players_2023['stat_link'])

855

In [24]:
total_players_2023 = len(all_players_2023['stat_link'].value_counts().keys())

In [29]:
player_data_2023_df = pd.DataFrame()
i = 0
failed_links = []
for player_url in list(all_players_2023['stat_link'].value_counts().keys()):
    time.sleep(random.randint(7,10))
    print(f"\r{float(i/826)}")
    try:
        player_data = get_html_data(player_url)
        # get all tables in the html
        tables = player_data.findAll('table')

        # create dfs for each table and append each one to a list
        table_dfs = []
        for table in tables:
            table_dfs.append(pd.read_html(StringIO(str(table)))[0])
        table_dfs[0]['player'] = player_url
        player_data_2023_df = pd.concat([player_data_2023_df, table_dfs[0]],ignore_index=True)
        i+=1
    except:
        print(f"Could not get player data for {player_url}")
        failed_links.append(player_url)
        i+=1

0.0012106537530266344
1.0012106537530265
2.0012106537530268
3.0012106537530268
4.001210653753026
5.001210653753026
6.001210653753026
7.001210653753026
8.001210653753027
9.001210653753027
10.001210653753027
11.001210653753027
12.001210653753027
13.001210653753027
14.001210653753027
15.001210653753027
16.001210653753027
17.001210653753027
18.001210653753027
19.001210653753027
20.001210653753027
21.001210653753027
22.001210653753027
23.001210653753027
24.001210653753027
25.001210653753027
26.001210653753027
27.001210653753027
28.001210653753027
29.001210653753027
30.001210653753027
31.001210653753027
32.00121065375303
33.00121065375303
34.00121065375303
35.00121065375303
36.00121065375303
37.00121065375303
38.00121065375303
39.00121065375303
40.00121065375303
41.00121065375303
42.00121065375303
43.00121065375303
44.00121065375303
45.00121065375303
46.00121065375303
47.00121065375303
48.00121065375303
49.00121065375303
50.00121065375303
51.00121065375303
52.00121065375303
53.00121065375303

In [30]:
failed_links

[]

In [31]:
player_data_2023_df

Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,...,Passes,Passes,Passes,Passes,Carries,Carries,Take-Ons,Take-Ons,Unnamed: 36_level_0,player
Unnamed: 0_level_1,Date,Day,Comp,Round,Venue,Result,Squad,Opponent,Start,Pos,...,Cmp,Att,Cmp%,PrgP,Carries,PrgC,Att,Succ,Match Report,Unnamed: 21_level_1
0,2023-02-25,Sat,MLS,Regular Season,Away,L 2–3,Toronto FC,D.C. United,N,"LW,FW",...,16,21,76.2,0,13,1,0,0,Match Report,https://fbref.com/en/players/214d2406/matchlog...
1,2023-03-04,Sat,MLS,Regular Season,Away,D 1–1,Toronto FC,Atlanta Utd,N,FW,...,7,8,87.5,0,6,0,0,0,Match Report,https://fbref.com/en/players/214d2406/matchlog...
2,,,,,,,,,,,...,,,,,,,,,,https://fbref.com/en/players/214d2406/matchlog...
3,2023-03-18,Sat,MLS,Regular Season,Home,W 2–0,Toronto FC,Inter Miami,N,FW,...,4,5,80.0,0,12,1,2,0,Match Report,https://fbref.com/en/players/214d2406/matchlog...
4,,,,,,,,,,,...,,,,,,,,,,https://fbref.com/en/players/214d2406/matchlog...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26162,2023-10-04,Wed,MLS,Regular Season,Away,L 0–3,St. Louis,Vancouver W'caps,Y,CB,...,34,44,77.3,4,28,0,0,0,Match Report,https://fbref.com/en/players/672f4300/matchlog...
26163,2023-10-21,Sat,MLS,Regular Season,Home,L 0–2,St. Louis,Seattle Sounders FC,N,"On matchday squad, but did not play",...,"On matchday squad, but did not play","On matchday squad, but did not play","On matchday squad, but did not play","On matchday squad, but did not play","On matchday squad, but did not play","On matchday squad, but did not play","On matchday squad, but did not play","On matchday squad, but did not play",Match Report,https://fbref.com/en/players/672f4300/matchlog...
26164,2023-10-29,Sun,MLS,Round One,Home,L 1–4,St. Louis,Sporting KC,N,"On matchday squad, but did not play",...,"On matchday squad, but did not play","On matchday squad, but did not play","On matchday squad, but did not play","On matchday squad, but did not play","On matchday squad, but did not play","On matchday squad, but did not play","On matchday squad, but did not play","On matchday squad, but did not play",Match Report,https://fbref.com/en/players/672f4300/matchlog...
26165,2023-11-05,Sun,MLS,Round One,Away,L 1–2,St. Louis,Sporting KC,Y,CB,...,32,41,78.0,3,18,0,0,0,Match Report,https://fbref.com/en/players/672f4300/matchlog...


In [32]:
player_data_2023_df.to_csv(mls_2023/'all_player_data_2023.csv')