In [3]:
import pandas as pd
import matplotlib as plt
import numpy as py

In [4]:
url = 'https://mlsplayers.org/resources/salary-guide'

In [5]:
tables = pd.read_html(url)
type(tables)

list

In [6]:
df = tables[0]
df

Unnamed: 0,First Name,Last Name,Club,Position(s),Base Salary,Guaranteed Compensation
0,Brenden,Aaronson,Philadelphia Union,M-F,"$70,000.08","$98,309.48"
1,Saad,Abdul-Salaam,Seattle Sounders FC,D,"$70,250.04","$70,250.04"
2,Lalas,Abubakar,Colorado Rapids,D,"$135,000.00","$144,937.50"
3,David,Accam,Columbus Crew,M-F,"$1,010,004.00","$1,137,920.00"
4,Kellyn,Acosta,Colorado Rapids,D-M,"$549,999.96","$664,999.96"
...,...,...,...,...,...,...
709,Reto,Ziegler,FC Dallas,D,"$820,000.08","$870,000.08"
710,Walker,Zimmerman,LAFC,D,"$600,000.00","$600,000.00"
711,Kyle,Zobeck,FC Dallas,GK,"$70,875.00","$70,875.00"
712,Ethan,Zubak,LA Galaxy,F,"$75,000.00","$78,125.00"


In [7]:
# convert 'Base Salary' and 'Guaranteed Compensation' from object to float datatypes so we can aggregate
df['Base Salary'] = df['Base Salary'].replace( '[\$,)]','', regex=True ).replace( '','NaN',regex=True ).astype(float)
df['Guaranteed Compensation'] = df['Guaranteed Compensation'].replace( '[\$,)]','', regex=True ).replace( '','NaN',regex=True ).astype(float)
df

Unnamed: 0,First Name,Last Name,Club,Position(s),Base Salary,Guaranteed Compensation
0,Brenden,Aaronson,Philadelphia Union,M-F,70000.08,98309.48
1,Saad,Abdul-Salaam,Seattle Sounders FC,D,70250.04,70250.04
2,Lalas,Abubakar,Colorado Rapids,D,135000.00,144937.50
3,David,Accam,Columbus Crew,M-F,1010004.00,1137920.00
4,Kellyn,Acosta,Colorado Rapids,D-M,549999.96,664999.96
...,...,...,...,...,...,...
709,Reto,Ziegler,FC Dallas,D,820000.08,870000.08
710,Walker,Zimmerman,LAFC,D,600000.00,600000.00
711,Kyle,Zobeck,FC Dallas,GK,70875.00,70875.00
712,Ethan,Zubak,LA Galaxy,F,75000.00,78125.00


In [8]:
df['Club'].nunique()

27

In [9]:
df['Club'].value_counts()

FC Cincinnati             33
Toronto FC                32
Atlanta United            32
Real Salt Lake            31
FC Dallas                 31
San Jose Earthquakes      31
LA Galaxy                 30
Montreal Impact           30
Vancouver Whitecaps       30
Orlando City SC           30
New York Red Bulls        29
Philadelphia Union        29
Columbus Crew             29
Sporting Kansas City      29
Houston Dynamo            29
Seattle Sounders FC       28
Chicago Fire              28
Minnesota United          28
Colorado Rapids           28
New York City FC          28
Portland Timbers          27
LAFC                      27
DC United                 27
New England Revolution    26
Major League Soccer        8
Inter Miami                2
Nashville SC               2
Name: Club, dtype: int64

In [10]:
# Identify the teams that are not in MLS in 2019 and drop from the table
outliers = df.loc[(df["Club"] == "Major League Soccer")|(df["Club"] == "Nashville SC")|(df["Club"] == "Inter Miami")].index
outliers
df.drop(outliers, inplace=True)
df

Unnamed: 0,First Name,Last Name,Club,Position(s),Base Salary,Guaranteed Compensation
0,Brenden,Aaronson,Philadelphia Union,M-F,70000.08,98309.48
1,Saad,Abdul-Salaam,Seattle Sounders FC,D,70250.04,70250.04
2,Lalas,Abubakar,Colorado Rapids,D,135000.00,144937.50
3,David,Accam,Columbus Crew,M-F,1010004.00,1137920.00
4,Kellyn,Acosta,Colorado Rapids,D-M,549999.96,664999.96
...,...,...,...,...,...,...
709,Reto,Ziegler,FC Dallas,D,820000.08,870000.08
710,Walker,Zimmerman,LAFC,D,600000.00,600000.00
711,Kyle,Zobeck,FC Dallas,GK,70875.00,70875.00
712,Ethan,Zubak,LA Galaxy,F,75000.00,78125.00


In [18]:
df.rename(columns={"First Name": "First", "Last Name": "Last", "Position(s)":"Position"}, inplace=True)
df

Unnamed: 0,First,Last,Club,Position,Base Salary,Guaranteed Compensation
0,Brenden,Aaronson,Philadelphia Union,M-F,70000.08,98309.48
1,Saad,Abdul-Salaam,Seattle Sounders FC,D,70250.04,70250.04
2,Lalas,Abubakar,Colorado Rapids,D,135000.00,144937.50
3,David,Accam,Columbus Crew,M-F,1010004.00,1137920.00
4,Kellyn,Acosta,Colorado Rapids,D-M,549999.96,664999.96
...,...,...,...,...,...,...
709,Reto,Ziegler,FC Dallas,D,820000.08,870000.08
710,Walker,Zimmerman,LAFC,D,600000.00,600000.00
711,Kyle,Zobeck,FC Dallas,GK,70875.00,70875.00
712,Ethan,Zubak,LA Galaxy,F,75000.00,78125.00


In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 702 entries, 0 to 713
Data columns (total 6 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   First                    693 non-null    object 
 1   Last                     702 non-null    object 
 2   Club                     702 non-null    object 
 3   Position                 701 non-null    object 
 4   Base Salary              702 non-null    float64
 5   Guaranteed Compensation  702 non-null    float64
dtypes: float64(2), object(4)
memory usage: 38.4+ KB


In [29]:
# find null values to see if missing data will impact intended results
df[df.isnull().any(axis=1)]

Unnamed: 0,First,Last,Club,Position,Base Salary,Guaranteed Compensation
45,,Artur,Columbus Crew,D-M,360000.0,411633.33
51,,Auro,Toronto FC,D,225000.0,276666.67
100,,Bressan,FC Dallas,D,485300.04,521931.29
304,,Ilsinho,Philadelphia Union,M,330000.0,357000.0
324,,Judson,San Jose Earthquakes,D-M,300000.0,305000.0
326,,Juninho,LA Galaxy,M,150000.0,163125.0
328,,Kaku,New York Red Bulls,M-F,799999.92,799999.92
469,,Nani,Orlando City SC,M-F,2333333.04,2486249.7
559,,Robinho,Orlando City SC,M-F,198000.0,205500.0
589,Michael,Salazar,Houston Dynamo,,70250.04,70250.04


In [46]:
# reviewed last names to check they are unique players
df.loc[(df["Last"] == "Salazar")]

Unnamed: 0,First,Last,Club,Position,Base Salary,Guaranteed Compensation
589,Michael,Salazar,Houston Dynamo,,70250.04,70250.04
