## Scraping MLS Player Salaries

Scraping www.americansocceranalysis.com for player salaries

#### Import Necessary Modules

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from datetime import datetime 
from dateutil import parser
import os
import time

### Extract Salary Links from Menu

#### Send Request, Check Response, Parse HTML

In [2]:
base_url = 'http://www.americansocceranalysis.com/'
response = requests.get(base_url)

In [3]:
response.status_code, response.url

(200, 'http://www.americansocceranalysis.com/')

In [4]:
soup = BeautifulSoup(response.text,'lxml')

#### Find Salaries Menu

In [5]:
popups = soup.findAll('a', {'aria-haspopup': 'true'})

In [6]:
for p in popups:
    if p.findChild().text == 'MLS Player Salaries':
        salary_folder = p.findParent()

In [8]:
years = salary_folder.findAll('a', href=True)
salary_urls = []
salary_dates = []
for y in years:
    salary_urls.append(os.path.join(base_url, y['href'][1:]))
    salary_dates.append(parser.parse(y['title']))

#### Define Salary Scraper

In [9]:
def send_request(url):

    # Send Request
    response = requests.get(url)
    # Check Status, return None if not successful
    print(response.url, '\n', response.status_code, response.reason)
    if response.status_code < 200 or response.status_code >= 300:
        return None

In [10]:
def scrape_salary_data(salary_url, salary_dt):
    
    response = send_request(salary_url)
    if response is None:
        return None
    
    # Find all tables on page in order to find the salary table 
    # from the header row
    tables = soup.findAll('table')
    table_headers = {}
    for t_num, t in enumerate(tables):

        # Extract headers from all tables
        headers = t.findAll('th')
        table_header = []
        for h in headers:
            table_header.append(h.text)

        # Compile table headers into dictionary    
        table_headers[t_num] = (table_header)

        # Find salary table
        tmp_header = set(table_header)
        of_interest = ['Last Name', 'First Name']
        if bool(tmp_header.intersection(set(of_interest))) == True:
            salary_table = t    
            
    # Extract salary data
    salary_header = []
    salary_data = []
    for row in salary_table.findAll('tr'):
        row_data = []

        # Get row type and check if header or data row
        row_type = row.findChild().name
        if row_type == 'th':
            # Extract header
            for h in salary_table.findAll('th'):
                salary_header.append(h.text) 
        else:
            # Extract data
            for data in row.findAll('td'):
                row_data.append(data.text)
            salary_data.append(row_data)

    # Compile dataframe
    salary_df = pd.DataFrame(salary_data, columns=salary_header)

    # Add year column
    salary_df['Year'] = np.repeat(salary_dt.year, len(salary_df))
    
    # Drop rank column if it exists
    if any(df.columns == '#'):
        salary_df = salary_df.drop('#', axis=1)  
    
    return salary_df

#### Scrape Salary Data

In [30]:
# Scrape 
df = pd.DataFrame()
for url, dt in zip(salary_urls, salary_dates):
    salary_df = scrape_salary_data(url, dt)
    if df is None:
        continue            
    df = pd.concat([df, salary_df], axis=0)
    # Pause to prevent 429 status 
    # Note: Need to explore backoff package
    print(datetime.now())
    time.sleep(5) 
# Reset index inplace to remove duplicates 
# created during concatentaiton    
df.reset_index(inplace=True, drop=True)    

http://www.americansocceranalysis.com/september-15-2017/ 
 200 OK
2018-01-24 19:37:24.126687
http://www.americansocceranalysis.com/april-15-2017/ 
 200 OK
2018-01-24 19:37:26.356009
http://www.americansocceranalysis.com/september-15-2016/ 
 200 OK
2018-01-24 19:37:28.588585
http://www.americansocceranalysis.com/may-15-2016/ 
 200 OK
2018-01-24 19:37:30.821782
http://www.americansocceranalysis.com/september-15-2015/ 
 200 OK
2018-01-24 19:37:33.110668
http://www.americansocceranalysis.com/mls-player-salaries/ 
 200 OK
2018-01-24 19:37:35.330695
http://www.americansocceranalysis.com/september-15-2014/ 
 200 OK
2018-01-24 19:37:37.559043
http://www.americansocceranalysis.com/april-1-2014/ 
 200 OK
2018-01-24 19:37:39.813425
http://www.americansocceranalysis.com/september-15-2013/ 
 200 OK
2018-01-24 19:37:42.033493
http://www.americansocceranalysis.com/new-page-2/ 
 200 OK
2018-01-24 19:37:44.286197
http://www.americansocceranalysis.com/may-1-2013/ 
 429 Too Many Requests
2018-01-24 19:37

In [28]:
datetime.now()

datetime.datetime(2018, 1, 24, 19, 36, 11, 506058)

#### Inpect Data

In [58]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12426 entries, 0 to 12425
Data columns (total 7 columns):
Club                       12426 non-null object
Last Name                  12426 non-null object
First Name                 12426 non-null object
Pos                        12426 non-null object
Base Salary                12426 non-null object
Guaranteed Compensation    12426 non-null object
Year                       12426 non-null int64
dtypes: int64(1), object(6)
memory usage: 679.6+ KB


In [13]:
pd.concat([df.head(15), df.tail(15)], axis=0)

NameError: name 'df' is not defined

In [64]:
any(df.columns == 'Year')

True

In [20]:
dt.timetuple().tm_yday

258

In [21]:
dt.timetuple().tm_

time.struct_time(tm_year=2017, tm_mon=9, tm_mday=15, tm_hour=0, tm_min=0, tm_sec=0, tm_wday=4, tm_yday=258, tm_isdst=-1)

In [31]:
release_dates = []
for d in salary_dates:
    release_dates.append(parser.parse(d))

In [39]:
test_dt = datetime(2017, 1, 1, 0, 0)

In [48]:
rd = np.array(release_dates)
max(rd[np.array(release_dates) > test_dt])
    

datetime.datetime(2017, 9, 15, 0, 0)

In [32]:
release_dates

[datetime.datetime(2017, 9, 15, 0, 0),
 datetime.datetime(2017, 4, 15, 0, 0),
 datetime.datetime(2016, 9, 15, 0, 0),
 datetime.datetime(2016, 5, 15, 0, 0),
 datetime.datetime(2015, 9, 15, 0, 0),
 datetime.datetime(2015, 7, 15, 0, 0),
 datetime.datetime(2014, 9, 15, 0, 0),
 datetime.datetime(2014, 4, 1, 0, 0),
 datetime.datetime(2013, 9, 15, 0, 0),
 datetime.datetime(2013, 8, 1, 0, 0),
 datetime.datetime(2013, 5, 1, 0, 0),
 datetime.datetime(2012, 10, 1, 0, 0),
 datetime.datetime(2012, 8, 1, 0, 0),
 datetime.datetime(2012, 5, 15, 0, 0),
 datetime.datetime(2011, 9, 1, 0, 0),
 datetime.datetime(2010, 8, 12, 0, 0),
 datetime.datetime(2009, 9, 15, 0, 0),
 datetime.datetime(2008, 10, 7, 0, 0),
 datetime.datetime(2007, 8, 31, 0, 0)]

In [41]:
salary_df['Year2'] = [release_dates[0].year] * len(salary_df)

In [39]:
salary_df['Year'] = np.repeat(release_dates[0].year, len(salary_df))

In [42]:
salary_df

Unnamed: 0,Club,Last Name,First Name,Pos,Base Salary,Guaranteed Compensation,Year,Year2
0,ORL,Kaka,,M,"$6,660,000.00","$7,167,500.00",2017,2017
1,TOR,Giovinco,Sebastian,F,"$5,600,000.00","$7,115,555.67",2017,2017
2,TOR,Bradley,Michael,M,"$6,000,000.00","$6,500,000.00",2017,2017
3,NYCFC,Pirlo,Andrea,M,"$5,600,000.00","$5,915,690.00",2017,2017
4,NYCFC,Villa,David,F,"$5,610,000.00","$5,610,000.00",2017,2017
5,LA,Dos Santos,Giovani,F,"$3,750,000.00","$5,500,000.00",2017,2017
6,CHI,Schweinsteiger,Bastian,M,"$5,400,000.00","$5,400,000.00",2017,2017
7,TOR,Altidore,Jozy,F,"$4,875,000.00","$4,875,000.00",2017,2017
8,SEA,Dempsey,Clint,F,"$3,200,000.00","$3,892,933.50",2017,2017
9,POR,Valeri,Diego,M,"$2,227,500.00","$2,607,500.00",2017,2017
