## Scraping MLS Player Salaries

Scraping www.americansocceranalysis.com for player salaries

### Import Necessary Modules

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from datetime import datetime 
from dateutil import parser
import os
import time

### Extract Salary Data

#### Define Salary Scraper and Helpers

In [23]:
def send_request(url, attempts=3):

    cnt = 0
    while cnt <= attempts:
        cnt += 1        
        # Send Request
        response = requests.get(url)
        # Check Status
        print(response.url, '\n', response.status_code, response.reason)
        if (response.status_code >= 200) and (response.status_code < 300):           
            return response
        # If bad status, pause before trying again
        print('Pause, then retry')
        time.sleep(15)
    return None

In [15]:
def find_salary_table(soup):
    
    # Find all tables on page 
    tables = soup.findAll('table')
    table_headers = {}
    for t_num, t in enumerate(tables):

        # Extract headers from all table
        headers = t.findAll('th')
        table_header = []
        for h in headers:
            table_header.append(h.text)

        # Compile table headers into dictionary    
        table_headers[t_num] = (table_header)

        # Find salary table based on header
        tmp_header = set(table_header)
        of_interest = ['Last Name', 'First Name', 'Last', 'First']
        if bool(tmp_header.intersection(set(of_interest))) == True:
            salary_table = t   
            
    return salary_table            

In [31]:
def extract_salary_data(salary_table):
    
    # Extract salary data
    salary_header = []
    salary_data = []
    for row in salary_table.findAll('tr'):
        row_data = []

        # Get row type and check if header or data row
        row_type = row.findChild().name
        if row_type == 'th':
            # Extract header
            for h in salary_table.findAll('th'):
                salary_header.append(h.text) 
        else:
            # Extract data
            for data in row.findAll('td'):
                row_data.append(data.text)
            salary_data.append(row_data)    
            
    # Compile salary dataframe
    salary_df = pd.DataFrame(salary_data, columns=salary_header)
    
    # Strip any whitespace from column names
    salary_df.columns = salary_df.columns.str.strip()
    
    # Rename last name and first name columns
    if any(salary_df.columns == 'Last'):
        salary_df.rename(columns={'Last': 'Last Name'}, inplace=True)
    if any(salary_df.columns == 'First'):
        salary_df.rename(columns={'First': 'First Name'}, inplace=True)
    if any(salary_df.columns == 'Total Compensation'):
        salary_df.rename(columns={'Total Compensation': 'Guaranteed Compensation'}, inplace=True)
    # Drop rank column if it exists
    if any(salary_df.columns == '#'):
        salary_df = salary_df.drop('#', axis=1)
        
    return salary_df

In [17]:
def scrape_salary_data(salary_url, salary_date):
    
    response = send_request(salary_url)
    if response is None:
        return None
    
    # Parse HTML
    soup = BeautifulSoup(response.text,'lxml')
    
    salary_table = find_salary_table(soup)
    
    salary_df = extract_salary_data(salary_table)
    
    # Add year column
    salary_df['Date'] = np.repeat(salary_date, len(salary_df))
    
    return salary_df

#### Send Request, Check Response, Parse HTML

In [6]:
base_url = 'http://www.americansocceranalysis.com/'
response = requests.get(base_url)

In [7]:
response.status_code, response.url

(200, 'http://www.americansocceranalysis.com/')

In [8]:
soup = BeautifulSoup(response.text,'lxml')

#### Find Salaries Menu

In [9]:
popups = soup.findAll('a', {'aria-haspopup': 'true'})

In [10]:
for p in popups:
    if p.findChild().text == 'MLS Player Salaries':
        salary_folder = p.findParent()

In [11]:
years = salary_folder.findAll('a', href=True)
salary_urls = []
salary_dates = []
for y in years:
    salary_urls.append(os.path.join(base_url, y['href'][1:]))
    salary_dates.append(parser.parse(y['title']).date())

#### Scrape Salary Data

In [33]:
# Scrape 
df = pd.DataFrame()
for url, dt in zip(salary_urls, salary_dates):
    salary_df = scrape_salary_data(url, dt)
    if salary_df is None:
        continue            
    df = pd.concat([df, salary_df], axis=0)
    # Pause to prevent 429 status 
    # Note: Need to explore "backoff" package
    print(datetime.now())
    time.sleep(np.random.uniform(10, 20)) 
# Reset index inplace to remove duplicates 
# created during concatentaiton    
df.reset_index(inplace=True, drop=True)    

http://www.americansocceranalysis.com/september-15-2017/ 
 200 OK
2018-01-25 09:09:16.636923
http://www.americansocceranalysis.com/april-15-2017/ 
 200 OK
2018-01-25 09:09:35.188963
http://www.americansocceranalysis.com/september-15-2016/ 
 200 OK
2018-01-25 09:09:53.555323
http://www.americansocceranalysis.com/may-15-2016/ 
 200 OK
2018-01-25 09:10:13.348774
http://www.americansocceranalysis.com/september-15-2015/ 
 200 OK
2018-01-25 09:10:27.890607
http://www.americansocceranalysis.com/mls-player-salaries/ 
 200 OK
2018-01-25 09:10:41.112321
http://www.americansocceranalysis.com/september-15-2014/ 
 200 OK
2018-01-25 09:10:59.964325
http://www.americansocceranalysis.com/april-1-2014/ 
 200 OK
2018-01-25 09:11:18.289470
http://www.americansocceranalysis.com/september-15-2013/ 
 200 OK
2018-01-25 09:11:33.428716
http://www.americansocceranalysis.com/new-page-2/ 
 200 OK
2018-01-25 09:11:50.949511
http://www.americansocceranalysis.com/may-1-2013/ 
 200 OK
2018-01-25 09:12:09.688548
http

#### Inpect Data

In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10130 entries, 0 to 10129
Data columns (total 7 columns):
Club                       10130 non-null object
Last Name                  10130 non-null object
First Name                 10130 non-null object
Pos                        10130 non-null object
Base Salary                10130 non-null object
Guaranteed Compensation    10130 non-null object
Date                       10130 non-null object
dtypes: object(7)
memory usage: 554.1+ KB


In [35]:
df.head(20)

Unnamed: 0,Club,Last Name,First Name,Pos,Base Salary,Guaranteed Compensation,Date
0,ORL,Kaka,,M,"$6,660,000.00","$7,167,500.00",2017-09-15
1,TOR,Giovinco,Sebastian,F,"$5,600,000.00","$7,115,555.67",2017-09-15
2,TOR,Bradley,Michael,M,"$6,000,000.00","$6,500,000.00",2017-09-15
3,NYCFC,Pirlo,Andrea,M,"$5,600,000.00","$5,915,690.00",2017-09-15
4,NYCFC,Villa,David,F,"$5,610,000.00","$5,610,000.00",2017-09-15
5,LA,Dos Santos,Giovani,F,"$3,750,000.00","$5,500,000.00",2017-09-15
6,CHI,Schweinsteiger,Bastian,M,"$5,400,000.00","$5,400,000.00",2017-09-15
7,TOR,Altidore,Jozy,F,"$4,875,000.00","$4,875,000.00",2017-09-15
8,SEA,Dempsey,Clint,F,"$3,200,000.00","$3,892,933.50",2017-09-15
9,POR,Valeri,Diego,M,"$2,227,500.00","$2,607,500.00",2017-09-15


### Sandbox