## Scraping MLS Player Salaries

Scraping www.americansocceranalysis.com for player salaries

### Import Necessary Modules

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from datetime import datetime 
from dateutil import parser
import os
import time

### Extract Salary Data

#### Define Salary Scraper and Helpers

In [23]:
def send_request(url, attempts=3):

    cnt = 0
    while cnt <= attempts:
        cnt += 1        
        # Send Request
        response = requests.get(url)
        # Check Status
        print(response.url, '\n', response.status_code, response.reason)
        if (response.status_code >= 200) and (response.status_code < 300):           
            return response
        # If bad status, pause before trying again
        print('Pause, then retry')
        time.sleep(15)
    return None

In [15]:
def find_salary_table(soup):
    
    # Find all tables on page 
    tables = soup.findAll('table')
    table_headers = {}
    for t_num, t in enumerate(tables):

        # Extract headers from all table
        headers = t.findAll('th')
        table_header = []
        for h in headers:
            table_header.append(h.text)

        # Compile table headers into dictionary    
        table_headers[t_num] = (table_header)

        # Find salary table based on header
        tmp_header = set(table_header)
        of_interest = ['Last Name', 'First Name', 'Last', 'First']
        if bool(tmp_header.intersection(set(of_interest))) == True:
            salary_table = t   
            
    return salary_table            

In [16]:
def extract_salary_data(salary_table):
    
    # Extract salary data
    salary_header = []
    salary_data = []
    for row in salary_table.findAll('tr'):
        row_data = []

        # Get row type and check if header or data row
        row_type = row.findChild().name
        if row_type == 'th':
            # Extract header
            for h in salary_table.findAll('th'):
                salary_header.append(h.text) 
        else:
            # Extract data
            for data in row.findAll('td'):
                row_data.append(data.text)
            salary_data.append(row_data)    
            
    # Compile salary dataframe
    salary_df = pd.DataFrame(salary_data, columns=salary_header)
    
    # Strip any whitespace from column names
    salary_df.columns = salary_df.columns.str.strip()
    
    # Rename last name and first name columns
    if any(salary_df.columns == 'Last'):
        salary_df.rename(columns={'Last': 'Last Name'}, inplace=True)
    if any(salary_df.columns == 'First'):
        salary_df.rename(columns={'First': 'First Name'}, inplace=True)
    
    # Drop rank column if it exists
    if any(salary_df.columns == '#'):
        salary_df = salary_df.drop('#', axis=1)
        
    return salary_df

In [17]:
def scrape_salary_data(salary_url, salary_date):
    
    response = send_request(salary_url)
    if response is None:
        return None
    
    # Parse HTML
    soup = BeautifulSoup(response.text,'lxml')
    
    salary_table = find_salary_table(soup)
    
    salary_df = extract_salary_data(salary_table)
    
    # Add year column
    salary_df['Date'] = np.repeat(salary_date, len(salary_df))
    
    return salary_df

#### Send Request, Check Response, Parse HTML

In [6]:
base_url = 'http://www.americansocceranalysis.com/'
response = requests.get(base_url)

In [7]:
response.status_code, response.url

(200, 'http://www.americansocceranalysis.com/')

In [8]:
soup = BeautifulSoup(response.text,'lxml')

#### Find Salaries Menu

In [9]:
popups = soup.findAll('a', {'aria-haspopup': 'true'})

In [10]:
for p in popups:
    if p.findChild().text == 'MLS Player Salaries':
        salary_folder = p.findParent()

In [11]:
years = salary_folder.findAll('a', href=True)
salary_urls = []
salary_dates = []
for y in years:
    salary_urls.append(os.path.join(base_url, y['href'][1:]))
    salary_dates.append(parser.parse(y['title']).date())

#### Scrape Salary Data

In [24]:
# Scrape 
df = pd.DataFrame()
for url, dt in zip(salary_urls, salary_dates):
    salary_df = scrape_salary_data(url, dt)
    if salary_df is None:
        continue            
    df = pd.concat([df, salary_df], axis=0)
    # Pause to prevent 429 status 
    # Note: Need to explore "backoff" package
    print(datetime.now())
    time.sleep(np.random.uniform(10, 20)) 
# Reset index inplace to remove duplicates 
# created during concatentaiton    
df.reset_index(inplace=True, drop=True)    

http://www.americansocceranalysis.com/september-15-2017/ 
 200 OK
2018-01-25 08:51:42.345306
http://www.americansocceranalysis.com/april-15-2017/ 
 200 OK
2018-01-25 08:51:54.481084
http://www.americansocceranalysis.com/september-15-2016/ 
 200 OK
2018-01-25 08:52:10.791262
http://www.americansocceranalysis.com/may-15-2016/ 
 200 OK
2018-01-25 08:52:25.584795
http://www.americansocceranalysis.com/september-15-2015/ 
 200 OK
2018-01-25 08:52:42.354984
http://www.americansocceranalysis.com/mls-player-salaries/ 
 200 OK
2018-01-25 08:52:59.580521
http://www.americansocceranalysis.com/september-15-2014/ 
 200 OK
2018-01-25 08:53:12.258191
http://www.americansocceranalysis.com/april-1-2014/ 
 200 OK
2018-01-25 08:53:24.735859
http://www.americansocceranalysis.com/september-15-2013/ 
 200 OK
2018-01-25 08:53:37.685317
http://www.americansocceranalysis.com/new-page-2/ 
 200 OK
2018-01-25 08:53:49.163956
http://www.americansocceranalysis.com/may-1-2013/ 
 200 OK
2018-01-25 08:54:03.433762
http

#### Inpect Data

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10130 entries, 0 to 10129
Data columns (total 8 columns):
Base Salary                10130 non-null object
Club                       10130 non-null object
Date                       10130 non-null object
First Name                 10130 non-null object
Guaranteed Compensation    8990 non-null object
Last Name                  10130 non-null object
Pos                        10130 non-null object
Total Compensation         1140 non-null object
dtypes: object(8)
memory usage: 633.2+ KB


In [27]:
df.head(20)

Unnamed: 0,Base Salary,Club,Date,First Name,Guaranteed Compensation,Last Name,Pos,Total Compensation
0,"$6,660,000.00",ORL,2017-09-15,,"$7,167,500.00",Kaka,M,
1,"$5,600,000.00",TOR,2017-09-15,Sebastian,"$7,115,555.67",Giovinco,F,
2,"$6,000,000.00",TOR,2017-09-15,Michael,"$6,500,000.00",Bradley,M,
3,"$5,600,000.00",NYCFC,2017-09-15,Andrea,"$5,915,690.00",Pirlo,M,
4,"$5,610,000.00",NYCFC,2017-09-15,David,"$5,610,000.00",Villa,F,
5,"$3,750,000.00",LA,2017-09-15,Giovani,"$5,500,000.00",Dos Santos,F,
6,"$5,400,000.00",CHI,2017-09-15,Bastian,"$5,400,000.00",Schweinsteiger,M,
7,"$4,875,000.00",TOR,2017-09-15,Jozy,"$4,875,000.00",Altidore,F,
8,"$3,200,000.00",SEA,2017-09-15,Clint,"$3,892,933.50",Dempsey,F,
9,"$2,227,500.00",POR,2017-09-15,Diego,"$2,607,500.00",Valeri,M,


In [29]:
df[df['Total Compensation'].notnull()]

Unnamed: 0,Base Salary,Club,Date,First Name,Guaranteed Compensation,Last Name,Pos,Total Compensation
2411,"$50,000",NY,2015-09-15,Anatole,,Abang,F,"$50,000.00"
2412,"$50,000",PHI,2015-09-15,Eric,,Ayuk,M,"$50,000.00"
2413,"$50,000",COL,2015-09-15,Dominique,,Badji,F,"$50,000.00"
2414,"$50,000",NYCFC,2015-09-15,Connor,,Brandt,M,"$50,000.00"
2415,"$50,000",CHI,2015-09-15,Kingsley,,Bryce,M,"$50,000.00"
2416,"$50,000",KC,2015-09-15,Amadou,,Dia,D,"$50,000.00"
2417,"$50,000",CHI,2015-09-15,Patrick,,Doody,D,"$50,000.00"
2418,"$50,000",ORL,2015-09-15,Earl,,Edwards,GK,"$50,000.00"
2419,"$50,000",SEA,2015-09-15,Oniel,,Fisher,D,"$50,000.00"
2420,"$50,000",POR,2015-09-15,George,,Fochive,M,"$50,000.00"


### Sandbox