## Scraping MLS Player Stats

Scraping www.mlssoccer.com/ for player statistics

### Import Necessary Modules

In [33]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from datetime import datetime 
from dateutil import parser
import os
import time
import itertools

### Scrape Statistics

#### Define Scraper and Helpers

In [2]:
def send_request(url, attempts=3):

    cnt = 0
    while cnt <= attempts:
        cnt += 1        
        # Send Request
        response = requests.get(url)
        # Check Status
        print(response.url, '\n', response.status_code, response.reason)
        if (response.status_code >= 200) and (response.status_code < 300):           
            return response
        # If bad status, pause before trying again
        print('Pause, then retry')
        time.pause(15)
    return None

In [127]:
#
def generate_param_combos(param_groups):
    params = []
    for combo in itertools.product(*param_groups):
        params.append({'page': combo[0],
                       'year': combo[1], 
                       'season_type': combo[2],
                       'franchise': combo[3],
                       'group': combo[4]})
    return params

In [91]:
# 
def check_for_data(soup):
    # Check if the table has any 
    odd_rows = soup.findAll('tr', {'class': 'odd'})
    if odd_rows[0].text.strip() == 'Stats Unavailable':
        return False
    return True    

In [132]:
def extract_statistics(stat_table):
    
    # Extract salary data
    stat_header = []
    stat_data = []
    for row in stat_table.findAll('tr'):
        row_data = []

        # Get row type and check if header or data row
        row_type = row.findChild().name
        if row_type == 'th':
            # Extract header
            for h in stat_table.findAll('th'):
                stat_header.append(h.text) 
        else:
            # Extract data
            for data in row.findAll('td'):
                row_data.append(data.text)
            stat_data.append(row_data)    
            
    # Compile salary dataframe
    stat_df = pd.DataFrame(stat_data, columns=stat_header)
    
    # Strip any whitespace from column names
    stat_df.columns = stat_df.columns.str.strip()
        
    return stat_df

#### Send Request to Base URL and Verify Site is Up

In [5]:
base_url = 'http://www.mlssoccer.com/stats/season'
response = requests.get(base_url)

In [6]:
response.status_code, response.url

(200, 'https://www.mlssoccer.com/stats/season')

#### Define parameter options

In [94]:
page = np.arange(0, 2)
year = np.arange(2007, 2018)
season_type = ['REG', 'PS']
group = ['goals', 'assists', 'shots', 'fouls', 'goalkeeping']

# Find and parse franchise popup menu to get list of teams
soup = BeautifulSoup(response.text, 'lxml')
franchise_select = soup.find('select', {'id': 'edit-franchise', 'name': 'franchise'})
franchise = []
# start with second index since first is generic "Select A Club"
for team in franchise_select.findAll('option')[1:]:
    franchise.append(team['value'])

In [129]:
page = [0, 1, 2]
year = [2017]
season_type = ['REG']
franchise = [11091]
group = ['goals']
param_groups = [page, year, season_type, franchise, group]
combos = generate_param_combos(param_groups)

In [130]:
response = requests.get(base_url, combos[0])
response.status_code

200

In [131]:
soup = BeautifulSoup(response.text, 'lxml')

In [133]:
stat_table = soup.find('table')

In [134]:
df = extract_statistics(stat_table)

In [136]:
print(response.url)

https://www.mlssoccer.com/stats/season?page=0&year=2017&season_type=REG&franchise=11091&group=goals
