## Scraping MLS Player Stats

Scraping www.mlssoccer.com/ for player statistics

### Import Necessary Modules

In [232]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from datetime import datetime 
from dateutil import parser
import os
import time
import itertools
import re

### Scrape Statistics

#### Define Scraper and Helpers

In [142]:
def send_request(url, params, attempts=3):

    cnt = 0
    while cnt <= attempts:
        cnt += 1        
        # Send Request
        response = requests.get(url, params)
        # Check Status
        print(response.url, '\n', response.status_code, response.reason)
        if (response.status_code >= 200) and (response.status_code < 300):           
            return response
        # If bad status, pause before trying again
        print('Pause, then retry')
        time.pause(15)
    return None

In [182]:
def map_id_to_club(franchise_id):
    
    franchise_map = {11091: 'ATL',
                     1207: 'CHI',
                     436: 'COL',
                     454: 'CLB',
                     1326: 'DC',
                     1903: 'DAL',
                     1897: 'HOU',
                     1230: 'LA',
                     11690: 'LAFC',
                     6977: 'MNUFC',
                     1616: 'MTL',
                     928: 'NE',
                     9668: 'NYCFC',
                     399: 'NYRB',
                     6900: 'ORL',
                     5513: 'PHI',
                     1581: 'POR',
                     1899: 'RSL',
                     1131: 'SJ',
                     3500: 'SEA',
                     421: 'KC',
                     2077: 'TOR',
                     1708: 'VAN'}
    return franchise_map[franchise_id]
        

In [143]:
def generate_param_combos(param_groups):
    params = []
    for combo in itertools.product(*param_groups):
        params.append({'page': combo[0],
                       'year': combo[1], 
                       'season_type': combo[2],
                       'franchise': combo[3],
                       'group': combo[4]})
    return params

In [144]:
def check_for_data(stats_table):
    # Check if the table has any 
    odd_rows = stats_table.findAll('tr', {'class': 'odd'})
    if odd_rows[0].text.strip() == 'Stats Unavailable':
        return False
    return True    

In [166]:
def extract_stats(stat_table):
    
    # Extract salary data
    stat_header = []
    stat_data = []
    for row in stat_table.findAll('tr'):
        row_data = []

        # Get row type and check if header or data row
        row_type = row.findChild().name
        if row_type == 'th':
            # Extract header
            for h in stat_table.findAll('th'):
                stat_header.append(h.text) 
        else:
            # Extract data
            for data in row.findAll('td'):
                row_data.append(data.text)
            stat_data.append(row_data)    
            
    # Compile salary dataframe
    stat_df = pd.DataFrame(stat_data, columns=stat_header)
    
    # Strip any whitespace from column names
    stat_df.columns = stat_df.columns.str.strip()
        
    return stat_df

In [184]:
def scrape_player_stats(base_url, params):

    response = send_request(base_url, params)
    if response is None:
        return None
    
    # Parse HTML
    soup = BeautifulSoup(response.text,'lxml')
    
    stats_table = soup.find('table')
    
    if not check_for_data(stats_table):
        return None
    
    stats_df = extract_stats(stats_table)
    
    # Add year column
    stats_df['Club'] = np.repeat(map_id_to_club(params['franchise']), len(stats_df))
    
    return stats_df    

In [210]:
def split_player_name(df):
    first = []
    last = []
    for t in df.Player.str.split():
        if len(t) == 1:
            first.append(t[0])
            last.append('')
        elif len(t) == 2:
            first.append(t[0])
            last.append(t[1])
        else:
            first.append(t[0])
            last.append(' '.join(t[1:]))
    
    df['Last'] = last
    df['First'] = first  
    
    df = df.drop('Player', axis=1)
    
    return df
        

In [273]:
# # Find and parse franchise popup menu to get list of teams
# soup = BeautifulSoup(response.text, 'lxml')
# franchise_select = soup.find('select', {'id': 'edit-franchise', 'name': 'franchise'})
# franchise = []
# # start with second index since first is generic "Select A Club"
# for team in franchise_select.findAll('option')[1:]:
#     franchise.append(int(team['value']))

#### Send Request to Base URL and Verify Site is Up

In [272]:
base_url = 'http://www.mlssoccer.com/stats/season'
response = requests.get(base_url)
response.status_code, response.url

(200, 'https://www.mlssoccer.com/stats/season')

#### Define parameter options

In [276]:
page = np.arange(0, 2)
year = np.arange(2007, 2018)
season_type = ['REG', 'PS']
group = ['goals', 'assists', 'shots', 'fouls', 'goalkeeping']
franchise = ['select']

combos = {}
for grp in group:
    combos[grp] = {'reg': generate_param_combos([page, year, ['REG'], [grp], ['select']]), 
                   'ps': generate_param_combos([page, year, ['PS'], [grp], ['select']])}  

In [278]:
combos['goals']

{'ps': [{'franchise': 'goals',
   'group': 'select',
   'page': 0,
   'season_type': 'PS',
   'year': 2007},
  {'franchise': 'goals',
   'group': 'select',
   'page': 0,
   'season_type': 'PS',
   'year': 2008},
  {'franchise': 'goals',
   'group': 'select',
   'page': 0,
   'season_type': 'PS',
   'year': 2009},
  {'franchise': 'goals',
   'group': 'select',
   'page': 0,
   'season_type': 'PS',
   'year': 2010},
  {'franchise': 'goals',
   'group': 'select',
   'page': 0,
   'season_type': 'PS',
   'year': 2011},
  {'franchise': 'goals',
   'group': 'select',
   'page': 0,
   'season_type': 'PS',
   'year': 2012},
  {'franchise': 'goals',
   'group': 'select',
   'page': 0,
   'season_type': 'PS',
   'year': 2013},
  {'franchise': 'goals',
   'group': 'select',
   'page': 0,
   'season_type': 'PS',
   'year': 2014},
  {'franchise': 'goals',
   'group': 'select',
   'page': 0,
   'season_type': 'PS',
   'year': 2015},
  {'franchise': 'goals',
   'group': 'select',
   'page': 0,
   'se

In [None]:
for grp in combos.keys():
    combos[grp]

In [279]:
combos.keys

<function dict.keys>

#### Scrape Stats

In [214]:
# Scrape 
df = pd.DataFrame()
print('Combos:', len(param_combos))
for params in param_combos:
    stats_df = scrape_player_stats(base_url, params)
    if stats_df is None:
        continue
    df = pd.concat([df, stats_df], axis=0)
    # Pause to prevent 429 status 
    # Note: Need to explore "backoff" package
    print(datetime.now())
    time.sleep(np.random.uniform(2, 5)) 
# Reset index inplace to remove duplicates 
# created during concatentaiton    
df.reset_index(inplace=True, drop=True) 

https://www.mlssoccer.com/stats/season?page=0&year=2017&season_type=REG&franchise=11091&group=goals 
 200 OK
2018-01-25 18:32:16.845350
https://www.mlssoccer.com/stats/season?page=0&year=2017&season_type=REG&franchise=1207&group=goals 
 200 OK
2018-01-25 18:32:25.984528
https://www.mlssoccer.com/stats/season?page=0&year=2017&season_type=REG&franchise=436&group=goals 
 200 OK
2018-01-25 18:32:40.297527
https://www.mlssoccer.com/stats/season?page=0&year=2017&season_type=REG&franchise=454&group=goals 
 200 OK
2018-01-25 18:32:57.251346
https://www.mlssoccer.com/stats/season?page=0&year=2017&season_type=REG&franchise=1326&group=goals 
 200 OK
2018-01-25 18:33:02.260027
https://www.mlssoccer.com/stats/season?page=0&year=2017&season_type=REG&franchise=1903&group=goals 
 200 OK
2018-01-25 18:33:06.713490
https://www.mlssoccer.com/stats/season?page=0&year=2017&season_type=REG&franchise=1897&group=goals 
 200 OK
2018-01-25 18:33:21.954157
https://www.mlssoccer.com/stats/season?page=0&year=2017&

https://www.mlssoccer.com/stats/season?page=2&year=2017&season_type=REG&franchise=421&group=goals 
 200 OK
https://www.mlssoccer.com/stats/season?page=2&year=2017&season_type=REG&franchise=2077&group=goals 
 200 OK
https://www.mlssoccer.com/stats/season?page=2&year=2017&season_type=REG&franchise=1708&group=goals 
 200 OK


In [217]:
df.to_pickle('stats.pkl')

In [260]:
get_base_params()

{'franchise': 11091,
 'group': 'goals',
 'page': 0,
 'season_type': 'REG',
 'year': 2017}

### Sandbox

In [263]:
def get_base_params():
    
    base_params = {'page': 0,
                   'year': 2017,
                   'season_type': 'REG',
                   'group': 'goals',
                   'franchise': 'select'}
    
    return base_params

In [270]:
def get_last_page(base_url, group):
    
    params = {'page': 0, 'group': group}
    response = requests.get(base_url, params)
    print(response.url)
    
    soup = BeautifulSoup(response.text, 'lxml')
    last_pg_url = soup.find('li', {'class': 'pager-last last'}).findChild()['href']
    
    last_pg = int(re.search('(?<=page=)\d+', tmp).group())
    
    return last_pg

In [None]:
def scrape_goals(base_url):
    
    base_params = {'page': 0,
                   'year': 2017,
                   'season_type': 'REG',
                   'group': 'goals',
                   'franchise': 'select'}
    
    get_last_page(base_url, base_params)
    
    params = generate_param_combos(param_groups)
    

In [271]:
group = ['goals', 'assists', 'shots', 'fouls', 'goalkeeping']
for grp in group:
    print(grp, get_last_page(base_url, grp))

https://www.mlssoccer.com/stats/season?page=0&group=goals
goals 23
https://www.mlssoccer.com/stats/season?page=0&group=assists
assists 23
https://www.mlssoccer.com/stats/season?page=0&group=shots
shots 23
https://www.mlssoccer.com/stats/season?page=0&group=fouls
fouls 23
https://www.mlssoccer.com/stats/season?page=0&group=goalkeeping
goalkeeping 23


In [257]:
get_last_page(base_url)

23