## Scraping MLS Player Stats

Scraping www.mlssoccer.com/ for player statistics

### Import Necessary Modules

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from datetime import datetime 
from dateutil import parser
import os
import time
import itertools
import re
import pickle

### Scrape Statistics

#### Define Scraper and Helpers

In [2]:
def send_request(url, params={}, attempts=3):

    cnt = 0
    while cnt <= attempts:
        cnt += 1        
        # Send Request
        response = requests.get(url, params)
        # Check Status
        print(response.url, '\n', response.status_code, response.reason)
        if (response.status_code >= 200) and (response.status_code < 300):           
            return response
        # If bad status, pause before trying again
        print('Pause, then retry')
        time.sleep(15)
    return None

In [3]:
def map_id_to_club(franchise_id):
    
    franchise_map = {11091: 'ATL',
                     1207: 'CHI',
                     436: 'COL',
                     454: 'CLB',
                     1326: 'DC',
                     1903: 'DAL',
                     1897: 'HOU',
                     1230: 'LA',
                     11690: 'LAFC',
                     6977: 'MNUFC',
                     1616: 'MTL',
                     928: 'NE',
                     9668: 'NYCFC',
                     399: 'NYRB',
                     6900: 'ORL',
                     5513: 'PHI',
                     1581: 'POR',
                     1899: 'RSL',
                     1131: 'SJ',
                     3500: 'SEA',
                     421: 'KC',
                     2077: 'TOR',
                     1708: 'VAN'}
    return franchise_map[franchise_id]
        

In [4]:
def check_for_data(stats_table):
    # Check if the table has any 
    odd_rows = stats_table.findAll('tr', {'class': 'odd'})
    if odd_rows[0].text.strip() == 'Stats Unavailable':
        return False
    return True    

In [5]:
def split_player_name(df):
    first = []
    last = []
    for t in df.Player.str.split():
        if len(t) == 1:
            first.append(t[0])
            last.append('')
        elif len(t) == 2:
            first.append(t[0])
            last.append(t[1])
        else:
            first.append(t[0])
            last.append(' '.join(t[1:]))
    
    df['Last'] = last
    df['First'] = first  
    
    df = df.drop('Player', axis=1)
    
    return df
        

In [6]:
# # Find and parse franchise popup menu to get list of teams
# soup = BeautifulSoup(response.text, 'lxml')
# franchise_select = soup.find('select', {'id': 'edit-franchise', 'name': 'franchise'})
# franchise = []
# # start with second index since first is generic "Select A Club"
# for team in franchise_select.findAll('option')[1:]:
#     franchise.append(int(team['value']))

In [7]:
def generate_param_combos(param_groups):
    params = []
    for combo in itertools.product(*param_groups):
        params.append({'franchise': combo[0],
                       'group': combo[1],
                       'season_type': combo[2],
                       'year': combo[3],
                       'page': combo[4]})
    return params

In [8]:
def scrape_player_stats(base_url, params, return_last_pg=False):

    response = send_request(base_url, params)
    if response is None:
        return None
    
    # Parse HTML
    soup = BeautifulSoup(response.text,'lxml')
    
    stats_table = soup.find('table')
    
    if not check_for_data(stats_table):
        return None
    
    stats_df = extract_stats(stats_table)
    # Add Year Column
    stats_df['Year'] = np.repeat(int(params['year']), len(stats_df))
    # Add Season Column
    stats_df['Season'] = np.repeat(params['season_type'], len(stats_df))
    
    if return_last_pg:
        return stats_df, get_last_page(soup)
    
    return stats_df     

In [9]:
url = 'https://www.mlssoccer.com/stats/season?franchise=select&group=assists&season_type=REG&year=2009&page=14'

In [10]:
response = send_request(url)

https://www.mlssoccer.com/stats/season?franchise=select&group=assists&season_type=REG&year=2009&page=14 
 200 OK


In [11]:
soup = BeautifulSoup(response.text, 'lxml')

In [12]:
stat_table = soup.find('table')
# stat_table.findAll('tr')[-1]

In [13]:
    # Extract salary data
    stat_header = []
    stat_data = []
    t = []
    for row in stat_table.findAll('tr'):
        row_data = []

        # Get row type and check if header or data row
        row_type = row.findChild().name
        if row_type == 'th':
            # Extract header
            for h in stat_table.findAll('th'):
                stat_header.append(h.text) 
        else:
            # Extract data
            for data in row.findAll('td'):
                row_data.append(data.text)
            stat_data.append(row_data) 

In [14]:
def extract_stats(stat_table):
    
    # Extract salary data
    stat_header = []
    stat_data = []
    for row in stat_table.findAll('tr'):
        row_data = []

        # Get row type and check if header or data row
        row_type = row.findChild().name
        if row_type == 'th':
            # Extract header
            for h in stat_table.findAll('th'):
                stat_header.append(h.text) 
        else:
            # Extract data
            for data in row.findAll('td'):
                row_data.append(data.text)
            stat_data.append(row_data)    
            
    # Compile stat dataframe
    stat_df = pd.DataFrame(stat_data, columns=stat_header)
    
    # Strip any whitespace from column names
    stat_df.columns = stat_df.columns.str.strip()
    
    # Replace blanks with nans
    stat_df = stat_df.replace('', np.nan)
        
    return stat_df

In [15]:
def get_last_page(soup):

    last_pg_url = soup.find('li', {'class': 'pager-last last'}).find('a')['href']
    last_pg = int(re.search('(?<=page=)\d+', last_pg_url).group())
    
    return last_pg

In [16]:
def scrape_group_stats(base_url, group, year=np.arange(2007, 2018), franchise='select', season_type='REG'):
    
    group_df = pd.DataFrame()
    combos = generate_param_combos([[franchise], [group], [season_type], year, [0]])
    for params in combos: 
        stats_df, last_pg = scrape_player_stats(base_url, params, True)
        
        if stats_df is None:
            continue
            
        # Scrape first page of results
        df = pd.concat([group_df, stats_df], axis=0)
        # Add pause to prevent 429 status
        print(datetime.now())
        time.sleep(np.random.uniform(2, 5)) 
        
        # Scrape next through last
        for idx in range(1, last_pg+1):
            params['page'] = idx
            stats_df = scrape_player_stats(base_url, params)
            if stats_df is None:
                continue
            group_df = pd.concat([group_df, stats_df], axis=0)            
            # Add pause to prevent 429 status
            print(datetime.now())
            time.sleep(np.random.uniform(2, 5)) 
            
    # Reset index to remove duplicates created during concatenation
    group_df.reset_index(inplace=True, drop=True)
    
    return group_df        

#### Send Request to Base URL and Verify Site is Up

In [17]:
base_url = 'http://www.mlssoccer.com/stats/season'
response = requests.get(base_url)
response.status_code, response.url

(200, 'https://www.mlssoccer.com/stats/season')

#### Define parameter options

In [18]:
# group = ['goals', 'assists', 'shots', 'fouls', 'goalkeeping']
# for grp in group:
#     print('Scraping:', grp, datetime.now(), '\n')
#     df = scrape_group_stats(base_url, grp, np.arange(2007, 2018, 1))
#     df.to_pickle(grp + '_df.pkl')
#     print('Completed Scraping:', grp)

In [20]:
 shots_df = scrape_group_stats(base_url, 'shots')

https://www.mlssoccer.com/stats/season?franchise=select&group=shots&season_type=REG&year=2007&page=0 
 200 OK
2018-01-26 11:37:25.482855
https://www.mlssoccer.com/stats/season?franchise=select&group=shots&season_type=REG&year=2007&page=1 
 200 OK
2018-01-26 11:37:35.339322
https://www.mlssoccer.com/stats/season?franchise=select&group=shots&season_type=REG&year=2007&page=2 
 200 OK
2018-01-26 11:37:41.171442
https://www.mlssoccer.com/stats/season?franchise=select&group=shots&season_type=REG&year=2007&page=3 
 200 OK
2018-01-26 11:37:49.494501
https://www.mlssoccer.com/stats/season?franchise=select&group=shots&season_type=REG&year=2007&page=4 
 200 OK
2018-01-26 11:37:54.847556
https://www.mlssoccer.com/stats/season?franchise=select&group=shots&season_type=REG&year=2007&page=5 
 200 OK
2018-01-26 11:38:00.448073
https://www.mlssoccer.com/stats/season?franchise=select&group=shots&season_type=REG&year=2007&page=6 
 200 OK
2018-01-26 11:38:10.472340
https://www.mlssoccer.com/stats/season?fr

https://www.mlssoccer.com/stats/season?franchise=select&group=shots&season_type=REG&year=2010&page=13 
 200 OK
2018-01-26 11:43:54.052947
https://www.mlssoccer.com/stats/season?franchise=select&group=shots&season_type=REG&year=2010&page=14 
 200 OK
2018-01-26 11:44:00.938440
https://www.mlssoccer.com/stats/season?franchise=select&group=shots&season_type=REG&year=2010&page=15 
 200 OK
2018-01-26 11:44:07.367009
https://www.mlssoccer.com/stats/season?franchise=select&group=shots&season_type=REG&year=2010&page=16 
 200 OK
2018-01-26 11:44:12.208808
https://www.mlssoccer.com/stats/season?franchise=select&group=shots&season_type=REG&year=2011&page=0 
 200 OK
2018-01-26 11:44:19.402089
https://www.mlssoccer.com/stats/season?franchise=select&group=shots&season_type=REG&year=2011&page=1 
 200 OK
2018-01-26 11:44:26.743412
https://www.mlssoccer.com/stats/season?franchise=select&group=shots&season_type=REG&year=2011&page=2 
 200 OK
2018-01-26 11:44:32.753719
https://www.mlssoccer.com/stats/seaso

https://www.mlssoccer.com/stats/season?franchise=select&group=shots&season_type=REG&year=2013&page=14 
 200 OK
2018-01-26 11:50:37.219695
https://www.mlssoccer.com/stats/season?franchise=select&group=shots&season_type=REG&year=2013&page=15 
 200 OK
2018-01-26 11:50:44.151007
https://www.mlssoccer.com/stats/season?franchise=select&group=shots&season_type=REG&year=2013&page=16 
 200 OK
2018-01-26 11:50:51.842793
https://www.mlssoccer.com/stats/season?franchise=select&group=shots&season_type=REG&year=2013&page=17 
 200 OK
2018-01-26 11:50:59.322806
https://www.mlssoccer.com/stats/season?franchise=select&group=shots&season_type=REG&year=2013&page=18 
 200 OK
2018-01-26 11:51:05.656245
https://www.mlssoccer.com/stats/season?franchise=select&group=shots&season_type=REG&year=2013&page=19 
 200 OK
2018-01-26 11:51:11.641156
https://www.mlssoccer.com/stats/season?franchise=select&group=shots&season_type=REG&year=2013&page=20 
 200 OK
2018-01-26 11:51:19.718923
https://www.mlssoccer.com/stats/se

https://www.mlssoccer.com/stats/season?franchise=select&group=shots&season_type=REG&year=2016&page=8 
 200 OK
2018-01-26 12:01:08.793575
https://www.mlssoccer.com/stats/season?franchise=select&group=shots&season_type=REG&year=2016&page=9 
 200 OK
2018-01-26 12:01:16.730003
https://www.mlssoccer.com/stats/season?franchise=select&group=shots&season_type=REG&year=2016&page=10 
 200 OK
2018-01-26 12:01:23.991835
https://www.mlssoccer.com/stats/season?franchise=select&group=shots&season_type=REG&year=2016&page=11 
 200 OK
2018-01-26 12:01:30.935718
https://www.mlssoccer.com/stats/season?franchise=select&group=shots&season_type=REG&year=2016&page=12 
 200 OK
2018-01-26 12:01:37.504936
https://www.mlssoccer.com/stats/season?franchise=select&group=shots&season_type=REG&year=2016&page=13 
 200 OK
2018-01-26 12:01:45.189847
https://www.mlssoccer.com/stats/season?franchise=select&group=shots&season_type=REG&year=2016&page=14 
 200 OK
2018-01-26 12:01:50.411194
https://www.mlssoccer.com/stats/seas

In [21]:
shots_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5041 entries, 0 to 5040
Data columns (total 15 columns):
Player    5041 non-null object
Club      5037 non-null object
POS       5041 non-null object
GP        5037 non-null object
GS        5037 non-null object
MINS      5037 non-null object
G         5037 non-null object
A         5037 non-null object
SHTS      5037 non-null object
SOG       5037 non-null object
PKG/A     5041 non-null object
SC%       5041 non-null object
SOG%      5041 non-null object
Year      5041 non-null int64
Season    5041 non-null object
dtypes: int64(1), object(14)
memory usage: 590.8+ KB


In [22]:
shots_df

Unnamed: 0,Player,Club,POS,GP,GS,MINS,G,A,SHTS,SOG,PKG/A,SC%,SOG%,Year,Season
0,Yura Movsisyan,RSL,F,23,9,993,5,0,42,13,0/0,11.9,31.0,2007,REG
1,Carlos Marinelli,KC,M,26,19,1674,1,5,42,21,1/2,2.4,50.0,2007,REG
2,Pat Noonan,NE,F,27,21,1736,7,4,42,24,0/0,16.7,57.1,2007,REG
3,Shalrie Joseph,NE,M,27,27,2366,4,5,42,14,3/4,9.5,33.3,2007,REG
4,Collin Samuel,TOR,F,18,18,1510,3,0,41,24,1/1,7.3,58.5,2007,REG
5,Adam Cristman,NE,F,28,14,1421,4,4,40,16,0/0,10.0,40.0,2007,REG
6,Sacha Kljestan,OCS,M,25,25,2186,4,13,40,21,1/1,10.0,52.5,2007,REG
7,Joseph Ngwenya,CLB,F,30,30,2362,7,4,40,23,0/0,17.5,57.5,2007,REG
8,Chris Klein,LA,M,32,32,2880,2,4,40,16,0/0,5.0,40.0,2007,REG
9,Guillermo Barros Schelotto,CLB,F,22,19,1605,5,11,39,18,1/1,12.8,46.2,2007,REG


In [23]:
shots_df.to_pickle('shots_df.pickle')

### Sandbox