## Scraping MLS Player Stats

Scraping www.mlssoccer.com/ for player statistics

### Import Necessary Modules

In [33]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from datetime import datetime 
from dateutil import parser
import os
import time
import itertools

### Scrape Statistics

#### Define Scraper and Helpers

In [142]:
def send_request(url, params, attempts=3):

    cnt = 0
    while cnt <= attempts:
        cnt += 1        
        # Send Request
        response = requests.get(url, params)
        # Check Status
        print(response.url, '\n', response.status_code, response.reason)
        if (response.status_code >= 200) and (response.status_code < 300):           
            return response
        # If bad status, pause before trying again
        print('Pause, then retry')
        time.pause(15)
    return None

In [182]:
def map_id_to_club(franchise_id):
    
    franchise_map = {11091: 'ATL',
                     1207: 'CHI',
                     436: 'COL',
                     454: 'CLB',
                     1326: 'DC',
                     1903: 'DAL',
                     1897: 'HOU',
                     1230: 'LA',
                     11690: 'LAFC',
                     6977: 'MNUFC',
                     1616: 'MTL',
                     928: 'NE',
                     9668: 'NYCFC',
                     399: 'NYRB',
                     6900: 'ORL',
                     5513: 'PHI',
                     1581: 'POR',
                     1899: 'RSL',
                     1131: 'SJ',
                     3500: 'SEA',
                     421: 'KC',
                     2077: 'TOR',
                     1708: 'VAN'}
    return franchise_map[franchise_id]
        

In [143]:
def generate_param_combos(param_groups):
    params = []
    for combo in itertools.product(*param_groups):
        params.append({'page': combo[0],
                       'year': combo[1], 
                       'season_type': combo[2],
                       'franchise': combo[3],
                       'group': combo[4]})
    return params

In [144]:
def check_for_data(stats_table):
    # Check if the table has any 
    odd_rows = stats_table.findAll('tr', {'class': 'odd'})
    if odd_rows[0].text.strip() == 'Stats Unavailable':
        return False
    return True    

In [166]:
def extract_stats(stat_table):
    
    # Extract salary data
    stat_header = []
    stat_data = []
    for row in stat_table.findAll('tr'):
        row_data = []

        # Get row type and check if header or data row
        row_type = row.findChild().name
        if row_type == 'th':
            # Extract header
            for h in stat_table.findAll('th'):
                stat_header.append(h.text) 
        else:
            # Extract data
            for data in row.findAll('td'):
                row_data.append(data.text)
            stat_data.append(row_data)    
            
    # Compile salary dataframe
    stat_df = pd.DataFrame(stat_data, columns=stat_header)
    
    # Strip any whitespace from column names
    stat_df.columns = stat_df.columns.str.strip()
        
    return stat_df

In [184]:
def scrape_player_stats(base_url, params):

    response = send_request(base_url, params)
    if response is None:
        return None
    
    # Parse HTML
    soup = BeautifulSoup(response.text,'lxml')
    
    stats_table = soup.find('table')
    
    if not check_for_data(stats_table):
        return None
    
    stats_df = extract_stats(stats_table)
    
    # Add year column
    stats_df['Club'] = np.repeat(map_id_to_club(params['franchise']), len(stats_df))
    
    return stats_df    

#### Send Request to Base URL and Verify Site is Up

In [185]:
base_url = 'http://www.mlssoccer.com/stats/season'
response = requests.get(base_url)
response.status_code, response.url

(200, 'https://www.mlssoccer.com/stats/season')

#### Define parameter options

In [186]:
page = np.arange(0, 2)
year = np.arange(2007, 2018)
season_type = ['REG', 'PS']
group = ['goals', 'assists', 'shots', 'fouls', 'goalkeeping']

# Find and parse franchise popup menu to get list of teams
soup = BeautifulSoup(response.text, 'lxml')
franchise_select = soup.find('select', {'id': 'edit-franchise', 'name': 'franchise'})
franchise = []
# start with second index since first is generic "Select A Club"
for team in franchise_select.findAll('option')[1:]:
    franchise.append(int(team['value']))

In [187]:
page = [0, 1, 2]
year = [2017]
season_type = ['REG']
franchise = [11091]
group = ['goals']
param_groups = [page, year, season_type, franchise, group]
param_combos = generate_param_combos(param_groups)

#### Scrape Stats

In [195]:
# Scrape 
df = pd.DataFrame()
for params in param_combos:
    stats_df = scrape_player_stats(base_url, params)
    if stats_df is None:
        continue
    df = pd.concat([df, stats_df], axis=0)
#     # Pause to prevent 429 status 
#     # Note: Need to explore "backoff" package
#     print(datetime.now())
#     time.sleep(np.random.uniform(10, 20)) 
# Reset index inplace to remove duplicates 
# created during concatentaiton    
df.reset_index(inplace=True, drop=True) 

https://www.mlssoccer.com/stats/season?page=0&year=2017&season_type=REG&franchise=11091&group=goals 
 200 OK
https://www.mlssoccer.com/stats/season?page=1&year=2017&season_type=REG&franchise=11091&group=goals 
 200 OK
https://www.mlssoccer.com/stats/season?page=2&year=2017&season_type=REG&franchise=11091&group=goals 
 200 OK


In [196]:
df

Unnamed: 0,Player,POS,GP,GS,MINS,G,A,SHTS,SOG,GWG,PKG/A,HmG,RdG,G/90min,SC%,Club
0,Josef Martinez,F,20,17,1528,19,1,67,32,3,2/2,16,3,1.12,28.4,ATL
1,Hector Villalba,M,34,34,2762,13,11,86,44,4,0/0,7,6,0.42,15.1,ATL
2,Miguel Almiron,M,30,27,2394,9,14,84,35,3,1/1,7,2,0.34,10.7,ATL
3,Yamil Asad,M,32,31,2765,7,13,53,23,1,1/1,5,2,0.23,13.2,ATL
4,Julian Gressel,M,32,24,2048,5,9,35,9,1,0/0,3,2,0.22,14.3,ATL
5,Jacob Peterson,F,10,1,165,3,0,7,3,0,0/0,1,2,1.64,42.9,ATL
6,Greg Garza,D,26,25,2150,2,5,12,5,0,0/0,1,1,0.08,16.7,ATL
7,Carlos Carmona,M,31,30,2684,2,2,9,2,0,0/0,1,1,0.07,22.2,ATL
8,Anton Walkes,D,20,17,1528,2,0,7,4,1,0/0,2,0,0.12,28.6,ATL
9,Kenwyne Jones,F,17,5,499,2,0,11,4,0,0/0,1,1,0.36,18.2,ATL


In [199]:
tmp = df.Player.str.split()

In [210]:
def split_player_name(df):
    first = []
    last = []
    for t in df.Player.str.split():
        if len(t) == 1:
            first.append(t[0])
            last.append('')
        elif len(t) == 2:
            first.append(t[0])
            last.append(t[1])
        else:
            first.append(t[0])
            last.append(' '.join(t[1:]))
    
    df['Last'] = last
    df['First'] = first  
    
    df = df.drop('Player', axis=1)
    
    return df
        

In [211]:
split_player_name(df)

Unnamed: 0,POS,GP,GS,MINS,G,A,SHTS,SOG,GWG,PKG/A,HmG,RdG,G/90min,SC%,Club,Last,First
0,F,20,17,1528,19,1,67,32,3,2/2,16,3,1.12,28.4,ATL,Martinez,Josef
1,M,34,34,2762,13,11,86,44,4,0/0,7,6,0.42,15.1,ATL,Villalba,Hector
2,M,30,27,2394,9,14,84,35,3,1/1,7,2,0.34,10.7,ATL,Almiron,Miguel
3,M,32,31,2765,7,13,53,23,1,1/1,5,2,0.23,13.2,ATL,Asad,Yamil
4,M,32,24,2048,5,9,35,9,1,0/0,3,2,0.22,14.3,ATL,Gressel,Julian
5,F,10,1,165,3,0,7,3,0,0/0,1,2,1.64,42.9,ATL,Peterson,Jacob
6,D,26,25,2150,2,5,12,5,0,0/0,1,1,0.08,16.7,ATL,Garza,Greg
7,M,31,30,2684,2,2,9,2,0,0/0,1,1,0.07,22.2,ATL,Carmona,Carlos
8,D,20,17,1528,2,0,7,4,1,0/0,2,0,0.12,28.6,ATL,Walkes,Anton
9,F,17,5,499,2,0,11,4,0,0/0,1,1,0.36,18.2,ATL,Jones,Kenwyne
