## Scraping MLS Player Stats

Scraping www.mlssoccer.com/ for player statistics

### Import Necessary Modules

In [47]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from datetime import datetime 
from dateutil import parser
import os
import time
import itertools
import re
import pickle

### Scrape Statistics

#### Define Scraper and Helpers

In [2]:
def send_request(url, params={}, attempts=3):

    cnt = 0
    while cnt <= attempts:
        cnt += 1        
        # Send Request
        response = requests.get(url, params)
        # Check Status
        print(response.url, '\n', response.status_code, response.reason)
        if (response.status_code >= 200) and (response.status_code < 300):           
            return response
        # If bad status, pause before trying again
        print('Pause, then retry')
        time.pause(15)
    return None

In [3]:
def map_id_to_club(franchise_id):
    
    franchise_map = {11091: 'ATL',
                     1207: 'CHI',
                     436: 'COL',
                     454: 'CLB',
                     1326: 'DC',
                     1903: 'DAL',
                     1897: 'HOU',
                     1230: 'LA',
                     11690: 'LAFC',
                     6977: 'MNUFC',
                     1616: 'MTL',
                     928: 'NE',
                     9668: 'NYCFC',
                     399: 'NYRB',
                     6900: 'ORL',
                     5513: 'PHI',
                     1581: 'POR',
                     1899: 'RSL',
                     1131: 'SJ',
                     3500: 'SEA',
                     421: 'KC',
                     2077: 'TOR',
                     1708: 'VAN'}
    return franchise_map[franchise_id]
        

In [4]:
def check_for_data(stats_table):
    # Check if the table has any 
    odd_rows = stats_table.findAll('tr', {'class': 'odd'})
    if odd_rows[0].text.strip() == 'Stats Unavailable':
        return False
    return True    

In [5]:
def extract_stats(stat_table):
    
    # Extract salary data
    stat_header = []
    stat_data = []
    for row in stat_table.findAll('tr'):
        row_data = []

        # Get row type and check if header or data row
        row_type = row.findChild().name
        if row_type == 'th':
            # Extract header
            for h in stat_table.findAll('th'):
                stat_header.append(h.text) 
        else:
            # Extract data
            for data in row.findAll('td'):
                row_data.append(data.text)
            stat_data.append(row_data)    
            
    # Compile salary dataframe
    stat_df = pd.DataFrame(stat_data, columns=stat_header)
    
    # Strip any whitespace from column names
    stat_df.columns = stat_df.columns.str.strip()
        
    return stat_df

In [6]:
def split_player_name(df):
    first = []
    last = []
    for t in df.Player.str.split():
        if len(t) == 1:
            first.append(t[0])
            last.append('')
        elif len(t) == 2:
            first.append(t[0])
            last.append(t[1])
        else:
            first.append(t[0])
            last.append(' '.join(t[1:]))
    
    df['Last'] = last
    df['First'] = first  
    
    df = df.drop('Player', axis=1)
    
    return df
        

In [7]:
# # Find and parse franchise popup menu to get list of teams
# soup = BeautifulSoup(response.text, 'lxml')
# franchise_select = soup.find('select', {'id': 'edit-franchise', 'name': 'franchise'})
# franchise = []
# # start with second index since first is generic "Select A Club"
# for team in franchise_select.findAll('option')[1:]:
#     franchise.append(int(team['value']))

In [8]:
def generate_param_combos(param_groups):
    params = []
    for combo in itertools.product(*param_groups):
        params.append({'franchise': combo[0],
                       'group': combo[1],
                       'season_type': combo[2],
                       'year': combo[3],
                       'page': combo[4]})
    return params

In [40]:
def scrape_player_stats(base_url, params, return_last_pg=False):

    response = send_request(base_url, params)
    if response is None:
        return None
    
    # Parse HTML
    soup = BeautifulSoup(response.text,'lxml')
    
    stats_table = soup.find('table')
    
    if not check_for_data(stats_table):
        return None
    
    stats_df = extract_stats(stats_table)
    # Add Year Column
    stats_df['Year'] = np.repeat(int(params['year']), len(stats_df))
    # Add Season Column
    stats_df['Season'] = np.repeat(params['season_type'], len(stats_df))
    
    if return_last_pg:
        return stats_df, get_last_page(soup)
    
    return stats_df     

In [41]:
def get_last_page(soup):

    last_pg_url = soup.find('li', {'class': 'pager-last last'}).find('a')['href']
    last_pg = int(re.search('(?<=page=)\d+', last_pg_url).group())
    
    return last_pg

In [64]:
def scrape_group_stats(base_url, group, year=np.arange(2007, 2018), franchise='select', season_type='REG'):
    
    group_df = pd.DataFrame()
    combos = generate_param_combos([[franchise], [group], [season_type], year, [0]])
    for params in combos: 
        stats_df, last_pg = scrape_player_stats(base_url, params, True)
        
        if stats_df is None:
            continue
        # Scrape first page of results
        df = pd.concat([group_df, stats_df], axis=0)
        # Scrape next through last
        for idx in range(1, last_pg+1):
            params['page'] = idx
            stats_df = scrape_player_stats(base_url, params)
            if stats_df is None:
                continue
            group_df = pd.concat([group_df, stats_df], axis=0)            
            # Add pause to prevent 429 status
            print(datetime.now())
            time.sleep(np.random.uniform(2, 5)) 
    return group_df        

#### Send Request to Base URL and Verify Site is Up

In [42]:
base_url = 'http://www.mlssoccer.com/stats/season'
response = requests.get(base_url)
response.status_code, response.url

(200, 'https://www.mlssoccer.com/stats/season')

#### Define parameter options

In [65]:
group = ['goals', 'assists', 'shots', 'fouls', 'goalkeeping']
for grp in group:
    print('Scraping:', grp, datetime.now(), '\n')
    df = scrape_group_stats(base_url, grp, np.arange(2007, 2018, 1))
    df.to_pickle(grp + '_df.pkl')
    print('Completed Scraping:', grp)

Scraping: goals 2018-01-26 06:58:40.085499 

https://www.mlssoccer.com/stats/season?franchise=select&group=goals&season_type=REG&year=2007&page=0 
 200 OK
https://www.mlssoccer.com/stats/season?franchise=select&group=goals&season_type=REG&year=2007&page=1 
 200 OK
2018-01-26 06:58:42.961550
https://www.mlssoccer.com/stats/season?franchise=select&group=goals&season_type=REG&year=2007&page=2 
 200 OK
2018-01-26 06:58:48.953486
https://www.mlssoccer.com/stats/season?franchise=select&group=goals&season_type=REG&year=2007&page=3 
 200 OK
2018-01-26 06:58:54.371046
https://www.mlssoccer.com/stats/season?franchise=select&group=goals&season_type=REG&year=2007&page=4 
 200 OK
2018-01-26 06:59:00.596889
https://www.mlssoccer.com/stats/season?franchise=select&group=goals&season_type=REG&year=2007&page=5 
 200 OK
2018-01-26 06:59:08.440063
https://www.mlssoccer.com/stats/season?franchise=select&group=goals&season_type=REG&year=2007&page=6 
 200 OK
2018-01-26 06:59:16.319587
https://www.mlssoccer.c

https://www.mlssoccer.com/stats/season?franchise=select&group=goals&season_type=REG&year=2010&page=14 
 200 OK
2018-01-26 07:06:55.495347
https://www.mlssoccer.com/stats/season?franchise=select&group=goals&season_type=REG&year=2010&page=15 
 200 OK
2018-01-26 07:07:00.468335
https://www.mlssoccer.com/stats/season?franchise=select&group=goals&season_type=REG&year=2010&page=16 
 200 OK
2018-01-26 07:07:05.552657
https://www.mlssoccer.com/stats/season?franchise=select&group=goals&season_type=REG&year=2011&page=0 
 200 OK
https://www.mlssoccer.com/stats/season?franchise=select&group=goals&season_type=REG&year=2011&page=1 
 200 OK
2018-01-26 07:07:15.082427
https://www.mlssoccer.com/stats/season?franchise=select&group=goals&season_type=REG&year=2011&page=2 
 200 OK
2018-01-26 07:07:24.348307
https://www.mlssoccer.com/stats/season?franchise=select&group=goals&season_type=REG&year=2011&page=3 
 200 OK
2018-01-26 07:07:31.679566
https://www.mlssoccer.com/stats/season?franchise=select&group=goa

https://www.mlssoccer.com/stats/season?franchise=select&group=goals&season_type=REG&year=2013&page=16 
 200 OK
2018-01-26 07:14:47.361666
https://www.mlssoccer.com/stats/season?franchise=select&group=goals&season_type=REG&year=2013&page=17 
 200 OK
2018-01-26 07:14:57.563583
https://www.mlssoccer.com/stats/season?franchise=select&group=goals&season_type=REG&year=2013&page=18 
 200 OK
2018-01-26 07:15:10.091610
https://www.mlssoccer.com/stats/season?franchise=select&group=goals&season_type=REG&year=2013&page=19 
 200 OK
2018-01-26 07:15:17.228198
https://www.mlssoccer.com/stats/season?franchise=select&group=goals&season_type=REG&year=2013&page=20 
 200 OK
2018-01-26 07:15:24.634123
https://www.mlssoccer.com/stats/season?franchise=select&group=goals&season_type=REG&year=2013&page=21 
 200 OK
2018-01-26 07:15:31.861405
https://www.mlssoccer.com/stats/season?franchise=select&group=goals&season_type=REG&year=2014&page=0 
 200 OK
https://www.mlssoccer.com/stats/season?franchise=select&group=

https://www.mlssoccer.com/stats/season?franchise=select&group=goals&season_type=REG&year=2016&page=11 
 200 OK
2018-01-26 07:23:21.698954
https://www.mlssoccer.com/stats/season?franchise=select&group=goals&season_type=REG&year=2016&page=12 
 200 OK
2018-01-26 07:23:39.322670
https://www.mlssoccer.com/stats/season?franchise=select&group=goals&season_type=REG&year=2016&page=13 
 200 OK
2018-01-26 07:23:47.343333
https://www.mlssoccer.com/stats/season?franchise=select&group=goals&season_type=REG&year=2016&page=14 
 200 OK
2018-01-26 07:23:55.354304
https://www.mlssoccer.com/stats/season?franchise=select&group=goals&season_type=REG&year=2016&page=15 
 200 OK
2018-01-26 07:24:10.325179
https://www.mlssoccer.com/stats/season?franchise=select&group=goals&season_type=REG&year=2016&page=16 
 200 OK
2018-01-26 07:24:20.951693
https://www.mlssoccer.com/stats/season?franchise=select&group=goals&season_type=REG&year=2016&page=17 
 200 OK
2018-01-26 07:24:39.988814
https://www.mlssoccer.com/stats/se

https://www.mlssoccer.com/stats/season?franchise=select&group=assists&season_type=REG&year=2008&page=10 
 200 OK
2018-01-26 07:32:32.870605
https://www.mlssoccer.com/stats/season?franchise=select&group=assists&season_type=REG&year=2008&page=11 
 200 OK
2018-01-26 07:32:44.006172
https://www.mlssoccer.com/stats/season?franchise=select&group=assists&season_type=REG&year=2008&page=12 
 200 OK
2018-01-26 07:32:53.084609
https://www.mlssoccer.com/stats/season?franchise=select&group=assists&season_type=REG&year=2008&page=13 
 200 OK
2018-01-26 07:33:03.511943
https://www.mlssoccer.com/stats/season?franchise=select&group=assists&season_type=REG&year=2008&page=14 
 200 OK
2018-01-26 07:33:15.129996
https://www.mlssoccer.com/stats/season?franchise=select&group=assists&season_type=REG&year=2008&page=15 
 200 OK
2018-01-26 07:33:22.298887
https://www.mlssoccer.com/stats/season?franchise=select&group=assists&season_type=REG&year=2008&page=16 
 200 OK
2018-01-26 07:33:31.383996
https://www.mlssocce

AttributeError: module 'time' has no attribute 'pause'

In [69]:
 df = scrape_group_stats(base_url, 'assists', np.arange(2007, 2018, 1))

https://www.mlssoccer.com/stats/season?franchise=select&group=assists&season_type=REG&year=2007&page=0 
 200 OK
https://www.mlssoccer.com/stats/season?franchise=select&group=assists&season_type=REG&year=2007&page=1 
 200 OK
2018-01-26 08:38:59.979760
https://www.mlssoccer.com/stats/season?franchise=select&group=assists&season_type=REG&year=2007&page=2 
 200 OK
2018-01-26 08:39:06.490318
https://www.mlssoccer.com/stats/season?franchise=select&group=assists&season_type=REG&year=2007&page=3 
 200 OK
2018-01-26 08:39:20.154638
https://www.mlssoccer.com/stats/season?franchise=select&group=assists&season_type=REG&year=2007&page=4 
 200 OK
2018-01-26 08:39:35.429924
https://www.mlssoccer.com/stats/season?franchise=select&group=assists&season_type=REG&year=2007&page=5 
 200 OK
2018-01-26 08:39:44.294692
https://www.mlssoccer.com/stats/season?franchise=select&group=assists&season_type=REG&year=2007&page=6 
 200 OK
2018-01-26 08:39:53.237138
https://www.mlssoccer.com/stats/season?franchise=selec

KeyboardInterrupt: 

### Sandbox

In [66]:
goals_df = pickle.load(open('goals_df.pkl', 'rb'))

In [67]:
goals_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5041 entries, 0 to 10
Data columns (total 18 columns):
Player     5041 non-null object
Club       5041 non-null object
POS        5041 non-null object
GP         5041 non-null object
GS         5041 non-null object
MINS       5041 non-null object
G          5041 non-null object
A          5041 non-null object
SHTS       5041 non-null object
SOG        5041 non-null object
GWG        5041 non-null object
PKG/A      5041 non-null object
HmG        5041 non-null object
RdG        5041 non-null object
G/90min    5041 non-null object
SC%        5041 non-null object
Year       5041 non-null int64
Season     5041 non-null object
dtypes: int64(1), object(17)
memory usage: 748.3+ KB


In [68]:
goals_df = pickle.load(open('goals_df.pkl', 'rb'))

In [61]:
goals_df

Unnamed: 0,Player,Club,POS,GP,GS,MINS,G,A,SHTS,SOG,...,SHTS.1,SOG.1,GWG,PKG/A,HmG,RdG,G/90min,SC%,Year,Season
0,Luciano Emilio,DC,F,29,28,2410,20,1,79,47,...,10.0,1.0,0.0,0/0,0.0,0.0,0.0,0.0,2017.0,REG
1,Juan Pablo Angel,NY,F,24,24,2125,19,5,97,53,...,10.0,1.0,0.0,0/0,0.0,0.0,0.0,0.0,2017.0,REG
2,Taylor Twellman,NE,F,26,25,2283,16,3,90,55,...,11.0,2.0,0.0,0/0,0.0,0.0,0.0,0.0,2017.0,REG
3,Eddie Johnson,KC,F,24,24,2149,15,6,75,43,...,11.0,5.0,0.0,0/0,0.0,0.0,0.0,0.0,2017.0,REG
4,Maykel Galindo,CHV,F,28,24,2021,12,5,55,28,...,11.0,3.0,0.0,0/0,0.0,0.0,0.0,0.0,2017.0,REG
5,Ante Razov,CHV,F,26,24,2041,11,8,85,42,...,12.0,3.0,0.0,0/0,0.0,0.0,0.0,0.0,2017.0,REG
6,Christian Gomez,DC,M,27,27,2272,10,9,82,44,...,16.0,2.0,0.0,0/0,0.0,0.0,0.0,0.0,2017.0,REG
7,Jozy Altidore,TOR,F,22,15,1399,9,4,43,20,...,16.0,3.0,0.0,0/0,0.0,0.0,0.0,0.0,2017.0,REG
8,Landon Donovan,LA,M-F,25,24,2191,8,13,44,20,...,17.0,10.0,0.0,0/0,0.0,0.0,0.0,0.0,2017.0,REG
9,Robbie Findley,LA,F,25,14,1353,8,0,31,16,...,18.0,4.0,0.0,0/0,0.0,0.0,0.0,0.0,2017.0,REG


In [51]:
fouls_df = pickle.load(open('fouls_df.pkl', 'rb'))

In [52]:
fouls_df

Unnamed: 0,Player,Club,POS,GP,GS,MINS,G,A,SHTS,SOG,...,SHTS.1,SOG.1,GWG,PKG/A,HmG,RdG,G/90min,SC%,Year,Season
0,Luciano Emilio,DC,F,29,28,2410,20,1,79,47,...,10.0,1.0,0.0,0/0,0.0,0.0,0.0,0.0,2017.0,REG
1,Juan Pablo Angel,NY,F,24,24,2125,19,5,97,53,...,10.0,1.0,0.0,0/0,0.0,0.0,0.0,0.0,2017.0,REG
2,Taylor Twellman,NE,F,26,25,2283,16,3,90,55,...,11.0,2.0,0.0,0/0,0.0,0.0,0.0,0.0,2017.0,REG
3,Eddie Johnson,KC,F,24,24,2149,15,6,75,43,...,11.0,5.0,0.0,0/0,0.0,0.0,0.0,0.0,2017.0,REG
4,Maykel Galindo,CHV,F,28,24,2021,12,5,55,28,...,11.0,3.0,0.0,0/0,0.0,0.0,0.0,0.0,2017.0,REG
5,Ante Razov,CHV,F,26,24,2041,11,8,85,42,...,12.0,3.0,0.0,0/0,0.0,0.0,0.0,0.0,2017.0,REG
6,Christian Gomez,DC,M,27,27,2272,10,9,82,44,...,16.0,2.0,0.0,0/0,0.0,0.0,0.0,0.0,2017.0,REG
7,Jozy Altidore,TOR,F,22,15,1399,9,4,43,20,...,16.0,3.0,0.0,0/0,0.0,0.0,0.0,0.0,2017.0,REG
8,Landon Donovan,LA,M-F,25,24,2191,8,13,44,20,...,17.0,10.0,0.0,0/0,0.0,0.0,0.0,0.0,2017.0,REG
9,Robbie Findley,LA,F,25,14,1353,8,0,31,16,...,18.0,4.0,0.0,0/0,0.0,0.0,0.0,0.0,2017.0,REG
