## Scraping MLS Player Stats

Scraping www.mlssoccer.com/ for player statistics

### Import Necessary Modules

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from datetime import datetime 
from dateutil import parser
import os
import time
import itertools
import re
import pickle

### Scrape Statistics

#### Define Scraper and Helpers

In [2]:
def send_request(url, params={}, attempts=3):

    cnt = 0
    while cnt <= attempts:
        cnt += 1        
        # Send Request
        response = requests.get(url, params)
        # Check Status
        print(response.url, '\n', response.status_code, response.reason)
        if (response.status_code >= 200) and (response.status_code < 300):           
            return response
        # If bad status, pause before trying again
        print('Pause, then retry')
        time.sleep(15)
    return None

In [3]:
def map_id_to_club(franchise_id):
    
    franchise_map = {11091: 'ATL',
                     1207: 'CHI',
                     436: 'COL',
                     454: 'CLB',
                     1326: 'DC',
                     1903: 'DAL',
                     1897: 'HOU',
                     1230: 'LA',
                     11690: 'LAFC',
                     6977: 'MNUFC',
                     1616: 'MTL',
                     928: 'NE',
                     9668: 'NYCFC',
                     399: 'NYRB',
                     6900: 'ORL',
                     5513: 'PHI',
                     1581: 'POR',
                     1899: 'RSL',
                     1131: 'SJ',
                     3500: 'SEA',
                     421: 'KC',
                     2077: 'TOR',
                     1708: 'VAN'}
    return franchise_map[franchise_id]
        

In [4]:
def check_for_data(stats_table):
    # Check if the table has any 
    odd_rows = stats_table.findAll('tr', {'class': 'odd'})
    if odd_rows[0].text.strip() == 'Stats Unavailable':
        return False
    return True    

In [5]:
def split_player_name(df):
    first = []
    last = []
    for t in df.Player.str.split():
        if len(t) == 1:
            first.append(t[0])
            last.append('')
        elif len(t) == 2:
            first.append(t[0])
            last.append(t[1])
        else:
            first.append(t[0])
            last.append(' '.join(t[1:]))
    
    df['Last'] = last
    df['First'] = first  
    
    df = df.drop('Player', axis=1)
    
    return df
        

In [6]:
# # Find and parse franchise popup menu to get list of teams
# soup = BeautifulSoup(response.text, 'lxml')
# franchise_select = soup.find('select', {'id': 'edit-franchise', 'name': 'franchise'})
# franchise = []
# # start with second index since first is generic "Select A Club"
# for team in franchise_select.findAll('option')[1:]:
#     franchise.append(int(team['value']))

In [7]:
def generate_param_combos(param_groups):
    params = []
    for combo in itertools.product(*param_groups):
        params.append({'franchise': combo[0],
                       'group': combo[1],
                       'season_type': combo[2],
                       'year': combo[3],
                       'page': combo[4]})
    return params

In [8]:
def scrape_player_stats(base_url, params, return_last_pg=False):

    response = send_request(base_url, params)
    if response is None:
        return None
    
    # Parse HTML
    soup = BeautifulSoup(response.text,'lxml')
    
    stats_table = soup.find('table')
    
    if not check_for_data(stats_table):
        return None
    
    stats_df = extract_stats(stats_table)
    # Add Year Column
    stats_df['Year'] = np.repeat(int(params['year']), len(stats_df))
    # Add Season Column
    stats_df['Season'] = np.repeat(params['season_type'], len(stats_df))
    
    if return_last_pg:
        return stats_df, get_last_page(soup)
    
    return stats_df     

In [14]:
def extract_stats(stat_table):
    
    # Extract salary data
    stat_header = []
    stat_data = []
    for row in stat_table.findAll('tr'):
        row_data = []

        # Get row type and check if header or data row
        row_type = row.findChild().name
        if row_type == 'th':
            # Extract header
            for h in stat_table.findAll('th'):
                stat_header.append(h.text) 
        else:
            # Extract data
            for data in row.findAll('td'):
                row_data.append(data.text)
            stat_data.append(row_data)    
            
    # Compile stat dataframe
    stat_df = pd.DataFrame(stat_data, columns=stat_header)
    
    # Strip any whitespace from column names
    stat_df.columns = stat_df.columns.str.strip()
    
    # Replace blanks with nans
    stat_df = stat_df.replace('', np.nan)
        
    return stat_df

In [15]:
def get_last_page(soup):

    last_pg_url = soup.find('li', {'class': 'pager-last last'}).find('a')['href']
    last_pg = int(re.search('(?<=page=)\d+', last_pg_url).group())
    
    return last_pg

In [83]:
def scrape_group_stats(base_url, group, year=np.arange(2007, 2018), franchise='select', season_type='REG'):
    
    group_df = pd.DataFrame()
    combos = generate_param_combos([[franchise], [group], [season_type], year, [0]])
    for params in combos: 
        stats_df, last_pg = scrape_player_stats(base_url, params, True)
        
        if stats_df is None:
            continue
            
        # Scrape first page of results
        group_df = pd.concat([group_df, stats_df], axis=0)
        # Add pause to prevent 429 status
        print(datetime.now())
        time.sleep(np.random.uniform(2, 5)) 
        
        # Scrape next through last
        for idx in range(1, last_pg+1):
            params['page'] = idx
            stats_df = scrape_player_stats(base_url, params)
            if stats_df is None:
                continue
            group_df = pd.concat([group_df, stats_df], axis=0)            
            # Add pause to prevent 429 status
            print(datetime.now())
            time.sleep(np.random.uniform(2, 5)) 
            
    # Reset index to remove duplicates created during concatenation
    group_df.reset_index(inplace=True, drop=True)
    
    return group_df        

#### Send Request to Base URL and Verify Site is Up

In [17]:
base_url = 'http://www.mlssoccer.com/stats/season'
response = requests.get(base_url)
response.status_code, response.url

(200, 'https://www.mlssoccer.com/stats/season')

#### Define parameter options

In [18]:
# group = ['goals', 'assists', 'shots', 'fouls', 'goalkeeping']
# for grp in group:
#     print('Scraping:', grp, datetime.now(), '\n')
#     df = scrape_group_stats(base_url, grp, np.arange(2007, 2018, 1))
#     df.to_pickle(grp + '_df.pkl')
#     print('Completed Scraping:', grp)

In [84]:
 goals2007_df = scrape_group_stats(base_url, 'goals', [2007])

https://www.mlssoccer.com/stats/season?franchise=select&group=goals&season_type=REG&year=2007&page=0 
 200 OK
2018-01-26 15:17:20.618558
https://www.mlssoccer.com/stats/season?franchise=select&group=goals&season_type=REG&year=2007&page=1 
 200 OK
2018-01-26 15:17:29.201032
https://www.mlssoccer.com/stats/season?franchise=select&group=goals&season_type=REG&year=2007&page=2 
 200 OK
2018-01-26 15:17:37.990632
https://www.mlssoccer.com/stats/season?franchise=select&group=goals&season_type=REG&year=2007&page=3 
 200 OK
2018-01-26 15:17:49.175847
https://www.mlssoccer.com/stats/season?franchise=select&group=goals&season_type=REG&year=2007&page=4 
 200 OK
2018-01-26 15:17:54.956976
https://www.mlssoccer.com/stats/season?franchise=select&group=goals&season_type=REG&year=2007&page=5 
 200 OK
2018-01-26 15:18:03.792831
https://www.mlssoccer.com/stats/season?franchise=select&group=goals&season_type=REG&year=2007&page=6 
 200 OK
2018-01-26 15:18:11.953407
https://www.mlssoccer.com/stats/season?fr

In [85]:
goals2007_df

Unnamed: 0,Player,Club,POS,GP,GS,MINS,G,A,SHTS,SOG,GWG,PKG/A,HmG,RdG,G/90min,SC%,Year,Season
0,Luciano Emilio,DC,F,29,28,2410,20,1,79,47,4,0/0,12,8,0.75,25.3,2007,REG
1,Juan Pablo Angel,NY,F,24,24,2125,19,5,97,53,5,5/5,13,6,0.80,19.6,2007,REG
2,Taylor Twellman,NE,F,26,25,2283,16,3,90,55,5,0/0,7,9,0.63,17.8,2007,REG
3,Eddie Johnson,KC,F,24,24,2149,15,6,75,43,6,1/1,8,7,0.63,20.0,2007,REG
4,Maykel Galindo,CHV,F,28,24,2021,12,5,55,28,5,0/0,8,4,0.53,21.8,2007,REG
5,Ante Razov,CHV,F,26,24,2041,11,8,85,42,4,1/1,9,2,0.49,12.9,2007,REG
6,Christian Gomez,DC,M,27,27,2272,10,9,82,44,2,2/3,8,2,0.40,12.2,2007,REG
7,Jozy Altidore,TOR,F,22,15,1399,9,4,43,20,3,0/0,7,2,0.58,20.9,2007,REG
8,Landon Donovan,LA,M-F,25,24,2191,8,13,44,20,1,5/6,5,3,0.33,18.2,2007,REG
9,Robbie Findley,LA,F,25,14,1353,8,0,31,16,3,0/0,5,3,0.53,25.8,2007,REG


In [None]:
 goals_df = scrape_group_stats(base_url, 'goals')

https://www.mlssoccer.com/stats/season?franchise=select&group=goals&season_type=REG&year=2007&page=0 
 200 OK
2018-01-26 16:15:34.592901
https://www.mlssoccer.com/stats/season?franchise=select&group=goals&season_type=REG&year=2007&page=1 
 200 OK
2018-01-26 16:15:45.070853
https://www.mlssoccer.com/stats/season?franchise=select&group=goals&season_type=REG&year=2007&page=2 
 200 OK
2018-01-26 16:15:51.031936
https://www.mlssoccer.com/stats/season?franchise=select&group=goals&season_type=REG&year=2007&page=3 
 200 OK
2018-01-26 16:15:59.064267
https://www.mlssoccer.com/stats/season?franchise=select&group=goals&season_type=REG&year=2007&page=4 
 200 OK
2018-01-26 16:16:12.445243
https://www.mlssoccer.com/stats/season?franchise=select&group=goals&season_type=REG&year=2007&page=5 
 200 OK
2018-01-26 16:16:22.072119
https://www.mlssoccer.com/stats/season?franchise=select&group=goals&season_type=REG&year=2007&page=6 
 200 OK
2018-01-26 16:16:30.166087
https://www.mlssoccer.com/stats/season?fr

https://www.mlssoccer.com/stats/season?franchise=select&group=goals&season_type=REG&year=2010&page=13 
 200 OK
2018-01-26 16:23:49.889315
https://www.mlssoccer.com/stats/season?franchise=select&group=goals&season_type=REG&year=2010&page=14 
 200 OK
2018-01-26 16:23:57.039785
https://www.mlssoccer.com/stats/season?franchise=select&group=goals&season_type=REG&year=2010&page=15 
 200 OK
2018-01-26 16:24:03.894000
https://www.mlssoccer.com/stats/season?franchise=select&group=goals&season_type=REG&year=2010&page=16 
 200 OK
2018-01-26 16:24:09.021475
https://www.mlssoccer.com/stats/season?franchise=select&group=goals&season_type=REG&year=2011&page=0 
 200 OK
2018-01-26 16:24:17.833004
https://www.mlssoccer.com/stats/season?franchise=select&group=goals&season_type=REG&year=2011&page=1 
 200 OK
2018-01-26 16:24:25.441085
https://www.mlssoccer.com/stats/season?franchise=select&group=goals&season_type=REG&year=2011&page=2 
 200 OK
2018-01-26 16:24:33.691323
https://www.mlssoccer.com/stats/seaso

https://www.mlssoccer.com/stats/season?franchise=select&group=goals&season_type=REG&year=2013&page=14 
 200 OK
2018-01-26 16:32:05.161453
https://www.mlssoccer.com/stats/season?franchise=select&group=goals&season_type=REG&year=2013&page=15 
 200 OK
2018-01-26 16:32:11.714776
https://www.mlssoccer.com/stats/season?franchise=select&group=goals&season_type=REG&year=2013&page=16 
 200 OK
2018-01-26 16:32:19.675333
https://www.mlssoccer.com/stats/season?franchise=select&group=goals&season_type=REG&year=2013&page=17 
 200 OK
2018-01-26 16:32:27.993061
https://www.mlssoccer.com/stats/season?franchise=select&group=goals&season_type=REG&year=2013&page=18 
 200 OK
2018-01-26 16:32:38.750689
https://www.mlssoccer.com/stats/season?franchise=select&group=goals&season_type=REG&year=2013&page=19 
 200 OK
2018-01-26 16:32:45.919189
https://www.mlssoccer.com/stats/season?franchise=select&group=goals&season_type=REG&year=2013&page=20 
 200 OK
2018-01-26 16:32:55.864551
https://www.mlssoccer.com/stats/se

https://www.mlssoccer.com/stats/season?franchise=select&group=goals&season_type=REG&year=2016&page=8 
 200 OK
2018-01-26 16:40:36.089010
https://www.mlssoccer.com/stats/season?franchise=select&group=goals&season_type=REG&year=2016&page=9 
 200 OK
2018-01-26 16:40:44.738335
https://www.mlssoccer.com/stats/season?franchise=select&group=goals&season_type=REG&year=2016&page=10 
 200 OK
2018-01-26 16:40:52.009099
https://www.mlssoccer.com/stats/season?franchise=select&group=goals&season_type=REG&year=2016&page=11 
 200 OK
2018-01-26 16:41:00.197762
https://www.mlssoccer.com/stats/season?franchise=select&group=goals&season_type=REG&year=2016&page=12 
 200 OK
2018-01-26 16:41:07.376463
https://www.mlssoccer.com/stats/season?franchise=select&group=goals&season_type=REG&year=2016&page=13 
 200 OK
2018-01-26 16:41:14.949322
https://www.mlssoccer.com/stats/season?franchise=select&group=goals&season_type=REG&year=2016&page=14 
 200 OK
2018-01-26 16:41:23.083761
https://www.mlssoccer.com/stats/seas

In [20]:
goals_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5041 entries, 0 to 5040
Data columns (total 18 columns):
Player     5041 non-null object
Club       5037 non-null object
POS        5041 non-null object
GP         5037 non-null object
GS         5037 non-null object
MINS       5037 non-null object
G          5037 non-null object
A          5037 non-null object
SHTS       5037 non-null object
SOG        5037 non-null object
GWG        5037 non-null object
PKG/A      5041 non-null object
HmG        5037 non-null object
RdG        5037 non-null object
G/90min    5041 non-null object
SC%        5041 non-null object
Year       5041 non-null int64
Season     5041 non-null object
dtypes: int64(1), object(17)
memory usage: 709.0+ KB


In [21]:
goals_df

Unnamed: 0,Player,Club,POS,GP,GS,MINS,G,A,SHTS,SOG,GWG,PKG/A,HmG,RdG,G/90min,SC%,Year,Season
0,Danny Dichio,TOR,F,17,14,1175,6,1,20,13,0,0/0,4,2,0.46,30.0,2007,REG
1,Jeff Cunningham,RSL,F,23,20,1636,6,1,32,21,0,1/1,5,1,0.33,18.8,2007,REG
2,Juan Toja,DAL,M,27,27,2388,6,1,33,16,3,0/0,1,5,0.23,18.2,2007,REG
3,Jovan Kirovski,COL,M-F,28,28,2161,6,1,36,14,2,4/4,4,2,0.25,16.7,2007,REG
4,Guillermo Barros Schelotto,CLB,F,22,19,1605,5,11,39,18,3,1/1,3,2,0.28,12.8,2007,REG
5,Eddie Gaven,CLB,M,27,21,1923,5,7,59,33,0,0/0,4,1,0.23,8.5,2007,REG
6,Stuart Holden,HOU,M,22,10,1030,5,5,24,13,3,0/0,2,3,0.44,20.8,2007,REG
7,Abe Thompson,DAL,F,24,15,1227,5,4,19,11,2,2/2,2,3,0.37,26.3,2007,REG
8,Edson Buddle,LA,F,26,15,1251,5,2,47,23,2,0/0,4,1,0.36,10.6,2007,REG
9,Chris Brown,RSL,M-F,22,12,1083,5,0,23,13,0,0/0,2,3,0.42,21.7,2007,REG


In [22]:
goals_df.to_pickle('goals_df.pkl')

### Sandbox

In [67]:
goals_df = pickle.load(open('goals_df.pkl', 'rb'))

In [68]:
# Split PKG/A column into to and drop
goals_df[['PKG', 'PKA']] = goals_df['PKG/A'].str.split('/', expand=True)
goals_df.drop('PKG/A', axis=1, inplace=True)

In [69]:
# Convert columns to numeric values
cols = ['GP', 'GS', 'MINS', 'G', 'A', 'SHTS', 'SOG', 'GWG', 'HmG', 'RdG', \
        'G/90min', 'SC%', 'Year', 'PKG', 'PKA']
goals_df[cols] = goals_df[cols].apply(pd.to_numeric)

In [70]:
goals_df.describe()

Unnamed: 0,GP,GS,MINS,G,A,SHTS,SOG,GWG,HmG,RdG,G/90min,SC%,Year,PKG,PKA
count,5037.0,5037.0,5037.0,5037.0,5037.0,5037.0,5037.0,5037.0,5037.0,5037.0,5041.0,5041.0,5041.0,5037.0,5037.0
mean,14.693071,11.351598,1020.098471,1.052214,1.295414,11.998412,4.167957,0.283899,0.632916,0.419297,0.094569,5.915929,2012.480063,0.061346,0.082787
std,10.766452,10.296154,897.450863,1.745143,2.110454,15.144354,5.875291,0.654614,1.179507,0.858824,0.666906,10.540524,3.082215,0.364761,0.434895
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2007.0,0.0,0.0
25%,4.0,1.0,147.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2010.0,0.0,0.0
50%,15.0,9.0,831.0,0.0,0.0,6.0,2.0,0.0,0.0,0.0,0.0,0.0,2013.0,0.0,0.0
75%,25.0,20.0,1769.0,1.0,2.0,17.0,6.0,0.0,1.0,1.0,0.11,9.7,2015.0,0.0,0.0
max,35.0,34.0,3060.0,11.0,20.0,121.0,46.0,6.0,9.0,8.0,45.0,100.0,2017.0,5.0,6.0


In [71]:
goals_df

Unnamed: 0,Player,Club,POS,GP,GS,MINS,G,A,SHTS,SOG,GWG,HmG,RdG,G/90min,SC%,Year,Season,PKG,PKA
0,Danny Dichio,TOR,F,17.0,14.0,1175.0,6.0,1.0,20.0,13.0,0.0,4.0,2.0,0.46,30.0,2007,REG,0.0,0.0
1,Jeff Cunningham,RSL,F,23.0,20.0,1636.0,6.0,1.0,32.0,21.0,0.0,5.0,1.0,0.33,18.8,2007,REG,1.0,1.0
2,Juan Toja,DAL,M,27.0,27.0,2388.0,6.0,1.0,33.0,16.0,3.0,1.0,5.0,0.23,18.2,2007,REG,0.0,0.0
3,Jovan Kirovski,COL,M-F,28.0,28.0,2161.0,6.0,1.0,36.0,14.0,2.0,4.0,2.0,0.25,16.7,2007,REG,4.0,4.0
4,Guillermo Barros Schelotto,CLB,F,22.0,19.0,1605.0,5.0,11.0,39.0,18.0,3.0,3.0,2.0,0.28,12.8,2007,REG,1.0,1.0
5,Eddie Gaven,CLB,M,27.0,21.0,1923.0,5.0,7.0,59.0,33.0,0.0,4.0,1.0,0.23,8.5,2007,REG,0.0,0.0
6,Stuart Holden,HOU,M,22.0,10.0,1030.0,5.0,5.0,24.0,13.0,3.0,2.0,3.0,0.44,20.8,2007,REG,0.0,0.0
7,Abe Thompson,DAL,F,24.0,15.0,1227.0,5.0,4.0,19.0,11.0,2.0,2.0,3.0,0.37,26.3,2007,REG,2.0,2.0
8,Edson Buddle,LA,F,26.0,15.0,1251.0,5.0,2.0,47.0,23.0,2.0,4.0,1.0,0.36,10.6,2007,REG,0.0,0.0
9,Chris Brown,RSL,M-F,22.0,12.0,1083.0,5.0,0.0,23.0,13.0,0.0,2.0,3.0,0.42,21.7,2007,REG,0.0,0.0


In [72]:
goals_df.to_pickle('goals_df.pkl')