## Scraping MLS Player Stats

Scraping www.mlssoccer.com/ for player statistics

### Import Necessary Modules

In [16]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from datetime import datetime 
from dateutil import parser
import os
import time
import itertools
import re
import pickle

### Scrape Statistics

#### Define Scraper and Helpers

In [17]:
def send_request(url, params={}, attempts=3):

    cnt = 0
    while cnt <= attempts:
        cnt += 1        
        # Send Request
        response = requests.get(url, params)
        # Check Status
        print(response.url, '\n', response.status_code, response.reason)
        if (response.status_code >= 200) and (response.status_code < 300):           
            return response
        # If bad status, pause before trying again
        print('Pause, then retry')
        time.sleep(15)
    return None

In [18]:
def map_id_to_club(franchise_id):
    
    franchise_map = {11091: 'ATL',
                     1207: 'CHI',
                     436: 'COL',
                     454: 'CLB',
                     1326: 'DC',
                     1903: 'DAL',
                     1897: 'HOU',
                     1230: 'LA',
                     11690: 'LAFC',
                     6977: 'MNUFC',
                     1616: 'MTL',
                     928: 'NE',
                     9668: 'NYCFC',
                     399: 'NYRB',
                     6900: 'ORL',
                     5513: 'PHI',
                     1581: 'POR',
                     1899: 'RSL',
                     1131: 'SJ',
                     3500: 'SEA',
                     421: 'KC',
                     2077: 'TOR',
                     1708: 'VAN'}
    return franchise_map[franchise_id]
        

In [19]:
def check_for_data(stats_table):
    # Check if the table has any 
    odd_rows = stats_table.findAll('tr', {'class': 'odd'})
    if odd_rows[0].text.strip() == 'Stats Unavailable':
        return False
    return True    

In [20]:
def split_player_name(df):
    first = []
    last = []
    for t in df.Player.str.split():
        if len(t) == 1:
            first.append(t[0])
            last.append('')
        elif len(t) == 2:
            first.append(t[0])
            last.append(t[1])
        else:
            first.append(t[0])
            last.append(' '.join(t[1:]))
    
    df['Last'] = last
    df['First'] = first  
    
    df = df.drop('Player', axis=1)
    
    return df
        

In [21]:
# # Find and parse franchise popup menu to get list of teams
# soup = BeautifulSoup(response.text, 'lxml')
# franchise_select = soup.find('select', {'id': 'edit-franchise', 'name': 'franchise'})
# franchise = []
# # start with second index since first is generic "Select A Club"
# for team in franchise_select.findAll('option')[1:]:
#     franchise.append(int(team['value']))

In [22]:
def generate_param_combos(param_groups):
    params = []
    for combo in itertools.product(*param_groups):
        params.append({'franchise': combo[0],
                       'group': combo[1],
                       'season_type': combo[2],
                       'year': combo[3],
                       'page': combo[4]})
    return params

In [23]:
def scrape_player_stats(base_url, params, return_last_pg=False):

    response = send_request(base_url, params)
    if response is None:
        return None
    
    # Parse HTML
    soup = BeautifulSoup(response.text,'lxml')
    
    stats_table = soup.find('table')
    
    if not check_for_data(stats_table):
        return None
    
    stats_df = extract_stats(stats_table)
    # Add Year Column
    stats_df['Year'] = np.repeat(int(params['year']), len(stats_df))
    # Add Season Column
    stats_df['Season'] = np.repeat(params['season_type'], len(stats_df))
    
    if return_last_pg:
        return stats_df, get_last_page(soup)
    
    return stats_df     

In [24]:
def extract_stats(stat_table):
    
    # Extract salary data
    stat_header = []
    stat_data = []
    for row in stat_table.findAll('tr'):
        row_data = []

        # Get row type and check if header or data row
        row_type = row.findChild().name
        if row_type == 'th':
            # Extract header
            for h in stat_table.findAll('th'):
                stat_header.append(h.text) 
        else:
            # Extract data
            for data in row.findAll('td'):
                row_data.append(data.text)
            stat_data.append(row_data)    
            
    # Compile stat dataframe
    stat_df = pd.DataFrame(stat_data, columns=stat_header)
    
    # Strip any whitespace from column names
    stat_df.columns = stat_df.columns.str.strip()
    
    # Replace blanks with nans
    stat_df = stat_df.replace('', np.nan)
        
    return stat_df

In [25]:
def get_last_page(soup):

    last_pg_url = soup.find('li', {'class': 'pager-last last'}).find('a')['href']
    last_pg = int(re.search('(?<=page=)\d+', last_pg_url).group())
    
    return last_pg

In [26]:
def scrape_group_stats(base_url, group, year=np.arange(2007, 2018), franchise='select', season_type='REG'):
    
    group_df = pd.DataFrame()
    combos = generate_param_combos([[franchise], [group], [season_type], year, [0]])
    for params in combos: 
        stats_df, last_pg = scrape_player_stats(base_url, params, True)
        
        if stats_df is None:
            continue
            
        # Scrape first page of results
        group_df = pd.concat([group_df, stats_df], axis=0)
        # Add pause to prevent 429 status
        print(datetime.now())
        time.sleep(np.random.uniform(2, 5)) 
        
        # Scrape next through last
        for idx in range(1, last_pg+1):
            params['page'] = idx
            stats_df = scrape_player_stats(base_url, params)
            if stats_df is None:
                continue
            group_df = pd.concat([group_df, stats_df], axis=0)            
            # Add pause to prevent 429 status
            print(datetime.now())
            time.sleep(np.random.uniform(2, 5)) 
            
    # Reset index to remove duplicates created during concatenation
    group_df.reset_index(inplace=True, drop=True)
    
    return group_df        

#### Send Request to Base URL and Verify Site is Up

In [27]:
base_url = 'http://www.mlssoccer.com/stats/season'
response = requests.get(base_url)
response.status_code, response.url

(200, 'https://www.mlssoccer.com/stats/season')

#### Define parameter options

In [28]:
# group = ['goals', 'assists', 'shots', 'fouls', 'goalkeeping']
# for grp in group:
#     print('Scraping:', grp, datetime.now(), '\n')
#     df = scrape_group_stats(base_url, grp, np.arange(2007, 2018, 1))
#     df.to_pickle(grp + '_df.pkl')
#     print('Completed Scraping:', grp)

In [29]:
 shots_df = scrape_group_stats(base_url, 'shots')

https://www.mlssoccer.com/stats/season?franchise=select&group=shots&season_type=REG&year=2007&page=0 
 200 OK
2018-01-26 16:57:43.724943
https://www.mlssoccer.com/stats/season?franchise=select&group=shots&season_type=REG&year=2007&page=1 
 200 OK
2018-01-26 16:57:51.150469
https://www.mlssoccer.com/stats/season?franchise=select&group=shots&season_type=REG&year=2007&page=2 
 200 OK
2018-01-26 16:58:00.405183
https://www.mlssoccer.com/stats/season?franchise=select&group=shots&season_type=REG&year=2007&page=3 
 200 OK
2018-01-26 16:58:08.692904
https://www.mlssoccer.com/stats/season?franchise=select&group=shots&season_type=REG&year=2007&page=4 
 200 OK
2018-01-26 16:58:17.583495
https://www.mlssoccer.com/stats/season?franchise=select&group=shots&season_type=REG&year=2007&page=5 
 200 OK
2018-01-26 16:58:25.169148
https://www.mlssoccer.com/stats/season?franchise=select&group=shots&season_type=REG&year=2007&page=6 
 200 OK
2018-01-26 16:58:31.728751
https://www.mlssoccer.com/stats/season?fr

https://www.mlssoccer.com/stats/season?franchise=select&group=shots&season_type=REG&year=2010&page=13 
 200 OK
2018-01-26 17:06:07.175106
https://www.mlssoccer.com/stats/season?franchise=select&group=shots&season_type=REG&year=2010&page=14 
 200 OK
2018-01-26 17:06:19.376806
https://www.mlssoccer.com/stats/season?franchise=select&group=shots&season_type=REG&year=2010&page=15 
 200 OK
2018-01-26 17:06:31.207682
https://www.mlssoccer.com/stats/season?franchise=select&group=shots&season_type=REG&year=2010&page=16 
 200 OK
2018-01-26 17:06:41.166394
https://www.mlssoccer.com/stats/season?franchise=select&group=shots&season_type=REG&year=2011&page=0 
 200 OK
2018-01-26 17:06:47.359852
https://www.mlssoccer.com/stats/season?franchise=select&group=shots&season_type=REG&year=2011&page=1 
 200 OK
2018-01-26 17:06:55.265048
https://www.mlssoccer.com/stats/season?franchise=select&group=shots&season_type=REG&year=2011&page=2 
 200 OK
2018-01-26 17:07:02.215285
https://www.mlssoccer.com/stats/seaso

https://www.mlssoccer.com/stats/season?franchise=select&group=shots&season_type=REG&year=2013&page=14 
 200 OK
2018-01-26 17:15:02.442244
https://www.mlssoccer.com/stats/season?franchise=select&group=shots&season_type=REG&year=2013&page=15 
 200 OK
2018-01-26 17:15:10.175667
https://www.mlssoccer.com/stats/season?franchise=select&group=shots&season_type=REG&year=2013&page=16 
 200 OK
2018-01-26 17:15:19.224392
https://www.mlssoccer.com/stats/season?franchise=select&group=shots&season_type=REG&year=2013&page=17 
 200 OK
2018-01-26 17:15:26.166772
https://www.mlssoccer.com/stats/season?franchise=select&group=shots&season_type=REG&year=2013&page=18 
 200 OK
2018-01-26 17:15:34.690605
https://www.mlssoccer.com/stats/season?franchise=select&group=shots&season_type=REG&year=2013&page=19 
 200 OK
2018-01-26 17:15:43.397701
https://www.mlssoccer.com/stats/season?franchise=select&group=shots&season_type=REG&year=2013&page=20 
 200 OK
2018-01-26 17:15:51.382354
https://www.mlssoccer.com/stats/se

https://www.mlssoccer.com/stats/season?franchise=select&group=shots&season_type=REG&year=2016&page=8 
 200 OK
2018-01-26 17:23:56.349599
https://www.mlssoccer.com/stats/season?franchise=select&group=shots&season_type=REG&year=2016&page=9 
 200 OK
2018-01-26 17:24:05.671653
https://www.mlssoccer.com/stats/season?franchise=select&group=shots&season_type=REG&year=2016&page=10 
 200 OK
2018-01-26 17:24:17.172411
https://www.mlssoccer.com/stats/season?franchise=select&group=shots&season_type=REG&year=2016&page=11 
 200 OK
2018-01-26 17:24:27.277749
https://www.mlssoccer.com/stats/season?franchise=select&group=shots&season_type=REG&year=2016&page=12 
 200 OK
2018-01-26 17:24:36.382057
https://www.mlssoccer.com/stats/season?franchise=select&group=shots&season_type=REG&year=2016&page=13 
 200 OK
2018-01-26 17:24:44.375562
https://www.mlssoccer.com/stats/season?franchise=select&group=shots&season_type=REG&year=2016&page=14 
 200 OK
2018-01-26 17:24:54.175971
https://www.mlssoccer.com/stats/seas

In [30]:
shots_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5316 entries, 0 to 5315
Data columns (total 15 columns):
Player    5316 non-null object
Club      5312 non-null object
POS       5316 non-null object
GP        5312 non-null object
GS        5312 non-null object
MINS      5312 non-null object
G         5312 non-null object
A         5312 non-null object
SHTS      5312 non-null object
SOG       5312 non-null object
PKG/A     5316 non-null object
SC%       5316 non-null object
SOG%      5316 non-null object
Year      5316 non-null int64
Season    5316 non-null object
dtypes: int64(1), object(14)
memory usage: 623.0+ KB


In [31]:
shots_df

Unnamed: 0,Player,Club,POS,GP,GS,MINS,G,A,SHTS,SOG,PKG/A,SC%,SOG%,Year,Season
0,Juan Pablo Angel,NY,F,24,24,2125,19,5,97,53,5/5,19.6,54.6,2007,REG
1,Taylor Twellman,NE,F,26,25,2283,16,3,90,55,0/0,17.8,61.1,2007,REG
2,Ante Razov,CHV,F,26,24,2041,11,8,85,42,1/1,12.9,49.4,2007,REG
3,Christian Gomez,DC,M,27,27,2272,10,9,82,44,2/3,12.2,53.7,2007,REG
4,Luciano Emilio,DC,F,29,28,2410,20,1,79,47,0/0,25.3,59.5,2007,REG
5,Eddie Johnson,KC,F,24,24,2149,15,6,75,43,1/1,20.0,57.3,2007,REG
6,Davy Arnaud,KC,M,28,28,2489,4,9,74,29,0/0,5.4,39.2,2007,REG
7,Dwayne De Rosario,HOU,M,24,22,1973,6,4,62,24,1/2,9.7,38.7,2007,REG
8,Arturo Alvarez,HOU,M,27,21,1898,3,3,61,30,0/0,4.9,49.2,2007,REG
9,Carlos Ruiz,DAL,F,22,19,1721,7,2,59,25,1/1,11.9,42.4,2007,REG


In [32]:
shots_df.to_pickle('shots_df.pkl')

### Sandbox

In [10]:
shots_df = pickle.load(open('shots_df.pickle', 'rb'))

In [11]:
shots_df

Unnamed: 0,Player,Club,POS,GP,GS,MINS,G,A,SHTS,SOG,PKG/A,SC%,SOG%,Year,Season
0,Yura Movsisyan,RSL,F,23,9,993,5,0,42,13,0/0,11.9,31.0,2007,REG
1,Carlos Marinelli,KC,M,26,19,1674,1,5,42,21,1/2,2.4,50.0,2007,REG
2,Pat Noonan,NE,F,27,21,1736,7,4,42,24,0/0,16.7,57.1,2007,REG
3,Shalrie Joseph,NE,M,27,27,2366,4,5,42,14,3/4,9.5,33.3,2007,REG
4,Collin Samuel,TOR,F,18,18,1510,3,0,41,24,1/1,7.3,58.5,2007,REG
5,Adam Cristman,NE,F,28,14,1421,4,4,40,16,0/0,10.0,40.0,2007,REG
6,Sacha Kljestan,OCS,M,25,25,2186,4,13,40,21,1/1,10.0,52.5,2007,REG
7,Joseph Ngwenya,CLB,F,30,30,2362,7,4,40,23,0/0,17.5,57.5,2007,REG
8,Chris Klein,LA,M,32,32,2880,2,4,40,16,0/0,5.0,40.0,2007,REG
9,Guillermo Barros Schelotto,CLB,F,22,19,1605,5,11,39,18,1/1,12.8,46.2,2007,REG


In [7]:
# Split PKG/A column into to and drop
shots_df[['PKG', 'PKA']] = shots_df['PKG/A'].str.split('/', expand=True)
shots_df.drop('PKG/A', axis=1, inplace=True)

In [8]:
cols = ['GP', 'GS', 'MINS', 'G', 'A', 'SHTS', 'SOG', 'PKG', 'PKA', 'SC%', 'Year']
shots_df[cols] = shots_df[cols].apply(pd.to_numeric)

In [9]:
shots_df.describe()

Unnamed: 0,GP,GS,MINS,G,A,SHTS,SOG,SC%,Year,PKG,PKA
count,5037.0,5037.0,5037.0,5037.0,5037.0,5037.0,5037.0,5041.0,5041.0,5037.0,5037.0
mean,14.643041,11.305539,1015.740917,1.124876,1.294223,11.600953,4.109986,6.169431,2012.480063,0.077626,0.09867
std,10.722307,10.250756,893.116685,2.010519,2.104591,13.918319,5.680588,10.788747,3.082215,0.443076,0.510408
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2007.0,0.0,0.0
25%,4.0,1.0,147.0,0.0,0.0,1.0,0.0,0.0,2010.0,0.0,0.0
50%,15.0,9.0,831.0,0.0,0.0,6.0,2.0,0.0,2013.0,0.0,0.0
75%,24.0,20.0,1754.0,1.0,2.0,17.0,6.0,10.0,2015.0,0.0,0.0
max,35.0,34.0,3060.0,19.0,20.0,68.0,36.0,100.0,2017.0,6.0,6.0


In [12]:
shots_df

Unnamed: 0,Player,Club,POS,GP,GS,MINS,G,A,SHTS,SOG,PKG/A,SC%,SOG%,Year,Season
0,Yura Movsisyan,RSL,F,23,9,993,5,0,42,13,0/0,11.9,31.0,2007,REG
1,Carlos Marinelli,KC,M,26,19,1674,1,5,42,21,1/2,2.4,50.0,2007,REG
2,Pat Noonan,NE,F,27,21,1736,7,4,42,24,0/0,16.7,57.1,2007,REG
3,Shalrie Joseph,NE,M,27,27,2366,4,5,42,14,3/4,9.5,33.3,2007,REG
4,Collin Samuel,TOR,F,18,18,1510,3,0,41,24,1/1,7.3,58.5,2007,REG
5,Adam Cristman,NE,F,28,14,1421,4,4,40,16,0/0,10.0,40.0,2007,REG
6,Sacha Kljestan,OCS,M,25,25,2186,4,13,40,21,1/1,10.0,52.5,2007,REG
7,Joseph Ngwenya,CLB,F,30,30,2362,7,4,40,23,0/0,17.5,57.5,2007,REG
8,Chris Klein,LA,M,32,32,2880,2,4,40,16,0/0,5.0,40.0,2007,REG
9,Guillermo Barros Schelotto,CLB,F,22,19,1605,5,11,39,18,1/1,12.8,46.2,2007,REG
