In [1]:
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
from sys import argv
from urllib.request import urlopen
from urllib.error import HTTPError
import requests
import time

In [66]:
def scrape_players(url):
    # Make Soup
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html.parser')
    
    # Save These Subsections for Later
    pergame_html = page.text[page.text.find('<h2>Per Game</h2>'):]
    advanced_html = page.text[page.text.find('<h2>Advanced</h2>'):]
    salary_html = page.text[page.text.find('<h2>Salaries</h2>'):]
    shooting_html = page.text[page.text.find('<h2>Shooting</h2>'):]
    
    # List for Storing Player Dictionaries
    dict_list = []
      
    for i in range(len(soup.find_all('tr'))):
        if i==0: continue
            
        player_str = str(soup.find_all('tr')[i])
        
        # Get Player Name
        name = re.search('.html">([\D\'\.\-]{1,50})</a>', player_str).group(1)
        
        # Get Team Abbreviation
        team_abb = re.search('teams/(.*)/\d{4}', url).group(1)
        
        # Get Year
        year = re.search('/(\d{4}).html', url).group(1)
        
        # Get Position
        if re.search('"pos">([A-Z]{1,2})<', player_str) is None: pos = 'NA'
        else: pos = re.search('"pos">([A-Z]{1,2})<', player_str).group(1)
        
        # Get Height (George Kar, SAS 1977 is 6-2, not 0)
        if re.search('"height">([\d,-]+)<', player_str) is None: height = 0
        else: height = re.search('"height">([\d,-]+)<', player_str).group(1)
        
        # Get Years of Experience
        exp = re.search('"years_experience">([\d,R]+)<', player_str).group(1)
        
        #### Get Per-Game Data ####
        
        # Trim to Relevant HTML
        player_html = pergame_html[pergame_html.find(name):]
        
        # Get Games Played
        if re.search('">([\d]{1,3})</a></td>', player_html) is None: GP = 0.0
        else: GP = int(re.search('">([\d]{1,3})</a></td>', player_html).group(1))
        
        # Get PPG
        if re.search('"pts_per_g" >([\d,\.]+)</td>', player_html) is None: ppg = 0.0
        else: ppg = float(re.search('"pts_per_g" >([\d,\.]+)</td>', player_html).group(1))
        
        # Get RPG
        if re.search('"trb_per_g" >([\d,\.]+)</td>', player_html) is None: rpg = 0.0
        else: rpg = float(re.search('"trb_per_g" >([\d,\.]+)</td>', player_html).group(1))
        
        # Get APG
        if re.search('"ast_per_g" >([\d,\.]+)</td>', player_html) is None: apg = 0.0
        else: apg = float(re.search('"ast_per_g" >([\d,\.]+)</td>', player_html).group(1))
        
        # Get TPG
        if re.search('"tov_per_g" >([\d,\.]+)</td>', player_html) is None: tpg = 0.0
        else: tpg = float(re.search('"tov_per_g" >([\d,\.]+)</td>', player_html).group(1))
        
        # Get 2-point %
        if re.search('"fg2_pct" >([\d,\.]+)</td>', player_html) is None: fg2_pct = 0.0
        else: fg2_pct = float(re.search('"fg2_pct" >([\d,\.]+)</td>', player_html).group(1))
        
        # Get 3-point %
        if re.search('"fg3_pct" >([\d,\.]+)</td>', player_html) is None: fg3_pct = 0.0
        else: fg3_pct = float(re.search('"fg3_pct" >([\d,\.]+)</td>', player_html).group(1))
        
        # Get Minutes
        if re.search('"mp_per_g" >([\d,\.]+)</td>', player_html) is None: mpg = 0.0
        else: mpg = float(re.search('"mp_per_g" >([\d,\.]+)</td>', player_html).group(1))
        
        #### Get Win Shares ####
        
        # Trim to Relevant HTML
        player_html = advanced_html[advanced_html.find(name):]
        
        if re.search('data-stat="obpm" >([\d\.]{1,5})<', player_html) is None: obpm = None
        else: obpm = float(re.search('data-stat="obpm" >([\d\.]{1,5})<', player_html).group(1))
            
        if re.search('data-stat="dbpm" >([\d\.]{1,5})<', player_html) is None: dbpm = None
        else: dbpm = float(re.search('data-stat="dbpm" >([\d\.]{1,5})<', player_html).group(1))
            
        if re.search('data-stat="usg_pct" >([\d\.]{1,5})<', player_html) is None: usg_pct = 0.0
        else: usg_pct = float(re.search('data-stat="usg_pct" >([\d\.]{1,5})<', player_html).group(1))
            
        #### Get % of Shots Taken From 3 ####
        
        # Trim to Relevant HTML
        player_html = shooting_html[shooting_html.find(name):]
        
        # Get % of shots taken from 3
        if re.search('"fg3a_pct_fga" >([\d,\.]+)</td>', player_html) is None: fg3_pct_fga = None
        else: fg3_pct_fga = float(re.search('"fg3a_pct_fga" >([\d,\.]+)</td>', player_html).group(1))
        
        #### Get Salaries ####
    
        # Trim to Relevant HTML
        player_html = salary_html[salary_html.find(name):]
        
        if re.search('csk="(\d{1,10})', player_html) is None: continue
        else: salary = int(re.search('csk="(\d{1,10})', player_html).group(1))
            
        ##########################################    
    
        player_dict = {'Player': name,
                       'Team': team_abb,
                       'Year': year,
                       'Pos': pos,
                       'Height': height,
                       'Exp': exp,
                       'Salary': salary,
                       'MPG': mpg,
                       'GP': GP,
                       'OBPM': obpm,
                       'DBPM': dbpm,
                       'USG': usg_pct,
                       'PPG': ppg,
                       'RPG': rpg,
                       'APG': apg,
                       'TPG': tpg,
                       'FG2_pct': fg2_pct,
                       'FG3_pct': fg3_pct,
                       'pct_from_3': fg3_pct_fga}
        
        dict_list.append(player_dict)
        
    return pd.DataFrame(data=dict_list)


In [67]:
def get_team_urls(year: int):

    # Make Soup
    url = 'https://www.basketball-reference.com/leagues/NBA_' + str(year) + '.html'
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html.parser')
    
    url_list = []
    
    # For Each Team
    teams = soup.find_all('th', scope='row', class_='left')
    if len(teams) > 32: teams = teams[:int(len(teams)/2)]
        
    for team in teams:
        
        # Get URL of Team
        team_url = 'https://www.basketball-reference.com' + re.search('<a href="(.*.html)', str(team)).group(1)
        
        # Append to List
        url_list.append(team_url)
        
    return url_list

In [68]:
# DataFrame Containing Player Info, Salaries
players_df = pd.DataFrame()

for year in range(1998,2019):
    for url in get_team_urls(year):
        
        # Append Info to DataFrame
        players_df = players_df.append(scrape_players(url))
        #time.sleep(1)
players_df.head(20)

Unnamed: 0,APG,DBPM,Exp,FG2_pct,FG3_pct,GP,Height,MPG,OBPM,PPG,Player,Pos,RPG,Salary,TPG,Team,USG,Year,pct_from_3
0,0.6,1.7,7,0.375,0.284,46,6-7,14.8,1.3,2.4,Keith Askins,SF,2.2,750000,0.6,MIA,10.7,1998,
1,1.7,0.9,4,0.476,0.0,52,6-10,26.4,1.3,12.7,Isaac Austin,C,6.3,384000,2.3,MIA,25.5,1998,
2,1.4,2.9,4,0.471,0.376,74,6-11,31.9,0.1,9.6,P.J. Brown,PF,8.6,3844800,1.3,MIA,15.3,1998,
3,0.1,1.6,7,0.416,0.375,37,7-0,9.8,1.3,2.4,Duane Causwell,C,2.7,1211705,0.5,MIA,15.2,1998,
4,1.4,0.0,5,0.474,0.167,5,6-6,13.8,1.3,6.0,Todd Day,SG,1.2,272250,0.6,MIA,25.7,1998,
5,8.3,2.9,7,0.472,0.351,81,6-0,37.4,5.1,18.9,Tim Hardaway,PG,3.7,4800000,2.8,MIA,25.6,1998,
6,2.2,2.9,2,0.441,0.405,81,6-4,32.4,1.8,12.6,Voshon Lenard,SG,3.6,2364597,1.2,MIA,18.4,1998,
7,2.2,1.1,9,0.507,0.376,72,6-6,26.8,1.3,7.2,Dan Majerle,SG,3.7,2820000,0.9,MIA,12.7,1998,
8,2.8,0.1,4,0.47,0.303,48,6-8,36.0,0.5,15.1,Jamal Mashburn,SF,4.9,4433334,2.3,MIA,21.2,1998,
9,0.8,0.2,7,0.448,0.309,50,6-10,15.6,1.3,4.2,Terry Mills,PF,3.0,1000000,0.9,MIA,15.8,1998,


In [69]:
# Write To CSV
players_df.to_csv('PlayerData.csv')

In [65]:
pergame_html = page.text[page.text.find('<h2>Per Game</h2>'):]
player_html = pergame_html[pergame_html.find('Kyle Lowry'):]

#re.search('<a href=".+"([\d]+)<a>', player_html)
int(re.search('">([\d]{1,3})</a></td>', player_html).group(1))
#player_html

78