In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import numpy as np
import pickle

In [2]:
def get_links():
    page = requests.get('https://www.basketball-reference.com/leagues/NBA_2019_per_game.html')
    soup = BeautifulSoup(page.content)
    table = soup.find('table')
    rows = [row for row in table.find_all('tr')]
    
    base_url = 'https://www.basketball-reference.com'
    log = '/gamelog/2019/'

    links = []
    
    for row in rows[1:]:
        try:
            part = row.find('td').findChild().get('href')
            short = part[:-5] # remove the .html because we need to go one 'click' further
            full = base_url + short + log
            links.append(full)
        except:
            continue
            
    return list(set(links))

links = get_links()

In [3]:
links[:5]

['https://www.basketball-reference.com/players/z/zizican01/gamelog/2019/',
 'https://www.basketball-reference.com/players/m/mooreet01/gamelog/2019/',
 'https://www.basketball-reference.com/players/l/loydjo01/gamelog/2019/',
 'https://www.basketball-reference.com/players/b/bazemke01/gamelog/2019/',
 'https://www.basketball-reference.com/players/s/smartma01/gamelog/2019/']

In [8]:
def get_player_stats(url):
    headers = ['Player','Rank','Game','Date','Age','Team','Away','Opp','Result','Started','min_played','fgm','fga','fgp',
           '3pm','3pa','3pp','ftm','fta','ftp','orb','drb','trb','ast','stl','blk','tov','pf',
           'pts','GmSc','plus_minus']
    stats_list = []
    player_page = requests.get(url)
    player_soup = BeautifulSoup(player_page.content)
    player_table = player_soup.find('table',id='pgl_basic')
    player_rows = [row for row in player_table.find_all('tr')]
    
    for row in player_rows[1:]:
        try:
            stats = {}
            player = player_soup.find('title').text.split('2018-19')[0].strip()
            game = row.find('th').text
            items = row.find_all('td')
            stats = dict(zip(headers,([player] + [game] + [item.text for item in items])))
            stats_list.append(stats)
        except:
            continue
    return stats_list



In [None]:
def get_all_stats():
    total = []
    for link in links:
        try:
            player = get_player_stats(link)
            total += player
            time.sleep(1+2*np.random.uniform())
        except:
            continue
    return total

In [None]:
#final = get_all_stats()
#with open('nba_data.pickle','wb') as to_write:
#    pickle.dump(final,to_write)

In [None]:
with open('nba_data.pickle','rb') as read_file:
    stats = pickle.load(read_file)
    
stats[:5]

In [10]:
total = []
for link in links[:3]:
    try:
        player = get_player_stats(link)
        total += player
        time.sleep(1+2*np.random.uniform())
    except:
        continue
        
total

[{'Player': 'Ante Žižić',
  'Rank': '1',
  'Game': '1',
  'Date': '2018-10-17',
  'Age': '21-286',
  'Team': 'CLE',
  'Away': '@',
  'Opp': 'TOR',
  'Result': 'L (-12)',
  'Started': '0',
  'min_played': '14:24',
  'fgm': '2',
  'fga': '2',
  'fgp': '1.000',
  '3pm': '0',
  '3pa': '0',
  '3pp': '',
  'ftm': '1',
  'fta': '2',
  'ftp': '.500',
  'orb': '0',
  'drb': '3',
  'trb': '3',
  'ast': '2',
  'stl': '0',
  'blk': '0',
  'tov': '0',
  'pf': '2',
  'pts': '5',
  'GmSc': '5.5',
  'plus_minus': '+11'},
 {'Player': 'Ante Žižić',
  'Rank': '2',
  'Game': '2',
  'Date': '2018-10-19',
  'Age': '21-288',
  'Team': 'CLE',
  'Away': '@',
  'Opp': 'MIN',
  'Result': 'L (-8)',
  'Started': '0',
  'min_played': '10:51',
  'fgm': '2',
  'fga': '3',
  'fgp': '.667',
  '3pm': '0',
  '3pa': '0',
  '3pp': '',
  'ftm': '3',
  'fta': '4',
  'ftp': '.750',
  'orb': '1',
  'drb': '0',
  'trb': '1',
  'ast': '2',
  'stl': '1',
  'blk': '0',
  'tov': '0',
  'pf': '1',
  'pts': '7',
  'GmSc': '8.0',
  'p

In [12]:
pd.DataFrame(total).Player.unique()

array(['Ante Žižić', "E'Twaun Moore", 'Jordan Loyd'], dtype=object)