## NBA Data

In [1]:
import re
import os
from pprint import pprint
from typing import Dict, List
from concurrent import futures


import requests
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

In [2]:
TEAMS_URL = "https://www.nba.com/stats/teams"
CHROME_DRIVER = f"{os.getcwd()}/chromedriver.exe"

def get_team_url(team_id: int, season: str) -> str:
    return f"https://www.nba.com/stats/team/{team_id}?Season={season}"

def get_player_url(player_id: int) -> str:
    return f"https://www.nba.com/stats/player/{player_id}/career?PerMode=Totals"

def get_player_stats_url(player_id: int) -> str:
    return f"https://www.nba.com/stats/player/{player_id}?SeasonType=Regular+Season&PerMode=Totals"

In [42]:
def get_nba_teams() -> pd.DataFrame:
    response = requests.get(TEAMS_URL)
    teams = {}
    team_links = {}

    if not response.ok:
        print(response)
        raise Exception("There was an issue getting the NBA Team Listings")
    
    soup = BeautifulSoup(response.content, "html.parser")
    team_list_div = soup.find("h1", string="Teams List").parent.find_next_sibling()
    team_list = list(team_list_div.children)

    for section in team_list:
        for team in section:
            header_attrs = team.attrs["class"][0]
            if "StatsTeamsList_divName" in header_attrs:
                current_team = team.text
                teams[current_team] = []
            elif "StatsTeamsList_team" in header_attrs:
                teams[current_team].append(team.text)
    
    # Get the team IDs
    for team in team_list:
        for team_info in team.find_all("a"):
            team_links[team_info.text] = f'https://www.nba.com{team_info["href"]}'

    # Convert the dictionary to a pandas DataFrame
    teams_df = \
    pd.DataFrame(teams)\
        .stack()\
        .reset_index()\
        .drop(columns="level_0")\
        .rename(columns={"level_1": "Region", 0: "Teams"})
    
    teams_df["Team ID"] = \
    teams_df["Teams"].map(lambda team_name: team_links[team_name][len("https://www.nba.com/stats/team/"):])

    return teams_df

In [43]:
teams = get_nba_teams()

In [52]:
def get_team_roster(team_id: int, season: str) -> pd.DataFrame:
    capa = DesiredCapabilities.CHROME
    capa["pageLoadStrategy"] = "none"
    driver = webdriver.Chrome(service=Service(CHROME_DRIVER), desired_capabilities=capa)
    wait = WebDriverWait(driver, 30)
    driver.get(get_team_url(team_id=team_id, season=season)) 
    
    try:
        wait.until(EC.presence_of_element_located((By.TAG_NAME, "table")))
        page_html = driver.page_source
    except:
        driver.execute_script("window.stop();")

    print(team_id)    
    
    try:
        soup_team = BeautifulSoup(page_html, "html.parser")
        players, staff = soup_team.find_all("table")
        df = pd.read_html(str(players))[0][f"{season} Team Roster"]
        df = df.iloc[:, :-1]
        df["Team ID"] = team_id
        df["Season"] = season
        
        player_id = {}

        for a_tag in players.find_all("a"):
            if match := re.search("/stats/player/(\d+)/", a_tag["href"], re.IGNORECASE):
                player_id[a_tag.text] = match.group(1)
        df["Player ID"] = df["Player"].map(lambda player_name: player_id[player_name])
        return df
    except Exception as e:
        print(f"Exception: {e}")
        print("Roster Not Available")
        return pd.DataFrame()

In [50]:
# get_team_roster(1610612751, "2022-23")

In [56]:
def get_all_nba_players(season: str) -> pd.DataFrame:
#     teams = get_nba_teams()
#     with futures.ThreadPoolExecutor() as executor: # default/optimized number of threads
#         player_list = list(executor.map(lambda id_: get_team_roster(id_, season), teams["Team ID"].to_list()))
    player_list = list(map(lambda id_: get_team_roster(id_, season), teams["Team ID"].to_list()))
    return pd.concat(player_list)

In [57]:
players = get_all_nba_players(season="2021-22")

1610612738
1610612741
1610612737
1610612743
1610612744
1610612742
1610612751
1610612739
1610612766
1610612750
1610612746
1610612745
1610612752
1610612765
1610612748
1610612760
1610612747
1610612763
1610612755
1610612754
1610612753
1610612757
1610612756
1610612740
1610612761
1610612749
Exception: local variable 'page_html' referenced before assignment
Roster Not Available
1610612764
1610612762
1610612758
1610612759


In [58]:
players

Unnamed: 0,Player,No.,Pos,Height,Weight,Birthdate,Age,Exp,School,Team ID,Season,Player ID
0,Matt Ryan,#,F,6-7,215 lbs,"APR 17, 1997",25,R,Tennessee-Chattanooga,1610612738,2021-22,1630346
1,Jayson Tatum,#0,F-G,6-8,210 lbs,"MAR 03, 1998",24,4,Duke,1610612738,2021-22,1628369
2,Jaylen Brown,#7,G-F,6-6,223 lbs,"OCT 24, 1996",25,5,California,1610612738,2021-22,1627759
3,Malik Fitts,#8,F,6-5,230 lbs,"JUL 04, 1997",24,1,,1610612738,2021-22,1630238
4,Derrick White,#9,G,6-4,190 lbs,"JUL 02, 1994",27,4,Colorado,1610612738,2021-22,1628401
...,...,...,...,...,...,...,...,...,...,...,...,...
12,Robert Woodard II,#28,F,6-6,235 lbs,"SEP 22, 1999",22,1,Mississippi State,1610612759,2021-22,1630218
13,Keita Bates-Diop,#31,F,6-8,229 lbs,"JAN 23, 1996",26,3,Ohio State,1610612759,2021-22,1628966
14,Tre Jones,#33,G,6-1,185 lbs,"JAN 08, 2000",22,1,Duke,1610612759,2021-22,1630200
15,Jock Landale,#34,C,6-11,255 lbs,"OCT 25, 1995",26,R,St. Mary's,1610612759,2021-22,1629111


In [70]:
def get_quick_stats(player_id: int) -> Dict[str, int]:
    player_response = requests.get(get_player_stats_url(player_id))
    player_soup = BeautifulSoup(player_response.content, "html.parser")
    regex = re.compile("PlayerSummary_playerStat*")
    player_stats = player_soup.find_all("div", class_=regex)
    
    stats = {}
    for i in player_stats:
        stat_tags = list(i.children)
        label, value = stat_tags
        stats[label.text] = float(value.text)
    return stats

In [71]:
get_quick_stats(1630178)

{'PPG': 17.5, 'RPG': 3.2, 'APG': 4.3, 'PIE': 10.5}

In [178]:
def get_player_stats(player_id: int) -> pd.DataFrame:
    driver = webdriver.Chrome(service=Service(CHROME_DRIVER))
    wait = WebDriverWait(driver, 60)
    driver.get(get_player_url(player_id)) 

    try:
        wait.until(EC.presence_of_element_located((By.TAG_NAME, "table")))
        stats = driver.page_source
    except:
        driver.execute_script("window.stop();")
    print(player_id)
    
    soup = BeautifulSoup(stats, "html.parser")
    col_names = list(map(lambda col: col.text, soup.find("table").find_all("th")[3:]))
    
    table_row = soup.find("table").find("tbody").find_all("tr")
    
    rows = []
    for row in table_row:
        tds = row.find_all("td")
        row = [td.text for td in tds]
        rows.append(row)
    
    try:
        df = pd.DataFrame(rows, columns=col_names)
        df["Player ID"] = player_id
        return df
    except:
        print("There is no available stat on this player")
        return pd.DataFrame()

In [179]:
player = get_player_stats(1630723)

1630723
There is no available stat on this player


In [174]:
player

Unnamed: 0,Season,TEAM,AGE,GP,GS,MIN,PTS,FGM,FGA,FG%,...,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,PF,Player ID
0,2021-22,BOS,36,69,69,2005,701,266,569,46.7,...,84.2,108,422,530,232,49,92,65,130,201143
1,2020-21,OKC,35,28,28,782,398,162,360,45.0,...,81.8,29,159,188,94,25,26,29,48,201143
2,2019-20,PHI,34,67,61,2026,798,319,709,45.0,...,76.3,103,353,456,270,52,61,80,142,201143
3,2018-19,BOS,33,68,68,1973,925,387,723,53.5,...,82.1,120,338,458,283,59,86,102,126,201143
4,2017-18,BOS,32,72,72,2277,927,368,753,48.9,...,78.3,103,428,531,339,43,78,132,138,201143
5,2016-17,BOS,31,68,68,2193,952,379,801,47.3,...,80.0,95,370,465,337,52,86,115,138,201143
6,2015-16,ATL,30,82,82,2631,1249,529,1048,50.5,...,79.8,148,448,596,263,68,121,107,163,201143
7,2014-15,ATL,29,76,76,2318,1156,519,965,53.8,...,75.9,131,413,544,244,68,98,100,121,201143
8,2013-14,ATL,28,29,29,958,538,238,420,56.7,...,68.2,66,178,244,76,27,44,64,56,201143
9,2012-13,ATL,27,74,74,2756,1289,576,1060,54.3,...,64.4,195,562,757,240,78,78,147,163,201143


In [187]:
philly = players[players["Team ID"] == "1610612755"]

In [188]:
def get_all_player_stats():
    with futures.ThreadPoolExecutor() as executor: # default/optimized number of threads
        player_list = list(executor.map(get_player_stats, philly["Player ID"].to_list()))
    return pd.concat(player_list)

In [189]:
stats = get_all_player_stats()

201935
1630178
202699
200782
1630531
2039541627777
1626149

1629003
There is no available stat on this player
There is no available stat on this player
1629001
There is no available stat on this player1631198
1629718

There is no available stat on this player
1627863
1629680
1630194
1630577
1630701
There is no available stat on this player
1629635
1627788


In [193]:
stats[stats["Player ID"] == "1630701"]

Unnamed: 0,Season,SCHOOL,AGE,GP,GS,MIN,PTS,FGM,FGA,FG%,...,OREB,DREB,REB,AST,STL,BLK,TOV,PF,Player ID,TEAM
