# DSCI 511: Data Aquistion and Preprocessing

## NBA API Final Project

### Members:
* Dara Kasrovi
* Ao Wang

## Import Libraries

In [1]:
# !pip install pandas
# !pip install selenium
# !pip install webdriver-manager

In [2]:
import re
import json
import os
from concurrent import futures
from typing import List, Dict, Any

import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.support.wait import WebDriverWait
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By

## Get NBA Teams 

After running the function, the output should look like this:

|   | Division | Team               |    Team ID |
|--:|---------:|-------------------:|-----------:|
| 0 | Atlantic |     Boston Celtics | 1610612738 |
| 1 | Atlantic |      Brooklyn Nets | 1610612751 |
| 2 | Atlantic |    New York Knicks | 1610612752 |
| 3 | Atlantic | Philadelphia 76ers | 1610612755 |
| 4 | Atlantic |    Toronto Raptors | 1610612761 |


In [3]:
def get_nba_teams() -> pd.DataFrame:
    """The function goes to the NBA team stats page and scrapes all the team data,
    including division, team, and team ID
    
    Returns:
        pd.DataFrame: the NBA teams
    """
    url = "https://www.nba.com/stats/teams"
    response = requests.get(url)
    
    # Check that the response was successful, i.e.  200 - good, 401, 404, etc - bad
    if not response.ok:
        print("Something went wrong in getting team listings")
        return pd.DataFrame()
    
    # Use bs4 and regex to get the table of teams
    soup = BeautifulSoup(response.content, "html.parser")
    regex = re.compile("^StatsTeamsList_divContent")
    table = soup.find("div", {"class": regex})
    
    # Atlantic, Central, Southeast, etc are basketball divisions
    chart = list(table.children)
    output = []
    
    # Get the division, team, and team id in the HTML
    for html in chart:
        division = html.find("h2").text
        teams = html.find_all("a")
        for team in teams:
            href = team["href"].replace("/stats/team/", "")
            team_name = team.text
            output.append([division, team_name, href])
    return pd.DataFrame(output, columns=["Division", "Team", "Team ID"])

In [4]:
teams = get_nba_teams()
teams

Unnamed: 0,Division,Team,Team ID
0,Atlantic,Boston Celtics,1610612738
1,Atlantic,Brooklyn Nets,1610612751
2,Atlantic,New York Knicks,1610612752
3,Atlantic,Philadelphia 76ers,1610612755
4,Atlantic,Toronto Raptors,1610612761
5,Central,Chicago Bulls,1610612741
6,Central,Cleveland Cavaliers,1610612739
7,Central,Detroit Pistons,1610612765
8,Central,Indiana Pacers,1610612754
9,Central,Milwaukee Bucks,1610612749


## Save NBA Teams to Directory

In [None]:
# if not os.path.isdir("data"):
#     os.makedirs("data")
# teams.to_csv(os.path.join("data", "teams.csv"))

## Get Team Roster

In [5]:
def create_team_query(team_id: int, season: str = "2022-23") -> str:
    """Create URL for team roster
    
    Args:
        team_id (int): the NBA team ID, i.e. 1610612755
        season (str): the year of the roster, i.e. 2022-23
    
    Returns:
        str: the URL for team roster
    """
    return f"https://www.nba.com/stats/team/{team_id}?Season={season}"

In [6]:
def get_team_roster(team_id: int, season: str = "2022-23") -> pd.DataFrame:
    """Gets the NBA team roster
    
    Args:
        team_id (int): the NBA team ID, i.e. 1610612755
        season (str): the year of the roster, i.e. 2022-23
    
    Returns:
        pd.DataFrame: the NBA team roster
    """
    print(team_id)
    url = create_team_query(team_id=team_id, season=season)
    response = requests.get(url)
    i = 0
    
    # There are times where requests return a 502 - bad gateway,
    # so just in case, make multiple requests
    while not response.ok:
        print(f"There was an issue getting team id={team_id}!!")
        print(f"Reattempting! Iteration {i + 1}")
        i += 1
        response = requests.get(url)
    
    # Find roster information
    soup = BeautifulSoup(response.content, "html.parser")
    output = json.loads(soup.find("script", {"id": "__NEXT_DATA__"}).text)
    return pd.DataFrame(output["props"]["pageProps"]["team"]["roster"])

In [7]:
philly_team = get_team_roster(team_id=1610612755)
philly_team

1610612755


Unnamed: 0,TeamID,SEASON,LeagueID,PLAYER,NICKNAME,PLAYER_SLUG,NUM,POSITION,HEIGHT,WEIGHT,BIRTH_DATE,AGE,EXP,SCHOOL,PLAYER_ID,HOW_ACQUIRED
0,1610612755,2022,0,Tyrese Maxey,Tyrese,tyrese-maxey,0,G,6-2,200,"NOV 04, 2000",22,2,Kentucky,1630178,#21 Pick in 2020 Draft
1,1610612755,2022,0,James Harden,James,james-harden,1,G,6-5,220,"AUG 26, 1989",33,13,Arizona State,201935,Traded from BKN on 02/10/22
2,1610612755,2022,0,Montrezl Harrell,Montrezl,montrezl-harrell,5,F-C,6-7,240,"JAN 26, 1994",28,7,Louisville,1626149,Signed on 09/13/22
3,1610612755,2022,0,De'Anthony Melton,De'Anthony,deanthony-melton,8,G,6-2,200,"MAY 28, 1998",24,4,Southern California,1629001,Traded from MEM on 06/24/22
4,1610612755,2022,0,Jaden Springer,Jaden,jaden-springer,11,G,6-4,202,"SEP 25, 2002",20,1,Tennessee,1630531,#28 Pick in 2021 Draft
5,1610612755,2022,0,Tobias Harris,Tobias,tobias-harris,12,F,6-7,226,"JUL 15, 1992",30,11,Tennessee,202699,Traded from LAC on 02/06/19
6,1610612755,2022,0,P.J. Tucker,P.J.,pj-tucker,17,F,6-5,245,"MAY 05, 1985",37,11,Texas,200782,Signed on 07/06/22
7,1610612755,2022,0,Shake Milton,Shake,shake-milton,18,G-F,6-5,205,"SEP 26, 1996",26,4,Southern Methodist,1629003,Draft Rights Traded from DAL on 06/22/18
8,1610612755,2022,0,Georges Niang,Georges,georges-niang,20,F,6-7,230,"JUN 17, 1993",29,6,Iowa State,1627777,Signed on 08/06/21
9,1610612755,2022,0,Joel Embiid,Joel,joel-embiid,21,C-F,7-0,280,"MAR 16, 1994",28,6,Kansas,203954,#3 Pick in 2014 Draft


## Get all Players in NBA

In [11]:
def get_all_players(team_ids: List[int]) -> pd.DataFrame:
    """Get team rosters from all teams in NBA
    
    Args: 
        team_ids (List[int]): list of team IDs
    
    Returns:
        pd.DataFrame: all the NBA players
    """
    
    # Used threads to speed up queries
    with futures.ThreadPoolExecutor() as executor:
        player_list = list(executor.map(get_team_roster, team_ids))
    return pd.concat(player_list).reset_index(drop=True)

In [12]:
players = get_all_players(team_ids=teams["Team ID"].to_list())
players

1610612738
1610612751
1610612752
1610612755
1610612761
1610612741
1610612739
1610612765
1610612754
1610612749
1610612737
1610612766
1610612748
1610612753
1610612764
1610612743
1610612750
1610612760
1610612757
1610612762
1610612744
1610612746
1610612747
1610612756
1610612758
1610612742
1610612745
1610612763
1610612740
1610612759


Unnamed: 0,TeamID,SEASON,LeagueID,PLAYER,NICKNAME,PLAYER_SLUG,NUM,POSITION,HEIGHT,WEIGHT,BIRTH_DATE,AGE,EXP,SCHOOL,PLAYER_ID,HOW_ACQUIRED
0,1610612738,2022,00,Jayson Tatum,Jayson,jayson-tatum,0,F-G,6-8,210,"MAR 03, 1998",24,5,Duke,1628369,#3 Pick in 2017 Draft
1,1610612738,2022,00,Noah Vonleh,Noah,noah-vonleh,4,F,6-10,257,"AUG 24, 1995",27,7,Indiana,203943,Signed on 09/07/22
2,1610612738,2022,00,Jaylen Brown,Jaylen,jaylen-brown,7,G-F,6-6,223,"OCT 24, 1996",26,6,California,1627759,#3 Pick in 2016 Draft
3,1610612738,2022,00,Danilo Gallinari,Danilo,danilo-gallinari,8,F,6-10,236,"AUG 08, 1988",34,13,Olimpia Milano,201568,Signed on 07/12/22
4,1610612738,2022,00,Derrick White,Derrick,derrick-white,9,G,6-4,190,"JUL 02, 1994",28,5,Colorado,1628401,Traded from SAS on 02/10/22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
496,1610612759,2022,00,Keita Bates-Diop,Keita,keita-bates-diop,31,F,6-8,229,"JAN 23, 1996",26,4,Ohio State,1628966,Signed on 11/29/20
497,1610612759,2022,00,Tre Jones,Tre,tre-jones,33,G,6-1,185,"JAN 08, 2000",22,2,Duke,1630200,#41 Pick in 2020 Draft
498,1610612759,2022,00,Romeo Langford,Romeo,romeo-langford,35,G-F,6-5,216,"OCT 25, 1999",23,3,Indiana,1629641,Traded from BOS on 02/10/22
499,1610612759,2022,00,Gorgui Dieng,Gorgui,gorgui-dieng,41,C,6-10,248,"JAN 18, 1990",32,9,Louisville,203476,Signed on 08/09/22


## Preprocess Team Roster

In [14]:
def feet_to_meter(height: str) -> float:
    """Converts feet'inches to meters"""
    feet, inches = list(map(int, height.split("-")))
    foot_to_inch_conversion = 12
    inch_to_meter = 2.54
    return (feet * foot_to_inch_conversion + inches) * inch_to_meter

def process_team_roster(df: pd.DataFrame) -> pd.DataFrame:
    """Cleans the team roster data"""
    pounds_to_kilograms = 0.45359237
    output = df.copy(deep=True)
    
    # Drop columns we don't need
    output.drop(columns=["LeagueID", "NICKNAME", "PLAYER_SLUG", "HOW_ACQUIRED"], inplace=True)
    
    # Format columns to have title case and reformat columns with ID
    output.columns = output.columns.str.title()
    output.rename(columns={
        "Teamid": "Team ID",
        "Player_Id": "Player ID",
        "Birth_Date": "Birth Date"
    }, inplace=True)
    
    # Convert feet and inches to meter
    output["Height"] = output["Height"].map(feet_to_meter)
    output["Age"] = output["Age"].astype(int)
    
    # Convert pounds to kilograms
    output["Weight"] = output["Weight"].astype(int)
    output["Weight"] = output["Weight"].map(lambda weight: weight * pounds_to_kilograms)
    
    # Convert to DateTime
    output["Birth Date"] = pd.to_datetime(output["Birth Date"])
    return output

In [15]:
processed_team_roster = process_team_roster(df=players)
processed_team_roster

Unnamed: 0,Team ID,Season,Player,Num,Position,Height,Weight,Birth Date,Age,Exp,School,Player ID
0,1610612738,2022,Jayson Tatum,0,F-G,203.20,95.254398,1998-03-03,24,5,Duke,1628369
1,1610612738,2022,Noah Vonleh,4,F,208.28,116.573239,1995-08-24,27,7,Indiana,203943
2,1610612738,2022,Jaylen Brown,7,G-F,198.12,101.151099,1996-10-24,26,6,California,1627759
3,1610612738,2022,Danilo Gallinari,8,F,208.28,107.047799,1988-08-08,34,13,Olimpia Milano,201568
4,1610612738,2022,Derrick White,9,G,193.04,86.182550,1994-07-02,28,5,Colorado,1628401
...,...,...,...,...,...,...,...,...,...,...,...,...
496,1610612759,2022,Keita Bates-Diop,31,F,203.20,103.872653,1996-01-23,26,4,Ohio State,1628966
497,1610612759,2022,Tre Jones,33,G,185.42,83.914588,2000-01-08,22,2,Duke,1630200
498,1610612759,2022,Romeo Langford,35,G-F,195.58,97.975952,1999-10-25,23,3,Indiana,1629641
499,1610612759,2022,Gorgui Dieng,41,C,208.28,112.490908,1990-01-18,32,9,Louisville,203476


In [None]:
# processed_team_roster.to_csv(os.path.join("nba", "players.csv"))

## Get Player Dashboard Stats

In [16]:
def create_player_query(player_id: int) -> str:
    """Creates URL for player career regular season stats
    
    Args:
        player_id (int): the player's ID
    
    Returns:
        str: the URL for player stats
    """
    return f"https://www.nba.com/stats/player/{player_id}/career?PerMode=Totals"

In [17]:
def get_quick_stats(player_id: int) -> Dict[str, Any]:
    """The function gets a player's quick stats that show up on the dashboard, such as the
    PPG, RPG, APG, and PIE

    Args:
        player_id (int): the player's ID
    
    Returns:
        Dict[str, Any]: the player's quick stats
    """
    url = create_player_query(player_id=player_id)
    print(url)
    
    player_response = requests.get(url)
    player_soup = BeautifulSoup(player_response.content, "html.parser")
    regex = re.compile("PlayerSummary_playerStat*")
    player_stats = player_soup.find_all("div", class_=regex)
    
    stats = {"Player ID": player_id}
    for i in player_stats:
        stat_tags = list(i.children)
        label, value = stat_tags
        stats[label.text] = float(value.text) if value.text != "--" else np.nan
    return stats

In [18]:
def get_all_player_quick_stats(player_ids: List[int]) -> pd.DataFrame:
    """Get all the NBA player's quick stats from the dashboard
    
    Args:
        player_ids (List[int]): all the player's IDs
    
    Returns:
        pd.DataFrame: all the player's quick stats
    """
    # Used threads to speed up queries
    with futures.ThreadPoolExecutor() as executor:
        player_list = list(executor.map(get_quick_stats, player_ids))
    return pd.DataFrame(player_list)


In [21]:
player_ids = players["PLAYER_ID"].to_list()
quick_stats = get_all_player_quick_stats(player_ids=player_ids)
quick_stats

https://www.nba.com/stats/player/1628369/career?PerMode=Totalshttps://www.nba.com/stats/player/203943/career?PerMode=Totals

https://www.nba.com/stats/player/1627759/career?PerMode=Totals
https://www.nba.com/stats/player/201568/career?PerMode=Totals
https://www.nba.com/stats/player/1628401/career?PerMode=Totals
https://www.nba.com/stats/player/1630202/career?PerMode=Totals
https://www.nba.com/stats/player/1629684/career?PerMode=Totals
https://www.nba.com/stats/player/1627763/career?PerMode=Totals
https://www.nba.com/stats/player/1631120/career?PerMode=Totals
https://www.nba.com/stats/player/1629662/career?PerMode=Totals
https://www.nba.com/stats/player/1630573/career?PerMode=Totals
https://www.nba.com/stats/player/203935/career?PerMode=Totals
https://www.nba.com/stats/player/1628436/career?PerMode=Totals
https://www.nba.com/stats/player/201143/career?PerMode=Totals
https://www.nba.com/stats/player/1629057/career?PerMode=Totals
https://www.nba.com/stats/player/1628382/career?PerMode=Tot

https://www.nba.com/stats/player/204456/career?PerMode=Totals
https://www.nba.com/stats/player/1631112/career?PerMode=Totals
https://www.nba.com/stats/player/1629052/career?PerMode=Totals
https://www.nba.com/stats/player/201949/career?PerMode=Totals
https://www.nba.com/stats/player/1630678/career?PerMode=Totals
https://www.nba.com/stats/player/1630543/career?PerMode=Totals
https://www.nba.com/stats/player/1630174/career?PerMode=Totals
https://www.nba.com/stats/player/1627741/career?PerMode=Totalshttps://www.nba.com/stats/player/1630188/career?PerMode=Totals
https://www.nba.com/stats/player/1628464/career?PerMode=Totals

https://www.nba.com/stats/player/1626167/career?PerMode=Totals
https://www.nba.com/stats/player/1629048/career?PerMode=Totals
https://www.nba.com/stats/player/1630699/career?PerMode=Totals
https://www.nba.com/stats/player/201588/career?PerMode=Totals
https://www.nba.com/stats/player/1628975/career?PerMode=Totals
https://www.nba.com/stats/player/204060/career?PerMode=Tot

https://www.nba.com/stats/player/1630639/career?PerMode=Totals
https://www.nba.com/stats/player/1629162/career?PerMode=Totals
https://www.nba.com/stats/player/1631111/career?PerMode=Totals
https://www.nba.com/stats/player/1631169/career?PerMode=Totals
https://www.nba.com/stats/player/1627854/career?PerMode=Totals
https://www.nba.com/stats/player/1629675/career?PerMode=Totals
https://www.nba.com/stats/player/1627752/career?PerMode=Totals
https://www.nba.com/stats/player/1630233/career?PerMode=Totals
https://www.nba.com/stats/player/203085/career?PerMode=Totals
https://www.nba.com/stats/player/203497/career?PerMode=Totals
https://www.nba.com/stats/player/1626157/career?PerMode=Totals
https://www.nba.com/stats/player/1630568/career?PerMode=Totals
https://www.nba.com/stats/player/1628983/career?PerMode=Totals
https://www.nba.com/stats/player/1630581/career?PerMode=Totals
https://www.nba.com/stats/player/1629652/career?PerMode=Totals
https://www.nba.com/stats/player/1631119/career?PerMode=T

https://www.nba.com/stats/player/1631099/career?PerMode=Totals
https://www.nba.com/stats/player/1630558/career?PerMode=Totals
https://www.nba.com/stats/player/1631320/career?PerMode=Totals
https://www.nba.com/stats/player/1626158/career?PerMode=Totals
https://www.nba.com/stats/player/1631165/career?PerMode=Totals
https://www.nba.com/stats/player/203458/career?PerMode=Totalshttps://www.nba.com/stats/player/1629644/career?PerMode=Totals

https://www.nba.com/stats/player/203084/career?PerMode=Totals
https://www.nba.com/stats/player/1626168/career?PerMode=Totals
https://www.nba.com/stats/player/1629674/career?PerMode=Totals
https://www.nba.com/stats/player/201580/career?PerMode=Totals
https://www.nba.com/stats/player/1629033/career?PerMode=Totals
https://www.nba.com/stats/player/1630267/career?PerMode=Totals
https://www.nba.com/stats/player/1630702/career?PerMode=Totals
https://www.nba.com/stats/player/203939/career?PerMode=Totals
https://www.nba.com/stats/player/1630182/career?PerMode=Tot

Unnamed: 0,Player ID,PPG,RPG,APG,PIE
0,1628369,31.9,7.4,3.9,17.5
1,203943,1.9,3.0,0.3,2.5
2,1627759,25.4,6.8,3.5,12.0
3,201568,11.7,4.7,1.5,9.9
4,1628401,8.9,3.2,3.1,8.1
...,...,...,...,...,...
496,1628966,,,,
497,1630200,,,,
498,1629641,,,,
499,203476,,,,


In [None]:
# quick_stats.to_csv(os.path.join("nba", "player_quick_stats.csv"))

## Get Player Career Stats

In [22]:
def loading_completed(driver) -> bool:
    """Allows the driver to know when the loading animation is over
    
    Args:
        driver (selenium.webdriver.chrome.webdriver.WebDriver): the Selenium driver
    
    Returns:
        bool: whether the loading animation is over
    """
    try:
        element = driver.find_element(By.CSS_SELECTOR, "div[class*=LoadingOverlay]")
    except NoSuchElementException:
        return False
    return element.get_attribute("data-hidden") == "true"


In [23]:
def get_page_source(url: str) -> str:
    """Opens the player stats page and gets the HTML page source
    
    Args:
        url (str): the player stats URL
    
    Returns:
        str: the player stats HTML
    """
    service = ChromeService(executable_path=ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service)

    # Timeout in seconds
    TIMEOUT = 30
    
    try:
        wait = WebDriverWait(driver, timeout=TIMEOUT)
        driver.get(url)
        wait.until(loading_completed)
        page_source = driver.page_source
        driver.quit()
        return page_source
    except Exception:
        raise Exception("Something went wrong!")


In [24]:
def get_player_info(player_id: int) -> pd.DataFrame:
    """Gets the player's career regular season stats
    
    Args:
        player_id: the player's ID
    
    Returns:
        pd.DataFrame: the player's career stats
    """
    url = create_player_query(player_id=player_id)
    print(url)
    
    # Retrieve the player's stats page
    # Tries multiple times
    i = 0
    while True:
        try:
            page_content = get_page_source(url)
        except Exception:
            print(f"Iteration #{i}: Issue getting page contents")
            i += 1
    
    # Looks for the CAREER REGULAR SEASON STATS table
    soup = BeautifulSoup(page_content, "html.parser")
    table = soup.find("table", {"class": re.compile("Crom_table*")})

    try:
        # Collect and format the table data
        cols = [elem.text for elem in table.find_all("th")[3:]]
        rows = [row.text for row in table.find_all("td")]
        output = []
        row = []
        for i, v in enumerate(rows):
            if i != 0 and i % len(cols) == 0:
                output.append(row)
                row = []

            row.append(v)

        df = pd.DataFrame(output, columns=cols)
        df["PLAYER ID"] = player_id
        return df
    except Exception:
        # New players don't have stats, so return an empty DataFrame
        if soup.find("div", string="No data available"):
            print(f"No data available for player: {player_id}")
            return pd.DataFrame()
        print("There seems to be another issue!!")


In [27]:
def get_all_player_info(player_ids: List[int]) -> pd.DataFrame:
    """Get all the player's career statistics. I don't use threads with Selenium because
    Chrome is a RAM-hog, and it wastes more time than just visiting a site one after another.
    
    Args:
        player_ids (List[int]): all the player IDs
    
    Returns:
        pd.DataFrame: all player career stats
    """
    output = []
    for idx, player_id in enumerate(player_ids):
        print(f"#{idx}", end=" ")
        output.append(get_player_info(player_id))
    return pd.concat(output).reset_index(drop=True)

In [28]:
players_info = get_all_player_info(player_ids=player_ids[:5])

#0 https://www.nba.com/stats/player/1628369/career?PerMode=Totals


[WDM] - Downloading: 100%|█████████████████| 7.72M/7.72M [00:00<00:00, 31.7MB/s]


Iteration #0: Issue getting page contents
Iteration #1: Issue getting page contents



KeyboardInterrupt



In [29]:
def process_career_stats(df: pd.DataFrame) -> pd.DataFrame:
    """Process the career stats. Changing the column names and making the statistics into numeric types
    instead of strings.
    
    Args:
        df (pd.DataFrame): the raw career stats
        
    Returns:
        pd.DataFrame: the processed career stats
    """
    df = df.copy(deep=True)
    }    df.rename(columns={
        "TEAM": "Team", 
        "AGE": "Age",
        "PLAYER ID": "Player ID",
    }, inplace=True)
    
    int_cols = list(set(df.columns) - {"Season", "Team"})
    df[int_cols] = df[int_cols].apply(pd.to_numeric)
    return df

In [30]:
career_stats = process_career_stats(players_info)

NameError: name 'players_info' is not defined

In [None]:
# career_stats.to_csv(os.path.join("nba", "player_info.csv"))