# DSCI 511: Data Aquistion and Preprocessing

## NBA API Final Project

### Members:
* Dara Kasrovi
* Ao Wang

## Import Libraries

In [None]:
# !pip install pandas
# !pip install selenium
# !pip install webdriver-manager

In [None]:
import re
import json
import os
from concurrent import futures
from typing import List, Dict, Any

import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.support.wait import WebDriverWait
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By

## Get NBA Teams 

After running the function, the output should look like this:

|   | Division | Team               |    Team ID |
|--:|---------:|-------------------:|-----------:|
| 0 | Atlantic |     Boston Celtics | 1610612738 |
| 1 | Atlantic |      Brooklyn Nets | 1610612751 |
| 2 | Atlantic |    New York Knicks | 1610612752 |
| 3 | Atlantic | Philadelphia 76ers | 1610612755 |
| 4 | Atlantic |    Toronto Raptors | 1610612761 |


In [None]:
def get_nba_teams() -> pd.DataFrame:
    """The function goes to the NBA team stats page and scrapes all the team data,
    including division, team, and team ID
    
    Returns:
        pd.DataFrame: the NBA teams
    """
    url = "https://www.nba.com/stats/teams"
    response = requests.get(url)
    
    # Check that the response was successful, i.e.  200 - good, 401, 404, etc - bad
    if not response.ok:
        print("Something went wrong in getting team listings")
        return pd.DataFrame()
    
    # Use bs4 and regex to get the table of teams
    soup = BeautifulSoup(response.content, "html.parser")
    regex = re.compile("^StatsTeamsList_divContent")
    table = soup.find("div", {"class": regex})
    
    # Atlantic, Central, Southeast, etc are basketball divisions
    chart = list(table.children)
    output = []
    
    # Get the division, team, and team id in the HTML
    for html in chart:
        division = html.find("h2").text
        teams = html.find_all("a")
        for team in teams:
            href = team["href"].replace("/stats/team/", "")
            team_name = team.text
            output.append([division, team_name, href])
    return pd.DataFrame(output, columns=["Division", "Team", "Team ID"])

In [None]:
teams = get_nba_teams()
teams

## Save NBA Teams to Directory

In [None]:
# if not os.path.isdir("nba"):
#     os.makedirs("nba")
# teams.to_csv(os.path.join("nba", "teams.csv"))

## Get Team Roster

In [None]:
def create_team_query(team_id: int, season: str = "2022-23") -> str:
    """Create URL for team roster
    
    Args:
        team_id (int): the NBA team ID, i.e. 1610612755
        season (str): the year of the roster, i.e. 2022-23
    
    Returns:
        str: the URL for team roster
    """
    return f"https://www.nba.com/stats/team/{team_id}?Season={season}"

In [None]:
def get_team_roster(team_id: int, season: str = "2022-23") -> pd.DataFrame:
    """Gets the NBA team roster
    
    Args:
        team_id (int): the NBA team ID, i.e. 1610612755
        season (str): the year of the roster, i.e. 2022-23
    
    Returns:
        pd.DataFrame: the NBA team roster
    """
    print(team_id)
    url = create_team_query(team_id=team_id, season=season)
    response = requests.get(url)
    i = 0
    
    # There are times where requests return a 502 - bad gateway,
    # so just in case, make multiple requests
    while not response.ok:
        print(f"There was an issue getting team id={team_id}!!")
        print(f"Reattempting! Iteration {i + 1}")
        i += 1
        
        response = requests.get(url)
    
    # Find roster information
    soup = BeautifulSoup(response.content, "html.parser")
    output = json.loads(soup.find("script", {"id": "__NEXT_DATA__"}).text)
    return pd.DataFrame(output["props"]["pageProps"]["team"]["roster"])

In [None]:
philly_team = get_team_roster(team_id=1610612755)
philly_team

## Get all Players in NBA

In [None]:
def get_all_players(team_ids: List[int]) -> pd.DataFrame:
    """Get team rosters from all teams in NBA
    
    Args: 
        team_ids (List[int]): list of team IDs
    
    Returns:
        pd.DataFrame: all the NBA players
    """
    
    # Used threads to speed up queries
    with futures.ThreadPoolExecutor() as executor:
        player_list = list(executor.map(get_team_roster, team_ids))
    return pd.concat(player_list).reset_index(drop=True)

In [None]:
players = get_all_players(team_ids=teams["Team ID"].to_list())
players

## Preprocess Team Roster

In [None]:
def feet_to_meter(height: str) -> float:
    """Converts feet'inches to meters"""
    feet, inches = list(map(int, height.split("-")))
    foot_to_inch_conversion = 12
    inch_to_meter = 2.54
    return (feet * foot_to_inch_conversion + inches) * inch_to_meter

def process_team_roster(df: pd.DataFrame) -> pd.DataFrame:
    """Cleans the team roster data"""
    pounds_to_kilograms = 0.45359237
    output = df.copy(deep=True)
    
    # Drop columns we don't need
    output.drop(columns=["LeagueID", "NICKNAME", "PLAYER_SLUG", "HOW_ACQUIRED"], inplace=True)
    
    # Format columns to have title case and reformat columns with ID
    output.columns = output.columns.str.title()
    output.rename(columns={
        "Teamid": "Team ID",
        "Player_Id": "Player ID",
        "Birth_Date": "Birth Date"
    }, inplace=True)
    
    # Convert feet and inches to meter
    output["Height"] = output["Height"].map(feet_to_meter)
    output["Age"] = output["Age"].astype(int)
    
    # Convert pounds to kilograms
    output["Weight"] = output["Weight"].astype(int)
    output["Weight"] = output["Weight"].map(lambda weight: weight * pounds_to_kilograms)
    
    # Convert to DateTime
    output["Birth Date"] = pd.to_datetime(output["Birth Date"])
    return output

In [None]:
processed_team_roster = process_team_roster(df=players)
processed_team_roster

In [None]:
# processed_team_roster.to_csv(os.path.join("nba", "players.csv"))

## Get Player Dashboard Stats

In [None]:
def create_player_query(player_id: int) -> str:
    """Creates URL for player career regular season stats
    
    Args:
        player_id (int): the player's ID
    
    Returns:
        str: the URL for player stats
    """
    return f"https://www.nba.com/stats/player/{player_id}/career?PerMode=Totals"

In [None]:
def get_quick_stats(player_id: int) -> Dict[str, Any]:
    """The function gets a player's quick stats that show up on the dashboard, such as the
    PPG, RPG, APG, and PIE

    Args:
        player_id (int): the player's ID
    
    Returns:
        Dict[str, Any]: the player's quick stats
    """
    url = create_player_query(player_id=player_id)
    print(url)
    
    player_response = requests.get(url)
    player_soup = BeautifulSoup(player_response.content, "html.parser")
    regex = re.compile("PlayerSummary_playerStat*")
    player_stats = player_soup.find_all("div", class_=regex)
    
    stats = {"Player ID": player_id}
    for i in player_stats:
        stat_tags = list(i.children)
        label, value = stat_tags
        stats[label.text] = float(value.text) if value.text != "--" else np.nan
    return stats

In [None]:
def get_all_player_quick_stats(player_ids: List[int]) -> pd.DataFrame:
    """Get all the NBA player's quick stats from the dashboard
    
    Args:
        player_ids (List[int]): all the player's IDs
    
    Returns:
        pd.DataFrame: all the player's quick stats
    """
    # Used threads to speed up queries
    with futures.ThreadPoolExecutor() as executor:
        player_list = list(executor.map(get_quick_stats, player_ids))
    return pd.DataFrame(player_list)


In [None]:
player_ids = players["PLAYER_ID"].to_list()
quick_stats = get_all_player_quick_stats(player_ids=player_ids)

In [None]:
# quick_stats.to_csv(os.path.join("nba", "player_quick_stats.csv"))

## Get Player Career Stats

In [None]:
def loading_completed(driver) -> bool:
    """Allows the driver to know when the loading animation is over
    
    Args:
        driver (selenium.webdriver.chrome.webdriver.WebDriver): the Selenium driver
    
    Returns:
        bool: whether the loading animation is over
    """
    try:
        element = driver.find_element(By.CSS_SELECTOR, "div[class*=LoadingOverlay]")
    except NoSuchElementException:
        return False
    return element.get_attribute("data-hidden") == "true"


In [None]:
def get_page_source(url: str) -> str:
    """Opens the player stats page and gets the HTML page source
    
    Args:
        url (str): the player stats URL
    
    Returns:
        str: the player stats HTML
    """
    service = ChromeService(executable_path=ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service)

    # Timeout in seconds
    TIMEOUT = 30
    
    try:
        wait = WebDriverWait(driver, timeout=TIMEOUT)
        driver.get(url)
        wait.until(loading_completed)
        page_source = driver.page_source
        driver.quit()
        return page_source
    except Exception:
        raise Exception("Something went wrong!")


In [None]:
def get_player_info(player_id: int) -> pd.DataFrame:
    """Gets the player's career regular season stats
    
    Args:
        player_id: the player's ID
    
    Returns:
        pd.DataFrame: the player's career stats
    """
    url = create_player_query(player_id=player_id)
    print(url)
    
    # Retrieve the player's stats page
    # Tries multiple times
    i = 0
    while True:
        try:
            page_content = get_page_source(url)
        except Exception:
            print(f"Iteration #{i}: Issue getting page contents")
            i += 1
    
    # Looks for the CAREER REGULAR SEASON STATS table
    soup = BeautifulSoup(page_content, "html.parser")
    table = soup.find("table", {"class": re.compile("Crom_table*")})

    try:
        # Collect and format the table data
        cols = [elem.text for elem in table.find_all("th")[3:]]
        rows = [row.text for row in table.find_all("td")]
        output = []
        row = []
        for i, v in enumerate(rows):
            if i != 0 and i % len(cols) == 0:
                output.append(row)
                row = []

            row.append(v)

        df = pd.DataFrame(output, columns=cols)
        df["PLAYER ID"] = player_id
        return df
    except Exception:
        # New players don't have stats, so return an empty DataFrame
        if soup.find("div", string="No data available"):
            print(f"No data available for player: {player_id}")
            return pd.DataFrame()
        print("There seems to be another issue!!")


In [None]:
def get_all_player_info(player_ids: List[int]) -> pd.DataFrame:
    """Get all the player's career statistics. I don't use threads with Selenium because
    Chrome is a RAM-hog, and it wastes more time than just visiting a site one after another.
    
    Args:
        player_ids (List[int]): all the player IDs
    
    Returns:
        pd.DataFrame: all player career stats
    """
    output = []
    for idx, player_id in enumerate(player_ids):
        print(f"#{idx}", end=" ")
        output.append(get_player_info(player_id))
    return pd.concat(output).reset_index(drop=True)

In [None]:
players_info = get_all_player_info(player_ids=player_ids[:5])

In [None]:
def process_career_stats(df: pd.DataFrame) -> pd.DataFrame:
    """Process the career stats. Changing the column names and making the statistics into numeric types
    instead of strings.
    
    Args:
        df (pd.DataFrame): the raw career stats
        
    Returns:
        pd.DataFrame: the processed career stats
    """
    df = df.copy(deep=True)
    df.drop(columns="SCHOOL", inplace=True)
    df.rename(columns={
        "TEAM": "Team", 
        "AGE": "Age",
        "PLAYER ID": "Player ID",
    }, inplace=True)
    
    int_cols = list(set(df.columns) - {"Season", "Team"})
    df[int_cols] = df[int_cols].apply(pd.to_numeric)
    return df

In [None]:
career_stats = process_career_stats(players_info)

In [None]:
# career_stats.to_csv(os.path.join("nba", "player_info.csv"))