# DSCI 511: Data Aquistion and Preprocessing

## NBA API

### Members:
* Dara Kasrovi
* Ao Wang

## Import Libraries

In [1]:
import re
import json
from concurrent import futures
from typing import List
import time
import os

import requests
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.support.wait import WebDriverWait
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By

## Constants

In [2]:
TEAMS_URL = "https://www.nba.com/stats/teams"

## Get NBA Teams 

After running the function, the output should look like this:

|   | Division | Team               |    Team ID |
|--:|---------:|-------------------:|-----------:|
| 0 | Atlantic |     Boston Celtics | 1610612738 |
| 1 | Atlantic |      Brooklyn Nets | 1610612751 |
| 2 | Atlantic |    New York Knicks | 1610612752 |
| 3 | Atlantic | Philadelphia 76ers | 1610612755 |
| 4 | Atlantic |    Toronto Raptors | 1610612761 |


In [3]:
def get_nba_teams() -> pd.DataFrame:
    """The function goes to the NBA team stats page and scrapes all the team data,
    including division, team, and team ID"""
    
    response = requests.get(TEAMS_URL)
    
    # Check that the response was successful, i.e.  200 - good, 401, 404, etc - bad
    if not response.ok:
        print("Something went wrong in getting team listings")
        return pd.DataFrame()
    
    # Use bs4 and regex to get the table of teams
    soup = BeautifulSoup(response.content, "html.parser")
    regex = re.compile("^StatsTeamsList_divContent")
    table = soup.find("div", {"class": regex})
    
    # Atlantic, Central, Southeast, etc are basketball divisions
    chart = list(table.children)
    output = []
    
    # Get the division, team, and team id in the HTML
    for html in chart:
        division = html.find("h2").text
        teams = html.find_all("a")
        for team in teams:
            href = team["href"].replace("/stats/team/", "")
            team_name = team.text
            output.append([division, team_name, href])
    return pd.DataFrame(output, columns=["Division", "Team", "Team_ID"])

In [4]:
teams = get_nba_teams()

In [5]:
teams

Unnamed: 0,Division,Team,Team_ID
0,Atlantic,Boston Celtics,1610612738
1,Atlantic,Brooklyn Nets,1610612751
2,Atlantic,New York Knicks,1610612752
3,Atlantic,Philadelphia 76ers,1610612755
4,Atlantic,Toronto Raptors,1610612761
5,Central,Chicago Bulls,1610612741
6,Central,Cleveland Cavaliers,1610612739
7,Central,Detroit Pistons,1610612765
8,Central,Indiana Pacers,1610612754
9,Central,Milwaukee Bucks,1610612749


In [6]:
def create_team_query(team_id: int, season: str = "2022-23") -> str:
    return f"https://www.nba.com/stats/team/{team_id}?Season={season}"

In [7]:
create_team_query(1610612755)

'https://www.nba.com/stats/team/1610612755?Season=2022-23'

In [8]:
def get_team_roster(team_id: int, season: str = "2022-23") -> pd.DataFrame:
    print(team_id)
    url = create_team_query(team_id=team_id, season=season)
    response = requests.get(url)
    i = 0
    
    while not response.ok:
        print(f"There was an issue getting team id={team_id}!!")
        print(f"Reattempting! Iteration {i + 1}")
        i += 1
        
        response = requests.get(url)
    
    soup = BeautifulSoup(response.content, "html.parser")
    output = json.loads(soup.find("script", {"id": "__NEXT_DATA__"}).text)
    return pd.DataFrame(output["props"]["pageProps"]["team"]["roster"])

In [9]:
philly_team = get_team_roster(1610612755)

1610612755


In [10]:
def get_all_players(team_ids: List[int]) -> pd.DataFrame:
    with futures.ThreadPoolExecutor() as executor:
        player_list = list(executor.map(get_team_roster, team_ids))
    return pd.concat(player_list).reset_index(drop=True)

In [11]:
players = get_all_players(team_ids=teams["Team_ID"].to_list())

1610612738
1610612751
1610612752
1610612755
1610612761
1610612741
1610612739
1610612765
1610612754
1610612749
1610612737
1610612766
1610612748
1610612753
1610612764
1610612743
1610612750
1610612760
1610612757
1610612762
1610612744
1610612746
1610612747
1610612756
1610612758
1610612742
1610612745
1610612763
1610612740
1610612759
There was an issue getting team id=1610612745!!
Reattempting! Iteration 1


In [12]:
def create_player_query(player_id: int) -> str:
    return f"https://www.nba.com/stats/player/{player_id}/career?PerMode=Totals"

In [13]:
player_ids = players["PLAYER_ID"].to_list()

In [17]:
def loading_completed(driver):
    try:
        element = driver.find_element(By.CSS_SELECTOR, "div[class*=LoadingOverlay]")
    except NoSuchElementException:
        return False
    return element.get_attribute("data-hidden") == "true"

def get_page_source(url: str) -> str:
    service = ChromeService(executable_path=ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service)
    wait = WebDriverWait(driver, timeout=60)
    driver.get(url)
    wait.until(loading_completed)
    page_source = driver.page_source
    driver.quit()
    return page_source

def get_player_info(player_id: int) -> pd.DataFrame:
    url = create_player_query(player_id=player_id)
    print(url)
    page_content = get_page_source(url)
    soup = BeautifulSoup(page_content, "html.parser")
    table = soup.find("table", {"class": re.compile("Crom_table*")})
    cols = [elem.text for elem in table.find_all("th")[3:]]
    rows = [row.text for row in table.find_all("td")]
    output = []
    row = []
    for i, v in enumerate(rows):
        if i != 0 and i % len(cols) == 0:
            output.append(row)
            row = []

        row.append(v)
        
    return pd.DataFrame(output, columns=cols)

In [18]:
a = get_player_info(player_id=1630178)

https://www.nba.com/stats/player/1630178/career?PerMode=Totals


In [20]:
def get_all_player_info(player_ids: List[int]) -> pd.DataFrame:
    with futures.ThreadPoolExecutor() as executor:
        player_list = list(executor.map(get_player_info, player_ids))
    return pd.concat(player_list).reset_index(drop=True)

In [21]:
players = get_all_player_info(player_ids=player_ids[:10])

https://www.nba.com/stats/player/1628369/career?PerMode=Totals
https://www.nba.com/stats/player/203943/career?PerMode=Totals
https://www.nba.com/stats/player/1627759/career?PerMode=Totals
https://www.nba.com/stats/player/201568/career?PerMode=Totals
https://www.nba.com/stats/player/1628401/career?PerMode=Totals
https://www.nba.com/stats/player/1630202/career?PerMode=Totals
https://www.nba.com/stats/player/1629684/career?PerMode=Totals
https://www.nba.com/stats/player/1627763/career?PerMode=Totals
https://www.nba.com/stats/player/1631120/career?PerMode=Totals
https://www.nba.com/stats/player/1629662/career?PerMode=Totals
ERROR! Session/line number was not unique in database. History logging moved to new session 243


KeyboardInterrupt: 