# DSCI 511: Data Aquistion and Preprocessing

## NBA API

### Members:
* Dara Kasrovi
* Ao Wang

## Import Libraries

In [1]:
import re
import json
from concurrent import futures
from typing import List
import time

import requests
from bs4 import BeautifulSoup
import pandas as pd

## Constants

In [2]:
TEAMS_URL = "https://www.nba.com/stats/teams"

## Get NBA Teams 

After running the function, the output should look like this:

|   | Division | Team               |    Team ID |
|--:|---------:|-------------------:|-----------:|
| 0 | Atlantic |     Boston Celtics | 1610612738 |
| 1 | Atlantic |      Brooklyn Nets | 1610612751 |
| 2 | Atlantic |    New York Knicks | 1610612752 |
| 3 | Atlantic | Philadelphia 76ers | 1610612755 |
| 4 | Atlantic |    Toronto Raptors | 1610612761 |


In [3]:
def get_nba_teams() -> pd.DataFrame:
    """The function goes to the NBA team stats page and scrapes all the team data,
    including division, team, and team ID"""
    
    response = requests.get(TEAMS_URL)
    
    # Check that the response was successful, i.e.  200 - good, 401, 404, etc - bad
    if not response.ok:
        print("Something went wrong in getting team listings")
        return pd.DataFrame()
    
    # Use bs4 and regex to get the table of teams
    soup = BeautifulSoup(response.content, "html.parser")
    regex = re.compile("^StatsTeamsList_divContent")
    table = soup.find("div", {"class": regex})
    
    # Atlantic, Central, Southeast, etc are basketball divisions
    chart = list(table.children)
    output = []
    
    # Get the division, team, and team id in the HTML
    for html in chart:
        division = html.find("h2").text
        teams = html.find_all("a")
        for team in teams:
            href = team["href"].replace("/stats/team/", "")
            team_name = team.text
            output.append([division, team_name, href])
    return pd.DataFrame(output, columns=["Division", "Team", "Team_ID"])

In [4]:
teams = get_nba_teams()

In [5]:
teams

Unnamed: 0,Division,Team,Team_ID
0,Atlantic,Boston Celtics,1610612738
1,Atlantic,Brooklyn Nets,1610612751
2,Atlantic,New York Knicks,1610612752
3,Atlantic,Philadelphia 76ers,1610612755
4,Atlantic,Toronto Raptors,1610612761
5,Central,Chicago Bulls,1610612741
6,Central,Cleveland Cavaliers,1610612739
7,Central,Detroit Pistons,1610612765
8,Central,Indiana Pacers,1610612754
9,Central,Milwaukee Bucks,1610612749


In [6]:
def create_team_query(team_id: int, season: str = "2022-23") -> str:
    return f"https://www.nba.com/stats/team/{team_id}?Season={season}"

In [7]:
create_team_query(1610612755)

'https://www.nba.com/stats/team/1610612755?Season=2022-23'

In [8]:
def feet_to_meter(height: str) -> float:
    """Converts feet'inches to meters"""
    feet, inches = list(map(int, height.split("-")))
    return (feet * 12 + inches) * 2.54

def process_team_roster(df: pd.DataFrame) -> pd.DataFrame:
    """Cleans the team roster data"""
    output = df.copy(deep=True)
    
    # Drop columns we don't need
    output.drop(columns=["LeagueID", "NICKNAME", "PLAYER_SLUG", "HOW_ACQUIRED"], inplace=True)
    
    # Format columns to have title case and reformat columns with ID
    output.columns = output.columns.str.title()
    output.rename(columns={
        "Teamid": "Team_ID",
        "Player_Id": "Player_ID"
    }, inplace=True)
    
    # Convert feet and inches to meter
    output["Height"] = output["Height"].map(feet_to_meter)
    output["Age"] = output["Age"].astype(int)
    
    # Convert pounds to kilograms
    output["Weight"] = output["Weight"].astype(int)
    output["Weight"] = output["Weight"].map(lambda weight: weight * 0.45359237)
    
    # Convert to DateTime
    output["Birth_Date"] = pd.to_datetime(output["Birth_Date"])
    return output

In [9]:
# def get_team_roster(team_id: int, season: str = "2022-23") -> pd.DataFrame:
#     print(team_id)
#     url = create_team_query(team_id=team_id, season=season)
#     response = requests.get(url, headers=HEADERS)
#     if not response.ok:
#         print(f"There was an issue getting team id={team_id}!!")
#         return pd.DataFrame()
    
#     json_data = response.json()
#     players, _ = json_data["resultSets"]
#     players_headers, player_row_data = players["headers"], players["rowSet"]
#     return pd.DataFrame(player_row_data, columns=players_headers)

def get_team_roster(team_id: int, season: str = "2022-23") -> pd.DataFrame:
    print(team_id)
    url = create_team_query(team_id=team_id, season=season)
    response = requests.get(url)
    if not response.ok:
        print(f"There was an issue getting team id={team_id}!!")
        return pd.DataFrame()
    
    soup = BeautifulSoup(response.content, "html.parser")
    output = json.loads(soup.find("script", {"id": "__NEXT_DATA__"}).text)
    return pd.DataFrame(output["props"]["pageProps"]["team"]["roster"])

In [10]:
philly_team = get_team_roster(1610612755)

1610612755


In [11]:
process_team_roster(philly_team)

Unnamed: 0,Team_ID,Season,Player,Num,Position,Height,Weight,Birth_Date,Age,Exp,School,Player_ID
0,1610612755,2022,Tyrese Maxey,0,G,187.96,90.718474,2000-11-04,22,2,Kentucky,1630178
1,1610612755,2022,James Harden,1,G,195.58,99.790321,1989-08-26,33,13,Arizona State,201935
2,1610612755,2022,Montrezl Harrell,5,F-C,200.66,108.862169,1994-01-26,28,7,Louisville,1626149
3,1610612755,2022,De'Anthony Melton,8,G,187.96,90.718474,1998-05-28,24,4,Southern California,1629001
4,1610612755,2022,Jaden Springer,11,G,193.04,91.625659,2002-09-25,20,1,Tennessee,1630531
5,1610612755,2022,Tobias Harris,12,F,200.66,102.511876,1992-07-15,30,11,Tennessee,202699
6,1610612755,2022,P.J. Tucker,17,F,195.58,111.130131,1985-05-05,37,11,Texas,200782
7,1610612755,2022,Shake Milton,18,G-F,195.58,92.986436,1996-09-26,26,4,Southern Methodist,1629003
8,1610612755,2022,Georges Niang,20,F,200.66,104.326245,1993-06-17,29,6,Iowa State,1627777
9,1610612755,2022,Joel Embiid,21,C-F,213.36,127.005864,1994-03-16,28,6,Kansas,203954


In [12]:
def get_all_players(team_ids: List[int]) -> pd.DataFrame:
    with futures.ThreadPoolExecutor() as executor:
        player_list = list(executor.map(get_team_roster, team_ids))
    return pd.concat(player_list).reset_index(drop=True)

In [13]:
players = get_all_players(team_ids=teams["Team_ID"].to_list())

1610612738
1610612751
1610612752
1610612755
1610612761
1610612741
1610612739
1610612765
1610612754
1610612749
1610612737
1610612766
16106127481610612753

1610612764
1610612743
1610612750
1610612760
1610612757
1610612762
1610612744
1610612746
1610612747
1610612756
1610612758
1610612742
1610612745
1610612763
1610612740
1610612759


In [16]:
players[players["PLAYER"].str.startswith("Tyrese")]

Unnamed: 0,TeamID,SEASON,LeagueID,PLAYER,NICKNAME,PLAYER_SLUG,NUM,POSITION,HEIGHT,WEIGHT,BIRTH_DATE,AGE,EXP,SCHOOL,PLAYER_ID,HOW_ACQUIRED
51,1610612755,2022,0,Tyrese Maxey,Tyrese,tyrese-maxey,0,G,6-2,200,"NOV 04, 2000",22,2,Kentucky,1630178,#21 Pick in 2020 Draft
134,1610612754,2022,0,Tyrese Haliburton,Tyrese,tyrese-haliburton,0,G,6-5,185,"FEB 29, 2000",22,2,Iowa State,1630169,Traded from SAC on 02/08/22
182,1610612737,2022,0,Tyrese Martin,Tyrese,tyrese-martin,22,G,6-6,215,"MAR 07, 1999",23,R,Connecticut,1631213,Draft Rights Traded from GSW on 06/24/22


In [20]:
def create_player_query(player_id: int) -> str:
    return f"https://stats.nba.com/stats/playercareerstats?LeagueID=00&PerMode=Totals&PlayerID={player_id}"

In [28]:
player_url = create_player_query(1610612755)

In [None]:
response