Using an API service to analyze NBA data.

In this one, we're going to use the [balldontlie.io](https://www.balldontlie.io/) API. It's a free API that's pretty well documented.

In [None]:
import requests
import datetime
import pathlib
import pandas as pd
import utils

BASE_DIR = pathlib.Path().resolve().parent
COURSE_DIR = BASE_DIR / "course"
DATASET_DIR = COURSE_DIR / "datasets"
SAMPLES_DIR = COURSE_DIR / "samples"
INPUT_PATH = SAMPLES_DIR / '4-player-salaries-cleaned.csv'

In the [balldontlie.io](https://www.balldontlie.io/) docs, we see a few different urls such as `https://www.balldontlie.io/api/v1/players`. This URL should return NBA players as a paginated list of dictionaries. 

Let's look at a basic example by using [Python Requests](https://docs.python-requests.org/en/latest/) to perform a quick lookup:

In [None]:
url = "https://www.balldontlie.io/api/v1/players"
r = requests.get(url)
json_data = r.json()
data = json_data['data']
meta_data = json_data['meta']
total_pages = meta_data.get('total_pages')

Based on the information above, we can run a loop until we reach the `total_pages` so we can extract each page's response `data`.

In [None]:
print(total_pages, len(data))

As we see here, we have the ability to scrape `150` pages that have about `25` data points each. This will net us about: `3,750` players.

Let's create a simple function that will loop through all pages and extract all data:

In [None]:
def get_players_dataset(per_page=100):
    # set the base url and initial url lookup
    base_url = "https://www.balldontlie.io/api/v1/players"
    url = f"{base_url}?page=0&per_page={per_page}"
    # perform first lookup with the goal to find `total_pages`
    r = requests.get(url)
    json_data = r.json()
    meta_data = json_data['meta']
    total_pages = meta_data.get('total_pages')
    print(total_pages)
    # initialize an emtpy list for our entire dataset
    dataset = []
    for x in range(0, total_pages+1):
        # on each iteration, lookup a unique page
        endpoint = f"{base_url}?page={x}&per_page={per_page}"
        print(endpoint)
        r2 = requests.get(endpoint)
        if not r2.status_code in range(200, 299):
            # if the page is not successful, skip the loop
            print("skipping")
            continue
        # grap the response json
        json_data = r2.json()
        # grab all of the data items on this iteration
        data = json_data.get('data')
        # add all data items to our dataset list
        dataset += data
    return dataset

In [None]:
players_dataset = get_players_dataset(per_page=100)

In [None]:
len(players_dataset)

In [None]:
players_df = pd.DataFrame(players_dataset)[['id', 'first_name', 'last_name']]
players_df.drop_duplicates(subset='id', inplace=True)
players_df['full_name'] = players_df['first_name'] + " " + players_df['last_name']
players_df.head()

In [None]:
players_df.shape

In [None]:
name = 'Michael Jordan'
player = players_df[players_df['full_name'] == name]

player_id = 0
if not player.empty:
    player_id = player['id'].item()

In [None]:
player_id

In [None]:
def get_stats(player_id, postseason=False):
    # this is very close to the above function
    # it serves to extract a single player's stats
    # based on postseason or regular season (if postseason=False)
    base_url = f'https://www.balldontlie.io/api/v1/stats'
    stats_dataset = []
    postseason_str = "true" if postseason else "false"
    init_url = f"{base_url}?player_ids[]={player_id}&page=0&per_page=100&postseason={postseason_str}"
    r = requests.get(init_url)
    json_data = r.json()
    meta_data = json_data['meta']
    total_pages = meta_data.get('total_pages')
    for x in range(0, total_pages+1):
        endpoint = f"{base_url}?player_ids[]={player_id}&page={x}&per_page=100&postseason={postseason_str}"
        r2 = requests.get(endpoint)
        if not r2.status_code in range(200, 299):
            print("skipping")
            continue
        json_data = r2.json()
        data = json_data.get('data')
        stats_dataset += data
    return stats_dataset

In [None]:
post_season_stats = get_stats(player_id, postseason=True)
reg_season_stats = get_stats(player_id, postseason=False)

In [None]:
def unpack_nested_dicts(row):
    """
    Flatten key columns in our DataFrame
    """
    game = row['game']
    if isinstance(game, dict):
        for key, val in game.items():
            row[f'game_{key}'] = val
    team = row['team']
    if isinstance(team, dict):
        for key, val in team.items():
            row[f'team_{key}'] = val
    player = row['player']
    if isinstance(player, dict):
        for key, val in player.items():
            row[f'player_{key}'] = val
    return row

def get_seconds_played(val):
    """
    Convert the time `min` string into
    total seconds played
    Such as:
    "32:04" is 32 minutes, 4 seconds or
    1924 total seconds
    """
    h, m, s = 0,0,0
    if val:
        time_string = val.split(":")
        if len(time_string) == 3:
            h, m, s = time_string
        if len(time_string) == 2:
            m, s = time_string
            h = 0
        if len(time_string) == 1:
            m = time_string[0]
            s = 0
            h = 0
        if f"{h}".isdigit():
            h = int(h)
        if f"{m}".isdigit():
            m = int(m)
        if f"{s}".isdigit():
            s = int(s)
    return datetime.timedelta(hours=h, minutes=m, seconds=s).total_seconds()


def get_stats_df(stats_dataset):
    """
    Convert our stats_dataset into a 
    cleaned Pandas DataFrame
    """
    df = pd.DataFrame(stats_dataset)
    df = df.apply(unpack_nested_dicts, axis=1)
    if 'min' in df.columns:
        df['seconds'] = df['min'].apply(get_seconds_played)
        df['did_play'] = df['seconds'].apply(lambda x: x != 0)
    if 'game_date' in df.columns:
        # use the built-in `pd.to_datetime` to parse
        # our df['game_time']
        df['date'] = pd.to_datetime(df['game_date'])
        # add the year from the above date
        df['year'] = df['date'].apply(lambda x: x.year)
    df.drop(columns=['game', 'team', 'player'], inplace=True)
    df.drop_duplicates(subset=['id', 'game_id'], inplace=True)
    return df

In [None]:
post_season_df = get_stats_df(post_season_stats)
reg_season_df = get_stats_df(reg_season_stats)

In [None]:
reg_season_df.shape

In [None]:
post_season_df.shape

In [None]:
reg_season_df.head()

In [None]:
average_pts_per_year = post_season_df.groupby('year')['pts'].mean()
average_pts_per_year

In [None]:
mean_df = pd.DataFrame(average_pts_per_year)
mean_df.reset_index(drop=False, inplace=True)
mean_df.head()

In [None]:
salary_df = pd.read_csv(INPUT_PATH)
salary_df.head()

In [None]:
player_salary_df = salary_df.copy()[salary_df['player'] == name][['adj_salary', 'year_start']]
player_salary_df.head()

In [None]:
merged_df = mean_df.merge(player_salary_df, left_on='year', right_on='year_start')
merged_df.drop(columns='year_start', inplace=True)

In [None]:
merged_df['adj_salary_$'] = merged_df['adj_salary'].apply(utils.float_to_dollars)
merged_df.sort_values(by=['year'])