Let's use the free [balldontlie.io](https://www.balldontlie.io/) API for extracting a new dataset!

In [None]:
import requests
import datetime
import pathlib
import pandas as pd
import time
import utils

BASE_DIR = pathlib.Path().resolve().parent
COURSE_DIR = BASE_DIR / "course"
DATASET_DIR = COURSE_DIR / "datasets"
SAMPLES_DIR = COURSE_DIR / "samples"
INPUT_PATH = SAMPLES_DIR / '4-player-salaries-cleaned.csv'
salary_df = pd.read_csv(INPUT_PATH)

In [None]:
players_endpoint = "https://www.balldontlie.io/api/v1/players?per_page=100&page=0"
stats_endpoint = f'https://www.balldontlie.io/api/v1/stats'

In [None]:
# !curl "https://www.balldontlie.io/api/v1/players?per_page=100"

In [None]:
def get_players_dataset(per_page=100):
    dataset = []
    base_url = "https://www.balldontlie.io/api/v1/players"
    init_url = f"{base_url}?per_page={per_page}"
    r = requests.get(init_url)
    if not r.status_code in range(200, 299):
        return []
    json_data = r.json()
    meta_data = json_data['meta']
    total_pages = int(meta_data.get('total_pages'))
    for x in range(0, total_pages + 1):
        time.sleep(0.25)
        url = f"{base_url}?per_page={per_page}&page={x}"
        r = requests.get(url)
        if not r.status_code in range(200, 299):
            print('skipping')
            continue
        json_data = r.json()
        data = json_data['data']
        # dataset.append(data)
        dataset += data
    return dataset

In [None]:
players_dataset = get_players_dataset()

In [None]:
player_df = pd.DataFrame(players_dataset)[['id', 'first_name', 'last_name']]
player_df['full_name'] = player_df['first_name'] + " " + player_df['last_name']
player_df.drop_duplicates(subset=['id'], inplace=True)
player_df.head()

In [None]:
player_df.shape

In [None]:
def get_stats(player_id=1, postseason=False, per_page=100):
    dataset = []
    postseason_param = "true" if postseason else "false"
    base_url = f"https://www.balldontlie.io/api/v1/stats?player_ids[]={player_id}&postseason={postseason_param}"
    init_url = f"{base_url}&per_page={per_page}"
    r = requests.get(init_url)
    if not r.status_code in range(200, 299):
        return []
    json_data = r.json()
    meta_data = json_data['meta']
    total_pages = int(meta_data.get('total_pages'))
    for x in range(0, total_pages + 1):
        time.sleep(0.25)
        url = f"{base_url}&per_page={per_page}&page={x}"
        r = requests.get(url)
        if not r.status_code in range(200, 299):
            print('skipping')
            continue
        json_data = r.json()
        data = json_data['data']
        # dataset.append(data)
        dataset += data
    return dataset

In [None]:
NESTED_STATS_COLS = ['game', 'team', 'player']

def unpack_nested_dict(row):
    for col in NESTED_STATS_COLS:
        col_val = row[col] # row['game']
        if isinstance(col_val, dict):
            for key, val in col_val.items():
                new_col_key = f"{col}_{key}"
                # game_id
                # game_period
                # game_status
                row[new_col_key] = val
    return row

def get_second_played(val):
    h, m, s = 0,0,0
    if val:
        time_string = val.split(":") # always create a list
        if len(time_string) == 2:
            m, s = time_string
        if len(time_string) == 3:
            h, m, s = time_string
        if len(time_string) == 1:
            m = time_string[0]
        if f"{h}".isdigit():
            h = int(h)
        if f"{m}".isdigit():
            m = int(m)
        if f"{s}".isdigit():
            s = int(s)
    return datetime.timedelta(hours=h, minutes=m, seconds=s).total_seconds()

def get_stats_df(stats_dataset):
    if len(stats_dataset) == 0:
        return pd.DataFrame()
    df = pd.DataFrame(stats_dataset)
    df = df.apply(unpack_nested_dict, axis=1)
    df.drop(columns=NESTED_STATS_COLS, inplace=True)
    if "game_date" in df.columns:
        df['date'] = pd.to_datetime(df['game_date'])
        df['year'] = df['date'].apply(lambda x: x.year)
    if "min" in df.columns:
        df['seconds'] = df['min'].apply(get_second_played)
        df['did_play'] = df['seconds'].apply(lambda x: x > 0)
    df.drop_duplicates(subset=['id'], inplace=True)
    return df

In [None]:
# player_id = player_df.sample(n=1)['id'].item()
# player_id

In [None]:
name = 'Michael Jordan'
player = player_df[player_df["full_name"] == name]
player_id = 0

if not player.empty:
    player_id = player['id'].item()

player_id

In [None]:
reg_season_stats = get_stats(player_id=player_id, postseason=False)
post_season_stats = get_stats(player_id=player_id, postseason=True)

In [None]:
reg_season_df = get_stats_df(reg_season_stats)
post_season_df = get_stats_df(post_season_stats)

In [None]:
reg_season_df.head()

In [None]:
post_season_df.shape

In [None]:
avg_pts_per_year = reg_season_df.groupby('year')['pts'].mean()
# avg_pts_per_year

In [None]:
avg_pts_per_year_per_postseason = post_season_df.groupby('year')['pts'].mean()
# avg_pts_per_year_per_postseason

In [None]:
player_salary_df = salary_df.copy()[salary_df['player'] == name][['adj_salary', 'year_start']]
player_salary_df.head(n=20)

In [None]:
mean_df = pd.DataFrame(avg_pts_per_year)
mean_df.reset_index(drop=False, inplace=True)
mean_df.head()

In [None]:
merged_df = mean_df.merge(player_salary_df, left_on='year', right_on='year_start')
merged_df.drop(columns=['year_start'], inplace=True)
merged_df['adj_salary_$'] = merged_df['adj_salary'].apply(utils.float_to_dollars)
merged_df.head(n=100)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=07445f6e-2e90-46a0-bcba-e6988a45d7c3' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>