# Basketball Anomaly Dataset

For ADMERCS paper

Based on our earliest idea of a demo dataset that would contain typical contextual anomalies which should be easy for MERCS to detect.



# Preliminaries

In [17]:
# (Optional) Black codeformatter (`pip install nb_black`) for jupyterlab. In jupyter notebook, this changes slightly.
%load_ext lab_black

The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black


## Imports

In [18]:
import nba_api
from nba_api.stats import endpoints
from nba_api.stats.static import players, teams

import pandas as pd
import numpy as np

# show all columns
pd.set_option("display.max_columns", None)

In [19]:
from nba_anomaly_generator.data import (
    get_team_roster_dataframe,
    get_team_plyr_stats_dataframe,
)

# Static Data

In [4]:
player_dict = players.get_players()

# Use ternary operator or write function
# Names are case sensitive
bron = [player for player in player_dict if player["full_name"] == "LeBron James"][0]
bron_id = bron["id"]

# find team Ids
teams_dict = teams.get_teams()
LAL = [x for x in teams_dict if x["full_name"] == "Los Angeles Lakers"][0]
LAL_id = LAL["id"]

In [5]:
def get_team_id(full_name="Los Angeles Lakers"):
    teams_dict_list = teams.get_teams()
    team = [x for x in teams_dict if x["full_name"] == full_name][0]
    return team["id"]

# Testing Data Functions

## Data Retrieval

Retrieving data from API.

In [6]:
# tiny test
team_id = LAL_id
plyr_id = bron_id
season_id = "2018-19"

In [7]:
df_team_roster = get_team_roster_dataframe(
    team_id=LAL_id, season_id=season_id, timeout_s=50
)

df_team_roster.head()

Unnamed: 0,TeamID,SEASON,LeagueID,PLAYER,NUM,POSITION,HEIGHT,WEIGHT,BIRTH_DATE,AGE,EXP,SCHOOL,PLAYER_ID,SEASON_ID
0,1610612747,2018,0,Kyle Kuzma,0,F,6-9,220,"JUL 24, 1995",23.0,1,Utah,1628398,2018-19
1,1610612747,2018,0,Kentavious Caldwell-Pope,1,G,6-5,205,"FEB 18, 1993",26.0,5,Georgia,203484,2018-19
2,1610612747,2018,0,Lonzo Ball,2,G,6-6,190,"OCT 27, 1997",21.0,1,UCLA,1628366,2018-19
3,1610612747,2018,0,Josh Hart,3,G,6-5,215,"MAR 06, 1995",24.0,1,Villanova,1628404,2018-19
4,1610612747,2018,0,Alex Caruso,4,G,6-5,186,"FEB 28, 1994",25.0,1,Texas A&M,1627936,2018-19


In [8]:
df = get_team_plyr_stats_dataframe(team_id=team_id, season_id=season_id, timeout_s=30)
df.head()

ReadTimeout: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30)

## Filter Columns

In [None]:
COLUMNS_OF_INTEREST = [
    "SEASON_ID",
    # "TEAM_ID",
    "TEAM_ABBREVIATION",
    "PLAYER_ID",
    "PLAYER",
    "NUM",
    "POSITION",
    "HEIGHT",
    "WEIGHT",
    "BIRTH_DATE",
    "PLAYER_AGE",
    # "EXP",
    "GP",
    "GS",
    "MIN",
    "FGM",
    "FGA",
    "FG_PCT",
    "FG3M",
    "FG3A",
    "FG3_PCT",
    "FTM",
    "FTA",
    "FT_PCT",
    # "OREB",
    # "DREB",
    "REB",
    "AST",
    "STL",
    "BLK",
    # "TOV",
    # "PF",
    "PTS",
]

In [None]:
df = df[COLUMNS_OF_INTEREST]
df.head()

In [None]:
df.to_csv("lal.csv")

In [None]:
df.shape

# Full Extraction

In [15]:
def get_data_and_save_data_for_season(
    team_id="Los Angeles Lakers", team_abbrev="lal", season_id=2017, timeout_s=60
):
    df = get_team_plyr_stats_dataframe(
        team_id=team_id, season_id=season_id, timeout_s=timeout_s
    )

    df.to_csv("{}-{}.csv".format(team_abbrev, season_id))
    return

In [16]:
for year in range(2000, 2018):
    print(year)
    get_data_and_save_data_for_season(team_id="Los Angeles Lakers", team_abbrev="lal", season_id=year, timeout_s=60)
    print("{} done".format(year))

2000
2000 done
2001


ReadTimeout: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=60)