# Basketball Anomaly Dataset

For both Synth and ADMERCS.

Based on our earliest idea of a demo dataset that would contain typical contextual anomalies which should be easy for MERCS to detect.



# Preliminaries

In [1]:
# (Optional) Black codeformatter (`pip install nb_black`) for jupyterlab. In jupyter notebook, this changes slightly.
%load_ext lab_black

## Imports

In [2]:
import nba_api
from nba_api.stats import endpoints
from nba_api.stats.static import players, teams

import pandas as pd
import numpy as np

# show all columns
pd.set_option("display.max_columns", None)

In [28]:
from nba_anomaly_generator.data import load_lal

from nba_anomaly_generator.anom import (
    insert_dependency_anomaly,
    insert_contextual_anomaly,
    insert_swap_anomaly,
    insert_transformation_anomaly,
    ft_to_m,
    lb_to_kg,
)

# Load Data

In [4]:
# loading dump
df = load_lal()
df.head()

Unnamed: 0,SEASON_ID,TEAM_ABBREVIATION,PLAYER_ID,PLAYER,NUM,POSITION,HEIGHT,WEIGHT,BIRTH_DATE,PLAYER_AGE,GP,GS,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,REB,AST,STL,BLK,PTS
0,2018-19,LAL,1628398,Kyle Kuzma,0,F,6-9,220,"JUL 24, 1995",23.0,70,68,2314.0,496,1087,0.456,128,422,0.303,188,250,0.752,382,178,41,26,1308
1,2018-19,LAL,203484,Kentavious Caldwell-Pope,1,G,6-5,205,"FEB 18, 1993",26.0,82,23,2035.0,325,756,0.43,151,435,0.347,137,158,0.867,238,110,73,13,938
2,2018-19,LAL,1628366,Lonzo Ball,2,G,6-6,190,"OCT 27, 1997",21.0,47,45,1423.0,185,456,0.406,75,228,0.329,20,48,0.417,251,255,69,19,465
3,2018-19,LAL,1628404,Josh Hart,3,G,6-5,215,"MAR 06, 1995",24.0,67,22,1715.0,189,464,0.407,92,274,0.336,55,80,0.688,248,93,64,40,525
4,2018-19,LAL,1627936,Alex Caruso,4,G,6-5,186,"FEB 28, 1994",25.0,25,4,531.0,77,173,0.445,24,50,0.48,51,64,0.797,67,77,24,9,229


# Kinds Of Columns

Highlighting the interesting columns for in the presentation.

## Metadata

In [20]:
metadata_columns = [
    "SEASON_ID",
    "TEAM_ABBREVIATION",
    "PLAYER_ID",
    "PLAYER",
    "NUM",
    "POSITION",
    "HEIGHT",
    "WEIGHT",
    "BIRTH_DATE",
    "PLAYER_AGE",
]

df.head().style.applymap(
    lambda x: "background-color:orange", subset=pd.IndexSlice[:, metadata_columns]
)

Unnamed: 0,SEASON_ID,TEAM_ABBREVIATION,PLAYER_ID,PLAYER,NUM,POSITION,HEIGHT,WEIGHT,BIRTH_DATE,PLAYER_AGE,GP,GS,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,REB,AST,STL,BLK,PTS,a_lbl
0,2018-19,LAL,1628398,Kyle Kuzma,0,F,6-9,220,"JUL 24, 1995",23.0,70,68,2314.0,496,1087,0.46,128,422,0.3,188,250,0.75,382,178,41,26,1308,0
1,2018-19,LAL,203484,Kentavious Caldwell-Pope,1,G,6-5,205,"FEB 18, 1993",26.0,82,23,2035.0,325,756,0.43,151,435,0.35,137,158,0.87,238,110,73,13,938,0
2,2018-19,LAL,1628366,Lonzo Ball,2,G,6-6,190,"OCT 27, 1997",21.0,47,45,1423.0,185,456,0.41,75,228,0.33,20,48,0.42,251,255,69,19,465,0
3,2018-19,LAL,1628404,Josh Hart,3,G,6-5,215,"MAR 06, 1995",24.0,67,22,1715.0,189,464,0.41,92,274,0.34,55,80,0.69,248,93,64,40,525,0
4,2018-19,LAL,1627936,Alex Caruso,4,G,6-5,186,"FEB 28, 1994",25.0,25,4,531.0,77,173,0.45,24,50,0.48,51,64,0.8,67,77,24,9,229,0


## Stats

In [19]:
stats_columns = [
    "GP",
    "GS",
    "MIN",
    "FGM",
    "FGA",
    # "FG_PCT",
    "FG3M",
    "FG3A",
    # "FG3_PCT",
    "FTM",
    "FTA",
    # "FT_PCT",
    # "OREB",
    # "DREB",
    "REB",
    "AST",
    "STL",
    "BLK",
    # "TOV",
    # "PF",
    "PTS",
]

df.head().style.applymap(
    lambda x: "background-color:orange", subset=pd.IndexSlice[:, stats_columns]
)

Unnamed: 0,SEASON_ID,TEAM_ABBREVIATION,PLAYER_ID,PLAYER,NUM,POSITION,HEIGHT,WEIGHT,BIRTH_DATE,PLAYER_AGE,GP,GS,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,REB,AST,STL,BLK,PTS,a_lbl
0,2018-19,LAL,1628398,Kyle Kuzma,0,F,6-9,220,"JUL 24, 1995",23.0,70,68,2314.0,496,1087,0.46,128,422,0.3,188,250,0.75,382,178,41,26,1308,0
1,2018-19,LAL,203484,Kentavious Caldwell-Pope,1,G,6-5,205,"FEB 18, 1993",26.0,82,23,2035.0,325,756,0.43,151,435,0.35,137,158,0.87,238,110,73,13,938,0
2,2018-19,LAL,1628366,Lonzo Ball,2,G,6-6,190,"OCT 27, 1997",21.0,47,45,1423.0,185,456,0.41,75,228,0.33,20,48,0.42,251,255,69,19,465,0
3,2018-19,LAL,1628404,Josh Hart,3,G,6-5,215,"MAR 06, 1995",24.0,67,22,1715.0,189,464,0.41,92,274,0.34,55,80,0.69,248,93,64,40,525,0
4,2018-19,LAL,1627936,Alex Caruso,4,G,6-5,186,"FEB 28, 1994",25.0,25,4,531.0,77,173,0.45,24,50,0.48,51,64,0.8,67,77,24,9,229,0


## Derived Stats

In [18]:
derived_stats_columns = [
    "FG_PCT",
    "FG3_PCT",
    "FT_PCT",
]

df.head().style.applymap(
    lambda x: "background-color:orange", subset=pd.IndexSlice[:, derived_stats_columns]
)

Unnamed: 0,SEASON_ID,TEAM_ABBREVIATION,PLAYER_ID,PLAYER,NUM,POSITION,HEIGHT,WEIGHT,BIRTH_DATE,PLAYER_AGE,GP,GS,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,REB,AST,STL,BLK,PTS,a_lbl
0,2018-19,LAL,1628398,Kyle Kuzma,0,F,6-9,220,"JUL 24, 1995",23.0,70,68,2314.0,496,1087,0.46,128,422,0.3,188,250,0.75,382,178,41,26,1308,0
1,2018-19,LAL,203484,Kentavious Caldwell-Pope,1,G,6-5,205,"FEB 18, 1993",26.0,82,23,2035.0,325,756,0.43,151,435,0.35,137,158,0.87,238,110,73,13,938,0
2,2018-19,LAL,1628366,Lonzo Ball,2,G,6-6,190,"OCT 27, 1997",21.0,47,45,1423.0,185,456,0.41,75,228,0.33,20,48,0.42,251,255,69,19,465,0
3,2018-19,LAL,1628404,Josh Hart,3,G,6-5,215,"MAR 06, 1995",24.0,67,22,1715.0,189,464,0.41,92,274,0.34,55,80,0.69,248,93,64,40,525,0
4,2018-19,LAL,1627936,Alex Caruso,4,G,6-5,186,"FEB 28, 1994",25.0,25,4,531.0,77,173,0.45,24,50,0.48,51,64,0.8,67,77,24,9,229,0


## All together

In [17]:
pd.set_option("precision", 2)

df.head().style.applymap(
    lambda x: "background-color:lightblue", subset=pd.IndexSlice[:, metadata_columns]
).applymap(
    lambda x: "background-color:orange", subset=pd.IndexSlice[:, stats_columns]
).applymap(
    lambda x: "background-color:yellow", subset=pd.IndexSlice[:, derived_stats_columns],
)

Unnamed: 0,SEASON_ID,TEAM_ABBREVIATION,PLAYER_ID,PLAYER,NUM,POSITION,HEIGHT,WEIGHT,BIRTH_DATE,PLAYER_AGE,GP,GS,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,REB,AST,STL,BLK,PTS,a_lbl
0,2018-19,LAL,1628398,Kyle Kuzma,0,F,6-9,220,"JUL 24, 1995",23.0,70,68,2314.0,496,1087,0.46,128,422,0.3,188,250,0.75,382,178,41,26,1308,0
1,2018-19,LAL,203484,Kentavious Caldwell-Pope,1,G,6-5,205,"FEB 18, 1993",26.0,82,23,2035.0,325,756,0.43,151,435,0.35,137,158,0.87,238,110,73,13,938,0
2,2018-19,LAL,1628366,Lonzo Ball,2,G,6-6,190,"OCT 27, 1997",21.0,47,45,1423.0,185,456,0.41,75,228,0.33,20,48,0.42,251,255,69,19,465,0
3,2018-19,LAL,1628404,Josh Hart,3,G,6-5,215,"MAR 06, 1995",24.0,67,22,1715.0,189,464,0.41,92,274,0.34,55,80,0.69,248,93,64,40,525,0
4,2018-19,LAL,1627936,Alex Caruso,4,G,6-5,186,"FEB 28, 1994",25.0,25,4,531.0,77,173,0.45,24,50,0.48,51,64,0.8,67,77,24,9,229,0


# Insert Anomalies

## Dependency Anomaly

In [9]:
df, md = insert_dependency_anomaly(
    df,
    row=None,
    col="PLAYER_AGE",
    val=None,
    val_list=None,
    val_dist=None,
    random_state=420,
    rng=None,
)

df.head()

Unnamed: 0,SEASON_ID,TEAM_ABBREVIATION,PLAYER_ID,PLAYER,NUM,POSITION,HEIGHT,WEIGHT,BIRTH_DATE,PLAYER_AGE,GP,GS,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,REB,AST,STL,BLK,PTS,a_lbl
0,2018-19,LAL,1628398,Kyle Kuzma,0,F,6-9,220,"JUL 24, 1995",23.0,70,68,2314.0,496,1087,0.46,128,422,0.3,188,250,0.75,382,178,41,26,1308,0
1,2018-19,LAL,203484,Kentavious Caldwell-Pope,1,G,6-5,205,"FEB 18, 1993",26.0,82,23,2035.0,325,756,0.43,151,435,0.35,137,158,0.87,238,110,73,13,938,0
2,2018-19,LAL,1628366,Lonzo Ball,2,G,6-6,190,"OCT 27, 1997",21.0,47,45,1423.0,185,456,0.41,75,228,0.33,20,48,0.42,251,255,69,19,465,0
3,2018-19,LAL,1628404,Josh Hart,3,G,6-5,215,"MAR 06, 1995",24.0,67,22,1715.0,189,464,0.41,92,274,0.34,55,80,0.69,248,93,64,40,525,0
4,2018-19,LAL,1627936,Alex Caruso,4,G,6-5,186,"FEB 28, 1994",25.0,25,4,531.0,77,173,0.45,24,50,0.48,51,64,0.8,67,77,24,9,229,0


In [10]:
md

{'iloc': (10, 9), 'loc': (10, 'PLAYER_AGE'), 'old': 21.0, 'new': 28.0}

In [22]:
df.tail(7).style.applymap(
    lambda x: "background-color:red", subset=pd.IndexSlice[md["loc"]]
)

Unnamed: 0,SEASON_ID,TEAM_ABBREVIATION,PLAYER_ID,PLAYER,NUM,POSITION,HEIGHT,WEIGHT,BIRTH_DATE,PLAYER_AGE,GP,GS,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,REB,AST,STL,BLK,PTS,a_lbl
10,2018-19,LAL,1627742,Brandon Ingram,14,F,6-9,190,"SEP 02, 1997",28.0,52,52,1761.0,362,729,0.5,31,94,0.33,195,289,0.68,267,154,28,31,950,1
11,2018-19,LAL,1629021,Moritz Wagner,15,C,6-11,245,"APR 26, 1997",22.0,43,5,446.0,71,171,0.41,22,77,0.29,43,53,0.81,85,24,11,13,207,0
12,2018-19,LAL,1629067,Isaac Bonga,17,G,6-8,180,"NOV 08, 1999",19.0,22,0,120.0,5,33,0.15,0,8,0.0,9,15,0.6,25,15,9,4,19,0
13,2018-19,LAL,1629140,Johnathan Williams,19,F,6-9,228,"MAY 22, 1995",24.0,24,0,372.0,65,110,0.59,0,2,0.0,27,48,0.56,99,13,8,7,157,0
14,2018-19,LAL,2544,LeBron James,23,F,6-8,250,"DEC 30, 1984",34.0,55,55,1937.0,558,1095,0.51,111,327,0.34,278,418,0.67,465,454,72,33,1505,0
15,2018-19,LAL,203488,Mike Muscala,31,F-C,6-11,240,"JUL 01, 1991",27.0,17,4,265.0,36,83,0.43,21,57,0.37,7,8,0.88,44,14,4,11,100,0
16,2018-19,LAL,203493,Reggie Bullock,35,G-F,6-7,205,"MAR 16, 1991",28.0,19,16,524.0,63,153,0.41,34,99,0.34,17,21,0.81,50,20,16,7,177,0


lgtm

## Swap Anomaly

Swapping two values in a row (or in a column).

The row based one basically introduces global anomalies. The column based one is another way to introduce dependency anomalies in two places at once.

In [23]:
df, md = insert_swap_anomaly(
    df,
    row=None,
    l_col=None,
    r_col=None,
    val=None,
    val_list=None,
    val_dist=None,
    rng=None,
    random_state=42,
)

In [25]:
md

[{'loc': (1, 21), 'old': 0.34700000000000003, 'new': 0.867},
 {'loc': (1, 18), 'old': 0.867, 'new': 0.34700000000000003}]

In [26]:
df.head()

Unnamed: 0,SEASON_ID,TEAM_ABBREVIATION,PLAYER_ID,PLAYER,NUM,POSITION,HEIGHT,WEIGHT,BIRTH_DATE,PLAYER_AGE,GP,GS,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,REB,AST,STL,BLK,PTS,a_lbl
0,2018-19,LAL,1628398,Kyle Kuzma,0,F,6-9,220,"JUL 24, 1995",23.0,70,68,2314.0,496,1087,0.46,128,422,0.3,188,250,0.75,382,178,41,26,1308,0
1,2018-19,LAL,203484,Kentavious Caldwell-Pope,1,G,6-5,205,"FEB 18, 1993",26.0,82,23,2035.0,325,756,0.43,151,435,0.35,137,158,0.87,238,110,73,13,938,0
2,2018-19,LAL,1628366,Lonzo Ball,2,G,6-6,190,"OCT 27, 1997",21.0,47,45,1423.0,185,456,0.41,75,228,0.33,20,48,0.42,251,255,69,19,465,0
3,2018-19,LAL,1628404,Josh Hart,3,G,6-5,215,"MAR 06, 1995",24.0,67,22,1715.0,189,464,0.41,92,274,0.34,55,80,0.69,248,93,64,40,525,0
4,2018-19,LAL,1627936,Alex Caruso,4,G,6-5,186,"FEB 28, 1994",25.0,25,4,531.0,77,173,0.45,24,50,0.48,51,64,0.8,67,77,24,9,229,0


## Transformation Anomaly

Apply a transformation to a value.

This is rather general.
- Change units (global anomaly)
- Shift a value (for instance edit a string), can be a global (if you hit a non-existing value) or local (if you end up in a familiar subspace)
    

In [29]:
df, md = insert_transformation_anomaly(
    df,
    row=None,
    col="HEIGHT",
    transformation=ft_to_m,
    rng=None,
    random_state=420,
    return_anomaly_metadata=True,
)

In [30]:
md

{'iloc': (10, 6),
 'loc': (10, 'HEIGHT'),
 'old': '6-9',
 'new': 2.0574000000000003}

In [38]:
df.tail(7).style.applymap(
    lambda x: "background-color:red", subset=pd.IndexSlice[md["loc"]]
)

Unnamed: 0,SEASON_ID,TEAM_ABBREVIATION,PLAYER_ID,PLAYER,NUM,POSITION,HEIGHT,WEIGHT,BIRTH_DATE,PLAYER_AGE,GP,GS,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,REB,AST,STL,BLK,PTS,a_lbl
10,2018-19,LAL,1627742,Brandon Ingram,14,F,2.06,190,"SEP 02, 1997",28.0,52,52,1761.0,362,729,0.5,31,94,0.33,195,289,0.68,267,154,28,31,950,1
11,2018-19,LAL,1629021,Moritz Wagner,15,C,6-11,245,"APR 26, 1997",22.0,43,5,446.0,71,171,0.41,22,77,0.29,43,53,0.81,85,24,11,13,207,0
12,2018-19,LAL,1629067,Isaac Bonga,17,G,6-8,180,"NOV 08, 1999",19.0,22,0,120.0,5,33,0.15,0,8,0.0,9,15,0.6,25,15,9,4,19,0
13,2018-19,LAL,1629140,Johnathan Williams,19,F,6-9,228,"MAY 22, 1995",24.0,24,0,372.0,65,110,0.59,0,2,0.0,27,48,0.56,99,13,8,7,157,0
14,2018-19,LAL,2544,LeBron James,23,F,6-8,250,"DEC 30, 1984",34.0,55,55,1937.0,558,1095,0.51,111,327,0.34,278,418,0.67,465,454,72,33,1505,0
15,2018-19,LAL,203488,Mike Muscala,31,F-C,6-11,240,"JUL 01, 1991",27.0,17,4,265.0,36,83,0.43,21,57,0.37,7,8,0.88,44,14,4,11,100,0
16,2018-19,LAL,203493,Reggie Bullock,35,G-F,6-7,205,"MAR 16, 1991",28.0,19,16,524.0,63,153,0.41,34,99,0.34,17,21,0.81,50,20,16,7,177,0


## Contextual Anomaly

Introduce an anomaly in a subpopulation. This is not a global one, and also not really a local one (if it is situated in a single feature, even local metrics may fail to detect.)

This is a higher level anomaly, for instance you can replace a guard's height with a height sampled from the centers. In that way, it may actually be difficult to detect for global methods, but in the subpopulation of guards, it should really be a big outlier.

Re-use previous methods which you derived above. Finalize this asap!

Also, write your demo-DataFrame to disk. 

In [43]:
df, md = insert_contextual_anomaly(
    df,
    row=None,
    col="HEIGHT",
    src_subpop_filter=lambda r: r.POSITION == "C",
    tgt_subpop_filter=lambda r: r.POSITION == "G",
    rng=None,
    random_state=40,
    return_anomaly_metadata=True,
)

['7-1' '7-0' '6-11']


In [40]:
md

{'iloc': (8, 6), 'loc': (8, 'HEIGHT'), 'old': '6-1', 'new': '7-0'}

In [45]:
df.head(5).style.applymap(
    lambda x: "background-color:orange", subset=pd.IndexSlice[md["loc"]]
)

Unnamed: 0,SEASON_ID,TEAM_ABBREVIATION,PLAYER_ID,PLAYER,NUM,POSITION,HEIGHT,WEIGHT,BIRTH_DATE,PLAYER_AGE,GP,GS,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,REB,AST,STL,BLK,PTS,a_lbl
0,2018-19,LAL,1628398,Kyle Kuzma,0,F,6-9,220,"JUL 24, 1995",23.0,70,68,2314.0,496,1087,0.46,128,422,0.3,188,250,0.75,382,178,41,26,1308,0
1,2018-19,LAL,203484,Kentavious Caldwell-Pope,1,G,6-5,205,"FEB 18, 1993",26.0,82,23,2035.0,325,756,0.43,151,435,0.35,137,158,0.87,238,110,73,13,938,0
2,2018-19,LAL,1628366,Lonzo Ball,2,G,6-6,190,"OCT 27, 1997",21.0,47,45,1423.0,185,456,0.41,75,228,0.33,20,48,0.42,251,255,69,19,465,0
3,2018-19,LAL,1628404,Josh Hart,3,G,6-5,215,"MAR 06, 1995",24.0,67,22,1715.0,189,464,0.41,92,274,0.34,55,80,0.69,248,93,64,40,525,0
4,2018-19,LAL,1627936,Alex Caruso,4,G,6-11,186,"FEB 28, 1994",25.0,25,4,531.0,77,173,0.45,24,50,0.48,51,64,0.8,67,77,24,9,229,0


So this looks rather good to me, and all the power of the subpopulations is in the filers so there you can in principle go unlimited. Now the point is that you need to document your efforts and get this into actual datasets.

- Datasets
- Spreadsheets
- Demos
    - Does ADMERCS work
    - Can you get ADMERCS in Synth?
    - Can Synth do other cool stuff on these datasets?

# Create Datasets

Apply all the above the generate some actual datasets.

# SpreadSheet

For Synth, convert to spreadsheets useable in the Synth framework.

In [46]:
import openpyxl

In [51]:
with pd.ExcelWriter("output.xlsx") as writer:
    df.to_excel(
        writer, startrow=0, startcol=0, index=False, sheet_name="anomaly_detection"
    )