# Generate Clean Dataset 01

Generate a globally useful, nba dataset from the small datasets that were collected.

# Preliminaries

In [1]:
# Black Codeformatter
%load_ext lab_black

## Imports 

In [2]:
import numpy as np
import pandas as pd
import altair as alt

from pathlib import Path

In [3]:
from nba_anomaly_generator.data import load_lal
from nba_anomaly_generator.anom import (
    insert_dependency_anomaly,
    insert_contextual_anomaly,
    insert_swap_anomaly,
    insert_transformation_anomaly,
    ft_to_m,
    lb_to_kg,
)
from nba_anomaly_generator.anom.utils import init_rng, init_row_idx

## Constants

Naming conventions etc.

In [4]:
N_JOBS = 4
VERBOSE = 51

In [31]:
NBA_DATA_DIR = Path().resolve().parent.parent / "data"

PLYR_DIR = NBA_DATA_DIR / "players"
TEAM_DIR = NBA_DATA_DIR / "rosters"
CLEAN_DIR = NBA_DATA_DIR / "clean"

FP = CLEAN_DIR / "nba-clean-01.csv"

FP

PosixPath('/home/zissou/repos/nba-anomaly-generator/data/clean/nba-clean-01.csv')

In [6]:
NORMALIZE = True
CONTAMINATION = 5
DROP_NA = True

# Row filter
MAX_AGE = 40
BEGIN_SEASON = 2006  # First season that we want in the eventual data (season filter)

# Column Filter
NUMERIC_COLUMNS_ONLY = True

## Attribute Types

Also a kind of constant, but data-specific for obvious reasons...

In [7]:
NOMINAL_ATTRIBUTES = [
    "PLAYER_ID",
    "SEASON_ID",
    "TEAM_ID",
    "TEAM_ABBREVIATION",
]

NUMERIC_ATTRIBUTES = [
    "PLAYER_AGE",
    "GP",
    "GS",
    "MIN",
    "FGM",
    "FGA",
    "FG_PCT",
    "FG3M",
    "FG3A",
    "FG3_PCT",
    "FTM",
    "FTA",
    "FT_PCT",
    "OREB",
    "DREB",
    "REB",
    "AST",
    "STL",
    "BLK",
    "TOV",
    "PF",
    "PTS",
]

# Functions

In [8]:
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler


def normalize(df, scaler=MinMaxScaler):

    columns_to_scale = [c for c in df.columns if "label" not in c]

    for c in columns_to_scale:
        df[c] = scaler().fit_transform(df[c].values.reshape(-1, 1))
    return df


def dataframe_to_dataset(df, copy=True):
    if copy:
        df = df.copy()
    relevant_cols = [c for c in df.columns if c not in {"a_lbl", "i", "cluster_label"}]
    return df[relevant_cols].values

# Create DataSet

Create the standard NBA-dataset.

## Collect

In [9]:
dfs = []
for idx, fn in enumerate(PLYR_DIR.glob("*.csv")):
    df = pd.read_csv(fn, index_col=0)
    dfs.append(df)

df = pd.concat(dfs)

df.reset_index(drop=True, inplace=True)

df.head()

Unnamed: 0,PLAYER_ID,SEASON_ID,LEAGUE_ID,TEAM_ID,TEAM_ABBREVIATION,PLAYER_AGE,GP,GS,MIN,FGM,...,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS
0,76823,1949-50,0,1610612747,MNL,22.0,1,,,1,...,,,,,0,,,,1,2
1,77524,1985-86,0,1610612739,CLE,23.0,21,0.0,266.0,28,...,0.625,15.0,23.0,38.0,9,7.0,1.0,10.0,30,61
2,77524,1986-87,0,1610612744,GOS,24.0,63,34.0,1284.0,164,...,0.632,63.0,120.0,183.0,84,27.0,8.0,43.0,200,353
3,77524,1987-88,0,1610612744,GOS,25.0,81,41.0,2039.0,258,...,0.784,133.0,202.0,335.0,138,39.0,8.0,93.0,246,612
4,77524,1988-89,0,1610612744,GOS,26.0,11,0.0,103.0,13,...,0.6,4.0,8.0,12.0,5,4.0,0.0,3.0,11,35


In [10]:
df.shape

(26951, 27)

## Types and `None`

Let us take a look at which attributes are in the dataset.

In [11]:
df.columns.tolist()

['PLAYER_ID',
 'SEASON_ID',
 'LEAGUE_ID',
 'TEAM_ID',
 'TEAM_ABBREVIATION',
 'PLAYER_AGE',
 'GP',
 'GS',
 'MIN',
 'FGM',
 'FGA',
 'FG_PCT',
 'FG3M',
 'FG3A',
 'FG3_PCT',
 'FTM',
 'FTA',
 'FT_PCT',
 'OREB',
 'DREB',
 'REB',
 'AST',
 'STL',
 'BLK',
 'TOV',
 'PF',
 'PTS']

Now let us see which one of these attributes are accounted for in the beginning of this notebook.

In [12]:
set(df.columns) - set(NOMINAL_ATTRIBUTES) - set(NUMERIC_ATTRIBUTES)

{'LEAGUE_ID'}

Please verify that the attributes that are unaccounted for a truly not of interest, since they will not be included in the final dataset.

In [13]:
df = df[NOMINAL_ATTRIBUTES + NUMERIC_ATTRIBUTES]

In [14]:
if DROP_NA:
    print("Number of rows BEFORE dropping NA: {}".format(df.shape[0]))

    df = df.dropna()

    print("Number of rows AFTER dropping NA: {}".format(df.shape[0]))

Number of rows BEFORE dropping NA: 26951
Number of rows AFTER dropping NA: 20225


In [15]:
for attribute in NOMINAL_ATTRIBUTES:
    df[attribute] = df[attribute].astype("category")

for attribute in NUMERIC_ATTRIBUTES:
    df[attribute] = df[attribute].astype(float)

In [16]:
df.dtypes, df.shape

(PLAYER_ID            category
 SEASON_ID            category
 TEAM_ID              category
 TEAM_ABBREVIATION    category
 PLAYER_AGE            float64
 GP                    float64
 GS                    float64
 MIN                   float64
 FGM                   float64
 FGA                   float64
 FG_PCT                float64
 FG3M                  float64
 FG3A                  float64
 FG3_PCT               float64
 FTM                   float64
 FTA                   float64
 FT_PCT                float64
 OREB                  float64
 DREB                  float64
 REB                   float64
 AST                   float64
 STL                   float64
 BLK                   float64
 TOV                   float64
 PF                    float64
 PTS                   float64
 dtype: object,
 (20225, 26))

## Column Generation

### Season Column

In [17]:
def _season_id_to_season(season_id):
    return float(season_id.split("-")[0])

In [18]:
df["SEASON"] = df.apply(lambda r: _season_id_to_season(r.SEASON_ID), axis=1)
df.head()

Unnamed: 0,PLAYER_ID,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,PLAYER_AGE,GP,GS,MIN,FGM,FGA,...,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS,SEASON
1,77524,1985-86,1610612739,CLE,23.0,21.0,0.0,266.0,28.0,58.0,...,15.0,23.0,38.0,9.0,7.0,1.0,10.0,30.0,61.0,1985.0
2,77524,1986-87,1610612744,GOS,24.0,63.0,34.0,1284.0,164.0,360.0,...,63.0,120.0,183.0,84.0,27.0,8.0,43.0,200.0,353.0,1986.0
3,77524,1987-88,1610612744,GOS,25.0,81.0,41.0,2039.0,258.0,552.0,...,133.0,202.0,335.0,138.0,39.0,8.0,93.0,246.0,612.0,1987.0
4,77524,1988-89,1610612744,GOS,26.0,11.0,0.0,103.0,13.0,19.0,...,4.0,8.0,12.0,5.0,4.0,0.0,3.0,11.0,35.0,1988.0
5,2776,2004-05,1610612761,TOR,23.0,27.0,4.0,255.0,23.0,58.0,...,18.0,39.0,57.0,2.0,12.0,4.0,10.0,46.0,62.0,2004.0


## Row Filters

### Filter out the non-active players

Filtering on "Games Played" `GP` will help to get comparable data. Players that have none or very little play time will have deviant statistics by default and that is not really what we are interested in.

So, a `GP` filter will be an easy way to get this fixed.

In [19]:
mean_gs = np.mean(df.GS.values)

# Only players that started an above average amount of games
df = df[df.GS > mean_gs]

df.shape

(7156, 27)

### Season Filter

Convenient subsampling to only get relatively recent data. Otherwise the dataset becomes huge, and  also the very old data shows some every strange patterns.

In [20]:
df = df[df.SEASON > BEGIN_SEASON]

df.shape

(2575, 27)

### Age Outlier Filter

Not the most necessary filter but nevertheless just to be sure. Depends a bit on the parameters you choose before this stage whether or not it is useful.

In [21]:
df = df[df.PLAYER_AGE < MAX_AGE]
df.PLAYER_AGE.max()

39.0

In [22]:
df.shape

(2570, 27)

## Column Filters

### Numeric only

For now, this is a limitation of `ADMERCS` and also most other methods.

In [23]:
if NUMERIC_COLUMNS_ONLY:
    print("Dropping all nominal columns")
    df = df[NUMERIC_ATTRIBUTES]

Dropping all nominal columns


In [24]:
pd.set_option("display.max_columns", 22)
df.head(15)

Unnamed: 0,PLAYER_AGE,GP,GS,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS
64,23.0,77.0,42.0,1939.0,243.0,540.0,0.45,82.0,203.0,0.404,78.0,94.0,0.83,18.0,160.0,178.0,91.0,76.0,14.0,69.0,157.0,646.0
65,24.0,71.0,66.0,2375.0,338.0,775.0,0.436,76.0,225.0,0.338,133.0,153.0,0.869,60.0,192.0,252.0,121.0,93.0,20.0,76.0,130.0,885.0
67,26.0,58.0,26.0,1757.0,251.0,580.0,0.433,87.0,217.0,0.401,71.0,86.0,0.826,26.0,133.0,159.0,88.0,68.0,22.0,64.0,106.0,660.0
68,27.0,78.0,39.0,1941.0,246.0,530.0,0.464,58.0,156.0,0.372,62.0,72.0,0.861,31.0,159.0,190.0,143.0,89.0,21.0,84.0,139.0,612.0
70,28.0,49.0,47.0,1469.0,205.0,431.0,0.476,49.0,142.0,0.345,81.0,90.0,0.9,21.0,117.0,138.0,83.0,45.0,19.0,48.0,90.0,540.0
71,28.0,79.0,47.0,1973.0,295.0,614.0,0.48,72.0,194.0,0.371,99.0,112.0,0.884,29.0,158.0,187.0,115.0,65.0,28.0,73.0,127.0,761.0
72,29.0,77.0,74.0,2354.0,288.0,643.0,0.448,90.0,224.0,0.402,111.0,129.0,0.86,21.0,157.0,178.0,151.0,75.0,12.0,78.0,129.0,777.0
73,30.0,51.0,37.0,1489.0,192.0,419.0,0.458,51.0,138.0,0.37,76.0,92.0,0.826,19.0,99.0,118.0,76.0,52.0,16.0,49.0,79.0,511.0
74,30.0,28.0,28.0,845.0,98.0,220.0,0.445,31.0,79.0,0.392,23.0,26.0,0.885,10.0,77.0,87.0,60.0,33.0,12.0,23.0,55.0,250.0
75,30.0,79.0,65.0,2334.0,290.0,639.0,0.454,82.0,217.0,0.378,99.0,118.0,0.839,29.0,176.0,205.0,136.0,85.0,28.0,72.0,134.0,761.0


## Normalize

Standard datascience practice.

In [25]:
if NORMALIZE:
    print("normalizing...")
    ndf = normalize(df)
    print("normalizing done...")

normalizing...
normalizing done...


In [26]:
ndf.shape

(2570, 22)

In [27]:
ndf.head(10)

Unnamed: 0,PLAYER_AGE,GP,GS,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS
64,0.2,0.898305,0.310345,0.492828,0.253949,0.253596,0.384615,0.20398,0.197471,0.404,0.101987,0.093716,0.808774,0.03211,0.141498,0.107679,0.092492,0.331754,0.049123,0.122222,0.42623,0.20701
65,0.25,0.79661,0.724138,0.641735,0.36938,0.378796,0.351981,0.189055,0.218872,0.338,0.174834,0.158765,0.852643,0.12844,0.179548,0.169449,0.125136,0.412322,0.070175,0.137778,0.337705,0.294268
67,0.35,0.576271,0.034483,0.430669,0.26367,0.274907,0.344988,0.216418,0.211089,0.401,0.092715,0.084895,0.804274,0.050459,0.109394,0.09182,0.089227,0.293839,0.077193,0.111111,0.259016,0.212121
68,0.4,0.915254,0.258621,0.493511,0.257594,0.248269,0.417249,0.144279,0.151751,0.372,0.080795,0.06946,0.843645,0.061927,0.140309,0.117696,0.149075,0.393365,0.073684,0.155556,0.367213,0.194597
70,0.45,0.423729,0.396552,0.332309,0.207776,0.195525,0.445221,0.121891,0.138132,0.345,0.10596,0.089305,0.887514,0.038991,0.090369,0.07429,0.083787,0.184834,0.066667,0.075556,0.206557,0.16831
71,0.45,0.932203,0.396552,0.50444,0.317132,0.293021,0.454545,0.179104,0.188716,0.371,0.129801,0.113561,0.869516,0.057339,0.13912,0.115192,0.118607,0.279621,0.098246,0.131111,0.327869,0.248996
72,0.5,0.898305,0.862069,0.634563,0.308627,0.308471,0.379953,0.223881,0.217899,0.402,0.145695,0.132304,0.84252,0.038991,0.137931,0.107679,0.15778,0.327014,0.042105,0.142222,0.334426,0.254838
73,0.55,0.457627,0.224138,0.339139,0.191981,0.189132,0.403263,0.126866,0.134241,0.37,0.099338,0.09151,0.804274,0.034404,0.068966,0.057596,0.07617,0.218009,0.05614,0.077778,0.170492,0.157722
74,0.55,0.067797,0.068966,0.119194,0.077764,0.083111,0.37296,0.077114,0.076848,0.392,0.029139,0.018743,0.870641,0.013761,0.042806,0.03172,0.05876,0.127962,0.042105,0.02,0.091803,0.062432
75,0.55,0.932203,0.706897,0.627732,0.311057,0.30634,0.393939,0.20398,0.211089,0.378,0.129801,0.120176,0.818898,0.057339,0.160523,0.130217,0.141458,0.374408,0.098246,0.128889,0.35082,0.248996


## Reindex

This needs to be done before applying any algorithmic filters.

In [28]:
ndf = ndf.reset_index(drop=True)
ndf.head()

Unnamed: 0,PLAYER_AGE,GP,GS,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS
0,0.2,0.898305,0.310345,0.492828,0.253949,0.253596,0.384615,0.20398,0.197471,0.404,0.101987,0.093716,0.808774,0.03211,0.141498,0.107679,0.092492,0.331754,0.049123,0.122222,0.42623,0.20701
1,0.25,0.79661,0.724138,0.641735,0.36938,0.378796,0.351981,0.189055,0.218872,0.338,0.174834,0.158765,0.852643,0.12844,0.179548,0.169449,0.125136,0.412322,0.070175,0.137778,0.337705,0.294268
2,0.35,0.576271,0.034483,0.430669,0.26367,0.274907,0.344988,0.216418,0.211089,0.401,0.092715,0.084895,0.804274,0.050459,0.109394,0.09182,0.089227,0.293839,0.077193,0.111111,0.259016,0.212121
3,0.4,0.915254,0.258621,0.493511,0.257594,0.248269,0.417249,0.144279,0.151751,0.372,0.080795,0.06946,0.843645,0.061927,0.140309,0.117696,0.149075,0.393365,0.073684,0.155556,0.367213,0.194597
4,0.45,0.423729,0.396552,0.332309,0.207776,0.195525,0.445221,0.121891,0.138132,0.345,0.10596,0.089305,0.887514,0.038991,0.090369,0.07429,0.083787,0.184834,0.066667,0.075556,0.206557,0.16831


# Save

Retain what you have created

In [39]:
ndf.to_csv(FP, index=False)
FP.exists()

True

In [40]:
pd.read_csv(FP).head()

Unnamed: 0,PLAYER_AGE,GP,GS,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS
0,0.2,0.898305,0.310345,0.492828,0.253949,0.253596,0.384615,0.20398,0.197471,0.404,0.101987,0.093716,0.808774,0.03211,0.141498,0.107679,0.092492,0.331754,0.049123,0.122222,0.42623,0.20701
1,0.25,0.79661,0.724138,0.641735,0.36938,0.378796,0.351981,0.189055,0.218872,0.338,0.174834,0.158765,0.852643,0.12844,0.179548,0.169449,0.125136,0.412322,0.070175,0.137778,0.337705,0.294268
2,0.35,0.576271,0.034483,0.430669,0.26367,0.274907,0.344988,0.216418,0.211089,0.401,0.092715,0.084895,0.804274,0.050459,0.109394,0.09182,0.089227,0.293839,0.077193,0.111111,0.259016,0.212121
3,0.4,0.915254,0.258621,0.493511,0.257594,0.248269,0.417249,0.144279,0.151751,0.372,0.080795,0.06946,0.843645,0.061927,0.140309,0.117696,0.149075,0.393365,0.073684,0.155556,0.367213,0.194597
4,0.45,0.423729,0.396552,0.332309,0.207776,0.195525,0.445221,0.121891,0.138132,0.345,0.10596,0.089305,0.887514,0.038991,0.090369,0.07429,0.083787,0.184834,0.066667,0.075556,0.206557,0.16831


In [42]:
ndf.head()

Unnamed: 0,PLAYER_AGE,GP,GS,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS
0,0.2,0.898305,0.310345,0.492828,0.253949,0.253596,0.384615,0.20398,0.197471,0.404,0.101987,0.093716,0.808774,0.03211,0.141498,0.107679,0.092492,0.331754,0.049123,0.122222,0.42623,0.20701
1,0.25,0.79661,0.724138,0.641735,0.36938,0.378796,0.351981,0.189055,0.218872,0.338,0.174834,0.158765,0.852643,0.12844,0.179548,0.169449,0.125136,0.412322,0.070175,0.137778,0.337705,0.294268
2,0.35,0.576271,0.034483,0.430669,0.26367,0.274907,0.344988,0.216418,0.211089,0.401,0.092715,0.084895,0.804274,0.050459,0.109394,0.09182,0.089227,0.293839,0.077193,0.111111,0.259016,0.212121
3,0.4,0.915254,0.258621,0.493511,0.257594,0.248269,0.417249,0.144279,0.151751,0.372,0.080795,0.06946,0.843645,0.061927,0.140309,0.117696,0.149075,0.393365,0.073684,0.155556,0.367213,0.194597
4,0.45,0.423729,0.396552,0.332309,0.207776,0.195525,0.445221,0.121891,0.138132,0.345,0.10596,0.089305,0.887514,0.038991,0.090369,0.07429,0.083787,0.184834,0.066667,0.075556,0.206557,0.16831
