# Anomaly Toy Datasets

Generating toy datasets for ADMERCS with very particular anomalies.

# Preliminaries

In [1]:
# Black Codeformatter
%load_ext lab_black

## Imports

In [2]:
import numpy as np
import pandas as pd
import os

from time import sleep
from pathlib import Path

# Custom imports
from nba_api.stats.static import players, teams

from nba_anomaly_generator.data import (
    get_team_roster_dataframe,
    get_plyr_stats_dataframe,
)

In [3]:
pd.set_option("display.max_columns", None)

## Constants

In [4]:
DATA_DIR = Path().resolve().parent.parent / "data"

PLYR_DIR = DATA_DIR / "players"
TEAM_DIR = DATA_DIR / "rosters"

In [5]:
RANDOM_SEED = 42
ANOMALY_FRACTION = 5

VERSION = 0
NORMALIZE = 1

# Data

## Collection

In [6]:
dfs = []
for idx, fn in enumerate(PLYR_DIR.glob("*.csv")):
    df = pd.read_csv(fn, index_col=0)
    dfs.append(df)

df = pd.concat(dfs)

df.reset_index(drop=True, inplace=True)

In [7]:
df.head()

Unnamed: 0,PLAYER_ID,SEASON_ID,LEAGUE_ID,TEAM_ID,TEAM_ABBREVIATION,PLAYER_AGE,GP,GS,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS
0,1626170,2015-16,0,1610612752,NYK,23.0,76,6.0,1265.0,154,391,0.394,22.0,100.0,0.22,96,123,0.78,23.0,120.0,143.0,177,50.0,10.0,87.0,97,426
1,1626170,2016-17,0,1610612741,CHI,24.0,63,28.0,1028.0,128,301,0.425,49.0,134.0,0.366,65,73,0.89,17.0,94.0,111.0,121,47.0,8.0,44.0,93,370
2,1626170,2017-18,0,1610612741,CHI,25.0,74,26.0,1686.0,209,503,0.416,61.0,187.0,0.326,140,188,0.745,31.0,140.0,171.0,342,63.0,8.0,89.0,132,619
3,1626170,2018-19,0,1610612753,ORL,26.0,60,1.0,939.0,92,220,0.418,40.0,110.0,0.364,26,40,0.65,19.0,79.0,98.0,156,44.0,6.0,51.0,78,250
4,202371,2010-11,0,1610612756,PHX,22.0,1,0.0,2.0,0,0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,1,0


## Types and Nones

In [8]:
df.columns

Index(['PLAYER_ID', 'SEASON_ID', 'LEAGUE_ID', 'TEAM_ID', 'TEAM_ABBREVIATION',
       'PLAYER_AGE', 'GP', 'GS', 'MIN', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A',
       'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL',
       'BLK', 'TOV', 'PF', 'PTS'],
      dtype='object')

In [9]:
nominal_attributes = [
    "PLAYER_ID",
    "SEASON_ID",
    "TEAM_ID",
    "TEAM_ABBREVIATION",
]

numeric_attributes = [
    "PLAYER_AGE",
    "GP",
    "GS",
    "MIN",
    "FGM",
    "FGA",
    "FG_PCT",
    "FG3M",
    "FG3A",
    "FG3_PCT",
    "FTM",
    "FTA",
    "FT_PCT",
    "OREB",
    "DREB",
    "REB",
    "AST",
    "STL",
    "BLK",
    "TOV",
    "PF",
    "PTS",
]

In [10]:
df = df[nominal_attributes + numeric_attributes]

for attribute in nominal_attributes:
    df[attribute] = df[attribute].astype("category")

for attribute in numeric_attributes:
    df[attribute] = df[attribute].astype(float)
    df[attribute] = df[attribute].fillna(0)

In [11]:
df.dtypes, df.shape

(PLAYER_ID            category
 SEASON_ID            category
 TEAM_ID              category
 TEAM_ABBREVIATION    category
 PLAYER_AGE            float64
 GP                    float64
 GS                    float64
 MIN                   float64
 FGM                   float64
 FGA                   float64
 FG_PCT                float64
 FG3M                  float64
 FG3A                  float64
 FG3_PCT               float64
 FTM                   float64
 FTA                   float64
 FT_PCT                float64
 OREB                  float64
 DREB                  float64
 REB                   float64
 AST                   float64
 STL                   float64
 BLK                   float64
 TOV                   float64
 PF                    float64
 PTS                   float64
 dtype: object,
 (26951, 26))

## Add Columns

### Season Column

In [12]:
def _season_id_to_season(season_id):
    return float(season_id.split("-")[0])

In [13]:
df["SEASON"] = df.apply(lambda r: _season_id_to_season(r.SEASON_ID), axis=1)

In [14]:
df.head()

Unnamed: 0,PLAYER_ID,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,PLAYER_AGE,GP,GS,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS,SEASON
0,1626170,2015-16,1610612752,NYK,23.0,76.0,6.0,1265.0,154.0,391.0,0.394,22.0,100.0,0.22,96.0,123.0,0.78,23.0,120.0,143.0,177.0,50.0,10.0,87.0,97.0,426.0,2015.0
1,1626170,2016-17,1610612741,CHI,24.0,63.0,28.0,1028.0,128.0,301.0,0.425,49.0,134.0,0.366,65.0,73.0,0.89,17.0,94.0,111.0,121.0,47.0,8.0,44.0,93.0,370.0,2016.0
2,1626170,2017-18,1610612741,CHI,25.0,74.0,26.0,1686.0,209.0,503.0,0.416,61.0,187.0,0.326,140.0,188.0,0.745,31.0,140.0,171.0,342.0,63.0,8.0,89.0,132.0,619.0,2017.0
3,1626170,2018-19,1610612753,ORL,26.0,60.0,1.0,939.0,92.0,220.0,0.418,40.0,110.0,0.364,26.0,40.0,0.65,19.0,79.0,98.0,156.0,44.0,6.0,51.0,78.0,250.0,2018.0
4,202371,2010-11,1610612756,PHX,22.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2010.0


## Filter Rows

### Key Players

Filtering on "Games Played" `GP` will help to get comparable data. Players that have none or very little play time will have deviant statistics by default and that is not really what we are interested in.

So, a `GP` filter will be an easy way to get this fixed.

In [15]:
mean_gs = np.mean(df.GS.values)

df = df[df.GS > mean_gs]

df.shape

(8166, 27)

### Season Filter

This is convenient for subsampling

In [16]:
df = df[df.SEASON > 2010]

In [17]:
df.shape

(2008, 27)

 ### Filter - Age Issues
 
 Depending on what you do before you do not run into this, but just to be on the safe side here.

In [18]:
df = df[df.PLAYER_AGE < 50]
df.PLAYER_AGE.max()

41.0

### Other Filters

## Filter Columns

In [19]:
df = df[numeric_attributes]
df.head()

Unnamed: 0,PLAYER_AGE,GP,GS,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS
1,24.0,63.0,28.0,1028.0,128.0,301.0,0.425,49.0,134.0,0.366,65.0,73.0,0.89,17.0,94.0,111.0,121.0,47.0,8.0,44.0,93.0,370.0
2,25.0,74.0,26.0,1686.0,209.0,503.0,0.416,61.0,187.0,0.326,140.0,188.0,0.745,31.0,140.0,171.0,342.0,63.0,8.0,89.0,132.0,619.0
93,32.0,46.0,32.0,1250.0,141.0,345.0,0.409,33.0,112.0,0.295,29.0,45.0,0.644,20.0,114.0,134.0,93.0,37.0,7.0,45.0,77.0,344.0
94,33.0,76.0,72.0,2278.0,244.0,612.0,0.399,99.0,267.0,0.371,85.0,110.0,0.773,33.0,174.0,207.0,226.0,51.0,25.0,85.0,143.0,672.0
133,24.0,52.0,43.0,1525.0,286.0,666.0,0.429,43.0,134.0,0.321,84.0,114.0,0.737,24.0,197.0,221.0,310.0,104.0,27.0,150.0,174.0,699.0


## Normalize

Normalization of the numerical columns. This will screw up some connections that are present in the data!

In [20]:
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler


def normalize(df):
    for c in df.columns:
        if "label" not in c:
            df[c] = MinMaxScaler().fit_transform(df[c].values.reshape(-1, 1))
    return df

In [21]:
df.to_csv("clean.csv")

## Reindex DataFrame

Never forget this step. Our index must really be consecutive for our anomaly-generation procedures.

In [22]:
df.reset_index(drop=True, inplace=True)

# Global Anomaly Parameters

In the procedure executed below, some things are left constant.

In [23]:
from nba_anomaly_generator.data import load_lal

from nba_anomaly_generator.anom import (
    insert_dependency_anomaly,
    insert_contextual_anomaly,
    insert_swap_anomaly,
    insert_transformation_anomaly,
    ft_to_m,
    lb_to_kg,
)

In [24]:
df.shape

(2008, 22)

In [25]:
NB_OF_ANOMALIES = int(df.shape[0] * ANOMALY_FRACTION * 0.01)

# Anomalies of the First Kind: Point Anomaly

**Procedure**: Introduce fake values by manually changing its value by means of a transformation.

Here we would expect to see any anomaly detection system perform rather well.

In [26]:
df_01 = df.copy()
md_01 = []
for anomaly_idx in range(NB_OF_ANOMALIES):
    df_01, md = insert_transformation_anomaly(
        df_01,
        row=None,
        col="PLAYER_AGE",
        transformation=lambda x: 10 * x,
        rng=None,
        random_state=RANDOM_SEED * anomaly_idx,
        return_anomaly_metadata=True,
    )
    md_01.append(md)

In [27]:
df_01.PLAYER_AGE.max()

390.0

In [28]:
np.sum(df_01.a_lbl)

100

In [29]:
if NORMALIZE:
    df_01 = normalize(df_01)

df_01.to_csv("anompoint_n01_a{0:02d}_v{1:02d}.csv".format(ANOMALY_FRACTION, VERSION))

# Anomalies of the Second Kind: Dependency Anomaly

**Procedure**: Introduce fake values by randomly sampling another value of the same column.

Here, we would expect global methods to fail. Local methods should be doing alright in the end.

In [30]:
df_02 = df.copy()
md_02 = []

for anomaly_idx in range(NB_OF_ANOMALIES):
    df_02, md = insert_dependency_anomaly(
        df_02,
        row=None,
        col="FGM",
        val=None,
        val_list=None,
        val_dist=None,
        random_state=RANDOM_SEED * anomaly_idx,
        rng=None,
    )

    md_02.append(md)

In [31]:
np.sum(df_02.a_lbl)

100

In [32]:
df_02.shape

(2008, 23)

In [33]:
if NORMALIZE:
    df_02 = normalize(df_02)


df_02.to_csv("anomdeps_n01_a{0:02d}_v{1:02d}.csv".format(ANOMALY_FRACTION, VERSION))

# Anomalies of the Third Kind: Contextual Anomaly

**Procedure**: Introduce fake values by randomly sampling another value of the same column, but that value comes from a different subpopulation.

Here we would like to see ADMERCS shine, and everyone else struggle.

## Context 01: rebounds and threes

The big guys in the game are not good in long-distance shooting.

So, the context is defined by `REB` and the attribute we modify is `FG3M`

In [34]:
df_03 = df.copy()
md_03 = []

for anomaly_idx in range(NB_OF_ANOMALIES):
    df_03, md = insert_contextual_anomaly(
        df_03,
        row=None,
        col="FT_PCT",
        src_subpop_filter=lambda r: r.FG3M > 0.8,  # Best shooters
        tgt_subpop_filter=lambda r: r.FG3M < 0.2,  # Regular shooters
        rng=None,
        random_state=RANDOM_SEED + anomaly_idx,
        return_anomaly_metadata=True,
        swap=False,
    )

    md_03.append(md)

In [35]:
if NORMALIZE:
    df_03 = normalize(df_03)

df_03.to_csv("anomcontext_n01_a{0:02d}_v{1:02d}.csv".format(ANOMALY_FRACTION, VERSION))

## Context 01: Rebounds and Threes - SWAP version

This may be a better implementation of our context-anomaly.

In [36]:
df_04 = df.copy()
md_04 = []

for anomaly_idx in range(NB_OF_ANOMALIES):
    df_04, md = insert_contextual_anomaly(
        df_04,
        row=None,
        col="FT_PCT",
        src_subpop_filter=lambda r: r.FG3M > 0.8,  # Best shooters
        tgt_subpop_filter=lambda r: r.FG3M < 0.2,  # Regular shooters
        rng=None,
        random_state=RANDOM_SEED + anomaly_idx,
        return_anomaly_metadata=True,
        swap=True,
    )

    md_04.extend(md)  # You already get a list if swap=True

In [37]:
if NORMALIZE:
    df_04 = normalize(df_04)

df_04.to_csv(
    "anomswapcontext_n01_a{0:02d}_v{1:02d}.csv".format(ANOMALY_FRACTION, VERSION)
)

# More Advanced Contextual Anomalies

First I perform some clustering, and then I swap between those clusters.

## Clustering

In [38]:
from sklearn.cluster import KMeans

N_CLUSTERS = 10

In [39]:
from nba_anomaly_generator.anom.utils import init_rng, init_row_idx

In [40]:
def add_cluster_column(df, n_clusters=10, **kmeans_kwargs):
    c = KMeans(n_clusters=n_clusters, **kmeans_kwargs)
    c.fit(df.values)

    print("Clustering succesful.")

    df["cluster_label"] = c.labels_
    return df


def find_an_interesting_switch(df, field="FGM", must_include=None):

    # Actual Function
    cluster_labels = df["cluster_label"].unique()
    cluster_labels.sort()

    # Collect Stats
    stats = []

    lower_bounds = []
    upper_bounds = []

    for label in cluster_labels:
        rdf = df[df.cluster_label == label]
        sts = (rdf[field].mean(), rdf[field].std())
        stats.append(sts)

        lb, ub = (sts[0] - sts[1], sts[0] + sts[1])
        lower_bounds.append(lb)
        upper_bounds.append(ub)

    bounds = list(zip(lower_bounds, upper_bounds, cluster_labels))

    # All interesting combos
    combos = []
    deltas = []
    for i, (l1, u1, label_1) in enumerate(bounds[:-1]):
        for i2 in range(i + 1, len(bounds)):

            # Check the constraint given above
            if must_include is not None:
                valid_combination = False

                assert isinstance(
                    must_include, int
                ), "If you have this kind of preference, it must be an integer"

                if i == must_include:
                    valid_combination = True
                elif i2 == must_include:
                    valid_combination = True
            else:
                valid_combination = True

            # If the combination is valid, do your thing
            if valid_combination:
                l2, u2, label_2 = bounds[i2]

                if u1 < l2:
                    delta = l2 - u1
                    deltas.append(delta)

                    combo = (label_1, label_2)
                    combos.append(combo)
                elif l1 > u2:
                    delta = l1 - u2
                    deltas.append(delta)

                    combo = (label_2, label_1)
                    combos.append(combo)
                else:
                    # Not enough difference: no interesting combo
                    pass
            else:
                # This combination is irrelevant, does not satisfy must_include constraint
                pass

    # Sort according to the delta associated with each combo.
    df_switch = pd.DataFrame()
    df_switch["delta"] = deltas
    df_switch["combo"] = combos
    df_switch["field"] = field
    df_switch.sort_values(by="delta", ascending=False, inplace=True, ignore_index=True)

    return df_switch


def pick_interesting_switch(df_switch, rng=None, random_state=42, k=None):
    # Manage inputs
    rng = init_rng(rng=rng, random_state=random_state)

    if k is not None and k > df_switch.shape[0]:
        k = None  # That means just take all rows

    # Sample
    eligible_rows = slice(k)
    dfs_row_idx = init_row_idx(rng, df_switch.iloc[eligible_rows, :])

    return df_switch.iloc[dfs_row_idx]

In [41]:
df = add_cluster_column(df, n_clusters=10)
df.head()

Clustering succesful.


Unnamed: 0,PLAYER_AGE,GP,GS,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS,cluster_label
0,24.0,63.0,28.0,1028.0,128.0,301.0,0.425,49.0,134.0,0.366,65.0,73.0,0.89,17.0,94.0,111.0,121.0,47.0,8.0,44.0,93.0,370.0,5
1,25.0,74.0,26.0,1686.0,209.0,503.0,0.416,61.0,187.0,0.326,140.0,188.0,0.745,31.0,140.0,171.0,342.0,63.0,8.0,89.0,132.0,619.0,7
2,32.0,46.0,32.0,1250.0,141.0,345.0,0.409,33.0,112.0,0.295,29.0,45.0,0.644,20.0,114.0,134.0,93.0,37.0,7.0,45.0,77.0,344.0,0
3,33.0,76.0,72.0,2278.0,244.0,612.0,0.399,99.0,267.0,0.371,85.0,110.0,0.773,33.0,174.0,207.0,226.0,51.0,25.0,85.0,143.0,672.0,3
4,24.0,52.0,43.0,1525.0,286.0,666.0,0.429,43.0,134.0,0.321,84.0,114.0,0.737,24.0,197.0,221.0,310.0,104.0,27.0,150.0,174.0,699.0,7


In [42]:
df_switch = find_an_interesting_switch(df, field="FGM")
df_switch.head()

Unnamed: 0,delta,combo,field
0,468.141445,"(5, 6)",FGM
1,396.764834,"(0, 6)",FGM
2,359.63209,"(5, 4)",FGM
3,322.976897,"(7, 6)",FGM
4,288.255478,"(0, 4)",FGM


In [43]:
switch = pick_interesting_switch(df_switch, rng=None, random_state=42, k=None)
switch

delta    359.632
combo     (5, 4)
field        FGM
Name: 2, dtype: object

## Swapping between Clusters

Make 10 versions. Do it in a single dimensions or in multiple ones.

Run the algorithms and observe whether you can see the patterns. If you switch things around in more dimensions, you should be able to witness some stuff happening.

It really cannot be too hard now. Start writing the entire thing down. Go at it with fury, and you will have yet another paper for your phd. That brings the grand total to 3. 

And then this summer you can spend with Gust on ddw, with Sam on clean autocomplete and with yourself on the CERN data for reals. So that would mean 2 or 3 submissions to AAAI, which would be a pretty insane total. Add to that Koralp's work _and_ maybe a paper with Jannes. Then, no-one can argue that you deserve a PhD.

In [44]:
def insert_switch_anomaly(
    df,
    field="FGM",
    n_clusters=10,
    rng=None,
    random_state=42,
    k=None,
    n_anomalies=10,
    normalize_df=NORMALIZE,
):

    # Determine subpopulations and sensible swaps
    if "cluster_label" not in df.columns:
        df = add_cluster_column(df, n_clusters=n_clusters)

    df_switch = find_an_interesting_switch(df, field=field)
    switch = pick_interesting_switch(df_switch, rng=rng, random_state=random_state, k=k)

    src_label, tgt_label = switch.combo

    # Insert anomalies
    ndf = df.copy()  # New DataFrame
    amd = []  # All MetaData

    for anomaly_idx in range(n_anomalies):
        ndf, md = insert_contextual_anomaly(
            ndf,
            row=None,
            col=field,
            src_subpop_filter=lambda r: r.cluster_label == src_label,  # Source Cluster
            tgt_subpop_filter=lambda r: r.cluster_label == tgt_label,  # Target Cluster
            rng=rng,
            random_state=random_state + anomaly_idx,
            return_anomaly_metadata=True,
            swap=False,
        )

        amd.append(md)

    if normalize_df:
        ndf = normalize(ndf)

    return ndf, amd

In [45]:
def _get_rows_with_anoms(amd):
    return [md["loc"][0] for md in amd]


def _get_cols_with_anoms(amd):
    return [md["loc"][1] for md in amd]


def _get_clus_with_anoms(df):
    return df[df.a_lbl == 1].cluster_label.unique().tolist()


def add_switch_anomaly(
    df,
    amd,
    rng=None,
    random_state=42,
    k=None,
    fields_to_include=["GS"],
    fields_to_exclude=["PTS"],
    normalize_df=NORMALIZE,
):

    rows_with_anoms = _get_rows_with_anoms(amd)
    cols_with_anoms = _get_cols_with_anoms(amd)
    clus_with_anoms = _get_clus_with_anoms(df)

    # Determine relevant fields
    cols_without_anoms = [
        c
        for c in df.columns
        if c not in cols_with_anoms
        if c not in ["cluster_label", "a_lbl"]
    ]
    relevant_fields = set(cols_without_anoms + fields_to_include)
    relevant_fields = relevant_fields - set(fields_to_exclude)
    relevant_fields = list(relevant_fields)

    # Determine all possible switches
    dfs = []
    for field in relevant_fields:
        s = find_an_interesting_switch(df, field=field, must_include=clus_with_anoms[0])

        dfs.append(s)

    df_switch = pd.concat(dfs)
    df_switch.sort_values(by="delta", ascending=False, inplace=True, ignore_index=True)

    # Pick Switch - Insert Anomaly
    switch = pick_interesting_switch(df_switch, rng=rng, random_state=random_state, k=k)

    labels = list(switch.combo)
    field = switch.field
    tgt_label = clus_with_anoms[0]
    src_label = [l for l in labels if l != tgt_label][0]

    # Insert anomalies
    ndf = df.copy()  # New DataFrame
    amd = amd.copy()

    for anomaly_idx, row_idx in enumerate(rows_with_anoms):
        ndf, md = insert_contextual_anomaly(
            ndf,
            row=row_idx,
            col=field,
            src_subpop_filter=lambda r: r.cluster_label == src_label,  # Source Cluster
            tgt_subpop_filter=lambda r: r.cluster_label == tgt_label,  # Target Cluster
            rng=rng,
            random_state=random_state + anomaly_idx,
            return_anomaly_metadata=True,
            swap=False,
        )

        amd.append(md)

    if normalize_df:
        ndf = normalize(ndf)

    return ndf, amd

In [46]:
def make_trio_of_switch_anomalies(
    df,
    first_field="FGM",
    n_clusters=10,
    rng=None,
    random_state=42,
    k=None,
    n_anomalies=10,
    basename="anomswitch",
    fields_to_include=["GS"],
    fields_to_exclude=["PTS"],
    version_start=VERSION,
    normalize_df=NORMALIZE,
):
    df_01, md_01 = insert_switch_anomaly(
        df,
        field=first_field,
        n_clusters=n_clusters,
        rng=rng,
        random_state=random_state,
        k=k,
        n_anomalies=n_anomalies,
        normalize_df=normalize_df,
    )

    columns_to_write = [c for c in df_01.columns if c != "cluster_label"]
    # Save it
    df_01.to_csv(
        "{0}_n01_a{1:02d}_v{2:02d}.csv".format(
            basename, ANOMALY_FRACTION, version_start
        ),
        columns=columns_to_write,
    )

    df_02, md_02 = add_switch_anomaly(
        df_01,
        md_01,
        rng=rng,
        random_state=random_state,
        k=k,
        fields_to_include=fields_to_include,
        fields_to_exclude=fields_to_exclude,
        normalize_df=normalize_df,
    )
    df_02.to_csv(
        "{0}_n01_a{1:02d}_v{2:02d}.csv".format(
            basename, ANOMALY_FRACTION, version_start + 1
        ),
        columns=columns_to_write,
    )

    # Save it
    df_03, md_03 = add_switch_anomaly(
        df_02,
        md_02,
        rng=rng,
        random_state=random_state,
        k=k,
        fields_to_include=fields_to_include,
        fields_to_exclude=fields_to_exclude,
        normalize_df=normalize_df,
    )

    df_03.to_csv(
        "{0}_n01_a{1:02d}_v{2:02d}.csv".format(
            basename, ANOMALY_FRACTION, version_start + 2
        ),
        columns=columns_to_write,
    )

    return

In [47]:
df.columns

Index(['PLAYER_AGE', 'GP', 'GS', 'MIN', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A',
       'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL',
       'BLK', 'TOV', 'PF', 'PTS', 'cluster_label'],
      dtype='object')

## Generate Trios

In [48]:
# The most different ones
make_trio_of_switch_anomalies(
    df,
    first_field="FGM",
    n_clusters=10,
    rng=None,
    random_state=RANDOM_SEED,
    k=1,
    basename="anomswitchbig",
    n_anomalies=NB_OF_ANOMALIES,
    fields_to_include=["GS"],
    fields_to_exclude=["PTS"],
    version_start=0,
)

In [49]:
# The completely random ones
make_trio_of_switch_anomalies(
    df,
    first_field="FGM",
    n_clusters=10,
    rng=None,
    random_state=RANDOM_SEED,
    k=None,
    basename="anomswitchrnd",
    n_anomalies=NB_OF_ANOMALIES,
    fields_to_include=["GS"],
    fields_to_exclude=["PTS"],
    version_start=0,
)

In [50]:
# In between
make_trio_of_switch_anomalies(
    df,
    first_field="FGM",
    n_clusters=10,
    rng=None,
    random_state=RANDOM_SEED,
    k=15,
    basename="anomswitchmid",
    n_anomalies=NB_OF_ANOMALIES,
    fields_to_include=["GS"],
    fields_to_exclude=["PTS"],
    version_start=0,
)