# Generate Synth Dataset 01

Generate a dataset useful for the synth project.

# Preliminaries

In [1]:
%load_ext lab_black

## Imports 

In [2]:
import numpy as np
import pandas as pd
import altair as alt

from pathlib import Path

In [3]:
from nba_anomaly_generator.data import load_lal
from nba_anomaly_generator.anom import (
    insert_dependency_anomaly,
    insert_contextual_anomaly,
    insert_swap_anomaly,
    insert_transformation_anomaly,
    ft_to_m,
    lb_to_kg,
)
from nba_anomaly_generator.anom.utils import init_rng, init_row_idx

## Constants

Naming conventions etc.

In [4]:
N_JOBS = 4
VERBOSE = 51

In [5]:
NBA_DATA_DIR = Path().resolve().parent.parent / "data"

PLYR_DIR = NBA_DATA_DIR / "players"
TEAM_DIR = NBA_DATA_DIR / "rosters"
CLEAN_DIR = NBA_DATA_DIR / "clean"

FP1 = CLEAN_DIR / "nba-synth-01-season-data.csv"
FP2 = CLEAN_DIR / "nba-synth-01-aggregate-sal.csv"

FP1

PosixPath('/home/zissou/repos/nba-anomaly-generator/data/clean/nba-synth-01-season-data.csv')

In [6]:
NORMALIZE = False
CONTAMINATION = 5
DROP_NA = True

# Row filter
N_YEARS = 5
MAX_AGE = 40
BEGIN_SEASON = (
    2020 - N_YEARS
)  # First season that we want in the eventual data (season filter)

# Column Filter
NUMERIC_COLUMNS_ONLY = False

In [7]:
POI_LOOKUP = {"LeBron", "Carmelo", "Kawhi", "Harden"}

SALARIES = dict(
    LeBron=37.44 * 10 ** 6,
    Harden=38.2 * 10 ** 6,
    Carmelo=2.159 * 10 ** 6,
    Kawhi=32.37 * 10 ** 6,
)

### Attribute Types

Also a kind of constant, but data-specific for obvious reasons...

In [8]:
NOMINAL_ATTRIBUTES = [
    "PLAYER_ID",
    "PLAYER_NAME",
    "TEAM_ABBREVIATION",
]

NUMERIC_ATTRIBUTES = [
    "SEASON",
    "PTS",
    "FGM",
    "FGA",
    "FG3M",
    "FG3A",
    "FTM",
    "FTA",
    "OREB",
    "DREB",
    "REB",
    "AST",
    "STL",
    "BLK",
]

In [9]:
AGGREGATE_ATTRIBUTES = [
    "PLAYER_ID",
    "PLAYER_NAME",
    "PTS",
    "REB",
    "AST",
    "STL",
    "BLK",
]

# Functions

In [10]:
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler


def normalize(df, scaler=MinMaxScaler):

    columns_to_scale = [c for c in df.columns if "label" not in c]

    for c in columns_to_scale:
        df[c] = scaler().fit_transform(df[c].values.reshape(-1, 1))
    return df


def dataframe_to_dataset(df, copy=True):
    if copy:
        df = df.copy()
    relevant_cols = [c for c in df.columns if c not in {"a_lbl", "i", "cluster_label"}]
    return df[relevant_cols].values

# Create DataSet

Create the standard NBA-dataset.

## Collect

In [11]:
dfs = []
for idx, fn in enumerate(PLYR_DIR.glob("*.csv")):
    df = pd.read_csv(fn, index_col=0)
    dfs.append(df)

df = pd.concat(dfs)

df.reset_index(drop=True, inplace=True)

df.head()

Unnamed: 0,PLAYER_ID,SEASON_ID,LEAGUE_ID,TEAM_ID,TEAM_ABBREVIATION,PLAYER_AGE,GP,GS,MIN,FGM,...,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS
0,76823,1949-50,0,1610612747,MNL,22.0,1,,,1,...,,,,,0,,,,1,2
1,77524,1985-86,0,1610612739,CLE,23.0,21,0.0,266.0,28,...,0.625,15.0,23.0,38.0,9,7.0,1.0,10.0,30,61
2,77524,1986-87,0,1610612744,GOS,24.0,63,34.0,1284.0,164,...,0.632,63.0,120.0,183.0,84,27.0,8.0,43.0,200,353
3,77524,1987-88,0,1610612744,GOS,25.0,81,41.0,2039.0,258,...,0.784,133.0,202.0,335.0,138,39.0,8.0,93.0,246,612
4,77524,1988-89,0,1610612744,GOS,26.0,11,0.0,103.0,13,...,0.6,4.0,8.0,12.0,5,4.0,0.0,3.0,11,35


## Filter Player of Interest

In [12]:
from nba_api.stats.static import players, teams

player_dict = players.get_players()
poi = [p for p in player_dict if any([l in p["full_name"] for l in POI_LOOKUP])]

for p in poi:
    for l in POI_LOOKUP:
        if l in p["full_name"]:
            p["salary"] = SALARIES[l]


poi_ids = {p["id"]: p["full_name"] for p in poi}
poi_sal = {p["id"]: np.round(p["salary"], decimals=0) for p in poi}

In [13]:
poi_sal

{2546: 2159000.0, 201935: 38200000.0, 2544: 37440000.0, 202695: 32370000.0}

In [14]:
poi_ids

{2546: 'Carmelo Anthony',
 201935: 'James Harden',
 2544: 'LeBron James',
 202695: 'Kawhi Leonard'}

In [15]:
df = df[df["PLAYER_ID"].isin(poi_ids)]
df["PLAYER_NAME"] = df.apply(lambda r: poi_ids[r.PLAYER_ID], axis=1)
df.head()

Unnamed: 0,PLAYER_ID,SEASON_ID,LEAGUE_ID,TEAM_ID,TEAM_ABBREVIATION,PLAYER_AGE,GP,GS,MIN,FGM,...,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS,PLAYER_NAME
10197,201935,2009-10,0,1610612760,OKC,20.0,76,0.0,1738.0,233,...,47.0,197.0,244.0,137,80.0,20.0,106.0,200,753,James Harden
10198,201935,2010-11,0,1610612760,OKC,21.0,82,5.0,2189.0,298,...,42.0,213.0,255.0,176,92.0,24.0,106.0,207,998,James Harden
10199,201935,2011-12,0,1610612760,OKC,22.0,62,2.0,1946.0,309,...,30.0,222.0,252.0,229,62.0,15.0,137.0,150,1044,James Harden
10200,201935,2012-13,0,1610612745,HOU,23.0,78,78.0,2985.0,585,...,62.0,317.0,379.0,455,142.0,38.0,295.0,178,2023,James Harden
10201,201935,2013-14,0,1610612745,HOU,24.0,73,73.0,2777.0,549,...,61.0,283.0,344.0,446,115.0,29.0,265.0,177,1851,James Harden


## Types and `None`

Let us take a look at which attributes are in the dataset.

In [16]:
df.columns.tolist()

['PLAYER_ID',
 'SEASON_ID',
 'LEAGUE_ID',
 'TEAM_ID',
 'TEAM_ABBREVIATION',
 'PLAYER_AGE',
 'GP',
 'GS',
 'MIN',
 'FGM',
 'FGA',
 'FG_PCT',
 'FG3M',
 'FG3A',
 'FG3_PCT',
 'FTM',
 'FTA',
 'FT_PCT',
 'OREB',
 'DREB',
 'REB',
 'AST',
 'STL',
 'BLK',
 'TOV',
 'PF',
 'PTS',
 'PLAYER_NAME']

Now let us see which one of these attributes are accounted for in the beginning of this notebook.

In [17]:
set(df.columns) - set(NOMINAL_ATTRIBUTES) - set(NUMERIC_ATTRIBUTES)

{'FG3_PCT',
 'FG_PCT',
 'FT_PCT',
 'GP',
 'GS',
 'LEAGUE_ID',
 'MIN',
 'PF',
 'PLAYER_AGE',
 'SEASON_ID',
 'TEAM_ID',
 'TOV'}

Please verify that the attributes that are unaccounted for a truly not of interest, since they will not be included in the final dataset.

In [18]:
if DROP_NA:
    print("Number of rows BEFORE dropping NA: {}".format(df.shape[0]))

    df = df.dropna()

    print("Number of rows AFTER dropping NA: {}".format(df.shape[0]))

Number of rows BEFORE dropping NA: 56
Number of rows AFTER dropping NA: 56


In [19]:
for attribute in NOMINAL_ATTRIBUTES + NUMERIC_ATTRIBUTES:
    if attribute in df.columns:
        if attribute in NOMINAL_ATTRIBUTES:
            df[attribute] = df[attribute].astype("category")

        if attribute in NUMERIC_ATTRIBUTES:
            df[attribute] = df[attribute].astype(float)

In [20]:
df.dtypes, df.shape

(PLAYER_ID            category
 SEASON_ID              object
 LEAGUE_ID              object
 TEAM_ID                object
 TEAM_ABBREVIATION    category
 PLAYER_AGE            float64
 GP                     object
 GS                    float64
 MIN                   float64
 FGM                   float64
 FGA                   float64
 FG_PCT                float64
 FG3M                  float64
 FG3A                  float64
 FG3_PCT               float64
 FTM                   float64
 FTA                   float64
 FT_PCT                float64
 OREB                  float64
 DREB                  float64
 REB                   float64
 AST                   float64
 STL                   float64
 BLK                   float64
 TOV                   float64
 PF                     object
 PTS                   float64
 PLAYER_NAME          category
 dtype: object,
 (56, 28))

## Column Generation

### Season Column

In [21]:
def _season_id_to_season(season_id):
    return float(season_id.split("-")[0])

In [22]:
df["SEASON"] = df.apply(lambda r: _season_id_to_season(r.SEASON_ID), axis=1)
df.head()

Unnamed: 0,PLAYER_ID,SEASON_ID,LEAGUE_ID,TEAM_ID,TEAM_ABBREVIATION,PLAYER_AGE,GP,GS,MIN,FGM,...,DREB,REB,AST,STL,BLK,TOV,PF,PTS,PLAYER_NAME,SEASON
10197,201935,2009-10,0,1610612760,OKC,20.0,76,0.0,1738.0,233.0,...,197.0,244.0,137.0,80.0,20.0,106.0,200,753.0,James Harden,2009.0
10198,201935,2010-11,0,1610612760,OKC,21.0,82,5.0,2189.0,298.0,...,213.0,255.0,176.0,92.0,24.0,106.0,207,998.0,James Harden,2010.0
10199,201935,2011-12,0,1610612760,OKC,22.0,62,2.0,1946.0,309.0,...,222.0,252.0,229.0,62.0,15.0,137.0,150,1044.0,James Harden,2011.0
10200,201935,2012-13,0,1610612745,HOU,23.0,78,78.0,2985.0,585.0,...,317.0,379.0,455.0,142.0,38.0,295.0,178,2023.0,James Harden,2012.0
10201,201935,2013-14,0,1610612745,HOU,24.0,73,73.0,2777.0,549.0,...,283.0,344.0,446.0,115.0,29.0,265.0,177,1851.0,James Harden,2013.0


## Row Filters

### Season Filter

Convenient subsampling to only get relatively recent data. Otherwise the dataset becomes huge, and  also the very old data shows some every strange patterns.

In [23]:
df = df[df.SEASON > BEGIN_SEASON]

df

Unnamed: 0,PLAYER_ID,SEASON_ID,LEAGUE_ID,TEAM_ID,TEAM_ABBREVIATION,PLAYER_AGE,GP,GS,MIN,FGM,...,DREB,REB,AST,STL,BLK,TOV,PF,PTS,PLAYER_NAME,SEASON
10204,201935,2016-17,0,1610612745,HOU,27.0,81,81.0,2947.0,674.0,...,564.0,659.0,907.0,121.0,38.0,464.0,215,2356.0,James Harden,2016.0
10205,201935,2017-18,0,1610612745,HOU,28.0,72,72.0,2551.0,651.0,...,348.0,389.0,630.0,126.0,50.0,315.0,169,2191.0,James Harden,2017.0
10206,201935,2018-19,0,1610612745,HOU,29.0,78,78.0,2867.0,843.0,...,452.0,518.0,586.0,158.0,58.0,387.0,244,2818.0,James Harden,2018.0
10207,201935,2019-20,0,1610612745,HOU,30.0,61,61.0,2241.0,603.0,...,324.0,388.0,450.0,106.0,53.0,273.0,206,2096.0,James Harden,2019.0
16254,202695,2016-17,0,1610612759,SAS,26.0,74,74.0,2475.0,636.0,...,350.0,430.0,260.0,133.0,55.0,154.0,122,1888.0,Kawhi Leonard,2016.0
16255,202695,2017-18,0,1610612759,SAS,27.0,9,9.0,210.0,52.0,...,36.0,42.0,21.0,18.0,9.0,16.0,9,146.0,Kawhi Leonard,2017.0
16256,202695,2018-19,0,1610612761,TOR,28.0,60,60.0,2040.0,560.0,...,361.0,439.0,199.0,106.0,24.0,121.0,87,1596.0,Kawhi Leonard,2018.0
16257,202695,2019-20,0,1610612746,LAC,28.0,51,51.0,1643.0,476.0,...,321.0,373.0,254.0,92.0,31.0,140.0,98,1370.0,Kawhi Leonard,2019.0
19156,2544,2016-17,0,1610612739,CLE,32.0,74,74.0,2795.0,736.0,...,542.0,639.0,646.0,92.0,44.0,303.0,134,1954.0,LeBron James,2016.0
19157,2544,2017-18,0,1610612739,CLE,33.0,82,82.0,3026.0,857.0,...,612.0,709.0,747.0,116.0,71.0,347.0,136,2251.0,LeBron James,2017.0


## Column Filters

### Desired Columns

In [24]:
set(df.columns) - set(NOMINAL_ATTRIBUTES) - set(NUMERIC_ATTRIBUTES)

{'FG3_PCT',
 'FG_PCT',
 'FT_PCT',
 'GP',
 'GS',
 'LEAGUE_ID',
 'MIN',
 'PF',
 'PLAYER_AGE',
 'SEASON_ID',
 'TEAM_ID',
 'TOV'}

In [25]:
df = df[NOMINAL_ATTRIBUTES + NUMERIC_ATTRIBUTES]

In [26]:
df

Unnamed: 0,PLAYER_ID,PLAYER_NAME,TEAM_ABBREVIATION,SEASON,PTS,FGM,FGA,FG3M,FG3A,FTM,FTA,OREB,DREB,REB,AST,STL,BLK
10204,201935,James Harden,HOU,2016.0,2356.0,674.0,1533.0,262.0,756.0,746.0,881.0,95.0,564.0,659.0,907.0,121.0,38.0
10205,201935,James Harden,HOU,2017.0,2191.0,651.0,1449.0,265.0,722.0,624.0,727.0,41.0,348.0,389.0,630.0,126.0,50.0
10206,201935,James Harden,HOU,2018.0,2818.0,843.0,1909.0,378.0,1028.0,754.0,858.0,66.0,452.0,518.0,586.0,158.0,58.0
10207,201935,James Harden,HOU,2019.0,2096.0,603.0,1386.0,271.0,769.0,619.0,719.0,64.0,324.0,388.0,450.0,106.0,53.0
16254,202695,Kawhi Leonard,SAS,2016.0,1888.0,636.0,1312.0,147.0,387.0,469.0,533.0,80.0,350.0,430.0,260.0,133.0,55.0
16255,202695,Kawhi Leonard,SAS,2017.0,146.0,52.0,111.0,11.0,35.0,31.0,38.0,6.0,36.0,42.0,21.0,18.0,9.0
16256,202695,Kawhi Leonard,TOR,2018.0,1596.0,560.0,1129.0,112.0,302.0,364.0,426.0,78.0,361.0,439.0,199.0,106.0,24.0
16257,202695,Kawhi Leonard,LAC,2019.0,1370.0,476.0,1016.0,107.0,292.0,311.0,350.0,52.0,321.0,373.0,254.0,92.0,31.0
19156,2544,LeBron James,CLE,2016.0,1954.0,736.0,1344.0,124.0,342.0,358.0,531.0,97.0,542.0,639.0,646.0,92.0,44.0
19157,2544,LeBron James,CLE,2017.0,2251.0,857.0,1580.0,149.0,406.0,388.0,531.0,97.0,612.0,709.0,747.0,116.0,71.0


### Numeric only

For now, this is a limitation of `ADMERCS` and also most other methods.

In [27]:
if NUMERIC_COLUMNS_ONLY:
    print("Dropping all nominal columns")
    df = df[NUMERIC_ATTRIBUTES]

In [28]:
pd.set_option("display.max_columns", 22)
df.head(15)

Unnamed: 0,PLAYER_ID,PLAYER_NAME,TEAM_ABBREVIATION,SEASON,PTS,FGM,FGA,FG3M,FG3A,FTM,FTA,OREB,DREB,REB,AST,STL,BLK
10204,201935,James Harden,HOU,2016.0,2356.0,674.0,1533.0,262.0,756.0,746.0,881.0,95.0,564.0,659.0,907.0,121.0,38.0
10205,201935,James Harden,HOU,2017.0,2191.0,651.0,1449.0,265.0,722.0,624.0,727.0,41.0,348.0,389.0,630.0,126.0,50.0
10206,201935,James Harden,HOU,2018.0,2818.0,843.0,1909.0,378.0,1028.0,754.0,858.0,66.0,452.0,518.0,586.0,158.0,58.0
10207,201935,James Harden,HOU,2019.0,2096.0,603.0,1386.0,271.0,769.0,619.0,719.0,64.0,324.0,388.0,450.0,106.0,53.0
16254,202695,Kawhi Leonard,SAS,2016.0,1888.0,636.0,1312.0,147.0,387.0,469.0,533.0,80.0,350.0,430.0,260.0,133.0,55.0
16255,202695,Kawhi Leonard,SAS,2017.0,146.0,52.0,111.0,11.0,35.0,31.0,38.0,6.0,36.0,42.0,21.0,18.0,9.0
16256,202695,Kawhi Leonard,TOR,2018.0,1596.0,560.0,1129.0,112.0,302.0,364.0,426.0,78.0,361.0,439.0,199.0,106.0,24.0
16257,202695,Kawhi Leonard,LAC,2019.0,1370.0,476.0,1016.0,107.0,292.0,311.0,350.0,52.0,321.0,373.0,254.0,92.0,31.0
19156,2544,LeBron James,CLE,2016.0,1954.0,736.0,1344.0,124.0,342.0,358.0,531.0,97.0,542.0,639.0,646.0,92.0,44.0
19157,2544,LeBron James,CLE,2017.0,2251.0,857.0,1580.0,149.0,406.0,388.0,531.0,97.0,612.0,709.0,747.0,116.0,71.0


## Normalize

Standard datascience practice.

In [29]:
if NORMALIZE:
    print("normalizing...")
    df = normalize(df)
    print("normalizing done...")

In [30]:
df.shape

(16, 17)

In [31]:
df.head(10)

Unnamed: 0,PLAYER_ID,PLAYER_NAME,TEAM_ABBREVIATION,SEASON,PTS,FGM,FGA,FG3M,FG3A,FTM,FTA,OREB,DREB,REB,AST,STL,BLK
10204,201935,James Harden,HOU,2016.0,2356.0,674.0,1533.0,262.0,756.0,746.0,881.0,95.0,564.0,659.0,907.0,121.0,38.0
10205,201935,James Harden,HOU,2017.0,2191.0,651.0,1449.0,265.0,722.0,624.0,727.0,41.0,348.0,389.0,630.0,126.0,50.0
10206,201935,James Harden,HOU,2018.0,2818.0,843.0,1909.0,378.0,1028.0,754.0,858.0,66.0,452.0,518.0,586.0,158.0,58.0
10207,201935,James Harden,HOU,2019.0,2096.0,603.0,1386.0,271.0,769.0,619.0,719.0,64.0,324.0,388.0,450.0,106.0,53.0
16254,202695,Kawhi Leonard,SAS,2016.0,1888.0,636.0,1312.0,147.0,387.0,469.0,533.0,80.0,350.0,430.0,260.0,133.0,55.0
16255,202695,Kawhi Leonard,SAS,2017.0,146.0,52.0,111.0,11.0,35.0,31.0,38.0,6.0,36.0,42.0,21.0,18.0,9.0
16256,202695,Kawhi Leonard,TOR,2018.0,1596.0,560.0,1129.0,112.0,302.0,364.0,426.0,78.0,361.0,439.0,199.0,106.0,24.0
16257,202695,Kawhi Leonard,LAC,2019.0,1370.0,476.0,1016.0,107.0,292.0,311.0,350.0,52.0,321.0,373.0,254.0,92.0,31.0
19156,2544,LeBron James,CLE,2016.0,1954.0,736.0,1344.0,124.0,342.0,358.0,531.0,97.0,542.0,639.0,646.0,92.0,44.0
19157,2544,LeBron James,CLE,2017.0,2251.0,857.0,1580.0,149.0,406.0,388.0,531.0,97.0,612.0,709.0,747.0,116.0,71.0


## Reindex

This needs to be done before applying any algorithmic filters.

In [32]:
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,PLAYER_ID,PLAYER_NAME,TEAM_ABBREVIATION,SEASON,PTS,FGM,FGA,FG3M,FG3A,FTM,FTA,OREB,DREB,REB,AST,STL,BLK
0,201935,James Harden,HOU,2016.0,2356.0,674.0,1533.0,262.0,756.0,746.0,881.0,95.0,564.0,659.0,907.0,121.0,38.0
1,201935,James Harden,HOU,2017.0,2191.0,651.0,1449.0,265.0,722.0,624.0,727.0,41.0,348.0,389.0,630.0,126.0,50.0
2,201935,James Harden,HOU,2018.0,2818.0,843.0,1909.0,378.0,1028.0,754.0,858.0,66.0,452.0,518.0,586.0,158.0,58.0
3,201935,James Harden,HOU,2019.0,2096.0,603.0,1386.0,271.0,769.0,619.0,719.0,64.0,324.0,388.0,450.0,106.0,53.0
4,202695,Kawhi Leonard,SAS,2016.0,1888.0,636.0,1312.0,147.0,387.0,469.0,533.0,80.0,350.0,430.0,260.0,133.0,55.0


# Split In Two Tables

We need an aggregated table as well.


In [33]:
df1 = df
df1

Unnamed: 0,PLAYER_ID,PLAYER_NAME,TEAM_ABBREVIATION,SEASON,PTS,FGM,FGA,FG3M,FG3A,FTM,FTA,OREB,DREB,REB,AST,STL,BLK
0,201935,James Harden,HOU,2016.0,2356.0,674.0,1533.0,262.0,756.0,746.0,881.0,95.0,564.0,659.0,907.0,121.0,38.0
1,201935,James Harden,HOU,2017.0,2191.0,651.0,1449.0,265.0,722.0,624.0,727.0,41.0,348.0,389.0,630.0,126.0,50.0
2,201935,James Harden,HOU,2018.0,2818.0,843.0,1909.0,378.0,1028.0,754.0,858.0,66.0,452.0,518.0,586.0,158.0,58.0
3,201935,James Harden,HOU,2019.0,2096.0,603.0,1386.0,271.0,769.0,619.0,719.0,64.0,324.0,388.0,450.0,106.0,53.0
4,202695,Kawhi Leonard,SAS,2016.0,1888.0,636.0,1312.0,147.0,387.0,469.0,533.0,80.0,350.0,430.0,260.0,133.0,55.0
5,202695,Kawhi Leonard,SAS,2017.0,146.0,52.0,111.0,11.0,35.0,31.0,38.0,6.0,36.0,42.0,21.0,18.0,9.0
6,202695,Kawhi Leonard,TOR,2018.0,1596.0,560.0,1129.0,112.0,302.0,364.0,426.0,78.0,361.0,439.0,199.0,106.0,24.0
7,202695,Kawhi Leonard,LAC,2019.0,1370.0,476.0,1016.0,107.0,292.0,311.0,350.0,52.0,321.0,373.0,254.0,92.0,31.0
8,2544,LeBron James,CLE,2016.0,1954.0,736.0,1344.0,124.0,342.0,358.0,531.0,97.0,542.0,639.0,646.0,92.0,44.0
9,2544,LeBron James,CLE,2017.0,2251.0,857.0,1580.0,149.0,406.0,388.0,531.0,97.0,612.0,709.0,747.0,116.0,71.0


In [34]:
df2 = df1.groupby(by=["PLAYER_ID"])[NUMERIC_ATTRIBUTES].mean()
df2 = df2.drop(columns="SEASON", errors="ignore").reset_index()
df2["PLAYER_NAME"] = df2.apply(lambda r: poi_ids[r.PLAYER_ID], axis=1)

df2 = df2[AGGREGATE_ATTRIBUTES]
df2["SALARY"] = df2.apply(lambda r: poi_sal[r.PLAYER_ID], axis=1)
df2

Unnamed: 0,PLAYER_ID,PLAYER_NAME,PTS,REB,AST,STL,BLK,SALARY
0,2544,LeBron James,1813.5,571.5,620.75,88.5,44.5,37440000.0
1,2546,Carmelo Anthony,954.25,314.5,100.0,37.75,28.25,2159000.0
2,201935,James Harden,2365.25,488.5,643.25,127.75,49.75,38200000.0
3,202695,Kawhi Leonard,1250.0,321.0,183.5,87.25,29.75,32370000.0


# Save

Retain what you have created

In [35]:
df1.to_csv(FP1, index=False)
FP1.exists()

True

In [36]:
df2.to_csv(FP2, index=False)
FP2.exists()

True

In [37]:
pd.read_csv(FP1).head()

Unnamed: 0,PLAYER_ID,PLAYER_NAME,TEAM_ABBREVIATION,SEASON,PTS,FGM,FGA,FG3M,FG3A,FTM,FTA,OREB,DREB,REB,AST,STL,BLK
0,201935,James Harden,HOU,2016.0,2356.0,674.0,1533.0,262.0,756.0,746.0,881.0,95.0,564.0,659.0,907.0,121.0,38.0
1,201935,James Harden,HOU,2017.0,2191.0,651.0,1449.0,265.0,722.0,624.0,727.0,41.0,348.0,389.0,630.0,126.0,50.0
2,201935,James Harden,HOU,2018.0,2818.0,843.0,1909.0,378.0,1028.0,754.0,858.0,66.0,452.0,518.0,586.0,158.0,58.0
3,201935,James Harden,HOU,2019.0,2096.0,603.0,1386.0,271.0,769.0,619.0,719.0,64.0,324.0,388.0,450.0,106.0,53.0
4,202695,Kawhi Leonard,SAS,2016.0,1888.0,636.0,1312.0,147.0,387.0,469.0,533.0,80.0,350.0,430.0,260.0,133.0,55.0


In [38]:
pd.read_csv(FP2).head()

Unnamed: 0,PLAYER_ID,PLAYER_NAME,PTS,REB,AST,STL,BLK,SALARY
0,2544,LeBron James,1813.5,571.5,620.75,88.5,44.5,37440000.0
1,2546,Carmelo Anthony,954.25,314.5,100.0,37.75,28.25,2159000.0
2,201935,James Harden,2365.25,488.5,643.25,127.75,49.75,38200000.0
3,202695,Kawhi Leonard,1250.0,321.0,183.5,87.25,29.75,32370000.0
