# Prepare Master DataFrame
In this notebook, I am merging the player stats data frames.

Goals:
- Merge the data.
- Determine what players to remove from the model based on playing time (lack of data).

Results:
- All data sets were included unless they had irrelevant (to offense) or overlapping data.
- Players with 1 year of data (nothing to predict from) were removed.
- Players always outside of the top 300 minutes (outside of the top 10 players per team) were removed.
- Years of a player with too few minutes will be removed later (they will still be used for history, but not for predicting).

## Importing Libraries and Data

In [1]:
# Importing libraries
import pandas as pd
from df_functions import add_season

In [2]:
# Importing data sets
advanced = pd.read_csv('./data/general_advanced', index_col=0)
touches = pd.read_csv('./data/tracking_touches', index_col=0)
drives = pd.read_csv('./data/tracking_drives', index_col=0)
defensive_impact = pd.read_csv('./data/tracking_defensive_impact', index_col=0)
passing = pd.read_csv('./data/tracking_passing', index_col=0)
shooting_efficiency = pd.read_csv('./data/tracking_shooting_efficiency', index_col=0)
speed_distance = pd.read_csv('./data/tracking_speed_distance', index_col=0)
rebounding = pd.read_csv('./data/tracking_rebounding', index_col=0)
catch_shoot = pd.read_csv('./data/tracking_catch_shoot', index_col=0)
pullup_shooting = pd.read_csv('./data/tracking_pullup_shooting', index_col=0)
elbow_touches = pd.read_csv('./data/tracking_elbow_touches', index_col=0)
post_ups = pd.read_csv('./data/tracking_post_ups', index_col=0)
paint_touches = pd.read_csv('./data/tracking_paint_touches', index_col=0)
hustle = pd.read_csv('./data/hustle', index_col=0)
shooting_tight = pd.read_csv('./data/shooting_tight', index_col=0)
shooting_very_tight = pd.read_csv('./data/shooting_very_tight', index_col=0)
shooting_open = pd.read_csv('./data/shooting_open', index_col=0)
shooting_very_open = pd.read_csv('./data/shooting_very_open', index_col=0)
shooting_1_dribble = pd.read_csv('./data/shooting_1_dribble', index_col=0)
shooting_2_dribbles = pd.read_csv('./data/shooting_2_dribbles', index_col=0)
shooting_3_6_dribbles = pd.read_csv('./data/shooting_3_6_dribbles', index_col=0)
shooting_7_dribbles = pd.read_csv('./data/shooting_7_dribbles', index_col=0)
bios = pd.read_csv('./data/bios', index_col=0)
traditional = pd.read_csv('./data/traditional', index_col=0)

## Merging the Data Frames
#### Making it possible to view the data frames

In [3]:
pd.options.display.max_columns = 100

#### Adding season to all of the data frames

In [4]:
advanced = add_season(advanced)
touches = add_season(touches)
drives = add_season(drives)
defensive_impact = add_season(defensive_impact)
passing = add_season(passing)
shooting_efficiency = add_season(shooting_efficiency)
speed_distance = add_season(speed_distance)
rebounding = add_season(rebounding)
catch_shoot = add_season(catch_shoot)
pullup_shooting = add_season(pullup_shooting)
elbow_touches = add_season(elbow_touches)
post_ups = add_season(post_ups)
paint_touches = add_season(paint_touches)
hustle = add_season(hustle)
shooting_tight = add_season(shooting_tight)
shooting_very_tight = add_season(shooting_very_tight)
shooting_open = add_season(shooting_open)
shooting_very_open = add_season(shooting_very_open)
shooting_1_dribble = add_season(shooting_1_dribble)
shooting_2_dribbles = add_season(shooting_2_dribbles)
shooting_3_6_dribbles = add_season(shooting_3_6_dribbles)
shooting_7_dribbles = add_season(shooting_7_dribbles)
bios = add_season(bios)
traditional = add_season(traditional)

#### Updating the different shooting columns to have different names

In [5]:
# Creating a list of columns they share
columns = list(shooting_tight.columns)

In [6]:
# Function to change the column names based on a prefix and condition
def update_columns(df, columns, prefix, condition):
    
    # Creating the new columns
    new_columns = [prefix + column if condition in column else column for column in columns]
    
    # Updating the columns for the dataframe
    df.columns = new_columns
    
    return df

In [7]:
# Updating the dataframes
shooting_open = update_columns(shooting_open, columns, 'OPEN_', 'FG')
shooting_very_open = update_columns(shooting_very_open, columns, 'VERY_OPEN_', 'FG')
shooting_tight = update_columns(shooting_tight, columns, 'TIGHT_', 'FG')
shooting_very_tight = update_columns(shooting_very_tight, columns, 'VERY_TIGHT_', 'FG')
shooting_1_dribble = update_columns(shooting_1_dribble, columns, '1_DRIBBLE_', 'FG')
shooting_2_dribbles = update_columns(shooting_2_dribbles, columns, '2_DRIBBLES_', 'FG')
shooting_3_6_dribbles = update_columns(shooting_3_6_dribbles, columns, '3_6_DRIBBLES_', 'FG')
shooting_7_dribbles = update_columns(shooting_7_dribbles, columns, '7_DRIBBLES_', 'FG')

#### Saving columns to use

In [22]:
# Putting in the potentially relevant non-overlapping columns
# Defensive impact not used because it did not have useful information
# Shooting efficiency not included since it is in other data sets
columns_dict = {
    'advanced': ['SEASON', 'PLAYER_ID', 'PLAYER_NAME', 'TEAM_ID', 'TEAM_ABBREVIATION', 'AGE', 'GP', 'W', 'L', 'W_PCT',
                 'MIN', 'OFF_RATING', 'DEF_RATING', 'NET_RATING', 'AST_PCT', 'AST_TO', 'AST_RATIO', 'TM_TOV_PCT',
                 'EFG_PCT', 'TS_PCT', 'USG_PCT', 'PACE', 'FGM', 'FGA', 'FGM_PG', 'FGA_PG', 'FG_PCT'],
    'touches': ['SEASON', 'PLAYER_ID', 'PLAYER_NAME', 'POINTS', 'TOUCHES', 'FRONT_CT_TOUCHES', 'TIME_OF_POSS',
                'AVG_SEC_PER_TOUCH', 'AVG_DRIB_PER_TOUCH', 'PTS_PER_TOUCH', 'PTS_PER_ELBOW_TOUCH',
                'PTS_PER_POST_TOUCH', 'PTS_PER_PAINT_TOUCH'],
    'drives': ['SEASON', 'PLAYER_ID', 'PLAYER_NAME', 'DRIVES', 'DRIVE_FGM', 'DRIVE_FGA', 'DRIVE_FG_PCT', 'DRIVE_FTM',
               'DRIVE_FTA', 'DRIVE_FT_PCT', 'DRIVE_PTS', 'DRIVE_PTS_PCT', 'DRIVE_PASSES', 'DRIVE_PASSES_PCT',
               'DRIVE_AST', 'DRIVE_AST_PCT', 'DRIVE_TOV', 'DRIVE_TOV_PCT', 'DRIVE_PF', 'DRIVE_PF_PCT', ],
    'passing': ['SEASON', 'PLAYER_ID', 'PLAYER_NAME', 'PASSES_MADE', 'PASSES_RECEIVED', 'AST', 'FT_AST',
                'SECONDARY_AST', 'POTENTIAL_AST', 'AST_POINTS_CREATED', 'AST_ADJ', 'AST_TO_PASS_PCT', 
                'AST_TO_PASS_PCT_ADJ'],
    'speed_distance': ['SEASON', 'PLAYER_ID', 'PLAYER_NAME', 'DIST_FEET', 'DIST_MILES', 'DIST_MILES_OFF',
                       'DIST_MILES_DEF', 'AVG_SPEED', 'AVG_SPEED_OFF', 'AVG_SPEED_DEF'],
    'rebounding': ['SEASON', 'PLAYER_ID', 'PLAYER_NAME', 'OREB', 'OREB_CONTEST', 'OREB_UNCONTEST', 'OREB_CONTEST_PCT',
                   'OREB_CHANCES', 'OREB_CHANCE_PCT', 'OREB_CHANCE_DEFER', 'OREB_CHANCE_PCT_ADJ', 'AVG_OREB_DIST',
                   'DREB', 'DREB_CONTEST', 'DREB_UNCONTEST', 'DREB_CONTEST_PCT', 'DREB_CHANCES', 'DREB_CHANCE_PCT',
                   'DREB_CHANCE_DEFER', 'DREB_CHANCE_PCT_ADJ', 'AVG_DREB_DIST', 'REB', 'REB_CONTEST', 'REB_UNCONTEST', 
                   'REB_CONTEST_PCT', 'REB_CHANCES', 'REB_CHANCE_PCT', 'REB_CHANCE_DEFER', 'REB_CHANCE_PCT_ADJ',
                   'AVG_REB_DIST'],
    'catch_shoot': ['SEASON', 'PLAYER_ID', 'PLAYER_NAME', 'CATCH_SHOOT_FGM', 'CATCH_SHOOT_FGA', 'CATCH_SHOOT_FG_PCT',
                    'CATCH_SHOOT_PTS', 'CATCH_SHOOT_FG3M', 'CATCH_SHOOT_FG3A', 'CATCH_SHOOT_FG3_PCT',
                    'CATCH_SHOOT_EFG_PCT'],
    'pullup_shooting': ['SEASON', 'PLAYER_ID', 'PLAYER_NAME', 'PULL_UP_FGM', 'PULL_UP_FGA', 'PULL_UP_FG_PCT',
                        'PULL_UP_FG3M', 'PULL_UP_FG3A', 'PULL_UP_FG3_PCT', 'PULL_UP_PTS', 'PULL_UP_EFG_PCT'],
    'elbow_touches': ['SEASON', 'PLAYER_ID', 'PLAYER_NAME', 'ELBOW_TOUCHES', 'ELBOW_TOUCH_FGM', 'ELBOW_TOUCH_FGA',
                      'ELBOW_TOUCH_FG_PCT', 'ELBOW_TOUCH_FTM', 'ELBOW_TOUCH_FTA', 'ELBOW_TOUCH_FT_PCT',
                      'ELBOW_TOUCH_PTS', 'ELBOW_TOUCH_PASSES', 'ELBOW_TOUCH_AST', 'ELBOW_TOUCH_AST_PCT',
                      'ELBOW_TOUCH_TOV', 'ELBOW_TOUCH_TOV_PCT', 'ELBOW_TOUCH_FOULS'],
    'post_ups': ['SEASON', 'PLAYER_ID', 'PLAYER_NAME', 'POST_TOUCHES', 'POST_TOUCH_FGM', 'POST_TOUCH_FGA',
                 'POST_TOUCH_FG_PCT', 'POST_TOUCH_FTM', 'POST_TOUCH_FTA', 'POST_TOUCH_FT_PCT', 'POST_TOUCH_PTS',
                 'POST_TOUCH_PTS_PCT', 'POST_TOUCH_PASSES', 'POST_TOUCH_PASSES_PCT', 'POST_TOUCH_AST',
                 'POST_TOUCH_AST_PCT', 'POST_TOUCH_TOV', 'POST_TOUCH_TOV_PCT', 'POST_TOUCH_FOULS',
                 'POST_TOUCH_FOULS_PCT'],
    'paint_touches': ['SEASON', 'PLAYER_ID', 'PLAYER_NAME', 'PAINT_TOUCHES', 'PAINT_TOUCH_FGM','PAINT_TOUCH_FGA',
                      'PAINT_TOUCH_FG_PCT', 'PAINT_TOUCH_FTM','PAINT_TOUCH_FTA', 'PAINT_TOUCH_FT_PCT',
                      'PAINT_TOUCH_PTS','PAINT_TOUCH_PTS_PCT', 'PAINT_TOUCH_PASSES', 'PAINT_TOUCH_PASSES_PCT',
                      'PAINT_TOUCH_AST', 'PAINT_TOUCH_AST_PCT', 'PAINT_TOUCH_TOV','PAINT_TOUCH_TOV_PCT',
                      'PAINT_TOUCH_FOULS', 'PAINT_TOUCH_FOULS_PCT'],
    'hustle': ['SEASON', 'PLAYER_ID', 'PLAYER_NAME', 'CONTESTED_SHOTS', 'CONTESTED_SHOTS_2PT', 'CONTESTED_SHOTS_3PT',
               'CHARGES_DRAWN', 'DEFLECTIONS', 'LOOSE_BALLS_RECOVERED', 'SCREEN_ASSISTS', 'BOX_OUTS'],
    'shooting_tight': ['SEASON', 'PLAYER_ID', 'PLAYER_NAME', 'TIGHT_FGA_FREQUENCY', 'TIGHT_FGM', 'TIGHT_FGA',
                       'TIGHT_FG_PCT', 'TIGHT_EFG_PCT', 'TIGHT_FG2A_FREQUENCY', 'TIGHT_FG2M', 'TIGHT_FG2A',
                       'TIGHT_FG2_PCT', 'TIGHT_FG3A_FREQUENCY', 'TIGHT_FG3M', 'TIGHT_FG3A', 'TIGHT_FG3_PCT'],
    'shooting_very_tight': ['SEASON', 'PLAYER_ID', 'PLAYER_NAME', 'VERY_TIGHT_FGA_FREQUENCY', 'VERY_TIGHT_FGM',
                            'VERY_TIGHT_FGA', 'VERY_TIGHT_FG_PCT', 'VERY_TIGHT_EFG_PCT', 'VERY_TIGHT_FG2A_FREQUENCY',
                            'VERY_TIGHT_FG2M', 'VERY_TIGHT_FG2A', 'VERY_TIGHT_FG2_PCT', 'VERY_TIGHT_FG3A_FREQUENCY',
                            'VERY_TIGHT_FG3M', 'VERY_TIGHT_FG3A', 'VERY_TIGHT_FG3_PCT'],
    'shooting_open': ['SEASON', 'PLAYER_ID', 'PLAYER_NAME', 'OPEN_FGA_FREQUENCY', 'OPEN_FGM', 'OPEN_FGA',
                      'OPEN_FG_PCT', 'OPEN_EFG_PCT', 'OPEN_FG2A_FREQUENCY', 'OPEN_FG2M', 'OPEN_FG2A', 'OPEN_FG2_PCT',
                      'OPEN_FG3A_FREQUENCY', 'OPEN_FG3M', 'OPEN_FG3A', 'OPEN_FG3_PCT'],
    'shooting_very_open': ['SEASON', 'PLAYER_ID', 'PLAYER_NAME', 'VERY_OPEN_FGA_FREQUENCY', 'VERY_OPEN_FGM',
                           'VERY_OPEN_FGA', 'VERY_OPEN_FG_PCT', 'VERY_OPEN_EFG_PCT', 'VERY_OPEN_FG2A_FREQUENCY',
                           'VERY_OPEN_FG2M', 'VERY_OPEN_FG2A', 'VERY_OPEN_FG2_PCT', 'VERY_OPEN_FG3A_FREQUENCY',
                           'VERY_OPEN_FG3M', 'VERY_OPEN_FG3A', 'VERY_OPEN_FG3_PCT'],
    'shooting_1_dribble': ['SEASON', 'PLAYER_ID', 'PLAYER_NAME', '1_DRIBBLE_FGA_FREQUENCY', '1_DRIBBLE_FGM', 
                           '1_DRIBBLE_FGA', '1_DRIBBLE_FG_PCT', '1_DRIBBLE_EFG_PCT', '1_DRIBBLE_FG2A_FREQUENCY',
                           '1_DRIBBLE_FG2M', '1_DRIBBLE_FG2A', '1_DRIBBLE_FG2_PCT', '1_DRIBBLE_FG3A_FREQUENCY',
                           '1_DRIBBLE_FG3M', '1_DRIBBLE_FG3A', '1_DRIBBLE_FG3_PCT'],
    'shooting_2_dribbles': ['SEASON', 'PLAYER_ID', 'PLAYER_NAME', '2_DRIBBLES_FGA_FREQUENCY', '2_DRIBBLES_FGM', 
                            '2_DRIBBLES_FGA', '2_DRIBBLES_FG_PCT', '2_DRIBBLES_EFG_PCT', '2_DRIBBLES_FG2A_FREQUENCY',
                            '2_DRIBBLES_FG2M', '2_DRIBBLES_FG2A', '2_DRIBBLES_FG2_PCT', '2_DRIBBLES_FG3A_FREQUENCY',
                            '2_DRIBBLES_FG3M', '2_DRIBBLES_FG3A', '2_DRIBBLES_FG3_PCT'],
    'shooting_3_6_dribbles': ['SEASON', 'PLAYER_ID', 'PLAYER_NAME', '3_6_DRIBBLES_FGA_FREQUENCY', '3_6_DRIBBLES_FGM', 
                              '3_6_DRIBBLES_FGA', '3_6_DRIBBLES_FG_PCT', '3_6_DRIBBLES_EFG_PCT', '3_6_DRIBBLES_FG2A_FREQUENCY',
                              '3_6_DRIBBLES_FG2M', '3_6_DRIBBLES_FG2A', '3_6_DRIBBLES_FG2_PCT', '3_6_DRIBBLES_FG3A_FREQUENCY',
                              '3_6_DRIBBLES_FG3M', '3_6_DRIBBLES_FG3A', '3_6_DRIBBLES_FG3_PCT'],
    'shooting_7_dribbles': ['SEASON', 'PLAYER_ID', 'PLAYER_NAME', '7_DRIBBLES_FGA_FREQUENCY', '7_DRIBBLES_FGM', 
                            '7_DRIBBLES_FGA', '7_DRIBBLES_FG_PCT', '7_DRIBBLES_EFG_PCT', '7_DRIBBLES_FG2A_FREQUENCY',
                            '7_DRIBBLES_FG2M', '7_DRIBBLES_FG2A', '7_DRIBBLES_FG2_PCT', '7_DRIBBLES_FG3A_FREQUENCY',
                            '7_DRIBBLES_FG3M', '7_DRIBBLES_FG3A', '7_DRIBBLES_FG3_PCT'],
    'bios': ['SEASON', 'PLAYER_ID', 'PLAYER_NAME', 'PLAYER_HEIGHT_INCHES', 'PLAYER_WEIGHT'],
    'traditional': ['SEASON', 'PLAYER_ID', 'PLAYER_NAME', 'FTM', 'FTA', 'FT_PCT']
}

#### Merging dataframes

In [23]:
# Making a dictionary of the dataframes
df_dict = {
    'advanced': advanced,
    'touches': touches,
    'drives': drives,
    'passing': passing,
    'speed_distance': speed_distance,
    'rebounding': rebounding,
    'catch_shoot': catch_shoot,
    'pullup_shooting': pullup_shooting,
    'elbow_touches': elbow_touches,
    'post_ups': post_ups,
    'paint_touches': paint_touches,
    'hustle': hustle,
    'shooting_tight': shooting_tight,
    'shooting_very_tight': shooting_very_tight,
    'shooting_open': shooting_open,
    'shooting_very_open': shooting_very_open,
    'shooting_1_dribble': shooting_1_dribble,
    'shooting_2_dribbles': shooting_2_dribbles,
    'shooting_3_6_dribbles': shooting_3_6_dribbles,
    'shooting_7_dribbles': shooting_7_dribbles,
    'bios': bios,
    'traditional': traditional
}

In [26]:
# Function to merge a bunch of dataframes together
def merge_df(df_dict, columns_dict, how, on):
    
    # Initiating to show we need a master df
    have_master = False
    
    # Looping over dataframes
    for key, df in df_dict.items():
        
        # Getting the columns for the dataframe
        columns = columns_dict[key]
        
        # Merging the dataframes
        if have_master == False:
            master_df = df.loc[:, columns]
            have_master = True
        else:
            master_df = pd.merge(master_df, df[columns], how=how, on=on)

    return master_df

In [27]:
# Saving the columns to merge on
on_columns = ["PLAYER_ID", "SEASON", "PLAYER_NAME"]

In [28]:
# Creating the master dataframe
master_df = merge_df(df_dict, columns_dict, 'outer', on_columns)

In [29]:
# Peeking at the master dataframe
master_df[master_df['PLAYER_NAME']=='Al Horford']

Unnamed: 0,SEASON,PLAYER_ID,PLAYER_NAME,TEAM_ID,TEAM_ABBREVIATION,AGE,GP,W,L,W_PCT,MIN,OFF_RATING,DEF_RATING,NET_RATING,AST_PCT,AST_TO,AST_RATIO,TM_TOV_PCT,EFG_PCT,TS_PCT,USG_PCT,PACE,FGM,FGA,FGM_PG,FGA_PG,FG_PCT,POINTS,TOUCHES,FRONT_CT_TOUCHES,TIME_OF_POSS,AVG_SEC_PER_TOUCH,AVG_DRIB_PER_TOUCH,PTS_PER_TOUCH,PTS_PER_ELBOW_TOUCH,PTS_PER_POST_TOUCH,PTS_PER_PAINT_TOUCH,DRIVES,DRIVE_FGM,DRIVE_FGA,DRIVE_FG_PCT,DRIVE_FTM,DRIVE_FTA,DRIVE_FT_PCT,DRIVE_PTS,DRIVE_PTS_PCT,DRIVE_PASSES,DRIVE_PASSES_PCT,DRIVE_AST,DRIVE_AST_PCT,...,1_DRIBBLE_FG2A,1_DRIBBLE_FG2_PCT,1_DRIBBLE_FG3A_FREQUENCY,1_DRIBBLE_FG3M,1_DRIBBLE_FG3A,1_DRIBBLE_FG3_PCT,2_DRIBBLES_FGA_FREQUENCY,2_DRIBBLES_FGM,2_DRIBBLES_FGA,2_DRIBBLES_FG_PCT,2_DRIBBLES_EFG_PCT,2_DRIBBLES_FG2A_FREQUENCY,2_DRIBBLES_FG2M,2_DRIBBLES_FG2A,2_DRIBBLES_FG2_PCT,2_DRIBBLES_FG3A_FREQUENCY,2_DRIBBLES_FG3M,2_DRIBBLES_FG3A,2_DRIBBLES_FG3_PCT,3_6_DRIBBLES_FGA_FREQUENCY,3_6_DRIBBLES_FGM,3_6_DRIBBLES_FGA,3_6_DRIBBLES_FG_PCT,3_6_DRIBBLES_EFG_PCT,3_6_DRIBBLES_FG2A_FREQUENCY,3_6_DRIBBLES_FG2M,3_6_DRIBBLES_FG2A,3_6_DRIBBLES_FG2_PCT,3_6_DRIBBLES_FG3A_FREQUENCY,3_6_DRIBBLES_FG3M,3_6_DRIBBLES_FG3A,3_6_DRIBBLES_FG3_PCT,7_DRIBBLES_FGA_FREQUENCY,7_DRIBBLES_FGM,7_DRIBBLES_FGA,7_DRIBBLES_FG_PCT,7_DRIBBLES_EFG_PCT,7_DRIBBLES_FG2A_FREQUENCY,7_DRIBBLES_FG2M,7_DRIBBLES_FG2A,7_DRIBBLES_FG2_PCT,7_DRIBBLES_FG3A_FREQUENCY,7_DRIBBLES_FG3M,7_DRIBBLES_FG3A,7_DRIBBLES_FG3_PCT,PLAYER_HEIGHT_INCHES,PLAYER_WEIGHT,FTM,FTA,FT_PCT
6,2018,201143,Al Horford,1610612738,BOS,32.0,72,47,25,0.653,31.6,108.2,101.1,7.1,0.225,2.57,26.6,10.3,0.553,0.575,0.187,97.74,368,753,5.1,10.5,0.489,12.8,63.5,40.0,2.3,2.2,1.06,0.201,0.409,0.476,0.693,2.9,0.7,1.3,0.545,0.1,0.2,0.833,1.6,0.527,0.9,0.32,0.3,0.118,...,1.43,0.576,0.007,0.06,0.07,0.8,0.11,0.52,1.1,0.474,0.474,0.11,0.52,1.1,0.474,0.0,0.0,0.0,,0.166,0.74,1.67,0.443,0.443,0.166,0.74,1.67,0.443,0.0,0.0,0.0,,0.029,0.14,0.29,0.5,0.525,0.027,0.13,0.28,0.474,0.001,0.01,0.01,1.0,82.0,245.0,1.3,1.7,0.783
545,2017,201143,Al Horford,1610612738,BOS,31.0,68,46,22,0.676,32.3,110.7,105.8,5.0,0.239,2.93,25.7,8.8,0.527,0.553,0.199,98.96,379,801,5.6,11.8,0.473,14.0,67.0,45.7,2.1,1.92,0.83,0.209,0.385,0.481,0.688,2.7,0.7,1.2,0.549,0.2,0.2,0.688,1.6,0.591,0.9,0.32,0.3,0.11,...,1.35,0.62,0.003,0.0,0.03,0.0,0.093,0.5,1.06,0.472,0.472,0.093,0.5,1.06,0.472,0.0,0.0,0.0,,0.119,0.62,1.35,0.457,0.457,0.119,0.62,1.35,0.457,0.0,0.0,0.0,,0.027,0.19,0.31,0.619,0.643,0.026,0.18,0.29,0.6,0.001,0.01,0.01,1.0,82.0,245.0,1.6,2.0,0.8
1030,2016,201143,Al Horford,1610612737,ATL,30.0,82,48,34,0.585,32.1,103.1,98.2,4.9,0.165,2.46,17.8,7.3,0.547,0.565,0.206,99.75,529,1048,6.5,12.8,0.505,15.2,63.5,42.1,1.7,1.6,0.57,0.24,0.322,0.484,0.779,2.1,0.5,0.9,0.535,0.2,0.3,0.75,1.2,0.563,0.8,0.352,0.2,0.108,...,1.06,0.586,0.0,0.0,0.0,,0.05,0.29,0.62,0.471,0.471,0.048,0.29,0.6,0.49,0.002,0.0,0.02,0.0,0.047,0.24,0.59,0.417,0.417,0.047,0.24,0.59,0.417,0.0,0.0,0.0,,0.005,0.01,0.06,0.2,0.2,0.005,0.01,0.06,0.2,0.0,0.0,0.0,,82.0,245.0,1.3,1.6,0.798
1506,2015,201143,Al Horford,1610612737,ATL,29.0,76,56,20,0.737,30.5,107.6,101.0,6.6,0.178,2.44,17.8,7.3,0.544,0.563,0.225,95.85,519,965,6.8,12.7,0.538,15.2,58.2,39.6,1.7,1.7,0.59,0.261,0.405,0.457,0.775,1.9,0.5,0.9,0.567,0.3,0.3,0.846,1.3,0.685,0.6,0.315,0.2,0.084,...,1.61,0.598,0.001,0.0,0.01,0.0,0.066,0.39,0.8,0.492,0.492,0.065,0.39,0.79,0.5,0.001,0.0,0.01,0.0,0.066,0.46,0.8,0.574,0.574,0.065,0.46,0.79,0.583,0.001,0.0,0.01,0.0,0.009,0.03,0.11,0.25,0.25,0.009,0.03,0.11,0.25,0.0,0.0,0.0,,82.0,250.0,1.4,1.9,0.759
1999,2014,201143,Al Horford,1610612737,ATL,28.0,29,16,13,0.552,33.0,104.4,100.7,3.7,0.139,1.19,12.7,10.7,0.571,0.588,0.246,97.04,238,420,8.2,14.5,0.567,18.6,63.0,40.6,1.8,1.75,0.49,0.295,0.436,0.51,0.869,1.5,0.4,0.7,0.55,0.2,0.3,0.7,1.0,0.674,0.3,0.209,0.1,0.047,...,1.72,0.58,0.0,0.0,0.0,,0.071,0.48,1.0,0.483,0.483,0.069,0.48,0.97,0.5,0.002,0.0,0.03,0.0,0.052,0.28,0.72,0.381,0.381,0.049,0.28,0.69,0.4,0.002,0.0,0.03,0.0,0.007,0.0,0.1,0.0,0.0,0.007,0.0,0.1,0.0,0.0,0.0,0.0,,82.0,250.0,2.0,2.9,0.682
2480,2013,201143,Al Horford,1610612737,ATL,27.0,74,42,32,0.568,37.3,104.8,101.5,3.3,0.149,1.63,15.6,9.6,0.545,0.56,0.221,93.61,576,1060,7.8,14.3,0.543,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,82.0,250.0,1.8,2.8,0.644
2948,2012,201143,Al Horford,1610612737,ATL,26.0,11,7,4,0.636,31.6,106.1,99.3,6.8,0.113,1.5,15.4,10.2,0.553,0.585,0.178,91.56,57,103,5.2,9.4,0.553,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,82.0,250.0,2.0,2.7,0.733
3428,2011,201143,Al Horford,1610612737,ATL,25.0,77,41,36,0.532,35.1,104.5,104.4,0.2,0.169,2.24,19.2,8.6,0.558,0.587,0.199,90.93,513,921,6.7,12.0,0.557,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,82.0,245.0,1.9,2.4,0.798


## Removing Players
#### Removing players who played only one year

In [30]:
# Creating a list of player IDs to keep
vet_id_list = [ID for ID, n in master_df['PLAYER_ID'].value_counts().items() if n > 1]

In [31]:
# Saving the dataframe with only "veteran" players
master_df = master_df[master_df['PLAYER_ID'].isin(vet_id_list)].reset_index(drop=True)

#### Removing players who played too few minutes over their career

In [32]:
# Creating a total minutes column
master_df['TOTAL_MIN'] = master_df['MIN'] * master_df['GP']

In [33]:
# Finding the minutes played for the 301st player each year (based on )
sum_total_min = 0
for season in master_df['SEASON'].unique():
    sum_total_min += sorted(master_df[master_df['SEASON'] == season]['TOTAL_MIN'], reverse=True)[301]
min_cutoff = sum_total_min/len(master_df['SEASON'].unique())

In [34]:
# Finding which players have played enough minutes
below_min_id_list = master_df[master_df['TOTAL_MIN'] < min_cutoff]['PLAYER_ID'].unique()
above_min_id_list = master_df[master_df['TOTAL_MIN'] >= min_cutoff]['PLAYER_ID'].unique()

In [35]:
# Keeping players who were above (or equal to) the cutoff in at least one season
master_df = master_df[master_df['PLAYER_ID'].isin(above_min_id_list)].reset_index(drop=True)

## Saving Data

In [36]:
master_df.to_csv('./data/master_df')