# Prepare Master DataFrame
In this notebook, I am merging the player stats data frames.

Goals:
- Merge the data.
- Determine what players to remove from the model based on playing time (lack of data).

Results:
- All data sets were included unless they had irrelevant (to offense) or overlapping data.
- Players with 1 year of data (nothing to predict from) were removed.
- Players always outside of the top 300 minutes (outside of the top 10 players per team) were removed.
- Years of a player with too few minutes will be removed later (they will still be used for history, but not for predicting).

## 0. Importing Libraries and Data

In [1]:
# Importing libraries
import pandas as pd
import numpy as np
from df_functions import add_season, update_columns, merge_df, pts_generated, use_history
from dictionaries import columns_dict, stat_types

In [2]:
# Importing data sets
advanced = pd.read_csv('./data/general_advanced', index_col=0)
touches = pd.read_csv('./data/tracking_touches', index_col=0)
drives = pd.read_csv('./data/tracking_drives', index_col=0)
defensive_impact = pd.read_csv('./data/tracking_defensive_impact', index_col=0)
passing = pd.read_csv('./data/tracking_passing', index_col=0)
shooting_efficiency = pd.read_csv('./data/tracking_shooting_efficiency', index_col=0)
speed_distance = pd.read_csv('./data/tracking_speed_distance', index_col=0)
rebounding = pd.read_csv('./data/tracking_rebounding', index_col=0)
catch_shoot = pd.read_csv('./data/tracking_catch_shoot', index_col=0)
pullup_shooting = pd.read_csv('./data/tracking_pullup_shooting', index_col=0)
elbow_touches = pd.read_csv('./data/tracking_elbow_touches', index_col=0)
post_ups = pd.read_csv('./data/tracking_post_ups', index_col=0)
paint_touches = pd.read_csv('./data/tracking_paint_touches', index_col=0)
hustle = pd.read_csv('./data/hustle', index_col=0)
shooting_tight = pd.read_csv('./data/shooting_tight', index_col=0)
shooting_very_tight = pd.read_csv('./data/shooting_very_tight', index_col=0)
shooting_open = pd.read_csv('./data/shooting_open', index_col=0)
shooting_very_open = pd.read_csv('./data/shooting_very_open', index_col=0)
shooting_1_dribble = pd.read_csv('./data/shooting_1_dribble', index_col=0)
shooting_2_dribbles = pd.read_csv('./data/shooting_2_dribbles', index_col=0)
shooting_3_6_dribbles = pd.read_csv('./data/shooting_3_6_dribbles', index_col=0)
shooting_7_dribbles = pd.read_csv('./data/shooting_7_dribbles', index_col=0)
bios = pd.read_csv('./data/bios', index_col=0)
traditional = pd.read_csv('./data/traditional', index_col=0)
league_assists = pd.read_csv('./data/assists_value', index_col=0)
league_turnovers = pd.read_csv('./data/turnovers', index_col=0)

## 1. Merging the Data Frames
#### Making it possible to view the data frames

In [3]:
pd.options.display.max_columns = 100

#### Adding season to all of the data frames

In [4]:
advanced = add_season(advanced)
touches = add_season(touches)
drives = add_season(drives)
defensive_impact = add_season(defensive_impact)
passing = add_season(passing)
shooting_efficiency = add_season(shooting_efficiency)
speed_distance = add_season(speed_distance)
rebounding = add_season(rebounding)
catch_shoot = add_season(catch_shoot)
pullup_shooting = add_season(pullup_shooting)
elbow_touches = add_season(elbow_touches)
post_ups = add_season(post_ups)
paint_touches = add_season(paint_touches)
hustle = add_season(hustle)
shooting_tight = add_season(shooting_tight)
shooting_very_tight = add_season(shooting_very_tight)
shooting_open = add_season(shooting_open)
shooting_very_open = add_season(shooting_very_open)
shooting_1_dribble = add_season(shooting_1_dribble)
shooting_2_dribbles = add_season(shooting_2_dribbles)
shooting_3_6_dribbles = add_season(shooting_3_6_dribbles)
shooting_7_dribbles = add_season(shooting_7_dribbles)
bios = add_season(bios)
traditional = add_season(traditional)

#### Updating the different shooting columns to have different names

In [5]:
# Creating a list of columns they share
columns = list(shooting_tight.columns)

In [6]:
# Updating the dataframes
shooting_open = update_columns(shooting_open, columns, 'OPEN_', 'FG')
shooting_very_open = update_columns(shooting_very_open, columns, 'VERY_OPEN_', 'FG')
shooting_tight = update_columns(shooting_tight, columns, 'TIGHT_', 'FG')
shooting_very_tight = update_columns(shooting_very_tight, columns, 'VERY_TIGHT_', 'FG')
shooting_1_dribble = update_columns(shooting_1_dribble, columns, '1_DRIBBLE_', 'FG')
shooting_2_dribbles = update_columns(shooting_2_dribbles, columns, '2_DRIBBLES_', 'FG')
shooting_3_6_dribbles = update_columns(shooting_3_6_dribbles, columns, '3_6_DRIBBLES_', 'FG')
shooting_7_dribbles = update_columns(shooting_7_dribbles, columns, '7_DRIBBLES_', 'FG')

#### Merging dataframes

In [7]:
# Making a dictionary of the dataframes
df_dict = {
    'advanced': advanced,
    'touches': touches,
    'drives': drives,
    'passing': passing,
    'speed_distance': speed_distance,
    'rebounding': rebounding,
    'catch_shoot': catch_shoot,
    'pullup_shooting': pullup_shooting,
    'elbow_touches': elbow_touches,
    'post_ups': post_ups,
    'paint_touches': paint_touches,
    'hustle': hustle,
    'shooting_tight': shooting_tight,
    'shooting_very_tight': shooting_very_tight,
    'shooting_open': shooting_open,
    'shooting_very_open': shooting_very_open,
    'shooting_1_dribble': shooting_1_dribble,
    'shooting_2_dribbles': shooting_2_dribbles,
    'shooting_3_6_dribbles': shooting_3_6_dribbles,
    'shooting_7_dribbles': shooting_7_dribbles,
    'bios': bios,
    'traditional': traditional
}

In [8]:
# Saving the columns to merge on
on_columns = ["PLAYER_ID", "SEASON", "PLAYER_NAME"]

In [9]:
# Creating the master dataframe
master_df = merge_df(df_dict, columns_dict, 'outer', on_columns)

In [10]:
# Peeking at the master dataframe
master_df[master_df['PLAYER_NAME']=='Al Horford']

Unnamed: 0,SEASON,PLAYER_ID,PLAYER_NAME,TEAM_ID,TEAM_ABBREVIATION,AGE,GP,W,L,W_PCT,MIN,OFF_RATING,DEF_RATING,NET_RATING,AST_PCT,AST_TO,AST_RATIO,TM_TOV_PCT,EFG_PCT,TS_PCT,USG_PCT,PACE,FGM,FGA,FGM_PG,FGA_PG,FG_PCT,POINTS,TOUCHES,FRONT_CT_TOUCHES,TIME_OF_POSS,AVG_SEC_PER_TOUCH,AVG_DRIB_PER_TOUCH,PTS_PER_TOUCH,PTS_PER_ELBOW_TOUCH,PTS_PER_POST_TOUCH,PTS_PER_PAINT_TOUCH,DRIVES,DRIVE_FGM,DRIVE_FGA,DRIVE_FG_PCT,DRIVE_FTM,DRIVE_FTA,DRIVE_FT_PCT,DRIVE_PTS,DRIVE_PTS_PCT,DRIVE_PASSES,DRIVE_PASSES_PCT,DRIVE_AST,DRIVE_AST_PCT,...,1_DRIBBLE_FG2A,1_DRIBBLE_FG2_PCT,1_DRIBBLE_FG3A_FREQUENCY,1_DRIBBLE_FG3M,1_DRIBBLE_FG3A,1_DRIBBLE_FG3_PCT,2_DRIBBLES_FGA_FREQUENCY,2_DRIBBLES_FGM,2_DRIBBLES_FGA,2_DRIBBLES_FG_PCT,2_DRIBBLES_EFG_PCT,2_DRIBBLES_FG2A_FREQUENCY,2_DRIBBLES_FG2M,2_DRIBBLES_FG2A,2_DRIBBLES_FG2_PCT,2_DRIBBLES_FG3A_FREQUENCY,2_DRIBBLES_FG3M,2_DRIBBLES_FG3A,2_DRIBBLES_FG3_PCT,3_6_DRIBBLES_FGA_FREQUENCY,3_6_DRIBBLES_FGM,3_6_DRIBBLES_FGA,3_6_DRIBBLES_FG_PCT,3_6_DRIBBLES_EFG_PCT,3_6_DRIBBLES_FG2A_FREQUENCY,3_6_DRIBBLES_FG2M,3_6_DRIBBLES_FG2A,3_6_DRIBBLES_FG2_PCT,3_6_DRIBBLES_FG3A_FREQUENCY,3_6_DRIBBLES_FG3M,3_6_DRIBBLES_FG3A,3_6_DRIBBLES_FG3_PCT,7_DRIBBLES_FGA_FREQUENCY,7_DRIBBLES_FGM,7_DRIBBLES_FGA,7_DRIBBLES_FG_PCT,7_DRIBBLES_EFG_PCT,7_DRIBBLES_FG2A_FREQUENCY,7_DRIBBLES_FG2M,7_DRIBBLES_FG2A,7_DRIBBLES_FG2_PCT,7_DRIBBLES_FG3A_FREQUENCY,7_DRIBBLES_FG3M,7_DRIBBLES_FG3A,7_DRIBBLES_FG3_PCT,PLAYER_HEIGHT_INCHES,PLAYER_WEIGHT,FTM,FTA,FT_PCT
6,2018,201143,Al Horford,1610612738,BOS,32.0,72,47,25,0.653,31.6,108.2,101.1,7.1,0.225,2.57,26.6,10.3,0.553,0.575,0.187,97.74,368,753,5.1,10.5,0.489,12.8,63.5,40.0,2.3,2.2,1.06,0.201,0.409,0.476,0.693,2.9,0.7,1.3,0.545,0.1,0.2,0.833,1.6,0.527,0.9,0.32,0.3,0.118,...,1.43,0.576,0.007,0.06,0.07,0.8,0.11,0.52,1.1,0.474,0.474,0.11,0.52,1.1,0.474,0.0,0.0,0.0,,0.166,0.74,1.67,0.443,0.443,0.166,0.74,1.67,0.443,0.0,0.0,0.0,,0.029,0.14,0.29,0.5,0.525,0.027,0.13,0.28,0.474,0.001,0.01,0.01,1.0,82.0,245.0,1.3,1.7,0.783
545,2017,201143,Al Horford,1610612738,BOS,31.0,68,46,22,0.676,32.3,110.7,105.8,5.0,0.239,2.93,25.7,8.8,0.527,0.553,0.199,98.96,379,801,5.6,11.8,0.473,14.0,67.0,45.7,2.1,1.92,0.83,0.209,0.385,0.481,0.688,2.7,0.7,1.2,0.549,0.2,0.2,0.688,1.6,0.591,0.9,0.32,0.3,0.11,...,1.35,0.62,0.003,0.0,0.03,0.0,0.093,0.5,1.06,0.472,0.472,0.093,0.5,1.06,0.472,0.0,0.0,0.0,,0.119,0.62,1.35,0.457,0.457,0.119,0.62,1.35,0.457,0.0,0.0,0.0,,0.027,0.19,0.31,0.619,0.643,0.026,0.18,0.29,0.6,0.001,0.01,0.01,1.0,82.0,245.0,1.6,2.0,0.8
1030,2016,201143,Al Horford,1610612737,ATL,30.0,82,48,34,0.585,32.1,103.1,98.2,4.9,0.165,2.46,17.8,7.3,0.547,0.565,0.206,99.75,529,1048,6.5,12.8,0.505,15.2,63.5,42.1,1.7,1.6,0.57,0.24,0.322,0.484,0.779,2.1,0.5,0.9,0.535,0.2,0.3,0.75,1.2,0.563,0.8,0.352,0.2,0.108,...,1.06,0.586,0.0,0.0,0.0,,0.05,0.29,0.62,0.471,0.471,0.048,0.29,0.6,0.49,0.002,0.0,0.02,0.0,0.047,0.24,0.59,0.417,0.417,0.047,0.24,0.59,0.417,0.0,0.0,0.0,,0.005,0.01,0.06,0.2,0.2,0.005,0.01,0.06,0.2,0.0,0.0,0.0,,82.0,245.0,1.3,1.6,0.798
1506,2015,201143,Al Horford,1610612737,ATL,29.0,76,56,20,0.737,30.5,107.6,101.0,6.6,0.178,2.44,17.8,7.3,0.544,0.563,0.225,95.85,519,965,6.8,12.7,0.538,15.2,58.2,39.6,1.7,1.7,0.59,0.261,0.405,0.457,0.775,1.9,0.5,0.9,0.567,0.3,0.3,0.846,1.3,0.685,0.6,0.315,0.2,0.084,...,1.61,0.598,0.001,0.0,0.01,0.0,0.066,0.39,0.8,0.492,0.492,0.065,0.39,0.79,0.5,0.001,0.0,0.01,0.0,0.066,0.46,0.8,0.574,0.574,0.065,0.46,0.79,0.583,0.001,0.0,0.01,0.0,0.009,0.03,0.11,0.25,0.25,0.009,0.03,0.11,0.25,0.0,0.0,0.0,,82.0,250.0,1.4,1.9,0.759
1999,2014,201143,Al Horford,1610612737,ATL,28.0,29,16,13,0.552,33.0,104.4,100.7,3.7,0.139,1.19,12.7,10.7,0.571,0.588,0.246,97.04,238,420,8.2,14.5,0.567,18.6,63.0,40.6,1.8,1.75,0.49,0.295,0.436,0.51,0.869,1.5,0.4,0.7,0.55,0.2,0.3,0.7,1.0,0.674,0.3,0.209,0.1,0.047,...,1.72,0.58,0.0,0.0,0.0,,0.071,0.48,1.0,0.483,0.483,0.069,0.48,0.97,0.5,0.002,0.0,0.03,0.0,0.052,0.28,0.72,0.381,0.381,0.049,0.28,0.69,0.4,0.002,0.0,0.03,0.0,0.007,0.0,0.1,0.0,0.0,0.007,0.0,0.1,0.0,0.0,0.0,0.0,,82.0,250.0,2.0,2.9,0.682
2480,2013,201143,Al Horford,1610612737,ATL,27.0,74,42,32,0.568,37.3,104.8,101.5,3.3,0.149,1.63,15.6,9.6,0.545,0.56,0.221,93.61,576,1060,7.8,14.3,0.543,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,82.0,250.0,1.8,2.8,0.644
2948,2012,201143,Al Horford,1610612737,ATL,26.0,11,7,4,0.636,31.6,106.1,99.3,6.8,0.113,1.5,15.4,10.2,0.553,0.585,0.178,91.56,57,103,5.2,9.4,0.553,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,82.0,250.0,2.0,2.7,0.733
3428,2011,201143,Al Horford,1610612737,ATL,25.0,77,41,36,0.532,35.1,104.5,104.4,0.2,0.169,2.24,19.2,8.6,0.558,0.587,0.199,90.93,513,921,6.7,12.0,0.557,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,82.0,245.0,1.9,2.4,0.798


## 2. Removing Players
#### Removing players who played only one year

In [11]:
# Creating a list of player IDs to keep
vet_id_list = [ID for ID, n in master_df['PLAYER_ID'].value_counts().items() if n > 1]

In [12]:
# Saving the dataframe with only "veteran" players
master_df = master_df[master_df['PLAYER_ID'].isin(vet_id_list)].reset_index(drop=True)

#### Removing players who played too few minutes over their career

In [13]:
# Creating a total minutes column
master_df['TOTAL_MIN'] = master_df['MIN'] * master_df['GP']

In [14]:
# Finding the minutes played for the 301st player each year (based on )
sum_total_min = 0
for season in master_df['SEASON'].unique():
    sum_total_min += sorted(master_df[master_df['SEASON'] == season]['TOTAL_MIN'], reverse=True)[301]
min_cutoff = sum_total_min/len(master_df['SEASON'].unique())

In [15]:
# Finding which players have played enough minutes
below_min_id_list = master_df[master_df['TOTAL_MIN'] < min_cutoff]['PLAYER_ID'].unique()
above_min_id_list = master_df[master_df['TOTAL_MIN'] >= min_cutoff]['PLAYER_ID'].unique()

In [16]:
# Keeping players who were above (or equal to) the cutoff in at least one season
master_df = master_df[master_df['PLAYER_ID'].isin(above_min_id_list)].reset_index(drop=True)

## 3. Creating Points Per Possession
### Calculating values of assists and turnovers
#### Calculating points per assist (expected value of an assist)

$ \dfrac{\text{3FGM}_{AST}}{\text{FGA}} = \text{3FG%} * \text{3FGA%} * \text{3FG}_{\%AST} $ 

In [17]:
# Saving assisted 3s rates
assisted_3 = league_assists['3P.1'] * league_assists['3P'] * league_assists['%Ast\'d.1']

In [18]:
# Looking at the trends in assisted 3 pointers
assisted_3

2018    0.101255
2017    0.094462
2016    0.084344
2015    0.079073
2014    0.078042
2013    0.074239
2012    0.066491
2011    0.068349
dtype: float64

$ \dfrac{\text{2FGM}_{AST}}{\text{FGA}} = \text{2FG%} * \text{2FGA%} * \text{2FG}_{\%AST} $  

In [19]:
# Saving assisted 2s rates
assisted_2 = league_assists['2P.1'] * league_assists['2P'] * league_assists['%Ast\'d']

In [20]:
# Looking at the trends in assisted 2 pointers
assisted_2

2018    0.168727
2017    0.170306
2016    0.179043
2015    0.184610
2014    0.187313
2013    0.195613
2012    0.191245
2011    0.196263
dtype: float64

$ \dfrac{\text{FGM}_{AST}}{\text{FGA}} = \dfrac{\text{3FGM}_{AST}}{\text{FGA}} + \dfrac{\text{2FGM}_{AST}}{\text{FGA}}$  

In [21]:
# Saving assist rates
assisted_fg = assisted_2 + assisted_3

$ EV(\text{AST}) = \dfrac{\text{3FGM}_{AST} * 3 + \text{2FGM}_{AST} * 2}{\text{FGM}_{AST}} = \dfrac{\frac{\text{3FGM}_{\text{AST}}}{\text{FGA}} * 3 + \frac{\text{2FGM}_{\text{AST}}}{\text{FGA}} * 2}{\frac{\text{FGM}_{\text{AST}}}{\text{FGA}}}$  

In [22]:
# Saving the value of an assist for each year
assist_value = (assisted_3 * 3 + assisted_2 * 2) / assisted_fg

In [23]:
# Looking at trends in the value of an assist
assist_value

2018    2.375044
2017    2.356773
2016    2.320228
2015    2.299880
2014    2.294104
2013    2.275110
2012    2.257981
2011    2.258300
dtype: float64

#### Calculating points per turnover (expected value of a turnover)

$ EV(\text{TO}) = \dfrac{\sum{PTS_{TO}}}{\sum{TO}} $

In [24]:
# Saving the value of a turnover
turnover_value = league_turnovers['PTS_OFF_TOV'] / league_turnovers['TOV']

In [25]:
# Looking at trends in the value of turnovers
turnover_value

2018    1.153810
2017    1.154011
2016    1.132762
2015    1.126335
2014    1.136581
2013    1.139061
2012    1.122258
2011    1.137810
dtype: float64

### Creating points per play
#### Dropping years with NANs

In [26]:
master_df = master_df[master_df['SEASON'].isin(range(2014, 2019))].reset_index(drop=True)

#### Shooting plays

$ EV(FGA) = \text{EFG%} * 2  $  

$ FGA_{rate} = \dfrac{FGA}{MIN} $

In [27]:
# Calculating the expected value (in points) and rate of shooting plays
master_df['CATCH_SHOOT_EV'] = master_df['CATCH_SHOOT_EFG_PCT'] * 2
master_df['CATCH_SHOOT_RATE'] = master_df['CATCH_SHOOT_FGA'] / master_df['MIN']
master_df['PULL_UP_EV'] = master_df['PULL_UP_EFG_PCT'] * 2
master_df['PULL_UP_RATE'] = master_df['PULL_UP_FGA'] / master_df['MIN']

#### Drives and post-ups

$ EV(\text{PLAY}) = \dfrac{\text{PTS}_{play} + \text{AST}_{play} * EV(\text{AST}) - \text{TO}_{play} * EV(\text{TO})}{(\text{FGA}_{play} + \text{FTA}_{play} - \text{PF}_{play}) + \text{AST}_{play} * \frac{\text{AST}_{potential}}{\text{AST}} + \text{TO}_{play}}$  
<br>
<br>
$ PLAY_{rate} = \dfrac{(\text{FGA}_{play} + \text{FTA}_{play} - \text{PF}_{play}) + \text{AST}_{play} * \frac{\text{AST}_{potential}}{\text{AST}} + \text{TO}_{play}}{MIN}$

This was a situation where I had to decide between using a player's assist to potential assist ratio versus the league ratio. Players on teams with poor shooting will be undervalued, but quality of player passes will be valued.

<sub>* These are significantly more complicated because there are many more ways for the play to end.</sub>

In [28]:
# Calculating the points earned from driving
drive_assist_points = master_df.apply(pts_generated, values=assist_value, play='DRIVE', category='AST', axis=1)
drive_turnover_points = master_df.apply(pts_generated, values=turnover_value, play='DRIVE', category='TOV', axis=1)
drive_total_points = master_df['DRIVE_PTS'] + drive_assist_points - drive_turnover_points

In [29]:
# Calculating the number of possessions used by drives
drive_shots = master_df['DRIVE_FGA'] + master_df['DRIVE_FTA'] - master_df['DRIVE_PF']
drive_potential_assists = master_df['DRIVE_AST'] * master_df['POTENTIAL_AST'] / master_df['AST']
drive_potential_assists.fillna(0, inplace=True)
drive_possessions = drive_shots + drive_potential_assists + master_df['DRIVE_TOV']

In [30]:
# Calculating the expected value and rate of drives
master_df['DRIVE_EV'] = drive_total_points/drive_possessions
master_df['DRIVE_EV'].replace(np.infty, np.nan, inplace=True)
master_df['DRIVE_RATE'] = drive_possessions/master_df['MIN']

In [31]:
# Calculating the points earned by posting-up
post_assist_points = master_df.apply(pts_generated, values=assist_value, play='POST_TOUCH', category='AST', axis=1)
post_turnover_points = master_df.apply(pts_generated, values=turnover_value, play='POST_TOUCH', category='TOV', axis=1)
post_total_points = master_df['POST_TOUCH_PTS'] + post_assist_points - post_turnover_points

In [32]:
# Calculating the number of possessions used by post-ups
post_shots = master_df['POST_TOUCH_FGA'] + master_df['POST_TOUCH_FTA'] - master_df['POST_TOUCH_FOULS']
post_potential_assists = master_df['POST_TOUCH_AST'] * master_df['POTENTIAL_AST'] / master_df['AST']
post_potential_assists.fillna(0, inplace=True)
post_possessions = post_shots + post_potential_assists + master_df['POST_TOUCH_TOV']

In [33]:
# Calculating the expected value and rate of post-ups
master_df['POST_TOUCH_EV'] = post_total_points/post_possessions
master_df['POST_TOUCH_EV'].replace(np.infty, np.nan, inplace=True)
master_df['POST_TOUCH_RATE'] = post_possessions/master_df['MIN']

## 4. Creating New Features
#### Adding per minute rate categories

In [34]:
# Creating rate columns for appropriate features
for column in stat_types['totals']:
    master_df[column + '_PER_MIN'] = master_df[column] / master_df['TOTAL_MIN']

for column in stat_types['game_totals']:
    master_df[column + '_PER_MIN'] = master_df[column] / master_df['MIN']

#### Creating a ratio of height to weight

In [35]:
# Creating height to weight ratio
master_df['SKINNY'] = master_df['PLAYER_HEIGHT_INCHES'] / master_df['PLAYER_WEIGHT']

#### Creating percentage of play types that end in shots

In [36]:
# Creating percent of post-ups that end in a shot
master_df['POST_TOUCH_FGA_PCT'] = master_df['POST_TOUCH_FGA'] / master_df['POST_TOUCHES']

In [42]:
# Creating percent of drives that end in a shot
master_df['DRIVE_FGA_PCT'] = master_df['DRIVE_FGA'] / master_df['DRIVES']

#### Creating weighted features using past season data

In [37]:
# Create features that include past seasons weighted by number of minutes played
numeric_columns = master_df.select_dtypes(['float', 'int']).columns
weighted_df = use_history(master_df, 'PLAYER_ID', 'SEASON', numeric_columns, 'MIN', 5)
weighted_df = weighted_df.astype('float64')

In [38]:
# Joining the data frames
master_df = pd.concat([master_df, weighted_df], axis=1)

#### Creating dummy season columns

In [39]:
# Creating seasons as a categorical feature
master_df['SEASON'] = master_df['SEASON'].astype('object')
master_df = pd.concat([master_df, pd.get_dummies(master_df['SEASON'], drop_first=True)], axis=1)

## Saving Data

In [43]:
master_df.to_csv('./data/master_df')