In [1]:
import numpy as np
import pandas as pd

In [187]:
df = pd.read_csv('data/shot_logs.csv')

In [188]:
list(df.columns)

['GAME_ID',
 'MATCHUP',
 'LOCATION',
 'W',
 'FINAL_MARGIN',
 'SHOT_NUMBER',
 'PERIOD',
 'GAME_CLOCK',
 'SHOT_CLOCK',
 'DRIBBLES',
 'TOUCH_TIME',
 'SHOT_DIST',
 'PTS_TYPE',
 'SHOT_RESULT',
 'CLOSEST_DEFENDER',
 'CLOSEST_DEFENDER_PLAYER_ID',
 'CLOSE_DEF_DIST',
 'FGM',
 'PTS',
 'player_name',
 'player_id']

In [189]:
df.head(2)

Unnamed: 0,GAME_ID,MATCHUP,LOCATION,W,FINAL_MARGIN,SHOT_NUMBER,PERIOD,GAME_CLOCK,SHOT_CLOCK,DRIBBLES,...,SHOT_DIST,PTS_TYPE,SHOT_RESULT,CLOSEST_DEFENDER,CLOSEST_DEFENDER_PLAYER_ID,CLOSE_DEF_DIST,FGM,PTS,player_name,player_id
0,21400899,"MAR 04, 2015 - CHA @ BKN",A,W,24,1,1,1:09,10.8,2,...,7.7,2,made,"Anderson, Alan",101187,1.3,1,2,brian roberts,203148
1,21400899,"MAR 04, 2015 - CHA @ BKN",A,W,24,2,1,0:14,3.4,0,...,28.2,3,missed,"Bogdanovic, Bojan",202711,6.1,0,0,brian roberts,203148


In [190]:
# Select columns of interest
df = df[['player_name', 'GAME_ID', 'FGM']]

In [191]:
df.head(2)

Unnamed: 0,player_name,GAME_ID,FGM
0,brian roberts,21400899,1
1,brian roberts,21400899,0


## Shot History

In [192]:
df['shot_m1'] = df.groupby(['player_name', 'GAME_ID'])['FGM'].apply(lambda x: x.shift(periods=1))
df['shot_m2'] = df.groupby(['player_name', 'GAME_ID'])['FGM'].apply(lambda x: x.shift(periods=2))
df['shot_m3'] = df.groupby(['player_name', 'GAME_ID'])['FGM'].apply(lambda x: x.shift(periods=3))
df['shot_m4'] = df.groupby(['player_name', 'GAME_ID'])['FGM'].apply(lambda x: x.shift(periods=4))
df['shot_m5'] = df.groupby(['player_name', 'GAME_ID'])['FGM'].apply(lambda x: x.shift(periods=5))

In [193]:
df.head()

Unnamed: 0,player_name,GAME_ID,FGM,shot_m1,shot_m2,shot_m3,shot_m4,shot_m5
0,brian roberts,21400899,1,,,,,
1,brian roberts,21400899,0,1.0,,,,
2,brian roberts,21400899,0,0.0,1.0,,,
3,brian roberts,21400899,0,0.0,0.0,1.0,,
4,brian roberts,21400899,0,0.0,0.0,0.0,1.0,


## Min Shot Count

In [194]:
df.shape

(128069, 8)

In [195]:
grouped = df.groupby(['player_name', 'GAME_ID'])
df = grouped.filter(lambda x: x['FGM'].count()>9).reset_index(drop=True)

In [196]:
df.shape

(81588, 8)

In [197]:
# filter for min shots/game (10/15/20+)

# group by player and game
# min 10 shots
# was shot-1 good?
# player average shooting rate

# taking previous shot into account
    # column['shot_m1'] = duplicate the column and move up one
    # remove the first shot, then look at each shot
    
# player average for next shot if previous a hit    
    # hit_prev_miss = sum of hits after misses/sum of shots after misses
    # hit_prev_hit = sum of hits after hits/sum of shots after hits
    
# player average if previous shot a miss
    # miss_prev_miss = sum of misses after misses/sum of shots after misses
    # miss_prev_hit = sum of misses after hits/sum of shots after hits
    

# repeat for 2, 3, 4 shots

# check for 3 out of 4 previous shots


## Player p_stats

In [198]:
# get shooting percentage
p_stats = pd.DataFrame(df.groupby(['player_name'])['FGM'].mean()).reset_index()
p_stats = p_stats.rename(columns={'FGM' : 'FG_pct'})
p_stats.head()

# maybe add mean number of shots by player from original df?

Unnamed: 0,player_name,FG_pct
0,aaron brooks,0.420904
1,al farouq aminu,0.5
2,al horford,0.519544
3,al jefferson,0.479592
4,alan anderson,0.487805


### Hit Previous 1 Shot

In [199]:
hot_1 = df.loc[df.shot_m1==1]
hot_1 = pd.DataFrame(hot_1.groupby(['player_name'])['FGM'].mean()).reset_index()
hot_1 = hot_1.rename(columns={'FGM':'FG_pct_1_make'})

In [200]:
# add to p_stats df
p_stats = p_stats.merge(hot_1, how='right', on='player_name')

In [201]:
p_stats.head(2)

Unnamed: 0,player_name,FG_pct,FG_pct_1_make
0,aaron brooks,0.420904,0.45
1,al farouq aminu,0.5,0.454545


### Missed Previous 1 Shot

In [202]:
hot_1 = df.loc[df.shot_m1==0]
hot_1 = pd.DataFrame(hot_1.groupby(['player_name'])['FGM'].mean()).reset_index()
hot_1 = hot_1.rename(columns={'FGM':'FG_pct_1_miss'})

In [203]:
# add to p_stats df
p_stats = p_stats.merge(hot_1, how='left', on='player_name')

In [204]:
p_stats.head(2)

Unnamed: 0,player_name,FG_pct,FG_pct_1_make,FG_pct_1_miss
0,aaron brooks,0.420904,0.45,0.392473
1,al farouq aminu,0.5,0.454545,0.555556


### Hit Previous 2 Shots

In [205]:
hot_1 = df.loc[(df.shot_m1==1) & (df.shot_m2==1)]
hot_1 = pd.DataFrame(hot_1.groupby(['player_name'])['FGM'].mean()).reset_index()
hot_1 = hot_1.rename(columns={'FGM':'FG_pct_2_make'})

In [206]:
# add to p_stats df
p_stats = p_stats.merge(hot_1, how='left', on='player_name')

In [207]:
p_stats.head(2)

Unnamed: 0,player_name,FG_pct,FG_pct_1_make,FG_pct_1_miss,FG_pct_2_make
0,aaron brooks,0.420904,0.45,0.392473,0.403509
1,al farouq aminu,0.5,0.454545,0.555556,0.6


### Missed Previous 2 Shots

In [208]:
hot_1 = df.loc[(df.shot_m1==0) & (df.shot_m2==0)]
hot_1 = pd.DataFrame(hot_1.groupby(['player_name'])['FGM'].mean()).reset_index()
hot_1 = hot_1.rename(columns={'FGM':'FG_pct_2_miss'})

In [209]:
# add to p_stats df
p_stats = p_stats.merge(hot_1, how='left', on='player_name')

In [210]:
p_stats.head(2)

Unnamed: 0,player_name,FG_pct,FG_pct_1_make,FG_pct_1_miss,FG_pct_2_make,FG_pct_2_miss
0,aaron brooks,0.420904,0.45,0.392473,0.403509,0.31068
1,al farouq aminu,0.5,0.454545,0.555556,0.6,0.333333


### Hit Previous 3 Shots

In [211]:
hot_1 = df.loc[(df.shot_m1==1) & (df.shot_m2==1) & (df.shot_m3==1)]
hot_1 = pd.DataFrame(hot_1.groupby(['player_name'])['FGM'].mean()).reset_index()
hot_1 = hot_1.rename(columns={'FGM':'FG_pct_3_make'})

In [212]:
# add to p_stats df
p_stats = p_stats.merge(hot_1, how='left', on='player_name')

In [213]:
p_stats.head(2)

Unnamed: 0,player_name,FG_pct,FG_pct_1_make,FG_pct_1_miss,FG_pct_2_make,FG_pct_2_miss,FG_pct_3_make
0,aaron brooks,0.420904,0.45,0.392473,0.403509,0.31068,0.35
1,al farouq aminu,0.5,0.454545,0.555556,0.6,0.333333,0.333333


### Missed Previous 3 Shots

In [214]:
hot_1 = df.loc[(df.shot_m1==0) & (df.shot_m2==0) & (df.shot_m3==0)]
hot_1 = pd.DataFrame(hot_1.groupby(['player_name'])['FGM'].mean()).reset_index()
hot_1 = hot_1.rename(columns={'FGM':'FG_pct_3_miss'})

In [215]:
# add to p_stats df
p_stats = p_stats.merge(hot_1, how='left', on='player_name')

In [216]:
p_stats.head(2)

Unnamed: 0,player_name,FG_pct,FG_pct_1_make,FG_pct_1_miss,FG_pct_2_make,FG_pct_2_miss,FG_pct_3_make,FG_pct_3_miss
0,aaron brooks,0.420904,0.45,0.392473,0.403509,0.31068,0.35,0.338462
1,al farouq aminu,0.5,0.454545,0.555556,0.6,0.333333,0.333333,0.0


### Hit Previous 4 Shots

In [217]:
hot_1 = df.loc[(df.shot_m1==1) & (df.shot_m2==1) & (df.shot_m3==1) & (df.shot_m4==1)]
hot_1 = pd.DataFrame(hot_1.groupby(['player_name'])['FGM'].mean()).reset_index()
hot_1 = hot_1.rename(columns={'FGM':'FG_pct_4_make'})

In [218]:
# add to p_stats df
p_stats = p_stats.merge(hot_1, how='left', on='player_name')

In [219]:
p_stats.head(2)

Unnamed: 0,player_name,FG_pct,FG_pct_1_make,FG_pct_1_miss,FG_pct_2_make,FG_pct_2_miss,FG_pct_3_make,FG_pct_3_miss,FG_pct_4_make
0,aaron brooks,0.420904,0.45,0.392473,0.403509,0.31068,0.35,0.338462,0.333333
1,al farouq aminu,0.5,0.454545,0.555556,0.6,0.333333,0.333333,0.0,0.0


### Missed Previous 4 Shots

In [220]:
hot_1 = df.loc[(df.shot_m1==0) & (df.shot_m2==0) & (df.shot_m3==0) & (df.shot_m4==0)]
hot_1 = pd.DataFrame(hot_1.groupby(['player_name'])['FGM'].mean()).reset_index()
hot_1 = hot_1.rename(columns={'FGM':'FG_pct_4_miss'})

In [221]:
# add to p_stats df
p_stats = p_stats.merge(hot_1, how='left', on='player_name')

In [222]:
p_stats.head(2)

Unnamed: 0,player_name,FG_pct,FG_pct_1_make,FG_pct_1_miss,FG_pct_2_make,FG_pct_2_miss,FG_pct_3_make,FG_pct_3_miss,FG_pct_4_make,FG_pct_4_miss
0,aaron brooks,0.420904,0.45,0.392473,0.403509,0.31068,0.35,0.338462,0.333333,0.317073
1,al farouq aminu,0.5,0.454545,0.555556,0.6,0.333333,0.333333,0.0,0.0,


### Hit Previous 5 Shots

In [223]:
hot_1 = df.loc[(df.shot_m1==1) & (df.shot_m2==1) & (df.shot_m3==1) & (df.shot_m4==1) & (df.shot_m5==1)]
hot_1 = pd.DataFrame(hot_1.groupby(['player_name'])['FGM'].mean()).reset_index()
hot_1 = hot_1.rename(columns={'FGM':'FG_pct_5_make'})

In [224]:
# add to p_stats df
p_stats = p_stats.merge(hot_1, how='left', on='player_name')

In [225]:
p_stats.head(2)

Unnamed: 0,player_name,FG_pct,FG_pct_1_make,FG_pct_1_miss,FG_pct_2_make,FG_pct_2_miss,FG_pct_3_make,FG_pct_3_miss,FG_pct_4_make,FG_pct_4_miss,FG_pct_5_make
0,aaron brooks,0.420904,0.45,0.392473,0.403509,0.31068,0.35,0.338462,0.333333,0.317073,0.0
1,al farouq aminu,0.5,0.454545,0.555556,0.6,0.333333,0.333333,0.0,0.0,,


### Missed Previous 5 Shots

In [226]:
hot_1 = df.loc[(df.shot_m1==0) & (df.shot_m2==0) & (df.shot_m3==0) & (df.shot_m4==0)& (df.shot_m5==0)]
hot_1 = pd.DataFrame(hot_1.groupby(['player_name'])['FGM'].mean()).reset_index()
hot_1 = hot_1.rename(columns={'FGM':'FG_pct_5_miss'})

In [227]:
# add to p_stats df
p_stats = p_stats.merge(hot_1, how='left', on='player_name')

In [228]:
p_stats.head(2)

Unnamed: 0,player_name,FG_pct,FG_pct_1_make,FG_pct_1_miss,FG_pct_2_make,FG_pct_2_miss,FG_pct_3_make,FG_pct_3_miss,FG_pct_4_make,FG_pct_4_miss,FG_pct_5_make,FG_pct_5_miss
0,aaron brooks,0.420904,0.45,0.392473,0.403509,0.31068,0.35,0.338462,0.333333,0.317073,0.0,0.307692
1,al farouq aminu,0.5,0.454545,0.555556,0.6,0.333333,0.333333,0.0,0.0,,,


### Hit 4 of 5 previous

In [229]:
df['prev5_count'] = df.loc[:,'shot_m1':'shot_m5'].sum(axis=1)

In [230]:
hot_1 = df.loc[df.prev5_count==4]
hot_1 = pd.DataFrame(hot_1.groupby(['player_name'])['FGM'].mean()).reset_index()
hot_1 = hot_1.rename(columns={'FGM':'FG_pct_80_pct'})

In [231]:
# add to p_stats df
p_stats = p_stats.merge(hot_1, how='left', on='player_name')

In [232]:
p_stats.head(2)

Unnamed: 0,player_name,FG_pct,FG_pct_1_make,FG_pct_1_miss,FG_pct_2_make,FG_pct_2_miss,FG_pct_3_make,FG_pct_3_miss,FG_pct_4_make,FG_pct_4_miss,FG_pct_5_make,FG_pct_5_miss,FG_pct_80_pct
0,aaron brooks,0.420904,0.45,0.392473,0.403509,0.31068,0.35,0.338462,0.333333,0.317073,0.0,0.307692,0.304348
1,al farouq aminu,0.5,0.454545,0.555556,0.6,0.333333,0.333333,0.0,0.0,,,,0.0


### Missed 4 of 5 previous

In [233]:
hot_1 = df.loc[df.prev5_count==1]
hot_1 = pd.DataFrame(hot_1.groupby(['player_name'])['FGM'].mean()).reset_index()
hot_1 = hot_1.rename(columns={'FGM':'FG_pct_20_pct'})

In [234]:
# add to p_stats df
p_stats = p_stats.merge(hot_1, how='left', on='player_name')

In [235]:
p_stats.head(2)

Unnamed: 0,player_name,FG_pct,FG_pct_1_make,FG_pct_1_miss,FG_pct_2_make,FG_pct_2_miss,FG_pct_3_make,FG_pct_3_miss,FG_pct_4_make,FG_pct_4_miss,FG_pct_5_make,FG_pct_5_miss,FG_pct_80_pct,FG_pct_20_pct
0,aaron brooks,0.420904,0.45,0.392473,0.403509,0.31068,0.35,0.338462,0.333333,0.317073,0.0,0.307692,0.304348,0.430233
1,al farouq aminu,0.5,0.454545,0.555556,0.6,0.333333,0.333333,0.0,0.0,,,,0.0,0.666667


# Summary Stats

In [236]:
p_stats.describe()

Unnamed: 0,FG_pct,FG_pct_1_make,FG_pct_1_miss,FG_pct_2_make,FG_pct_2_miss,FG_pct_3_make,FG_pct_3_miss,FG_pct_4_make,FG_pct_4_miss,FG_pct_5_make,FG_pct_5_miss,FG_pct_80_pct,FG_pct_20_pct
count,265.0,265.0,265.0,261.0,263.0,249.0,252.0,221.0,224.0,180.0,195.0,249.0,265.0
mean,0.470284,0.451124,0.486868,0.428287,0.508544,0.404352,0.53035,0.36087,0.504246,0.343285,0.515047,0.423965,0.504096
std,0.069348,0.100537,0.098687,0.141569,0.146259,0.192458,0.196799,0.227441,0.217662,0.260191,0.266116,0.194142,0.125185
min,0.3,0.0,0.166667,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142857
25%,0.423611,0.4,0.428571,0.375,0.421565,0.333333,0.417691,0.222222,0.387821,0.0,0.373214,0.333333,0.4375
50%,0.462019,0.447368,0.468391,0.428571,0.486301,0.421053,0.5,0.388889,0.5,0.333333,0.5,0.431818,0.482759
75%,0.50056,0.5,0.530686,0.5,0.55719,0.5,0.591487,0.5,0.583333,0.5,0.666667,0.510638,0.538462
max,0.7,0.833333,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [183]:
from scipy import stats

In [237]:
p_stats.FG_pct_1_miss

0      0.392473
1      0.555556
2      0.530686
3      0.500000
4      0.555556
         ...   
260    0.504644
261    0.449438
262    0.400000
263    0.483221
264    0.447761
Name: FG_pct_1_miss, Length: 265, dtype: float64

In [238]:
stats.ttest_ind(p_stats.FG_pct_1_miss, p_stats.FG_pct_1_make)

Ttest_indResult(statistic=4.130319757211996, pvalue=4.210867135540211e-05)

In [241]:
stats.ttest_ind(p_stats.FG_pct_2_miss, p_stats.FG_pct_2_make, nan_policy='omit')

Ttest_indResult(statistic=6.381549674551054, pvalue=3.871966995700222e-10)

In [242]:
stats.ttest_ind(p_stats.FG_pct_3_miss, p_stats.FG_pct_3_make, nan_policy='omit')

Ttest_indResult(statistic=7.2440874226294385, pvalue=1.661896827670226e-12)

In [243]:
stats.ttest_ind(p_stats.FG_pct_4_miss, p_stats.FG_pct_4_make, nan_policy='omit')

Ttest_indResult(statistic=6.794306516662065, pvalue=3.5212469774236385e-11)

In [244]:
stats.ttest_ind(p_stats.FG_pct_5_miss, p_stats.FG_pct_5_make, nan_policy='omit')

Ttest_indResult(statistic=6.311479455326923, pvalue=7.84752123418275e-10)

In [245]:
stats.ttest_ind(p_stats.FG_pct_80_pct, p_stats.FG_pct_20_pct, nan_policy='omit')

Ttest_indResult(statistic=-5.594408209699447, pvalue=3.6096620244384676e-08)