In [55]:
#Import Libraries

import pandas as pd
import datetime as dt
import scipy.stats as sp
import numpy as np
import statsmodels.formula.api as sm 

In [56]:
# Import Shotlog_14_15 and Player_Stats Datasets

Shotlog_1415=pd.read_csv("Data/Shotlog_14_15.csv")
Player_Stats=pd.read_csv("Data/Player_Stats_14_15.csv")
display(Shotlog_1415)

Unnamed: 0,game_id,date,match,home_team,away_team,home_away,result,final_margin,shot_number,quarter,...,closest_defender,closest_defender_id,closest_def_dist,current_shot_hit,points_earned,shoot_player,player_id,average_hit,shot_count,shot_per_game
0,21400280,5-Dec-14,ATL @ BKN,BKN,ATL,A,W,23,1,1,...,"Lopez, Brook",201572,6.6,1,2,al horford,201143,0.541259,715,10
1,21400280,5-Dec-14,ATL @ BKN,BKN,ATL,A,W,23,2,1,...,"Lopez, Brook",201572,5.6,0,0,al horford,201143,0.541259,715,10
2,21400280,5-Dec-14,ATL @ BKN,BKN,ATL,A,W,23,3,1,...,"Lopez, Brook",201572,4.7,0,0,al horford,201143,0.541259,715,10
3,21400280,5-Dec-14,ATL @ BKN,BKN,ATL,A,W,23,4,1,...,"Lopez, Brook",201572,5.8,0,0,al horford,201143,0.541259,715,10
4,21400280,5-Dec-14,ATL @ BKN,BKN,ATL,A,W,23,5,2,...,"Lopez, Brook",201572,6.4,0,0,al horford,201143,0.541259,715,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128064,21400350,14-Dec-14,WAS vs. UTA,WAS,UTA,H,W,9,11,3,...,"Burke, Trey",203504,4.7,1,2,john wall,202322,0.448513,874,15
128065,21400350,14-Dec-14,WAS vs. UTA,WAS,UTA,H,W,9,12,3,...,"Exum, Dante",203957,3.4,1,2,john wall,202322,0.448513,874,15
128066,21400350,14-Dec-14,WAS vs. UTA,WAS,UTA,H,W,9,13,4,...,"Kanter, Enes",202683,1.2,0,0,john wall,202322,0.448513,874,15
128067,21400350,14-Dec-14,WAS vs. UTA,WAS,UTA,H,W,9,14,4,...,"Kanter, Enes",202683,1.4,1,2,john wall,202322,0.448513,874,15


## I. Data Preparation and Exploration

1. Import the “Shotlog_14_15.csv” data file as “Shotlog_1415” into Jupyter Notebook. Import “Player_Stats_1415.csv” data file as “Player_Stats” into Jupyter Notebook.

_Descriptions of the datasets and selected variables_

 - In the dataset “Shotlog_14_15,” each observation represents an attempt of a shot. In the dataset “Player_Stats_14_15,” each observation represents a player.
 - The “average_hit” variable in both dataframes indicate the average success rate of a player making a shot over the season. It is defined and calculated the same way in both dataframes.
 - The variable “home_away” indicates whether the team that the player belongs to played at home or away.
 - The variable “result” indicates whether the team that the player belongs to won or lost the game. The variable “final_margin” represents the difference in final score between the team the player belongs to and their opponent’s.
 - The variable “shot_number” is the order of the shot the given player attempted at the given game.
 - “game_clock” is the countdown clock for each quarter. The game clock starts at 12 minutes. “shot_clock” refers to the display of a countdown clock of the time within which the team possessing the ball must attempt a field goal. The shot clock starts at 24 seconds.

 2. Convert the “date” variable to a date type variable and calculate summary statistics for the “shot_clock” variable.

In [57]:
#Shotlog['date']=pd.to_datetime(Shotlog['date'])
Shotlog_1415["date"] = pd.to_datetime(Shotlog_1415["date"])
Shotlog_1415["date"]

0        2014-12-05
1        2014-12-05
2        2014-12-05
3        2014-12-05
4        2014-12-05
            ...    
128064   2014-12-14
128065   2014-12-14
128066   2014-12-14
128067   2014-12-14
128068   2014-12-14
Name: date, Length: 128069, dtype: datetime64[ns]

 
 3. Create a lagged variable “lag_shot_hit” to indicate the result of the previous shot by the same player at the same game.
 - Hint: In this dataset, the variable “match” may not be able to uniquely identify each game; you can use “game_id” instead. You can sort the data by shot number for each player to create the lagged variable.

In [58]:
#Shotlog['lag_shot_hit']=Shotlog.sort_values(by=["quarter", "time"],ascending=[True, True]).groupby(["shoot_player", "date"])["current_shot_hit"].shift(1)

Shotlog_1415["lag_shot_hit"] = Shotlog_1415.sort_values(by=["game_id", "shot_number"], ascending=[True, True]).groupby(["shoot_player", "game_id"])["current_shot_hit"].shift(1)

In [59]:
pd.set_option('display.max_rows', 500)
Shotlog_1415.loc[:100, ["game_id", "shoot_player", "shot_number", "current_shot_hit", "lag_shot_hit", "average_hit"]]

Unnamed: 0,game_id,shoot_player,shot_number,current_shot_hit,lag_shot_hit,average_hit
0,21400280,al horford,1,1,,0.541259
1,21400280,al horford,2,0,1.0,0.541259
2,21400280,al horford,3,0,0.0,0.541259
3,21400280,al horford,4,0,0.0,0.541259
4,21400280,al horford,5,0,0.0,0.541259
5,21400280,al horford,6,0,0.0,0.541259
6,21400280,al horford,7,1,0.0,0.541259
7,21400280,al horford,8,0,1.0,0.541259
8,21400280,al horford,9,0,0.0,0.541259
9,21400280,al horford,10,1,0.0,0.541259


 4. Create a variable “error” to indicate the prediction error for each shot and a variable “lagerror” for the prediction error for the previous shot. The “error” variable is defined as the difference between the outcome of the current shot and the average success rate (“average_hit”) and the “lagerror” variable is defined as the difference between the outcome of the previous shot and the average success rate.

In [60]:
Shotlog_1415["error"] = Shotlog_1415["current_shot_hit"]-Shotlog_1415["average_hit"]
Shotlog_1415["lagerror"] = Shotlog_1415["lag_shot_hit"]-Shotlog_1415["average_hit"]
Shotlog_1415.loc[:100, ["game_id", "shoot_player", "shot_number", "current_shot_hit", "lag_shot_hit", "average_hit", "error", "lagerror"]]

Unnamed: 0,game_id,shoot_player,shot_number,current_shot_hit,lag_shot_hit,average_hit,error,lagerror
0,21400280,al horford,1,1,,0.541259,0.458741,
1,21400280,al horford,2,0,1.0,0.541259,-0.541259,0.458741
2,21400280,al horford,3,0,0.0,0.541259,-0.541259,-0.541259
3,21400280,al horford,4,0,0.0,0.541259,-0.541259,-0.541259
4,21400280,al horford,5,0,0.0,0.541259,-0.541259,-0.541259
5,21400280,al horford,6,0,0.0,0.541259,-0.541259,-0.541259
6,21400280,al horford,7,1,0.0,0.541259,0.458741,-0.541259
7,21400280,al horford,8,0,1.0,0.541259,-0.541259,0.458741
8,21400280,al horford,9,0,0.0,0.541259,-0.541259,-0.541259
9,21400280,al horford,10,1,0.0,0.541259,0.458741,-0.541259


5. Calculate summary statistics for the “error” and “lagerror” variables. 

In [61]:
Shotlog_1415[["error", "lagerror"]].describe()

Unnamed: 0,error,lagerror
count,128069.0,113726.0
mean,1.862089e-17,0.006303
std,0.494964,0.496035
min,-0.7124682,-0.712468
25%,-0.4491979,-0.449198
50%,-0.3850837,-0.382143
75%,0.5395973,0.542254
max,0.6914894,0.691489


In [62]:
len(Shotlog_1415.game_id.value_counts())

904

## II. Conditional Probability and Autocorrelation

1. Create a dummy variable “conse_shot” that indicates a player made consecutive shots.

In [63]:
Shotlog_1415["conse_shot"] = np.where((Shotlog_1415['current_shot_hit']==1)&(Shotlog_1415['lag_shot_hit']==1), 1, 0)

2. Create a dataframe “Player_Prob” for the probability of making the previous shot and the joint probability for making both the previous and current shots. Name the probability of making the previous shot “average_lag_hit” and the probability of making both shots “conse_shot_hit.” 

In [64]:
Player_Prob=Shotlog_1415.groupby(['shoot_player'])['conse_shot','lag_shot_hit'].mean()
Player_Prob=Player_Prob.reset_index()
Player_Prob.rename(columns={'conse_shot':'conse_shot_hit', 'lag_shot_hit':'average_lag_hit'}, inplace=True)
Player_Prob.head()

  Player_Prob=Shotlog_1415.groupby(['shoot_player'])['conse_shot','lag_shot_hit'].mean()


Unnamed: 0,shoot_player,conse_shot_hit,average_lag_hit
0,aaron brooks,0.153298,0.418
1,aaron gordon,0.201923,0.532468
2,al farouq aminu,0.162791,0.465686
3,al horford,0.262937,0.537994
4,al jefferson,0.2075,0.48


3. In the “Player_Prob” dataframe, calculate the conditional probability “conditional_prob” for a player to make a shot given that he made the previous shot. 

In [65]:
Player_Prob['conditional_prob']=Player_Prob['conse_shot_hit']/Player_Prob['average_lag_hit']
Player_Prob.head()

Unnamed: 0,shoot_player,conse_shot_hit,average_lag_hit,conditional_prob
0,aaron brooks,0.153298,0.418,0.366741
1,aaron gordon,0.201923,0.532468,0.379221
2,al farouq aminu,0.162791,0.465686,0.349572
3,al horford,0.262937,0.537994,0.488736
4,al jefferson,0.2075,0.48,0.432292


4. Merge the “Player_Prob” dataframe into the “Player_Stats” dataframe.

In [66]:
Player_Stats=pd.merge(Player_Prob, Player_Stats, on=['shoot_player'])
Player_Stats.head(10)

Unnamed: 0,shoot_player,conse_shot_hit,average_lag_hit,conditional_prob,average_hit
0,aaron brooks,0.153298,0.418,0.366741,0.41533
1,aaron gordon,0.201923,0.532468,0.379221,0.528846
2,al farouq aminu,0.162791,0.465686,0.349572,0.430233
3,al horford,0.262937,0.537994,0.488736,0.541259
4,al jefferson,0.2075,0.48,0.432292,0.4775
5,alan anderson,0.15727,0.462366,0.340142,0.433234
6,alan crabbe,0.138298,0.52381,0.264023,0.425532
7,alex len,0.247492,0.539419,0.458811,0.528428
8,alexis ajinca,0.28436,0.598802,0.474882,0.597156
9,alonzo gee,0.137681,0.460784,0.298797,0.478261


5. Calculate summary statistics for the probability for a player to make a shot (“average_hit”) and the conditional probability for a player to make a shot given that he made the previous one (“conditional_prob”) and the probability of players making consecutive shots (“conse_shot_hit”).

In [67]:
Player_Stats[["average_hit", "conditional_prob", "conse_shot_hit"]].describe()

Unnamed: 0,average_hit,conditional_prob,conse_shot_hit
count,281.0,281.0,281.0
mean,0.451545,0.380233,0.176987
std,0.059392,0.06232,0.047943
min,0.308511,0.225801,0.07619
25%,0.413223,0.336689,0.144543
50%,0.446078,0.38157,0.171625
75%,0.48048,0.422801,0.203512
max,0.712468,0.613209,0.422392


6. Perform a t-test for the statistical significance on the difference between conditional probability and unconditional probability of making a shot.

In [68]:
sp.stats.ttest_ind(Player_Stats['conditional_prob'], Player_Stats['average_hit'])

  sp.stats.ttest_ind(Player_Stats['conditional_prob'], Player_Stats['average_hit'])


Ttest_indResult(statistic=-13.885932802814914, pvalue=6.925846314604593e-38)

7. Calculate the first order autocorrelation coefficient on making a shot (correlation coefficient between making the current shot and the previous shot) for the entire shotlog dataset.

In [69]:
Shotlog_1415['current_shot_hit'].corr(Shotlog_1415['lag_shot_hit'])

-0.010502388301693177

8. Calculate the first order autocorrelation coefficient on making a shot for each player. Display the top ten players with the highest first order autocorrelation coefficient.

In [70]:
Autocorr_Hit=Shotlog_1415.groupby('shoot_player')[['current_shot_hit','lag_shot_hit']].corr().unstack().iloc[:,1].reset_index()
Autocorr_Hit.columns=Autocorr_Hit.columns.get_level_values(0)
Autocorr_Hit.rename(columns={'current_shot_hit':'autocorr'}, inplace=True)
Autocorr_Hit.sort_values(by="autocorr", ascending=False).head(10)

Unnamed: 0,shoot_player,autocorr
131,joey dorsey,0.334252
54,cole aldrich,0.174666
200,nate robinson,0.122107
267,tyler hansbrough,0.120608
7,alex len,0.118461
50,cj mccollum,0.115949
114,jason smith,0.105903
190,matt bonner,0.098577
143,jusuf nurkic,0.097465
195,mike miller,0.089366


## III.  Regression Analysis  

In this section, you will run several regressions to investigate the “hot hand.” In all the regressions, the dependent variable is “error” and the independent variable of interest is “lagerror.” 

Reg1: Run a linear least squares regression using the entire shotlog dataframe. Include the following control variables:
- Shot distance

- Number of dribbles

- Touch time

- Type of shot (“points” variable)

- Quarter of the game (as a categorical variable)

- Home or away game

- Shoot_player

- Closest defender

- Closest defender distance

In [17]:
Shotlog_1415.columns

Index(['game_id', 'date', 'match', 'home_team', 'away_team', 'home_away',
       'result', 'final_margin', 'shot_number', 'quarter', 'game_clock',
       'shot_clock', 'dribbles', 'touch_time', 'shot_dist', 'points',
       'current_shot_outcome', 'closest_defender', 'closest_defender_id',
       'closest_def_dist', 'current_shot_hit', 'points_earned', 'shoot_player',
       'player_id', 'average_hit', 'shot_count', 'shot_per_game',
       'lag_shot_hit', 'error', 'lagerror', 'conse_shot'],
      dtype='object')

In [18]:
reg1 = sm.ols(formula = 'error ~ lagerror+shot_dist+dribbles+touch_time+points+quarter+home_away+shoot_player+closest_defender+closest_def_dist', data= Shotlog_1415).fit()
print(reg1.summary())

                            OLS Regression Results                            
Dep. Variable:                  error   R-squared:                       0.057
Model:                            OLS   Adj. R-squared:                  0.051
Method:                 Least Squares   F-statistic:                     9.043
Date:                Mon, 09 May 2022   Prob (F-statistic):               0.00
Time:                        19:24:07   Log-Likelihood:                -78072.
No. Observations:              113726   AIC:                         1.577e+05
Df Residuals:                  112965   BIC:                         1.650e+05
Df Model:                         760                                         
Covariance Type:            nonrobust                                         
                                                    coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------------

Reg2: Run a weighted least squares regression using the entire shotlog dataframe. Include the same set of control variables as in Reg1. The regression should be weighted by the number of shot per game (weight=1/shot_per_game).

In [19]:
reg2 = sm.wls(formula = 'error ~ lagerror+shot_dist+dribbles+touch_time+points+quarter+home_away+shoot_player+closest_defender+closest_def_dist',  weights=1/Shotlog_1415['shot_per_game'] , data= Shotlog_1415).fit()
print(reg2.summary())

                            WLS Regression Results                            
Dep. Variable:                  error   R-squared:                       0.062
Model:                            WLS   Adj. R-squared:                  0.055
Method:                 Least Squares   F-statistic:                     9.792
Date:                Mon, 09 May 2022   Prob (F-statistic):               0.00
Time:                        19:24:49   Log-Likelihood:                -86952.
No. Observations:              113726   AIC:                         1.754e+05
Df Residuals:                  112965   BIC:                         1.828e+05
Df Model:                         760                                         
Covariance Type:            nonrobust                                         
                                                    coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------------

Reg3_player: Run linear least squares regressions on individual players. Include the following control variables:

- Shot distance

- Number of dribbles

- Touch time

- Type of shot (“points” variable)

- Quarter of the game (as a categorical variable)

- Home or away game

- Closest defender distance

In [78]:
def regression_all_players(shotlog, for_variable=None, reg_type=None):
    player_list = np.array(shotlog['shoot_player'])
    player_list = np.unique(player_list)
    result_array = list()
    
    for_variable = for_variable if for_variable else "lagerror"
    
    for player in player_list:
        player_df = shotlog[shotlog.shoot_player == player]
        if reg_type == "linear":
            reg = sm.ols(formula ='error ~ lagerror+shot_dist+dribbles+touch_time+points+quarter+home_away+closest_def_dist', data=player_df).fit()
        elif reg_type == "weighted":
            reg = sm.wls(formula ='error ~ lagerror+shot_dist+dribbles+touch_time+points+quarter+home_away+closest_def_dist', weights=1/player_df['shot_per_game'], data=player_df).fit()
        
        RegParams = pd.DataFrame(reg.params).reset_index()
        RegTvals = pd.DataFrame(reg.tvalues).reset_index()
        RegPvals = pd.DataFrame(reg.pvalues).reset_index()
    
        RegOutput = pd.merge(RegParams, RegTvals, on=['index'])
        RegOutput = pd.merge(RegOutput, RegPvals, on=['index'])
        
        LagErr = RegOutput[RegOutput['index'] == for_variable]
        LagErr = LagErr.drop(columns=['index'])
        LagErr = LagErr.rename(columns={"0_x":"Coef", "0_y":"T_Statistics", 0:"P_Value"})
        LagErr['shoot_player'] = player
        Headers = ['shoot_player', 'Coef', 'T_Statistics', 'P_Value']
        result_array.append(LagErr.to_dict('records')[0])
    return pd.DataFrame(result_array)
    

shot_dist_df = regression_all_players(Shotlog_1415, for_variable="shot_dist", reg_type="linear")
shot_dist_df = shot_dist_df.loc[shot_dist_df['P_Value']<=0.05]
shot_dist_df = shot_dist_df[shot_dist_df["shoot_player"].isin(["russell westbrook", "andrew wiggins", "stephen curry", "james harden"])]
shot_dist_df.sort_values(by="Coef")

Unnamed: 0,Coef,T_Statistics,P_Value,shoot_player
17,-0.021179,-6.793315,2.277021e-11,andrew wiggins
247,-0.018133,-6.151447,1.1533e-09,stephen curry
108,-0.015618,-4.83246,1.563118e-06,james harden
236,-0.014769,-5.352016,1.100274e-07,russell westbrook


Reg4_wls_player: Run weighted least squares regressions on individual players. Include the same set of control variables as in Reg3. The regression should be weighted by the number of shot per game (weight=1/shot_per_game).

In [90]:
hot_hand_df = regression_all_players(Shotlog_1415, reg_type="weighted")
hot_hand_df = hot_hand_df.loc[hot_hand_df['P_Value']<=0.05]
hot_hand_df = hot_hand_df[hot_hand_df["shoot_player"].isin([
    "reggie jackson", "stephen curry", "cole aldrich", "alonzo gee"
])]
hot_hand_df.sort_values(by="Coef", ascending=False)

Unnamed: 0,Coef,T_Statistics,P_Value,shoot_player
226,0.130516,3.302942,0.001014,reggie jackson
247,-0.082684,-2.532599,0.011491,stephen curry
9,-0.309617,-3.235532,0.001682,alonzo gee
