## Quiz 1 - Data Preparation and Exploration 
1.Import useful libraries

In [None]:
import pandas as pd
import numpy as np
import datetime as dt
import scipy.stats as sp
import statsmodels.formula.api as sm

2.Import "Shotlog_14_15" and "Player_Stats_1415" datasets into Jupyter Notebook.

In [None]:
Shotlog_1415=pd.read_csv("Assignment Data/Week 6/Shotlog_14_15.csv")
Player_Stats=pd.read_csv("Assignment Data/Week 6/Player_Stats_14_15.csv")
display(Shotlog_1415)

3.Find the total number of players.

In [None]:
Player_Stats.shape

4.Find the total number of shots.

In [None]:
Shotlog_1415.shape

5.Store "date" as a date type variable and produce summary statistics for the variable.

In [None]:
Shotlog_1415['date']=pd.to_datetime(Shotlog_1415['date'])
Shotlog_1415['date'].describe()

6.Calculate summary statistics for the "shot_clock" variable.

In [None]:
Shotlog_1415['shot_clock'].describe()

7.Create a lagged variable to indicate the result of the previous shot by the same player in the same game.
- We will first sort the current shot outcome and match;
- We will group the data by player and match and use the "shift" command to create a lag variable.

In [None]:
Shotlog_1415['lag_shot_hit']=Shotlog_1415.sort_values(by=['shot_number'], ascending=[True]).groupby(['player_id','game_id'])['current_shot_hit'].shift(1)
Shotlog_1415.head()

8.Create a variable “error” to indicate the prediction error for each shot and a variable “lagerror” for the prediction error for the previous shot.

In [None]:
Shotlog_1415['error']=Shotlog_1415['current_shot_hit']-Shotlog_1415['average_hit']
Shotlog_1415['lagerror']=Shotlog_1415['lag_shot_hit']-Shotlog_1415['average_hit']
Shotlog_1415.head()

9.Create summary statistics for "error" and "lagerror" variables.

In [None]:
Shotlog_1415['error'].describe()

In [None]:
Shotlog_1415['lagerror'].describe()

## Quiz 2 - Conditional Probability and Autocorrelation
10.Create a dummy variable “conse_shot_hit” to indicate a player made consecutive shots.

In [None]:
Shotlog_1415['conse_shot'] = np.where((Shotlog_1415['current_shot_hit']==1)&(Shotlog_1415['lag_shot_hit']==1), 1, 0) 
Shotlog_1415.head()

11.Create a dataframe for the probability of making the previous shot and the joint probability for making both the previous and current shots. 

In [None]:
Player_Prob=Shotlog_1415.groupby(['shoot_player'])['conse_shot','lag_shot_hit'].mean().reset_index()
Player_Prob.rename(columns={'lag_shot_hit':'average_lag_hit'}, inplace=True)
Player_Prob.rename(columns={'conse_shot':'conse_shot_hit'}, inplace=True)
Player_Prob.head()

12.Calculate the conditional probability for a player to make a shot given that he made the previous shot. 

In [None]:
Player_Prob['conditional_prob']=Player_Prob['conse_shot_hit']/Player_Prob['average_lag_hit']
Player_Prob.head()

13.Merge the “Player_Prob” dataframe into the “Player_Stats” dataframe.

In [None]:
Player_Stats=pd.merge(Player_Prob, Player_Stats, on=['shoot_player'])
Player_Stats.head()

14.Calculate summary statistics for the unconditional probability of players making a shot, the conditional probability of players making a shot given they make the previous one, and the probability of players making consecutive shots. 

In [None]:
Player_Stats['average_hit'].describe()

In [None]:
Player_Stats['conditional_prob'].describe()

In [None]:
Player_Stats['conse_shot_hit'].describe()

15.Perform a t-test on the difference between conditional and unconditonal probabilities.

In [None]:
sp.stats.ttest_ind(Player_Stats['conditional_prob'], Player_Stats['average_hit'])

16.Calculate the first order autocorrelation coefficient on making a shot for the entire shotlog dataset.

In [None]:
Shotlog_1415['current_shot_hit'].corr(Shotlog_1415['lag_shot_hit'])

17.Calculate the first order autocorrelation coefficient on making a shot for each player.

In [None]:
Autocorr_Hit=Shotlog_1415.groupby('shoot_player')[['current_shot_hit','lag_shot_hit']].corr().unstack().iloc[:,1].reset_index()
Autocorr_Hit.columns=Autocorr_Hit.columns.get_level_values(0)
Autocorr_Hit.rename(columns={'current_shot_hit':'autocorr'}, inplace=True)
Autocorr_Hit.sort_values(by=['autocorr'], ascending=[False]).head(10)

## Regression Analyses

18.Reg1: linear least squares regression using the entire shotlog dataframe
- Dependent variable: error
- Independent variables:lagerror, shot_dist, dribbles, touch_time, points, quarter, home_away, shoot_player, closest_defender, and closest_def_dist

In [None]:
reg1 = sm.ols(formula = 'error ~ lagerror+shot_dist+dribbles+touch_time+C(points)+C(quarter)+home_away+shoot_player+closest_defender+closest_def_dist', data= Shotlog_1415).fit()
print(reg1.summary())

19.Reg2: Weighted least squares regression using the entire shotlog dataframe, weighted by shot_per_game.

- Dependent variable: error
- Independent variables:lagerror, shot_dist, dribbles, touch_time, points, quarter, home_away, shoot_player, closest_defender, and closest_def_dist

In [None]:
reg2 = sm.wls(formula = 'error ~ lagerror+shot_dist+dribbles+touch_time+C(points)+C(quarter)+home_away+shoot_player+closest_defender+closest_def_dist',  weights=1/Shotlog_1415['shot_per_game'] , data= Shotlog_1415).fit()
print(reg2.summary())

20.Reg3_player: linear least squares regressions on individual players

- Dependent variable: error
- Independent variables:lagerror, shot_dist, dribbles, touch_time, points, quarter, home_away, and closest_def_dist

In [None]:
def Reg3_player(player):
    Shotlog_player=Shotlog_1415[Shotlog_1415.shoot_player==player]
    Reg3_player=sm.ols(formula = 'error ~ lagerror+shot_dist+dribbles+touch_time+C(points)+C(quarter)+home_away+closest_def_dist', data= Shotlog_player).fit()
    print(Reg3_player.summary())
    return; 

21.Show regression results for given players.

In [None]:
Reg3_player('andrew wiggins')

In [None]:
Reg3_player('stephen curry')

In [None]:
Reg3_player('james harden')

In [None]:
Reg3_player('russell westbrook')

22.Reg4_wls_player: weighted least squares regressions on individual players, weighted by shot_per_game.

- Dependent variable: error
- Independent variables:lagerror, shot_dist, dribbles, touch_time, points, quarter, home_away, and closest_def_dist

In [None]:
def Reg4_wls_player(player):
    Shotlog_player=Shotlog_1415[Shotlog_1415.shoot_player==player]
    Reg4_wls_player=sm.wls(formula = 'error ~ lagerror+shot_dist+dribbles+touch_time+C(points)+C(quarter)+home_away+closest_def_dist',weights=1/Shotlog_player['shot_per_game'] , data= Shotlog_player).fit()
    print(Reg4_wls_player.summary())
    return; 

23.Show regression results for given players.

In [None]:
Reg4_wls_player('reggie jackson')

In [None]:
Reg4_wls_player('alonzo gee')

In [None]:
Reg4_wls_player('cole aldrich')

In [None]:
Reg4_wls_player('stephen curry')