In [7]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegressionCV
from scipy.stats import chi2_contingency


np.random.seed(1)

shaq = pd.read_csv("shaq.csv")

## Part (a)
## COMPUTE each of the following:

## 1. average accuracy of Shaq's free throw
print("\n")
print("=====1. Avg Accuracy of Shaq Free throw=====")
# select the rows where Shaq attempted a free throw
free_throws = shaq[shaq["play"].str.contains("free throw")]

# calculate the average accuracy of Shaq's free throw
avg_accuracy = free_throws["shot_made"].mean()

print("Average accuracy of Shaq's free throw: {:.2f}%".format(avg_accuracy * 100))

## 2. average accuracy of Shaq's free throw during a home game
print("\n")
print("=====2. Avg Accuracy of Shaq Free throw during home game=====")
# select the rows where Shaq attempted a free throw and home game
free_throws_hg = shaq[
    (shaq["play"].str.contains("free throw")) & (shaq["home_game"] == 1)
]

# calculate the average accuracy of Shaq's free throw
avg_accuracy_hg = free_throws_hg["shot_made"].mean()

print(
    "Average accuracy of Shaq's free throw during a home game: {:.2f}%".format(
        avg_accuracy_hg * 100
    )
)

## 3. average accuracy of Shaq's free throw when the free throw
##    is the first of the two free throws.
print("\n")
print("=====3. Avg Accuracy of Shaq Free throw when it is first shot=====")
# select the rows where Shaq attempted a free throw and is the first shot
free_throws_hg = shaq[
    (shaq["play"].str.contains("free throw")) & (shaq["first_shot"] == 1)
]

# calculate the average accuracy of Shaq's free throw
avg_accuracy_fs = free_throws_hg["shot_made"].mean()

print(
    "Average accuracy of Shaq's free throw when the free throw is the first of the two free throws: {:.2f}%".format(
        avg_accuracy_fs * 100
    )
)
## 4. perform a chi-squared test for association between 
##    a free throw result and whether the game is a home game or not.

print("\n")
print("=====4. Chi-Square Test for association b/w a free throw and whether the game is home game or not=====")

# Create a contingency table of the counts of free throws made and missed for home and away games
contingency_table = pd.crosstab(index=free_throws['shot_made'], columns=free_throws['home_game'])

# Print the contingency table
print("Contingency Table for Referance: \n", contingency_table)

# Perform a chi-squared test for association between free throw result and home game
chi2, p_value, dof, expected = chi2_contingency(contingency_table)

# Print the chi-squared test statistic and p-value
print("\n")
print("Chi-squared test statistic:", chi2)
print("P-value:", p_value)

## 5. perform a chi-squared test for association between
##    a free throw result and whether the free throw is the first of the 
##    two free throws.

print("\n")
print("=====5. Chi-Square Test for association b/w a free throw and whether it is first shot=====")
# Create a contingency table of the counts of free throws made and missed for first shot
contingency_table = pd.crosstab(index=free_throws['shot_made'], columns=free_throws['first_shot'])

# Print the contingency table
print("Contingency Table for Referance: \n", contingency_table)

# Perform a chi-squared test for association between free throw result and home game
chi2, p_value, dof, expected = chi2_contingency(contingency_table)

# Print the chi-squared test statistic and p-value
print("\n")
print("Chi-squared test statistic:", chi2)
print("P-value:", p_value)


## end of part (a)



=====1. Avg Accuracy of Shaq Free throw=====
Average accuracy of Shaq's free throw: 52.27%


=====2. Avg Accuracy of Shaq Free throw during home game=====
Average accuracy of Shaq's free throw during a home game: 54.94%


=====3. Avg Accuracy of Shaq Free throw when it is first shot=====
Average accuracy of Shaq's free throw when the free throw is the first of the two free throws: 48.58%


=====4. Chi-Square Test for association b/w a free throw and whether the game is home game or not=====
Contingency Table for Referance: 
 home_game    0    1
shot_made          
0          501  278
1          514  339


Chi-squared test statistic: 2.6779010107677776
P-value: 0.1017497611642021


=====5. Chi-Square Test for association b/w a free throw and whether it is first shot=====
Contingency Table for Referance: 
 first_shot    0    1
shot_made           
0           309  470
1           409  444


Chi-squared test statistic: 11.001452096828773
P-value: 0.0009104053382288787


In [12]:
features = ["first_shot", "missed_first", "home_game", "cur_score",
"opp_score", "cur_time", "score_ratio", "made_first", "losing"]

ntrial = 100
err1 = np.zeros(ntrial)
err2 = np.zeros(ntrial)

for it in range(ntrial):
    X = shaq[features].values
    Y = shaq['shot_made']

    n = X.shape[0]
    ntrain = 1500
    learn_ixs = np.random.choice(n, ntrain, replace=False)
    test_ixs = [j for j in range(n) if j not in learn_ixs]
    
    X1 = X[learn_ixs, ]
    Y1 = Y[learn_ixs]

    X2 = X[test_ixs, ]
    Y2 = Y[test_ixs]
    
    clf = LogisticRegressionCV(Cs=50, cv=5, penalty='l2', solver='lbfgs', max_iter=1000)## FILL IN: use LogisticRegressionCV function 
    clf.fit(X1, Y1)

    Y2hat = clf.predict(X2)
    myerr = np.mean( abs(Y2 - Y2hat) )

    Y2baseline = np.ones(Y2.shape) ## FILL IN: compute the baseline prediction. The baseline predicts all 1 if 
                 ## the average of Y1 is at least 0.5, otherwise all 0.
    if np.mean(Y1) < 0.5:
        Y2baseline = np.zeros(Y2.shape)

    baseline_err = np.mean( abs(Y2 - Y2baseline) )

    err1[it] = myerr
    err2[it] = baseline_err
    
print("Ridge error: %.4f +/- %.4f    Baseline error: %.4f +/- %.4f" % (np.mean(err1), 2*np.std(err1), np.mean(err2), 2*np.std(err2)))


Ridge error: 0.4695 +/- 0.0821    Baseline error: 0.4786 +/- 0.0763


Q.Problem 3 Part (c) Choose the statement below which you agree with the most. Justify your answer in a sentence or two.
1. The features are strongly predictive of Shaquille O'Neal's free throws.
2. The features are weakly predictive of Shaquille O'Neal's free throws.
3. There is no evidence that the features are at all predictive of Shaquille O'Neal's free throws.

In [13]:
import statsmodels.api as sm

# convert boolean values to integers (0 or 1)
X1 = X1.astype(int)

# calculate the mean across columns
mean = np.nanmean(X1, axis=0)

# replace non-finite values with the mean
X1 = np.nan_to_num(X1, nan=mean)

model = sm.OLS(np.asarray(Y1), X1).fit()
res = model.resid
print_model = model.summary()
print(print_model)

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.525
Model:                            OLS   Adj. R-squared (uncentered):              0.522
Method:                 Least Squares   F-statistic:                              183.0
Date:                Sun, 12 Mar 2023   Prob (F-statistic):                   1.12e-233
Time:                        22:33:37   Log-Likelihood:                         -1083.6
No. Observations:                1500   AIC:                                      2185.
Df Residuals:                    1491   BIC:                                      2233.
Df Model:                           9                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

Ans. The features are weakly predictive of Shaquille O'Neal's free throws.

Reason: baseline OLS model has 5/9 features higher p value and also the adjusted R squared is very low, which indicates that the features are weakly predictive of Shaquille O'Neal's free throws.



Q.Problem 3 Part (d) (free response)
Give a two or three sentences answer to each of the following questions.

1. Each row of the original dataset also contains the final score of the game in which the free throw was made. Why is it that we cannot use the final score of the game as a feature in making the prediction

Ans. Final score of the game has no direct relation with the free throw as free throws come into picture when penalty shots are given to the team that a foul was committed against. Success of a free throw can be predicted using the free throw attempts, time remaining, 3 pointer records or shooting position of a player but total points is not very indicative of the same.

2. Why is it that, ideally, when holding out samples for the test set, we should choose a set of games and hold out ALL samples from those games?


Ans. To ensure accurate evaluation of the model, it is best to select a set of games and exclude all of their samples when creating the test set. If we randomly choose test samples from the same games as the training samples, the model may learn information that is specific to the test set and lead to overly optimistic results. It also makes sure the the model is tested on completely new data that it has not been trained on, which gives us a better picture of the models performance and create a more generalized model which can be utilized across all the variations.