# Modeling

## Import Libraries

In [75]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression, LinearRegression, Lasso, LassoCV, Ridge, RidgeCV, ElasticNet, ElasticNetCV
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB

## Read In Data

In [76]:
rockets_log_num = pd.read_csv('./data/rockets_log_num.csv')

In [77]:
rockets_log_num.head()

Unnamed: 0,HOME/AWAY,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,FT%,TRB,AST,PTS,GAME_SCORE,+/-
0,0,5,12,0.417,3,8,0.375,0,0,0.0,8,1,13,11.4,8
1,0,2,7,0.286,2,7,0.286,4,4,1.0,5,0,10,7.4,5
2,1,1,4,0.25,1,4,0.25,0,0,0.0,10,1,3,4.7,19
3,1,2,7,0.286,1,4,0.25,0,0,0.0,6,2,5,3.7,12
4,0,5,11,0.455,4,9,0.444,0,0,0.0,1,0,14,9.3,-1


In [78]:
sixers_log_num = pd.read_csv('./data/sixers_log_num.csv')

In [79]:
sixers_log_num.head()

Unnamed: 0,HOME/AWAY,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,FT%,TRB,AST,PTS,GAME_SCORE,+/-
0,1,18,33,0.545,3,8,0.375,0,0,0.0,7,6,39,28.6,-17
1,0,7,14,0.5,0,1,0.0,1,1,1.0,6,4,15,11.6,-24
2,1,7,18,0.389,1,6,0.167,1,3,0.333,3,8,16,9.9,-14
3,1,6,10,0.6,1,3,0.333,2,2,1.0,1,1,15,10.0,-2
4,0,5,13,0.385,0,4,0.0,2,2,1.0,3,0,12,6.2,2


## Baseline Model

In [4]:
rockets_log_num['3P'].value_counts(normalize=True)

0    0.362069
3    0.150862
1    0.146552
2    0.118534
4    0.103448
5    0.058190
6    0.047414
7    0.008621
8    0.004310
Name: 3P, dtype: float64

## Split Data

In [5]:
rockets_log_num.columns

Index(['HOME/AWAY', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', 'FT', 'FTA', 'FT%',
       'TRB', 'AST', 'PTS', 'GAME_SCORE', '+/-'],
      dtype='object')

In [80]:
features = ['HOME/AWAY', 'FG', 'FGA', 'FG%', '3PA', 'FT', 'FTA', 'FT%',
       'TRB', 'AST', 'PTS', 'GAME_SCORE', '+/-']
X = rockets_log_num[features]
y = rockets_log_num['3P']

sixers_test = sixers_log_num[features]

## Instantiate PolynomialFeatures

In [83]:
# Referred to lesson 4.02

# Instantiate our PolynomialFeatures object to create all two-way terms.
poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)

# Fit and transform our X data.
#X_overfit = poly.fit_transform(X)

In [84]:
# Create X_poly
X_poly = poly.fit_transform(X)

In [85]:
poly.get_feature_names(features)

['HOME/AWAY',
 'FG',
 'FGA',
 'FG%',
 '3PA',
 'FT',
 'FTA',
 'FT%',
 'TRB',
 'AST',
 'PTS',
 'GAME_SCORE',
 '+/-',
 'HOME/AWAY^2',
 'HOME/AWAY FG',
 'HOME/AWAY FGA',
 'HOME/AWAY FG%',
 'HOME/AWAY 3PA',
 'HOME/AWAY FT',
 'HOME/AWAY FTA',
 'HOME/AWAY FT%',
 'HOME/AWAY TRB',
 'HOME/AWAY AST',
 'HOME/AWAY PTS',
 'HOME/AWAY GAME_SCORE',
 'HOME/AWAY +/-',
 'FG^2',
 'FG FGA',
 'FG FG%',
 'FG 3PA',
 'FG FT',
 'FG FTA',
 'FG FT%',
 'FG TRB',
 'FG AST',
 'FG PTS',
 'FG GAME_SCORE',
 'FG +/-',
 'FGA^2',
 'FGA FG%',
 'FGA 3PA',
 'FGA FT',
 'FGA FTA',
 'FGA FT%',
 'FGA TRB',
 'FGA AST',
 'FGA PTS',
 'FGA GAME_SCORE',
 'FGA +/-',
 'FG%^2',
 'FG% 3PA',
 'FG% FT',
 'FG% FTA',
 'FG% FT%',
 'FG% TRB',
 'FG% AST',
 'FG% PTS',
 'FG% GAME_SCORE',
 'FG% +/-',
 '3PA^2',
 '3PA FT',
 '3PA FTA',
 '3PA FT%',
 '3PA TRB',
 '3PA AST',
 '3PA PTS',
 '3PA GAME_SCORE',
 '3PA +/-',
 'FT^2',
 'FT FTA',
 'FT FT%',
 'FT TRB',
 'FT AST',
 'FT PTS',
 'FT GAME_SCORE',
 'FT +/-',
 'FTA^2',
 'FTA FT%',
 'FTA TRB',
 'FTA AST',
 

In [86]:
# View X_poly in a DataFrame
pd.DataFrame(X_poly, columns=poly.get_feature_names(features)).head()

Unnamed: 0,HOME/AWAY,FG,FGA,FG%,3PA,FT,FTA,FT%,TRB,AST,...,AST^2,AST PTS,AST GAME_SCORE,AST +/-,PTS^2,PTS GAME_SCORE,PTS +/-,GAME_SCORE^2,GAME_SCORE +/-,+/-^2
0,0.0,5.0,12.0,0.417,8.0,0.0,0.0,0.0,8.0,1.0,...,1.0,13.0,11.4,8.0,169.0,148.2,104.0,129.96,91.2,64.0
1,0.0,2.0,7.0,0.286,7.0,4.0,4.0,1.0,5.0,0.0,...,0.0,0.0,0.0,0.0,100.0,74.0,50.0,54.76,37.0,25.0
2,1.0,1.0,4.0,0.25,4.0,0.0,0.0,0.0,10.0,1.0,...,1.0,3.0,4.7,19.0,9.0,14.1,57.0,22.09,89.3,361.0
3,1.0,2.0,7.0,0.286,4.0,0.0,0.0,0.0,6.0,2.0,...,4.0,10.0,7.4,24.0,25.0,18.5,60.0,13.69,44.4,144.0
4,0.0,5.0,11.0,0.455,9.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,-0.0,196.0,130.2,-14.0,86.49,-9.3,1.0


## Train/Test Split Data

In [96]:
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.7, random_state=42, stratify=y)

## Scale Data

In [97]:
sc = StandardScaler()
Z_train = sc.fit_transform(X_train)
Z_test = sc.transform(X_test)

## Linear Regression

In [98]:
X_poly.shape

(464, 104)

In [99]:
print(f'Z_train shape is: {Z_train.shape}')
print(f'y_train shape is: {y_train.shape}')
print(f'Z_test shape is: {Z_test.shape}')
print(f'y_test shape is: {y_test.shape}')

Z_train shape is: (139, 104)
y_train shape is: (139,)
Z_test shape is: (325, 104)
y_test shape is: (325,)


In [100]:
lr = LinearRegression()

In [101]:
cross_val_score(lr, X_poly, y).mean()

0.8

In [102]:
lr.fit(Z_train, y_train)

LinearRegression()

In [103]:
# How does the model score on the training and test data?
print(f"Training R-Squared: {lr.score(Z_train, y_train)}")
print(f"Testing R-Squared: {lr.score(Z_test, y_test)}")

Training R-Squared: 1.0
Testing R-Squared: 1.0


## Ridge Regression

In [104]:
# Instantiate Ridge Regression
ridge = Ridge(alpha=1)

# Fit.
ridge.fit(Z_train, y_train)

Ridge(alpha=1)

In [105]:
cross_val_score(ridge, X_poly, y).mean()

0.7986219534819743

In [106]:
# Evaluate model using R2.
print(f"Ridge Training R-Squared: {ridge.score(Z_train, y_train)}")
print(f"Ridge Testing R-Squared: {ridge.score(Z_test, y_test)}")

Ridge Training R-Squared: 0.9853491701556611
Ridge Testing R-Squared: 0.9499582568057752


## RidgeCV

In [107]:
# Set up a list of ridge alphas to check.
# np.logspace generates 100 values equally between 0 and 5,
# then converts them to alphas between 10^0 and 10^5.
r_alphas = np.logspace(0, 5, 100)

# Cross-validate over our list of ridge alphas
ridge_cv = RidgeCV(alphas=r_alphas, scoring="r2", cv=5)

# Fit model using best ridge alpha
ridge_cv.fit(Z_train, y_train)

RidgeCV(alphas=array([1.00000000e+00, 1.12332403e+00, 1.26185688e+00, 1.41747416e+00,
       1.59228279e+00, 1.78864953e+00, 2.00923300e+00, 2.25701972e+00,
       2.53536449e+00, 2.84803587e+00, 3.19926714e+00, 3.59381366e+00,
       4.03701726e+00, 4.53487851e+00, 5.09413801e+00, 5.72236766e+00,
       6.42807312e+00, 7.22080902e+00, 8.11130831e+00, 9.11162756e+00,
       1.02353102e+01, 1.14975700e+0...
       6.89261210e+03, 7.74263683e+03, 8.69749003e+03, 9.77009957e+03,
       1.09749877e+04, 1.23284674e+04, 1.38488637e+04, 1.55567614e+04,
       1.74752840e+04, 1.96304065e+04, 2.20513074e+04, 2.47707636e+04,
       2.78255940e+04, 3.12571585e+04, 3.51119173e+04, 3.94420606e+04,
       4.43062146e+04, 4.97702356e+04, 5.59081018e+04, 6.28029144e+04,
       7.05480231e+04, 7.92482898e+04, 8.90215085e+04, 1.00000000e+05]),
        cv=5, scoring='r2')

In [108]:
# Here is the optimal value of alpha
ridge_cv.alpha_

1.0

In [109]:
print(ridge_cv.score(Z_train, y_train))
print(ridge_cv.score(Z_test, y_test))

0.9853491701556611
0.9499582568057752


In [110]:
cross_val_score(ridge_cv, X_poly, y).mean()

0.7986219534819743

## LASSO Regression

In [111]:
# Set up a list of Lasso alphas to check.
l_alphas = np.logspace(-3, 0 , 100)

# Cross-validate over our list of Lasso alphas.
lasso_cv = LassoCV(alphas=l_alphas, cv=5, max_iter=5000)

# Fit model using best ridge alpha
lasso_cv.fit(Z_train, y_train);

  model = cd_fast.enet_coordinate_descent(


In [112]:
# An array of values of alphas used in the model 
lasso_cv.alphas

array([0.001     , 0.00107227, 0.00114976, 0.00123285, 0.00132194,
       0.00141747, 0.00151991, 0.00162975, 0.00174753, 0.00187382,
       0.00200923, 0.00215443, 0.00231013, 0.00247708, 0.00265609,
       0.00284804, 0.00305386, 0.00327455, 0.00351119, 0.00376494,
       0.00403702, 0.00432876, 0.00464159, 0.00497702, 0.0053367 ,
       0.00572237, 0.00613591, 0.00657933, 0.0070548 , 0.00756463,
       0.00811131, 0.00869749, 0.00932603, 0.01      , 0.01072267,
       0.01149757, 0.01232847, 0.01321941, 0.01417474, 0.01519911,
       0.01629751, 0.01747528, 0.01873817, 0.02009233, 0.02154435,
       0.0231013 , 0.02477076, 0.02656088, 0.02848036, 0.03053856,
       0.03274549, 0.03511192, 0.03764936, 0.04037017, 0.04328761,
       0.04641589, 0.04977024, 0.05336699, 0.05722368, 0.06135907,
       0.06579332, 0.07054802, 0.07564633, 0.08111308, 0.0869749 ,
       0.09326033, 0.1       , 0.10722672, 0.1149757 , 0.12328467,
       0.13219411, 0.14174742, 0.15199111, 0.16297508, 0.17475

In [113]:
# Here is the optimal value of alpha
lasso_cv.alpha_

0.001

In [114]:
print(f"LASSO Training R-Squared: {lasso_cv.score(Z_train, y_train)} ")
print(f"LASSO Testing R-Squared: {lasso_cv.score(Z_test, y_test)} ")

LASSO Training R-Squared: 0.999307049505475 
LASSO Testing R-Squared: 0.9983531346368631 


In [116]:
cross_val_score(lasso_cv, X_poly, y).mean();

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descen

## Make Predictions

#### Lasso Regression model returned the best score

In [117]:
sixers_poly = poly.fit_transform(sixers_test)

sixers_Z_test = sc.fit_transform(sixers_poly)

In [118]:
# Predictions dataframe

pred = lasso_cv.predict(sixers_Z_test)
pred_df = pd.DataFrame(pred)
pred_df.rename(columns={0: '3-Pointers'}, inplace=True)
pred_df

Unnamed: 0,3-Pointers
0,6.672887
1,0.405364
2,1.519987
3,1.292730
4,0.138874
...,...
78,3.044065
79,0.355355
80,2.822822
81,-0.889974


In [119]:
pred_df.sum()

3-Pointers    161.223022
dtype: float64

In [121]:
rockets_log_num['3P'].head(83).sum()

194

In [122]:
pred_df.to_csv('./data/sixers_three_point_pred_df.csv', index=False)

## Conclusion

- The Lasso model turned out very accurate, with a score of over 99%
- Using data from the Sixers’ first 18 games, the model was able to predict about 161 three-pointers
- I'm hopeful that this data and model can be useful for purposes like training, fantasy basketball, etc.

## Next Steps

- Find a way to gather data that shows the number of field goals made from right under the basket
- Regularly update the testing data as the 2020-2021 NBA season continues
- Feature engineer more advanced stats(3-point attempt rate,  turnover percentage, etc.)
- Increase the size of the training data (combine two or three years of Houston Rockets game logs)
- Explore more models like ElasticNet