## <font color=purple> **NBL DATA FROM 2016/2017**
##### **Eric Nesi**
##### All Code in Python 3

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as stats
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LinearRegression, LogisticRegressionCV, LogisticRegression, Lasso, LassoCV, Ridge, RidgeCV
from sklearn.model_selection import GridSearchCV, cross_val_score


plt.style.use('fivethirtyeight')

from ipywidgets import *
from IPython.display import display

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [25]:
#Set my X_features and append to dataframe
X_features = kings[['Court_Loc','Quarter']]
X_features = pd.concat([pd.get_dummies(X_features[col]) for col in X_features], axis=1)
X_features['distance_m'] = kings['distance_m']

In [26]:
X_features.head()

Unnamed: 0,In_the_Paint,Left_Corner,Left_Corner_3pt,Left_Wing,Left_Wing_3pt,Restricted_Area,Right_Corner,Right_Corner_3pt,Right_Wing,Right_Wing_3pt,Top_of_Key,Top_of_Key_3pt,1,2,3,4,distance_m
882,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,10.240632
302,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,9.994555
290,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,9.616913
1047,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,9.370499
1087,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,9.09037


#### **Random Forest:**

In [27]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()

Xs = ss.fit_transform(X_features)

Y = kings['Result']

In [28]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

Xtrain, Xtest, ytrain, ytest = train_test_split(Xs, Y, test_size=0.33)

dtc = DecisionTreeClassifier(max_depth=None, max_features='auto')
dtc.fit(Xtrain, ytrain)
print ('dtc acc:', dtc.score(Xtest, ytest))

rf = RandomForestClassifier(n_estimators=1000, max_depth=None, max_features='auto')
rf.fit(Xtrain, ytrain)
yhat = rf.predict(Xtest)
print ('rf acc:', accuracy_score(ytest, yhat))


dtc acc: 0.542517006803
rf acc: 0.544217687075


At this point, I realized predicting whether or not a shot is going to be made is potentially not a great option, but while I am doing it, I decided to try logistic regression to see if I could improve accuracy.  Not a great prediction score by RF/DTC.

#### **Logistic Regression:**

In [29]:
# Set parameters for gridsearching
# Taken from 6.3.3. and 6.3.4

gs_params = {
    'penalty':['l1','l2'],
    'solver':['liblinear'],
    'C':np.logspace(-5,0,100)
}

lr_gridsearch = GridSearchCV(LogisticRegression(), gs_params, cv=5, verbose=1)
print (lr_gridsearch)


GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'penalty': ['l1', 'l2'], 'solver': ['liblinear'], 'C': array([  1.00000e-05,   1.12332e-05, ...,   8.90215e-01,   1.00000e+00])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=1)


In [30]:
# TTS our data.
# We will have a holdout set to test on at the end.
X_train, X_test, y_train, y_test = train_test_split(Xs, Y, test_size=0.3, random_state=12)

In [31]:
lr_gridsearch.fit(X_train, y_train)

print (lr_gridsearch.best_score_)
print (lr_gridsearch.best_params_)

Fitting 5 folds for each of 200 candidates, totalling 1000 fits
0.626003210273
{'C': 0.0095454845666183372, 'penalty': 'l1', 'solver': 'liblinear'}


[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:    4.6s finished


In [34]:
bestfit = {'C': 0.0095454845666183372, 'penalty':'l1', 'solver': 'liblinear'}

logreg = LogisticRegression(C=0.0095454845666183372, penalty='l1', solver='liblinear')

logreg.fit(X_train, y_train)

LogisticRegression(C=0.009545484566618337, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l1', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [35]:
Y_pred = logreg.predict(X_test)

# stores confusion matrix for Y Test and Y Pred  
conmat = confusion_matrix(y_test, Y_pred, labels=logreg.classes_)
# converts np.matrix format matrix to a dataframe and adds index and column names
conmat = pd.DataFrame(conmat, columns=logreg.classes_, index=logreg.classes_)

print ('best params for Result:')
print (conmat)

best params for Result:
     0   1
0  248  32
1  173  82


In [36]:
print(classification_report(y_test, Y_pred))

             precision    recall  f1-score   support

          0       0.59      0.89      0.71       280
          1       0.72      0.32      0.44       255

avg / total       0.65      0.62      0.58       535



In [37]:
best_lr = lr_gridsearch.best_estimator_

print (best_lr.score(Xs, Y))


0.623245367771


In [38]:
coef_df = pd.DataFrame({
        'coef':best_lr.coef_[0],
        'feature':X_features.columns
    })


coef_df['abs_coef'] = np.abs(coef_df.coef)

# sort by absolute value of coefficient (magnitude)
coef_df.sort_values('abs_coef', ascending=False, inplace=True)

coef_df.head(12)

Unnamed: 0,coef,feature,abs_coef
5,0.15422,Restricted_Area,0.15422
0,0.0,In_the_Paint,0.0
9,0.0,Right_Wing_3pt,0.0
15,0.0,4,0.0
14,0.0,3,0.0
13,0.0,2,0.0
12,0.0,1,0.0
11,0.0,Top_of_Key_3pt,0.0
10,0.0,Top_of_Key,0.0
8,0.0,Right_Wing,0.0


#### Discussion:
This is not surprising that the closer to the basket you are the more likely you are to make the shot. That said, the prediction score isn't great.  Furthermore, it is really not what I am trying to accomplish with the project.  I will explain more in Summary.

In [43]:
print(kings.shape)
print(kings.Court_Loc.unique())

(1781, 19)
['Left_Wing_3pt' 'Right_Wing_3pt' 'Top_of_Key_3pt' 'Left_Corner_3pt'
 'Right_Corner_3pt' 'Right_Wing' 'Top_of_Key' 'Left_Wing' 'Right_Corner'
 'Left_Corner' 'In_the_Paint' 'Restricted_Area']


In [45]:
print(np.sum(kings.Court_Loc =="Restricted_Area")/1781)
print(np.sum(kings.Court_Loc =='Left_Wing_3pt')/1781)
print(np.sum(kings.Court_Loc =='Right_Wing_3pt')/1781)
print(np.sum(kings.Court_Loc =='Top_of_Key_3pt')/1781)
print(np.sum(kings.Court_Loc =='Left_Corner_3pt')/1781)
print(np.sum(kings.Court_Loc =='Right_Corner_3pt')/1781)
print(np.sum(kings.Court_Loc =='Top_of_Key')/1781)
print(np.sum(kings.Court_Loc =='Left_Wing')/1781)
print(np.sum(kings.Court_Loc =='Right_Corner')/1781)
print(np.sum(kings.Court_Loc =='Left_Corner')/1781)
print(np.sum(kings.Court_Loc =='In_the_Paint')/1781)

0.212801796743
0.111173498035
0.115665356541
0.0606400898372
0.0151600224593
0.0157215047726
0.0224592925323
0.0359348680517
0.0230207748456
0.0252667040988
0.329028635598
