In [1]:
import pandas as pd 
import requests
from bs4 import BeautifulSoup

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler

pd.set_option('max_columns', 100)

### Let's grab data from [basketball reference](https://www.basketball-reference.com/leagues/NBA_2019_per_game.html)

In [4]:
url = 'https://www.basketball-reference.com/leagues/NBA_2020_per_game.html'

### How can we specify just the table using beautiful soup? 

In [5]:
res = requests.get(url)
soup = BeautifulSoup(res.content)

In [6]:
table = soup.find('tbody')

thead = soup.find('thead')

headers =  [col.text for col in thead.find_all('th')]

## chopping of RK
headers = headers[1:]

In [8]:
## scraper

dcts = []
for row in table.find_all('tr', {'class':'full_table'}):
    dct = {}
    for header, datum in zip(headers, row.find_all('td')):
        dct[header] = datum.text
        
    dcts.append(dct)

### Set up the dataframe

In [10]:
df = pd.DataFrame(dcts)
df = df[headers]

In [33]:
df['Pos'].value_counts() #[0:5].index

SG    135
PF     99
C      90
PG     89
SF     87
Name: Pos, dtype: int64

In [32]:
## sort this to only the 5 major positions 

positions = df['Pos'].value_counts()[0:5].index

df = df.loc[df['Pos'].isin(positions)]

In [30]:
filter_ = df['Pos'].map(lambda x: len(x) < 3)
df = df.loc[filter_, :]

### Set up our Xs and ys

In [40]:
### can we make our X dataframe be everything passed Minutes Played? 
X = df.loc[:, 'MP':]
y = df['Pos']

In [49]:
## Any data cleaning needed for our X? 
X = X.replace('', 0).astype(float)

### Train test split 

In [50]:
X_train, X_test, y_train, y_test = train_test_split(X,y\
                                                    random_state=78701)

### Scale data

In [52]:
ss = StandardScaler()

Z_train = ss.fit_transform(X_train)
Z_test = ss.transform(X_test)

### Train KNN

In [55]:
from sklearn.metrics import accuracy_score

## instantiate the knn
knn = KNeighborsClassifier()

# fit the Knn
knn.fit(Z_train, y_train)

## we are accurately predicting on 54% of the test set 
y_preds = knn.predict(Z_test)

### manual accuracy 
print((y_preds == y_test).sum() / len(y_test))

###sklearn accuracy 
accuracy_score(y_test, y_preds)

0.424

In [62]:
y_test.value_counts(normalize=True)

SG    0.320
PF    0.224
C     0.168
SF    0.168
PG    0.120
Name: Pos, dtype: float64

### Create confusion matrix 

In [14]:
## look at the potential classes - create y preds 


In [63]:
## make a confusion matrix
conmat = confusion_matrix(y_test, y_preds)
# put confusion matrix in a dataframe
pd.DataFrame(conmat, columns=['Pred: '+ cls for cls in knn.classes_],\
            index=['True: '+ cls for cls in knn.classes_ ])

Unnamed: 0,Pred: C,Pred: PF,Pred: PG,Pred: SF,Pred: SG
True: C,15,6,0,0,0
True: PF,6,7,0,7,8
True: PG,0,1,11,1,2
True: SF,5,4,3,6,3
True: SG,2,4,12,8,14


### Use` .kneighbors ` to grab the matrix of nearest points - can we make this into a function?

In [None]:
## create a scaled DF of all players 

In [67]:
sc = StandardScaler()

X_sc = sc.fit_transform(X)

X_sc_df = pd.DataFrame(X_sc, columns=X.columns)

In [74]:
X_sc_df.index = df['Player'].values

In [78]:
knn_final = KNeighborsClassifier()

knn_final.fit(X_sc_df, y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [99]:
def player_comparer(name):

    one_player = X_sc_df[X_sc_df.index == name]
    return X_sc_df.iloc[knn_final.kneighbors(one_player, n_neighbors=10)[1][0]]

player_comparer('LeBron James')

Unnamed: 0,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
LeBron James,1.607954,2.747015,2.565771,0.415486,1.343358,1.561916,0.387947,2.72964,2.600538,0.398312,0.359561,1.639193,2.061448,0.010369,0.215628,2.062647,1.615511,4.896149,1.660248,-0.001899,3.386259,-0.007651,2.506074
Kyrie Irving,1.401186,2.91819,2.810347,0.326882,2.145978,1.959868,0.711919,2.623594,2.686734,0.167438,0.375232,2.33797,1.890949,0.962233,0.343673,0.675216,0.61108,2.475092,1.899064,0.235418,1.744493,1.151652,2.873726
Kawhi Leonard,1.318479,2.704222,2.70844,0.230224,1.343358,1.340832,0.612744,2.676617,2.974055,0.011259,0.16367,3.316259,2.857111,0.840718,0.215628,1.857102,1.460984,1.869827,2.854328,0.710051,2.213569,0.224209,2.82777
Luka Dončić,1.390848,2.704222,2.728822,0.222169,2.260638,2.844205,0.24249,2.305454,1.997165,0.554492,0.304711,3.945159,4.164271,0.326307,0.727808,2.730669,2.272255,3.740645,0.9438,-0.476532,3.620797,0.919791,3.103509
Nikola Jokić,1.359832,2.062316,1.709753,0.649077,0.311417,0.500712,0.28216,2.4115,2.112093,0.554492,0.469259,1.429559,1.379452,0.508579,2.00826,2.57651,2.542678,2.750212,1.421432,0.710051,2.213569,1.615373,1.832044
Russell Westbrook,1.721677,3.132159,3.197593,0.20606,0.082097,0.589145,-0.319503,3.896153,4.123337,0.099534,-0.110576,2.687359,2.743445,0.407317,0.983899,1.857102,1.692775,2.915284,2.376696,-0.001899,3.855335,2.195025,2.843088
De'Aaron Fox,1.256448,1.677172,1.66899,0.246334,0.082097,0.235411,0.116868,2.040337,2.198289,0.099534,0.014794,2.198215,2.743445,-0.013934,-0.168507,0.366898,0.224761,2.805236,2.13788,0.472735,2.565376,1.383513,1.694174
Bradley Beal,1.669985,2.91819,3.054924,0.133566,1.801998,2.313602,0.235878,2.72964,2.801662,0.208181,0.108821,3.665648,3.482274,0.642245,0.215628,0.315512,0.302025,2.365043,0.9438,-0.001899,2.448107,0.572,3.134147
Jrue Holiday,1.576939,1.891141,2.096999,0.028853,1.228698,1.385049,0.40117,1.775221,2.054629,-0.063436,0.006958,0.521148,0.640622,0.144035,0.471718,0.315512,0.417921,2.475092,2.615512,1.184685,2.0963,0.45607,1.648217
Brandon Ingram,1.525247,2.276284,2.239669,0.286608,1.801998,1.606133,0.751589,1.987314,2.112093,0.106324,0.351725,2.757237,2.459279,0.719204,0.087583,1.394625,1.074664,1.264563,0.9438,0.710051,2.213569,1.267582,2.490755


In [22]:
## getting this to work for a player  
