# Physical Dimensions

This notebook attempts to build an ML model for predicting the position of a player given their physical dimensions.

In [290]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import re
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
import pickle

In [291]:
player_attributes = pd.read_csv(os.path.join("data", "Players.csv"))
player_pos = pd.read_csv(os.path.join("data", "player_data.csv"))

In [292]:
player_attributes.head()

Unnamed: 0.1,Unnamed: 0,Player,height,weight,collage,born,birth_city,birth_state
0,0,Curly Armstrong,180.0,77.0,Indiana University,1918.0,,
1,1,Cliff Barker,188.0,83.0,University of Kentucky,1921.0,Yorktown,Indiana
2,2,Leo Barnhorst,193.0,86.0,University of Notre Dame,1924.0,,
3,3,Ed Bartels,196.0,88.0,North Carolina State University,1925.0,,
4,4,Ralph Beard,178.0,79.0,University of Kentucky,1927.0,Hardinsburg,Kentucky


In [293]:
player_pos.head()

Unnamed: 0,name,year_start,year_end,position,height,weight,birth_date,college
0,Alaa Abdelnaby,1991,1995,F-C,6-10,240.0,"June 24, 1968",Duke University
1,Zaid Abdul-Aziz,1969,1978,C-F,6-9,235.0,"April 7, 1946",Iowa State University
2,Kareem Abdul-Jabbar,1970,1989,C,7-2,225.0,"April 16, 1947","University of California, Los Angeles"
3,Mahmoud Abdul-Rauf,1991,2001,G,6-1,162.0,"March 9, 1969",Louisiana State University
4,Tariq Abdul-Wahad,1998,2003,F,6-6,223.0,"November 3, 1974",San Jose State University


In [294]:
print("Number of observations in player attributes:", len(player_attributes))
print("Number of observations in player positions:", len(player_pos))

Number of observations in player attributes: 3922
Number of observations in player positions: 4550


In [295]:
# Join player_attributes with player_data to obtain position (response)
player_data = pd.merge(player_attributes, player_pos, how = "inner", left_on = "Player", right_on = "name")

print("Number of observations in player data:", len(player_data))

Number of observations in player data: 3814


In [296]:
# Lets explore the players that didn't have a position
full_player_data = pd.merge(player_attributes, player_pos, how = "left", left_on = "Player", right_on = "name")

full_player_data[full_player_data['position'].isna()].tail()

Unnamed: 0.1,Unnamed: 0,Player,height_x,weight_x,collage,born,birth_city,birth_state,name,year_start,year_end,position,height_y,weight_y,birth_date,college
3351,3307,Luc Mbah,201.0,99.0,,1984.0,,,,,,,,,,
3591,3544,Nando De,206.0,97.0,,1968.0,,,,,,,,,,
3775,3727,James Michael,198.0,90.0,,1992.0,,,,,,,,,,
3870,3822,Walter Tavares,221.0,117.0,,1992.0,Maio,Cape Verde,,,,,,,,
3932,3884,Sheldon McClellan,196.0,90.0,University of Miami,1992.0,Houston,Texas,,,,,,,,


In [297]:
# Convert player position into string
player_data['position'] = player_data['position'].astype(str)

In [298]:
# Lets consider only the relevant columns
player_data = player_data[['Player', 'height_x', 'weight_x', 'year_start', 'year_end', 'position']]

# Lets rename columns
player_data.rename(columns = {"Player": "player", "height_x": "height", "weight_x": "weight"}, inplace=True)

In [299]:
# Check number of observatons before
print("The number of observations before removing NA values:", len(player_data))

# Lets remove NA values in relevant columns
player_data.dropna(subset = ['height', 'weight', 'position'], inplace = True)

# Check number of observatons after
print("The number of observations after removing NA values:", len(player_data))

The number of observations before removing NA values: 3814
The number of observations after removing NA values: 3814


Observing some of the positional data, it appears the first position prior to the "-" references the players primary position. We will use this. 

Example: By domain knowledge, I know that Karl-Anthony Towns, Myles Turner have played Center and Jonathon Simmons has played Guard for a majority of their careers

In [300]:
player_data[(player_data['year_start'] == 2016) & (player_data['position'].str.contains('-'))].tail()

Unnamed: 0,player,height,weight,year_start,year_end,position
3713,Jonathon Simmons,185.0,83.0,2016,2018,G-F
3715,Axel Toupane,201.0,89.0,2016,2017,G-F
3716,Karl-Anthony Towns,213.0,110.0,2016,2018,C-F
3717,Myles Turner,211.0,110.0,2016,2018,C-F
3720,Alan Williams,198.0,90.0,2016,2017,F-C


In [301]:
# We observe an nan value to filter out as it's not a real position
player_data['position'] = player_data['position'].astype(str)
player_data['position'].value_counts()

G      1322
F      1079
C       434
F-C     332
G-F     296
C-F     176
F-G     174
nan       1
Name: position, dtype: int64

In [302]:
# Filtering nan value out
player_data = player_data[player_data['position'] != "nan"]

# Checking it's filtered out
player_data['position'].value_counts()

G      1322
F      1079
C       434
F-C     332
G-F     296
C-F     176
F-G     174
Name: position, dtype: int64

In [303]:
# Extracting only primary position of each player
player_data['position'] = player_data['position'].apply(lambda row: row[0])

# Checking only primary positions
player_data['position'].value_counts()

G    1618
F    1585
C     610
Name: position, dtype: int64

In [304]:
player_data[player_data['year_start'] < 1960]

Unnamed: 0,player,height,weight,year_start,year_end,position
0,Curly Armstrong,180.0,77.0,1949,1951,G
1,Cliff Barker,188.0,83.0,1950,1952,G
2,Leo Barnhorst,193.0,86.0,1950,1954,F
3,Ed Bartels,196.0,88.0,1950,1951,F
4,Ralph Beard,178.0,79.0,1950,1951,G
...,...,...,...,...,...,...
441,Larry Staverman,201.0,92.0,1959,1964,F
442,Bennie Swain,203.0,99.0,1959,1959,F
515,Bob Duffy,193.0,79.0,1947,1947,F
613,Matt Guokas,190.0,88.0,1947,1947,F


In [305]:
player_data['BMI'] = player_data['weight'] / (player_data['height']/100)**2

player_data.head()

Unnamed: 0,player,height,weight,year_start,year_end,position,BMI
0,Curly Armstrong,180.0,77.0,1949,1951,G,23.765432
1,Cliff Barker,188.0,83.0,1950,1952,G,23.483477
2,Leo Barnhorst,193.0,86.0,1950,1954,F,23.087868
3,Ed Bartels,196.0,88.0,1950,1951,F,22.907122
4,Ralph Beard,178.0,79.0,1950,1951,G,24.93372


In [306]:
# Obtain predictors and response
player_predictors = player_data[['height', 'weight', 'BMI']]
player_response = player_data['position']

# Encode player response to be numeric
le = LabelEncoder()
enc_player_response = le.fit_transform(player_response).reshape(-1, 1)

ohe = OneHotEncoder(sparse=False, categories='auto', drop='first')
enc_player_response = ohe.fit_transform(enc_player_response)

# Obtain training/test split
X_train, X_test, y_train, y_test = train_test_split(player_predictors, enc_player_response, test_size=0.25)



In [307]:
rf_clf = RandomForestClassifier()
rf_clf.fit(X_train, y_train)
y_pred = rf_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

Accuracy: 0.750524109014675


In [308]:
y_pred = rf_clf.predict(X_test)
y_pred[0]

array([1., 0.])

In [309]:
# Export model
pickle.dump(rf_clf, open(os.path.join("models", "model.sav"), "wb"))

In [318]:
model = pickle.load(open(os.path.join("models", "model.sav"), 'rb'))

In [322]:
height = 2002
weight = 100

In [323]:
tmp_input = np.array([[height, weight, weight / (height/100)**2]])

tmp_input

array([[2.00200000e+03, 1.00000000e+02, 2.49500749e-01]])

In [325]:
model.predict(tmp_input)



array([[0., 0.]])

In [313]:
# [0, 1] looks like a Guard    - 1
# [0, 0] looks like a Center   - 2
# [1, 0] looks like a Forward  - 0
X_test.head(20)

Unnamed: 0,height,weight,BMI
1216,203.0,99.0,24.023878
2143,211.0,105.0,23.584376
112,198.0,83.0,21.171309
1755,203.0,108.0,26.207867
420,203.0,104.0,25.237205
601,188.0,81.0,22.91761
208,196.0,95.0,24.729279
2927,208.0,123.0,28.430104
3353,188.0,79.0,22.351743
795,206.0,99.0,23.329249


In [314]:
print(y_test[:20])

[[1. 0.]
 [1. 0.]
 [1. 0.]
 [0. 1.]
 [0. 0.]
 [0. 1.]
 [1. 0.]
 [1. 0.]
 [0. 1.]
 [1. 0.]
 [0. 1.]
 [1. 0.]
 [1. 0.]
 [0. 1.]
 [1. 0.]
 [1. 0.]
 [0. 1.]
 [0. 0.]
 [0. 1.]
 [1. 0.]]


In [315]:
int(np.nonzero(rf_clf.predict(tmp_input).flatten())[0])



TypeError: only size-1 arrays can be converted to Python scalars

In [None]:
probabilities = rf_clf.predict_proba(tmp_input)
print(probabilities)

[array([[0.53635827, 0.46364173]]), array([[0.84340864, 0.15659136]]), array([[0.62023308, 0.37976692]])]




In [None]:
X_test

Unnamed: 0,height,weight,BMI
95,201.0,96.0,23.761788
3289,211.0,111.0,24.932055
163,183.0,79.0,23.589835
811,188.0,72.0,20.371209
1078,206.0,97.0,22.857951
...,...,...,...
3787,211.0,111.0,24.932055
1104,208.0,97.0,22.420488
1537,193.0,83.0,22.282477
3038,201.0,102.0,25.246900


In [None]:
player_data.iloc[163:164, :]

Unnamed: 0,player,height,weight,year_start,year_end,position,BMI
163,Herm Schaefer,183.0,79.0,1949,1950,G,23.589835
