# Redefine NBA Player Positions
This Notebook explores possibility of classifying players into n types, instead of the conventional 5 positions

Data can be downloaded at https://www.basketball-reference.com

In [29]:
# load libraries
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA

In [33]:
# load data
# 2017-18
playerPerGame18 = pd.read_csv('2017_18_PlayerPerGame.csv')
playerAdvanced18 = pd.read_csv('2017_18_PlayerAdvanced.csv')
playerTotal18 = pd.read_csv('2017_18_PlayerTotals.csv')
player36Mins18 = pd.read_csv('2017_18_PlayerPer36Minutes.csv')
player100Poss18 = pd.read_csv('2017_18_PlayerPer100Poss.csv')

# 2016-17
# playerPerGame17 = pd.read_csv('2016_17_PlayerPerGame.csv')
# playerAdvanced17 = pd.read_csv('2016_17_PlayerAdvanced.csv')
# playerTotal17 = pd.read_csv('2016_17_PlayerTotals.csv')
# player36Mins17 = pd.read_csv('2016_17_PlayerPer36Minutes.csv')
# player100Poss17 = pd.read_csv('2016_17_PlayerPer100Poss.csv')

# # 2015-16
# playerPerGame16 = pd.read_csv('2015_16_PlayerPerGame.csv')
# playerAdvanced16 = pd.read_csv('2015_16_PlayerAdvanced.csv')
# playerTotal16 = pd.read_csv('2015_16_PlayerTotals.csv')
# player36Mins16 = pd.read_csv('2015_16_PlayerPer36Minutes.csv')
# player100Poss16 = pd.read_csv('2015_16_PlayerPer100Poss.csv')

### Data Preparation
Before the actual analysis, we need to clean and prepare the data.  
There are 2 steps here:  
1. Remove duplicates
2. Remove players who played a total of less than 25 games in a season  
  
I am going to use data from the per game and advanced table

In [34]:
# a nice thing about these data from basektball-reference.com is that
# for players who played for more than one team, it already calculated 
# players overall (aggregated) statistics and placed at the first row of that player

playerPerGame18 = playerPerGame18.drop_duplicates(['Player'], keep="first")
playerAdvanced18 = playerAdvanced18.drop_duplicates(['Player'], keep="first")

In [35]:
# filter players who played at least 25 games and played at least 15 minutes a game
minimum_games = 25
minimum_minutes = 15
PlayerPerGame = playerPerGame18[playerPerGame18['G'] >= minimum_games]
PlayerPerGame = playerPerGame18[playerPerGame18['MP'] >= minimum_minutes]

In [36]:
PlayerPerGame.fillna(0, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


### Principal Component Analysis

Principal Component Analysis is a dimension reduction method. 

In [38]:
# Here we see that we have 30 columns in this table, and most likely that not all of these columns will be
# important to our analysis
len(PlayerPerGame.columns)

30

In [30]:
# before PCA
PlayerPerGame.head()

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PS/G
0,1,Alex Abrines\abrinal01,SG,24,OKC,75,8,15.1,1.5,3.9,...,0.848,0.3,1.2,1.5,0.4,0.5,0.1,0.3,1.7,4.7
1,2,Quincy Acy\acyqu01,PF,27,BRK,70,8,19.4,1.9,5.2,...,0.817,0.6,3.1,3.7,0.8,0.5,0.4,0.9,2.1,5.9
2,3,Steven Adams\adamsst01,C,24,OKC,76,76,32.7,5.9,9.4,...,0.559,5.1,4.0,9.0,1.2,1.2,1.0,1.7,2.8,13.9
3,4,Bam Adebayo\adebaba01,C,20,MIA,69,19,19.8,2.5,4.9,...,0.721,1.7,3.8,5.5,1.5,0.5,0.6,1.0,2.0,6.9
4,5,Arron Afflalo\afflaar01,SG,32,ORL,53,3,12.9,1.2,3.1,...,0.846,0.1,1.2,1.2,0.6,0.1,0.2,0.4,1.1,3.4


In [None]:
# There is one more step before actually performing PCA, and that is to standardize the data. 
# This is a requirement for PCA.

In [None]:
# after PCA
PlayerPerGame.head()