# NBA Player Stats - Pre-processing and Training Data

Pre-process and prepare NBA Player stats data for model fitting. 

## Imports

In [1]:
# Import relevant libraries and packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve
from sklearn.dummy import DummyRegressor
from sklearn.pipeline import make_pipeline

# Suppress future warning messages
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

# Prepare the notebook for matplotlib and other libraries- show plots inline
%matplotlib inline

## Loading Data

In [2]:
# Load data and set parameter index_col=0, because first column contains no useful data. 
nba_df = pd.read_csv('NBA-Clustering/data/interim/nba_stats_cleaned.csv', index_col=0)

## Explore Data

In [3]:
# Checkout Data appearance
nba_df.head(10)

Unnamed: 0,Player,G,MP,FG,FGA,FG%,3P,3PA,3P%,2P,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,Precious Achiuwa,73,23.6,3.6,8.3,0.439,0.8,2.1,0.359,2.9,...,0.595,2.0,4.5,6.5,1.1,0.5,0.6,1.2,2.1,9.1
1,Steven Adams,76,26.3,2.8,5.1,0.547,0.0,0.0,0.0,2.8,...,0.543,4.6,5.4,10.0,3.4,0.9,0.8,1.5,2.0,6.9
2,Bam Adebayo,56,32.6,7.3,13.0,0.557,0.0,0.1,0.0,7.3,...,0.753,2.4,7.6,10.1,3.4,1.4,0.8,2.6,3.1,19.1
3,Santi Aldama,32,11.3,1.7,4.1,0.402,0.2,1.5,0.125,1.5,...,0.625,1.0,1.7,2.7,0.7,0.2,0.3,0.5,1.1,4.1
4,LaMarcus Aldridge,47,22.3,5.4,9.7,0.55,0.3,1.0,0.304,5.1,...,0.873,1.6,3.9,5.5,0.9,0.3,1.0,0.9,1.7,12.9
5,Nickeil Alexander-Walker,65,22.6,3.9,10.5,0.372,1.6,5.2,0.311,2.3,...,0.743,0.6,2.3,2.9,2.4,0.7,0.4,1.4,1.6,10.6
8,Grayson Allen,66,27.3,3.9,8.6,0.448,2.4,5.9,0.409,1.5,...,0.865,0.5,2.9,3.4,1.5,0.7,0.3,0.7,1.5,11.1
9,Jarrett Allen,56,32.3,6.6,9.7,0.677,0.0,0.2,0.1,6.6,...,0.708,3.4,7.3,10.8,1.6,0.8,1.3,1.7,1.7,16.1
10,Jose Alvarado,54,15.4,2.4,5.4,0.446,0.6,2.0,0.291,1.8,...,0.679,0.5,1.4,1.9,2.8,1.3,0.1,0.7,1.4,6.1
11,Justin Anderson,16,19.8,2.3,5.9,0.379,0.9,3.7,0.254,1.3,...,0.789,0.3,2.6,2.9,2.1,0.5,0.4,0.5,1.4,6.4


In [4]:
# Check out Data shape
nba_df.shape

(500, 25)

In [5]:
# Look at Data types
nba_df.dtypes

Player     object
G           int64
MP        float64
FG        float64
FGA       float64
FG%       float64
3P        float64
3PA       float64
3P%       float64
2P        float64
2PA       float64
2P%       float64
eFG%      float64
FT        float64
FTA       float64
FT%       float64
ORB       float64
DRB       float64
TRB       float64
AST       float64
STL       float64
BLK       float64
TOV       float64
PF        float64
PTS       float64
dtype: object

In [6]:
# Obtain data overview
nba_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 500 entries, 0 to 811
Data columns (total 25 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Player  500 non-null    object 
 1   G       500 non-null    int64  
 2   MP      500 non-null    float64
 3   FG      500 non-null    float64
 4   FGA     500 non-null    float64
 5   FG%     500 non-null    float64
 6   3P      500 non-null    float64
 7   3PA     500 non-null    float64
 8   3P%     500 non-null    float64
 9   2P      500 non-null    float64
 10  2PA     500 non-null    float64
 11  2P%     500 non-null    float64
 12  eFG%    500 non-null    float64
 13  FT      500 non-null    float64
 14  FTA     500 non-null    float64
 15  FT%     500 non-null    float64
 16  ORB     500 non-null    float64
 17  DRB     500 non-null    float64
 18  TRB     500 non-null    float64
 19  AST     500 non-null    float64
 20  STL     500 non-null    float64
 21  BLK     500 non-null    float64
 22  TO

## Create Dummy/Indicator Features for any Categorical Variables

In [7]:
# Check the names of NBA_df columns
nba_df.columns

Index(['Player', 'G', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P',
       '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST',
       'STL', 'BLK', 'TOV', 'PF', 'PTS'],
      dtype='object')

In [8]:
# Feature selection:
features = ['G', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P',
       '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST',
       'STL', 'BLK', 'TOV', 'PF', 'PTS']

In [9]:
# Make an explanatory and dependent variable
X = nba_df[features]
y = nba_df.Player

In [10]:
# One-hot encode categorical features in X, if any
X = pd.get_dummies(X)

## Split Data into Training and Testing Subsets

In [11]:
# Split into train/test data sets
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.30, 
                                                    random_state=24)

In [12]:
# Confirm training data shape
X_train.shape, X_test.shape

((350, 24), (150, 24))

In [13]:
# Confirm testing data shape
y_train.shape, y_test.shape

((350,), (150,))

## Standardize the Magnitude of Numeric Features

In [14]:
# Make Scaler object
scaler = StandardScaler()

In [15]:
# Fit scaler based on training data
scaler.fit(X_train)

StandardScaler()

In [16]:
# Transform training data using fitted scaler
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [17]:
# Re-introduce column names back into new scaled df
X_train_scaled = pd.DataFrame(X_train_scaled, columns=features)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=features)

In [19]:
# View new scaled df appearance
X_train_scaled.head()

Unnamed: 0,G,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,1.124903,0.168298,0.382729,0.543209,-0.283582,1.63622,1.452162,0.782967,-0.294463,-0.246067,...,0.661008,0.382602,1.592505,1.305994,0.027946,-0.68129,-0.570139,0.181687,-0.519574,0.622014
1,-0.676543,-1.262021,-1.125776,-1.106271,-0.928771,-0.402986,-0.526495,0.855719,-1.245316,-1.211771,...,-0.293823,-0.428978,-0.629561,-0.614417,-0.890705,-0.936046,-0.570139,-1.031871,-1.791258,-1.062625
2,-1.477186,-0.626324,-0.737874,-0.57019,-1.093001,-0.742854,-0.569509,-0.572038,-0.558589,-0.421649,...,0.467274,0.653128,-0.681237,-0.374366,-0.728591,-0.171778,-0.303006,-0.910515,-0.236977,-0.753517
3,-1.377105,-1.602572,-1.255076,-1.188746,-1.503576,-0.856143,-0.827595,-0.681166,-1.08684,-1.12398,...,0.05213,-1.105294,-1.353024,-1.37458,-0.458399,-1.445557,-1.104406,-1.031871,-1.791258,-1.232635
4,1.074863,1.371581,1.718833,1.677227,0.220839,0.390039,0.634891,0.073635,1.924195,1.948717,...,0.121321,1.329444,3.142784,2.826319,0.568328,0.847245,1.566929,0.909822,1.034707,1.24023


In [20]:
# Save the data
X_train_scaled.to_csv("nba_train_scaled.csv")
X_test_scaled.to_csv("nba_test_scaled.csv")

## Summary

NBA Stats data frame did not seem to have any categorical variables, but dummy features were created for any potential categorical features that were overlooked. Then, the scaled data was split into training and test data subsets with a 70/30 split. Finally, in order to standardize the magntiude of all the numerical features, a standard scaler was first fitting based on the training data and then the scaler was applied to both the training and test sets. 