# Combining Triple Slash Batting Stats with PCA
This notebook uses principal component analysis (PCA) to reduce the typical triple slash statistics into a single batting statistic to compare to the advanced stat wOBA.

In [1]:
%matplotlib notebook
import pandas as pd
from pybaseball import batting_stats
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import IncrementalPCA as PCA
import numpy as np

## Pull Stats from Fangraphs

In [2]:
# get all batting stats
all_2019 = batting_stats('2019', qual=1)
all_2019 = all_2019[['Name','G','AB','PA','AVG','OBP','SLG','wOBA']]
all_2019.describe()

Unnamed: 0,G,AB,PA,AVG,OBP,SLG,wOBA
count,990.0,990.0,990.0,990.0,990.0,990.0,990.0
mean,61.038384,168.334343,188.4,0.181799,0.237087,0.289111,0.226777
std,47.979252,192.57488,216.01781,0.135899,0.15969,0.209038,0.149439
min,1.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,22.0,7.0,8.0,0.0785,0.125,0.083,0.109
50%,43.0,64.0,70.0,0.215,0.282,0.3315,0.2715
75%,100.0,309.75,342.0,0.261,0.331,0.438,0.326
max,162.0,681.0,747.0,1.0,1.0,1.333,0.87


## Show Distribution of Stats

In [3]:
# pairplot to show distribution of each variable against one another
sns.pairplot(all_2019, 
             x_vars=['AVG','OBP','SLG','wOBA','PA'], 
             y_vars=['AVG','OBP','SLG','wOBA','PA'], 
             corner=True, 
             height=2)

<IPython.core.display.Javascript object>

<seaborn.axisgrid.PairGrid at 0x1f4bf285220>

## Look Only at Players with 100+ PA

In [4]:
over_100_PA_2019 = all_2019[all_2019['PA'] >= 100]
over_100_PA_2019.describe()

Unnamed: 0,G,AB,PA,AVG,OBP,SLG,wOBA
count,451.0,451.0,451.0,451.0,451.0,451.0,451.0
mean,103.898004,343.277162,384.274945,0.250299,0.321202,0.429253,0.317308
std,36.89105,156.398582,176.229408,0.036882,0.039665,0.085455,0.044183
min,27.0,86.0,101.0,0.124,0.178,0.144,0.15
25%,75.0,207.0,236.5,0.228,0.2985,0.374,0.291
50%,106.0,329.0,369.0,0.253,0.322,0.425,0.318
75%,136.0,481.5,532.0,0.275,0.346,0.488,0.346
max,162.0,681.0,747.0,0.344,0.438,0.671,0.442


In [5]:
sns.pairplot(over_100_PA_2019, 
             x_vars=['AVG','OBP','SLG','wOBA','PA'], 
             y_vars=['AVG','OBP','SLG','wOBA','PA'], 
             corner=True, 
             height=2)

<IPython.core.display.Javascript object>

<seaborn.axisgrid.PairGrid at 0x1f4bde88b80>

## Do PCA
Perform PCA while preserving the variance in the variables (do not standardize the variables first).

In [6]:
# my Windows machine has issues when automatically running all cells at once
# I get the error: LinAlgError: SVD did not converge
# simply rerunning the cell or running each cell individually fixes this. Windows, sigh.
pca = PCA()
arr = over_100_PA_2019[['AVG','OBP','SLG']].values
pca.fit(arr)
pca.explained_variance_ratio_

array([0.88441342, 0.08438219, 0.03120439])

In [7]:
PC = pca.transform(over_100_PA_2019[['AVG','OBP','SLG']].values)
PC = pd.DataFrame(PC, columns=['PC1','PC2','PC3'])
PC['wOBA'] = over_100_PA_2019['wOBA']

## Show PCA Distribution with wOBA

In [8]:
sns.pairplot(PC, 
             x_vars=['PC1','PC2','PC3','wOBA'], 
             y_vars=['PC1','PC2','PC3','wOBA'], 
             corner=True, 
             height=2)

<IPython.core.display.Javascript object>

<seaborn.axisgrid.PairGrid at 0x1f4bead54f0>

## List PCA Components

In [9]:
pca.components_

array([[ 0.31657687,  0.33681555,  0.88675497],
       [ 0.55149384,  0.6952472 , -0.4609619 ],
       [ 0.77177304, -0.63496978, -0.03434748]])