# Combining Triple Slash Batting Stats with PCA
This notebook uses principal component analysis (PCA) to reduce the typical triple slash statistics into a single batting statistic to compare to the advanced stat wOBA.

In [1]:
%matplotlib notebook
import pandas as pd
from pybaseball import batting_stats
import seaborn as sns
from sklearn.decomposition import IncrementalPCA as PCA
from sklearn.linear_model import LinearRegression
# from sklearn.decomposition import PCA

In [2]:
def PCA_analysis(year, PA_threshold, league, figs=True):
    """Perform PCA analysis and display plots
    
    Input:
        year (str): year under consideration
        PA_threshold (float): minimum number of plate attempts to consider
        league (str): 'al', 'nl', or 'all' for AL only, NL only, or all MLB stats
        figs (bool): if True, display distribution figures
    
    Returns:
        all_stats (pd.DataFrame): unfiltered stats (PA, AVG, OBP, SLG, wOBA)
        filter_by_PA (pd.DataFrame): filtered stats (PA, AVG, OBP, SLG, wOBA)
        pca (sklearn.decomposition object): fitted PCA object
        LR (sklearn.linear_model object): fitted LinearRegression object
    """
    
    # get all batting stats
    all_stats = batting_stats(year, qual=1, league=league)
    # filter down to baseball card stats + wOBA
    all_stats = all_stats[['PA','AVG','OBP','SLG','wOBA']]
    print(f'{year} All Stats:')
    print(all_stats.describe())
    
    # filter by PA
    filter_by_PA = all_stats[all_stats['PA'] >= PA_threshold]
    print(f'\n{year} PA >= {PA_threshold}')
    print(filter_by_PA.describe())
    
    # PCA analysis
    pca = PCA()
    arr = filter_by_PA[['AVG','OBP','SLG']].values
    pca.fit(arr)
    league_avg = pca.components_[:,0].dot(pca.mean_)
    print(f'\nPCA Analysis Results')
    print('====================')
    print(f'{year} Explained variance ratio: {pca.explained_variance_ratio_}')
    print(f'{year} Principal Components: \n {pca.components_}')
    print(f'{year} Mean AVG/OBP/SLG: {pca.mean_}')
    print(f'{year} PCA offset: {league_avg}')
    
    # fit linear regression model to PCA1 and wOBA
    LR = LinearRegression()
    X = filter_by_PA[['AVG','OBP','SLG']].values
    y = filter_by_PA['wOBA'].values
    LR.fit((pca.transform(X)[:, 0].reshape(-1,1)+league_avg)/league_avg*100, y)
    print(f'\nOLS Result')
    print(f'coeff: {LR.coef_} intercept: {LR.intercept_}')
    print(f'score: {LR.score((pca.transform(X)[:, 0].reshape(-1,1)+league_avg)/league_avg*100, y)}')
    
    if figs:
        # distribution of all stats
        g1 = sns.pairplot(all_stats, x_vars=['AVG','OBP','SLG','wOBA','PA'], y_vars=['AVG','OBP','SLG','wOBA','PA'], 
                          corner=True, height=2)
        g1.fig.suptitle(f'{year} All Stats')
    
        # show distribution of stats filtered by PA
        g2 = sns.pairplot(filter_by_PA, x_vars=['AVG','OBP','SLG','wOBA','PA'], y_vars=['AVG','OBP','SLG','wOBA','PA'],
                          corner=True, height=2)
        g2.fig.suptitle(f'{year} PA >= {PA_threshold}')
    
        # show distribution of PCA transformed stats
        PC = pca.transform(filter_by_PA[['AVG','OBP','SLG']].values)
        PC = pd.DataFrame(PC, columns=['PCA1','PCA2','PCA3'])
        PC['wOBA'] = filter_by_PA['wOBA']
    
        PC['PCA1'] = (PC['PCA1'] + league_avg)/league_avg
    
        g3 = sns.pairplot(PC, x_vars=['PCA1','PCA2','PCA3','wOBA'], y_vars=['PCA1','PCA2','PCA3','wOBA'],
                          corner=True, height=2)
        g3.fig.suptitle(f'{year} PCA Transformed')
    
    return all_stats, filter_by_PA, pca, LR

# 2020 Season Stats (MLB)

In [3]:
pca_2020_mlb = PCA_analysis('2020', 25, 'all', figs=False)

2020 All Stats:
               PA         AVG         OBP         SLG        wOBA
count  581.000000  581.000000  581.000000  581.000000  581.000000
mean   114.468158    0.222537    0.298253    0.369138    0.291864
std     79.894809    0.081991    0.094416    0.152548    0.093168
min      1.000000    0.000000    0.000000    0.000000    0.000000
25%     41.000000    0.184000    0.253000    0.283000    0.252000
50%    108.000000    0.230000    0.308000    0.385000    0.304000
75%    184.000000    0.270000    0.352000    0.462000    0.348000
max    267.000000    0.667000    1.000000    1.333000    0.728000

2020 PA >= 25
               PA         AVG         OBP         SLG        wOBA
count  475.000000  475.000000  475.000000  475.000000  475.000000
mean   137.532632    0.234318    0.310194    0.396928    0.307101
std     69.840267    0.054763    0.058484    0.110853    0.061661
min     25.000000    0.040000    0.073000    0.074000    0.089000
25%     77.000000    0.198500    0.275000    

# 2020 Season Stats (AL)

In [4]:
pca_2020_al = PCA_analysis('2020', 25, 'al', figs=False)

2020 All Stats:
               PA         AVG         OBP         SLG        wOBA
count  300.000000  300.000000  300.000000  300.000000  300.000000
mean   111.013333    0.223107    0.297663    0.368940    0.291540
std     78.186552    0.083098    0.087932    0.147491    0.089489
min      1.000000    0.000000    0.000000    0.000000    0.000000
25%     45.000000    0.182750    0.260500    0.281250    0.252250
50%    100.000000    0.230000    0.312500    0.383500    0.306500
75%    178.000000    0.271500    0.349250    0.462250    0.348250
max    266.000000    0.667000    0.667000    0.746000    0.588000

2020 PA >= 25
               PA         AVG         OBP         SLG        wOBA
count  247.000000  247.000000  247.000000  247.000000  247.000000
mean   132.287449    0.233530    0.309381    0.394802    0.306308
std     69.622986    0.056902    0.056726    0.115710    0.062374
min     25.000000    0.073000    0.073000    0.074000    0.089000
25%     68.500000    0.195000    0.276500    

# 2020 Season Stats (NL)

In [5]:
pca_2020_nl = PCA_analysis('2020', 25, 'nl', figs=False)

2020 All Stats:
               PA         AVG         OBP         SLG        wOBA
count  303.000000  303.000000  303.000000  303.000000  303.000000
mean   109.577558    0.220680    0.298581    0.366756    0.291515
std     81.251794    0.082731    0.104084    0.161355    0.098784
min      1.000000    0.000000    0.000000    0.000000    0.000000
25%     36.000000    0.183500    0.248500    0.278000    0.248500
50%    100.000000    0.231000    0.305000    0.385000    0.303000
75%    183.500000    0.270000    0.352000    0.457500    0.347500
max    267.000000    0.500000    1.000000    1.333000    0.728000

2020 PA >= 25
               PA         AVG         OBP         SLG        wOBA
count  242.000000  242.000000  242.000000  242.000000  242.000000
mean   134.454545    0.234719    0.310545    0.397570    0.307182
std     71.929253    0.055032    0.061560    0.110885    0.063292
min     26.000000    0.040000    0.143000    0.080000    0.139000
25%     67.250000    0.200000    0.270000    

# 2019 Season Stats (MLB)

In [6]:
pca_2019_mlb = PCA_analysis('2019', 75, 'all', figs=False)

2019 All Stats:
              PA         AVG         OBP         SLG        wOBA
count  990.00000  990.000000  990.000000  990.000000  990.000000
mean   188.40000    0.181799    0.237087    0.289111    0.226777
std    216.01781    0.135899    0.159690    0.209038    0.149439
min      1.00000    0.000000    0.000000    0.000000    0.000000
25%      8.00000    0.078500    0.125000    0.083000    0.109000
50%     70.00000    0.215000    0.282000    0.331500    0.271500
75%    342.00000    0.261000    0.331000    0.438000    0.326000
max    747.00000    1.000000    1.000000    1.333000    0.870000

2019 PA >= 75
               PA         AVG         OBP         SLG        wOBA
count  485.000000  485.000000  485.000000  485.000000  485.000000
mean   363.334021    0.247515    0.318144    0.423485    0.313942
std    186.300277    0.039269    0.041413    0.090064    0.046522
min     75.000000    0.107000    0.178000    0.144000    0.150000
25%    197.000000    0.225000    0.293000    0.364000 

# 2019 Season Stats (AL)

In [7]:
pca_2019_al = PCA_analysis('2019', 75, 'al', figs=False)

2019 All Stats:
               PA         AVG         OBP         SLG        wOBA
count  449.000000  449.000000  449.000000  449.000000  449.000000
mean   207.748330    0.187341    0.249087    0.306158    0.239577
std    216.344037    0.133630    0.155212    0.203174    0.144386
min      1.000000    0.000000    0.000000    0.000000    0.000000
25%      8.000000    0.091000    0.176000    0.128000    0.159000
50%    128.000000    0.220000    0.293000    0.359000    0.285000
75%    369.000000    0.264000    0.333000    0.455000    0.330000
max    747.000000    1.000000    1.000000    1.000000    0.870000

2019 PA >= 75
               PA         AVG         OBP         SLG        wOBA
count  255.000000  255.000000  255.000000  255.000000  255.000000
mean   352.733333    0.245694    0.315235    0.420180    0.311753
std    182.625190    0.040476    0.042499    0.092642    0.048082
min     75.000000    0.107000    0.178000    0.144000    0.150000
25%    192.500000    0.220500    0.290000    

# 2019 Season Stats (NL)

In [8]:
pca_2019_nl = PCA_analysis('2019', 75, 'nl', figs=False)

2019 All Stats:
               PA         AVG         OBP         SLG        wOBA
count  577.000000  577.000000  577.000000  577.000000  577.000000
mean   161.589255    0.176589    0.228386    0.274636    0.216858
std    207.311920    0.137011    0.161690    0.212755    0.151944
min      1.000000    0.000000    0.000000    0.000000    0.000000
25%      8.000000    0.067000    0.111000    0.073000    0.093000
50%     54.000000    0.206000    0.265000    0.305000    0.250000
75%    262.000000    0.259000    0.329000    0.423000    0.319000
max    715.000000    1.000000    1.000000    1.333000    0.870000

2019 PA >= 75
               PA         AVG         OBP         SLG        wOBA
count  237.000000  237.000000  237.000000  237.000000  237.000000
mean   361.088608    0.250089    0.321764    0.429498    0.317354
std    190.607560    0.037953    0.039972    0.088400    0.044773
min     76.000000    0.123000    0.196000    0.221000    0.186000
25%    190.000000    0.228000    0.299000    

# 2018 Season Stats (MLB)

In [9]:
pca_2018_mlb = PCA_analysis('2018', 75, 'all', figs=False)

2018 All Stats:
               PA         AVG         OBP         SLG        wOBA
count  990.000000  990.000000  990.000000  990.000000  990.000000
mean   187.009091    0.179841    0.233248    0.278902    0.225144
std    219.317235    0.134564    0.152775    0.210129    0.149493
min      1.000000    0.000000    0.000000    0.000000    0.000000
25%      6.000000    0.071250    0.120000    0.088250    0.112750
50%     67.000000    0.210500    0.275500    0.317000    0.268000
75%    346.750000    0.256000    0.328000    0.416000    0.323000
max    745.000000    1.000000    1.000000    2.000000    1.247000

2018 PA >= 75
               PA         AVG         OBP         SLG        wOBA
count  476.000000  476.000000  476.000000  476.000000  476.000000
mean   368.262605    0.244361    0.313368    0.399546    0.308872
std    190.309190    0.037133    0.042219    0.078749    0.044470
min     75.000000    0.117000    0.162000    0.165000    0.162000
25%    194.250000    0.223750    0.286000    

# 2018 Season Stats (AL)

In [10]:
pca_2018_al = PCA_analysis('2018', 75, 'al', figs=False)

2018 All Stats:
               PA         AVG         OBP         SLG        wOBA
count  444.000000  444.000000  444.000000  444.000000  444.000000
mean   207.779279    0.193563    0.248592    0.304971    0.242417
std    220.187599    0.123341    0.144498    0.188086    0.138438
min      1.000000    0.000000    0.000000    0.000000    0.000000
25%      6.750000    0.142500    0.200000    0.200000    0.187250
50%    117.500000    0.226000    0.287000    0.358000    0.288000
75%    375.000000    0.263000    0.332250    0.426250    0.326000
max    745.000000    1.000000    1.000000    1.000000    0.880000

2018 PA >= 75
               PA         AVG         OBP         SLG        wOBA
count  254.000000  254.000000  254.000000  254.000000  254.000000
mean   352.259843    0.244528    0.311886    0.402673    0.309756
std    188.738132    0.038557    0.044550    0.079434    0.045630
min     75.000000    0.117000    0.162000    0.200000    0.162000
25%    181.500000    0.222000    0.282000    

# 2018 Season Stats (NL)

In [11]:
pca_2018_nl = PCA_analysis('2018', 75, 'nl', figs=False)

2018 All Stats:
               PA         AVG         OBP         SLG        wOBA
count  582.000000  582.000000  582.000000  582.000000  582.000000
mean   159.596220    0.170517    0.223864    0.262241    0.214313
std    207.049397    0.140618    0.157682    0.224623    0.156161
min      1.000000    0.000000    0.000000    0.000000    0.000000
25%      6.000000    0.052250    0.094250    0.060000    0.086500
50%     50.000000    0.192000    0.265000    0.271000    0.246000
75%    273.500000    0.252750    0.327000    0.408000    0.319000
max    740.000000    1.000000    1.000000    2.000000    1.247000

2018 PA >= 75
               PA         AVG         OBP         SLG        wOBA
count  234.000000  234.000000  234.000000  234.000000  234.000000
mean   364.995726    0.245577    0.316910    0.398543    0.309709
std    187.889738    0.036102    0.039949    0.078276    0.043376
min     76.000000    0.134000    0.200000    0.165000    0.192000
25%    202.250000    0.226000    0.293250    

# 2017 Season Stats (MLB)

In [12]:
pca_2017_mlb = PCA_analysis('2017', 75, 'all', figs=False)

2017 All Stats:
               PA         AVG         OBP         SLG        wOBA
count  957.000000  957.000000  957.000000  957.000000  957.000000
mean   193.620690    0.192487    0.249176    0.299120    0.238892
std    221.789431    0.136430    0.158120    0.218016    0.152496
min      1.000000    0.000000    0.000000    0.000000    0.000000
25%      9.000000    0.104000    0.154000    0.128000    0.139000
50%     69.000000    0.222000    0.286000    0.336000    0.278000
75%    365.000000    0.265000    0.333000    0.432000    0.328000
max    725.000000    1.000000    1.000000    2.000000    1.232000

2017 PA >= 75
               PA         AVG         OBP         SLG        wOBA
count  468.000000  468.000000  468.000000  468.000000  468.000000
mean   374.279915    0.250506    0.319906    0.414024    0.314701
std    190.242977    0.037813    0.042677    0.083798    0.045309
min     75.000000    0.091000    0.143000    0.091000    0.115000
25%    197.000000    0.230000    0.291000    

# 2017 Season Stats (AL)

In [13]:
pca_2017_al = PCA_analysis('2017', 75, 'al', figs=False)

2017 All Stats:
               PA         AVG         OBP         SLG        wOBA
count  428.000000  428.000000  428.000000  428.000000  428.000000
mean   216.406542    0.208509    0.264423    0.326843    0.256334
std    232.125134    0.144097    0.154703    0.215336    0.150845
min      1.000000    0.000000    0.000000    0.000000    0.000000
25%      7.000000    0.165000    0.219250    0.200000    0.186500
50%    117.500000    0.232000    0.299500    0.365500    0.292000
75%    399.500000    0.268250    0.333000    0.445500    0.332500
max    723.000000    1.000000    1.000000    1.500000    1.054000

2017 PA >= 75
               PA         AVG         OBP         SLG        wOBA
count  237.000000  237.000000  237.000000  237.000000  237.000000
mean   378.324895    0.249384    0.317080    0.412781    0.313325
std    195.450826    0.036203    0.039425    0.080526    0.042788
min     75.000000    0.138000    0.176000    0.216000    0.178000
25%    192.000000    0.228000    0.291000    

# 2017 Season Stats (NL)

In [14]:
pca_2017_nl = PCA_analysis('2017', 75, 'nl', figs=False)

2017 All Stats:
               PA         AVG         OBP         SLG        wOBA
count  560.000000  560.000000  560.000000  560.000000  560.000000
mean   165.487500    0.181575    0.238725    0.279225    0.226641
std    204.647312    0.135791    0.164411    0.220072    0.155856
min      1.000000    0.000000    0.000000    0.000000    0.000000
25%     12.000000    0.086750    0.125000    0.091000    0.113750
50%     59.000000    0.205500    0.269000    0.303000    0.255000
75%    293.000000    0.260500    0.333000    0.423000    0.324000
max    725.000000    1.000000    1.000000    2.000000    1.232000

2017 PA >= 75
               PA         AVG         OBP         SLG        wOBA
count  243.000000  243.000000  243.000000  243.000000  243.000000
mean   350.288066    0.252169    0.323012    0.416782    0.316687
std    188.397306    0.040137    0.046803    0.088584    0.048486
min     75.000000    0.091000    0.143000    0.091000    0.115000
25%    175.500000    0.231500    0.293000    

# 2016 Season Stats (MLB)

In [15]:
pca_2016_mlb = PCA_analysis('2016', 75, 'all', figs=False)

2016 All Stats:
               PA         AVG         OBP         SLG        wOBA
count  969.000000  969.000000  969.000000  969.000000  969.000000
mean   190.481940    0.184943    0.237940    0.285720    0.229673
std    223.051568    0.129423    0.153392    0.212966    0.149252
min      1.000000    0.000000    0.000000    0.000000    0.000000
25%      9.000000    0.089000    0.143000    0.097000    0.121000
50%     68.000000    0.219000    0.284000    0.325000    0.270000
75%    337.000000    0.263000    0.328000    0.425000    0.324000
max    744.000000    1.000000    1.000000    2.000000    1.242000

2016 PA >= 75
               PA         AVG         OBP         SLG        wOBA
count  471.000000  471.000000  471.000000  471.000000  471.000000
mean   370.830149    0.249484    0.316212    0.403662    0.310760
std    196.416332    0.037969    0.042047    0.080141    0.044567
min     75.000000    0.094000    0.105000    0.114000    0.111000
25%    198.500000    0.226500    0.295000    

# 2016 Season Stats (AL)

In [16]:
pca_2016_al = PCA_analysis('2016', 75, 'al', figs=False)

2016 All Stats:
               PA         AVG         OBP         SLG        wOBA
count  429.000000  429.000000  429.000000  429.000000  429.000000
mean   214.543124    0.199166    0.251681    0.311604    0.246044
std    232.895350    0.134345    0.144918    0.226905    0.150098
min      1.000000    0.000000    0.000000    0.000000    0.000000
25%      7.000000    0.133000    0.200000    0.167000    0.178000
50%    129.000000    0.233000    0.295000    0.345000    0.282000
75%    376.000000    0.267000    0.332000    0.435000    0.326000
max    744.000000    1.000000    1.000000    2.000000    1.242000

2016 PA >= 75
               PA         AVG         OBP         SLG        wOBA
count  241.000000  241.000000  241.000000  241.000000  241.000000
mean   370.809129    0.247867    0.311411    0.402631    0.308432
std    201.278116    0.037092    0.040270    0.078607    0.043638
min     76.000000    0.146000    0.184000    0.197000    0.184000
25%    188.000000    0.225000    0.290000    

# 2016 Season Stats NL

In [17]:
pca_2016_nl = PCA_analysis('2016', 75, 'nl', figs=False)

2016 All Stats:
               PA         AVG         OBP         SLG        wOBA
count  567.000000  567.000000  567.000000  567.000000  567.000000
mean   163.206349    0.175372    0.228245    0.267280    0.218032
std    207.515808    0.124911    0.158202    0.202043    0.147737
min      1.000000    0.000000    0.000000    0.000000    0.000000
25%     11.000000    0.077000    0.124000    0.085000    0.109500
50%     53.000000    0.208000    0.264000    0.293000    0.252000
75%    265.500000    0.259000    0.326000    0.417000    0.322000
max    705.000000    1.000000    1.000000    1.333000    0.878000

2016 PA >= 75
               PA         AVG         OBP         SLG        wOBA
count  237.000000  237.000000  237.000000  237.000000  237.000000
mean   358.350211    0.251937    0.321802    0.404992    0.313527
std    192.117504    0.038016    0.042272    0.080937    0.044456
min     75.000000    0.102000    0.137000    0.114000    0.115000
25%    200.000000    0.229000    0.302000    