In [69]:
import pandas as pd
import numpy as np
import pybaseball

### Read in BRef data & Calculate Height and Weight Percentiles

In [70]:
pitchers = pd.read_csv("pitchers_br.csv")

def height_to_inches(height_str):
    feet, inches = height_str.split("'")
    inches = inches.replace('"', '').strip()
    return int(feet) * 12 + int(inches)

pitchers['height_inches'] = pitchers['Ht'].apply(height_to_inches)
# pitchers["Wt"].apply(int)

pitchers['height_percentile'] = pitchers['Ht'].rank(pct=True).round(2)
pitchers['weight_percentile'] = pitchers['Wt'].rank(pct=True).round(2)

pitchers["height_zscore"] = (pitchers["height_inches"] - pitchers["height_inches"].mean()) / pitchers["height_inches"].std()
pitchers["weight_zscore"] = (pitchers["Wt"] - pitchers["Wt"].mean()) / pitchers["Wt"].std()


pitchers_sorted = pitchers.sort_values(by='height_zscore', ascending=False)
print(pitchers_sorted)


     Unnamed: 0                Name      Ht   Wt  height_inches  \
360         360        Paul Gervase  6' 10"  230             82   
416         416         Bailey Ober   6' 9"  260             81   
340         340       Tyler Glasnow   6' 8"  225             80   
372         372          Eury Pérez   6' 8"  220             80   
648         648           Joe Boyle   6' 8"  250             80   
..          ...                 ...     ...  ...            ...   
343         343  Yoshinobu Yamamoto  5' 10"  176             70   
677         677      Danny Coulombe  5' 10"  190             70   
171         171            Sam Moll   5' 9"  190             69   
63           63         Rico Garcia   5' 9"  201             69   
555         555         Yuki Matsui   5' 8"  165             68   

     height_percentile  weight_percentile  height_zscore  weight_zscore  
360               0.26               0.81       3.355894       0.819287  
416               1.00               0.98      

### Calculate Size Score

In [71]:
pitchers["size_score"] = 10 * pitchers["height_percentile"] + 90 * pitchers["weight_percentile"]
# pitchers_sorted = pitchers.sort_values(by='size_score', ascending=False)
# print(pitchers_sorted)

In [72]:
# Alternative Calculation of Size score: z-scores
pitchers["size_score"] = 10 * pitchers["height_zscore"] + 90 * pitchers["weight_zscore"]
# pitchers_sorted = pitchers.sort_values(by='size_score', ascending=False)
# print(pitchers_sorted)

### Big-Body Score as an alternative to Size Score

Capture a score somewhere between BMI and Corpulence Index (using Imperial units - percentiles will be calculated anyway)
Big-Body Score = $\frac{Weight (lbs)}{(Height (in)^{2.5})}$

In [92]:
pitchers["big_body_raw"] = pitchers["Wt"] / (pitchers["height_inches"])**2.5
# pitchers['big_body_score'] = pitchers['big_body_raw'].rank(pct=True).round(4) * 100
pitchers["big_body_score"] = (pitchers["big_body_raw"] - pitchers["big_body_raw"].mean()) / pitchers["big_body_raw"].std() # + 0.5 * pitchers["height_zscore"]
pitchers_sorted = pitchers.sort_values(by='big_body_score', ascending=False)
print(pitchers_sorted[["Name", "big_body_score"]])

                 Name  big_body_score
164    Tony Santillan        3.517482
348  Brusdar Graterol        3.445045
43        Joe Jiménez        3.124514
146     Dan Altavilla        2.756605
467      David Bednar        2.656721
..                ...             ...
527      Jean Cabrera       -2.594183
3        Juan Morillo       -2.598771
483     Michael Arias       -2.622646
252    Keider Montero       -2.861545
453  Huascar Brazobán       -2.868251

[742 rows x 2 columns]


### Calculate Dominance Score from stats
This year's stats from FanGraphs

In [74]:
pitching_stats = pd.read_csv("pitchers_fg.csv")

In [75]:
# set a 10-game qualification filter
pitching_stats = pitching_stats[pitching_stats["G"] > 10]

# determine starters/relievers
pitching_stats["Pos"] = np.where(
    pitching_stats["GS"] > 0.5 * pitching_stats["G"], 
    "SP", 
    "RP"
)

In [76]:
pitching_stats.columns



Index(['Unnamed: 0', 'Name', 'WAR', 'G', 'GS', 'SO', 'K/9', 'FIP', 'K%',
       'FB% 2', 'vFA (sc)', 'vSI (sc)', 'vFC (sc)', 'Zone%', 'SwStr%',
       'Stuff+', 'Pitching+', 'Pos'],
      dtype='object')

In [96]:
# pitching_stats_sp = pd.DataFrame(pitching_stats[pitching_stats["Pos"] == "SP"])
# pitching_stats_sp['SO_percentile'] = pitching_stats_sp['SO'].rank(pct=True).round(2)
# pitching_stats_sp['SO9_percentile'] = pitching_stats_sp['K/9'].rank(pct=True).round(2)
# pitching_stats_rp = pd.DataFrame(pitching_stats[pitching_stats["Pos"] == "RP"])
# pitching_stats_rp['SO_percentile'] = pitching_stats_rp['SO'].rank(pct=True).round(2)
# pitching_stats_rp['SO9_percentile'] = pitching_stats_rp['K/9'].rank(pct=True).round(2)

import pandas as pd

# SP pitchers
pitching_stats_sp = pd.DataFrame(pitching_stats[pitching_stats["Pos"] == "SP"])
pitching_stats_sp['SO_z'] = (pitching_stats_sp['SO'] - pitching_stats_sp['SO'].mean()) / pitching_stats_sp['SO'].std()
pitching_stats_sp['SO9_z'] = (pitching_stats_sp['K/9'] - pitching_stats_sp['K/9'].mean()) / pitching_stats_sp['K/9'].std()

# RP pitchers
pitching_stats_rp = pd.DataFrame(pitching_stats[pitching_stats["Pos"] == "RP"])
pitching_stats_rp['SO_z'] = (pitching_stats_rp['SO'] - pitching_stats_rp['SO'].mean()) / pitching_stats_rp['SO'].std()
pitching_stats_rp['SO9_z'] = (pitching_stats_rp['K/9'] - pitching_stats_rp['K/9'].mean()) / pitching_stats_rp['K/9'].std()


pitching_stats = pd.concat([pitching_stats_sp, pitching_stats_rp], ignore_index=True)

pitching_stats["fb_velo"] = pitching_stats[["vFA (sc)", "vSI (sc)", "vFC (sc)",]].max(axis=1, skipna=True)
pitching_stats['fbv_z'] = (pitching_stats['fb_velo'] - pitching_stats['fb_velo'].mean()) / pitching_stats['fb_velo'].std()

pitching_stats["Stuff+_z"] = (pitching_stats["Stuff+"] - pitching_stats["Stuff+"].mean()) / pitching_stats["Stuff+"].std()
pitching_stats["Pitching+_z"] = (pitching_stats["Pitching+"] - pitching_stats["Pitching+"].mean()) / pitching_stats["Pitching+"].std()
pitching_stats["WAR_z"] = (pitching_stats["WAR"] - pitching_stats["WAR"].mean()) / pitching_stats["WAR"].std()



# pitching_stats["dominance_score_raw"] = 10 * pitching_stats["SO_percentile"] + 80 * pitching_stats["SO9_percentile"] + 50 * pitching_stats["fbv_percentile"] + pitching_stats["Stuff+"]
# pitching_stats["dominance_score"] = pitching_stats['dominance_score_raw'].rank(pct=True).round(4) * 100
# pitching_stats["dominance_score_raw"] = 10 * pitching_stats["SO9_z"] + 20 * pitching_stats["fbv_z"] + 30 * (pitching_stats["Stuff+"] / 100)
pitching_stats["dominance_score_raw"] = pitching_stats["WAR_z"] + pitching_stats["fbv_z"]
pitching_stats["dominance_score"] = (pitching_stats['dominance_score_raw'] - pitching_stats['dominance_score_raw'].mean()) / pitching_stats['dominance_score_raw'].std()


print(pitching_stats.sort_values(by='dominance_score', ascending=False).head(5))

   Unnamed: 0             Name  WAR   G  GS   SO    K/9   FIP     K%  FB% 2  \
0          43     Tarik Skubal  5.5  24  24  190  11.23  2.26  0.327  0.532   
2          26      Paul Skenes  4.7  25  25  166  10.09  2.51  0.286  0.479   
1          51  Garrett Crochet  4.9  24  24  188  11.11  2.54  0.308  0.528   
4          45     Hunter Brown  4.0  24  24  164  10.32  2.92  0.294  0.605   
5          72     Zack Wheeler  4.0  24  24  195  11.73  2.98  0.333  0.581   

   ...  Pos      SO_z     SO9_z  fb_velo     fbv_z  dominance_score_raw  \
0  ...   SP  2.615515  1.917598     97.5  1.248272             5.831922   
2  ...   SP  1.939721  1.214898     98.2  1.542198             5.329021   
1  ...   SP  2.559199  1.843630     96.3  0.744399             4.730429   
4  ...   SP  1.883405  1.356671     96.8  0.954346             4.043945   
5  ...   SP  2.756306  2.225800     96.1  0.660420             3.750020   

   dominance_score  Stuff+_z  Pitching+_z     WAR_z  
0         3.942683  

### Calculate HOSS = Size Score + Dominance Score

In [97]:
pitchers_hoss = pd.merge(
    pitching_stats, 
    pitchers, 
    on="Name",
    how="left"
)

# pitchers["HOSS"] = pitchers["size_score"] + pitchers["dominance_score"]
pitchers_hoss["HOSS"] = pitchers_hoss["big_body_score"] + pitchers_hoss["dominance_score"]

pitchers_sorted = pitchers_hoss.sort_values(by='HOSS', ascending=False)
print(pitchers_sorted[["Name", "big_body_score", "dominance_score", "HOSS"]].head(10))

                Name  big_body_score  dominance_score      HOSS
0       Tarik Skubal        1.307036         3.942683  5.249719
2        Paul Skenes        1.096627         3.602696  4.699322
7     Framber Valdez        2.474933         1.985292  4.460225
165     David Bednar        2.656721         1.202989  3.859711
244   Tony Santillan        3.517482         0.330909  3.848391
22    Drew Rasmussen        2.131172         1.613616  3.744788
1    Garrett Crochet        0.428628         3.198016  3.626645
4       Hunter Brown        0.693413         2.733918  3.427331
189  Graham Ashcraft        2.115739         0.837917  2.953656
164      Bryan Abreu        1.605623         1.327100  2.932723
