In [11]:
import pandas as pd
import numpy as np

### Read in Data & Calculate Height and Weight Percentiles

In [3]:
pitchers = pd.read_csv("pitchers.csv")

def height_to_inches(height_str):
    feet, inches = height_str.split("'")
    inches = inches.replace('"', '').strip()
    return int(feet) * 12 + int(inches)

pitchers['height_inches'] = pitchers['Ht'].apply(height_to_inches)

pitchers['height_percentile'] = pitchers['Ht'].rank(pct=True).round(2)
pitchers['weight_percentile'] = pitchers['Wt'].rank(pct=True).round(2)

pitchers_sorted = pitchers.sort_values(by='height_percentile', ascending=False)
print(pitchers_sorted)

# pitchers_sorted = pitchers.sort_values(by='weight_percentile', ascending=False)
# print(pitchers_sorted)


     Unnamed: 0               Name      Ht   Wt  height_inches  \
419         419        Bailey Ober   6' 9"  260             81   
130         130        Luke Little   6' 8"  220             80   
70           70     Félix Bautista   6' 8"  285             80   
641         641     Chris Roycroft   6' 8"  230             80   
77           77        Tyler Wells   6' 8"  260             80   
..          ...                ...     ...  ...            ...   
112         112      Shota Imanaga  5' 10"  175             70   
681         681     Danny Coulombe  5' 10"  190             70   
124         124         Eli Morgan  5' 10"  190             70   
716         716     Lazaro Estrada  5' 10"  180             70   
35           35  Daysbel Hernández  5' 10"  220             70   

     height_percentile  weight_percentile  
419               1.00               0.98  
130               0.99               0.65  
70                0.99               1.00  
641               0.99         

### Calculate Size Score

In [5]:
pitchers["size_score"] = 10 * pitchers["height_percentile"] + 90 * pitchers["weight_percentile"]
pitchers_sorted = pitchers.sort_values(by='size_score', ascending=False)
print(pitchers_sorted)

     Unnamed: 0             Name      Ht   Wt  height_inches  \
70           70   Félix Bautista   6' 8"  285             80   
297         297   Carlos Estévez   6' 6"  277             78   
711         711      Alek Manoah   6' 6"  285             78   
558         558     Johan Oviedo   6' 6"  275             78   
419         419      Bailey Ober   6' 9"  260             81   
..          ...              ...     ...  ...            ...   
603         603     Blas Castano  5' 10"  162             70   
694         694   Winston Santos   6' 0"  160             72   
110         110  Daniel Palencia  5' 11"  160             71   
530         530     Jean Cabrera   6' 0"  145             72   
367         367  Ronny Henriquez  5' 10"  155             70   

     height_percentile  weight_percentile  size_score  
70                0.99               1.00        99.9  
297               0.93               1.00        99.3  
711               0.93               1.00        99.3  
558    

### Calculate Dominance Score from stats
This year's stats from Baseball Reference

In [50]:
from pybaseball import pitching_stats_bref

# get all of this season's batting data so far
pitching_stats = pitching_stats_bref()

# set some qualification parameters
pitching_stats = pitching_stats[pitching_stats["IP"] > 30]
pitching_stats = pitching_stats[pitching_stats["G"] > 10]
pitching_stats = pitching_stats[["Name", "mlbID", "G", "GS", "SO", "SO9", "WHIP"]]

# calculate SO9 percentile by position
pitching_stats["Pos"] = np.where(
    pitching_stats["GS"] > 0.5 * pitching_stats["G"], 
    "SP", 
    "RP"
)

In [41]:
pitching_stats.columns



Index(['Name', 'mlbID', 'G', 'GS', 'SO', 'SO9'], dtype='object')

In [58]:
pitching_stats_sp = pd.DataFrame(pitching_stats[pitching_stats["Pos"] == "SP"])
pitching_stats_sp['SO_percentile'] = pitching_stats_sp['SO'].rank(pct=True).round(2)
pitching_stats_sp['SO9_percentile'] = pitching_stats_sp['SO9'].rank(pct=True).round(2)
pitching_stats_rp = pd.DataFrame(pitching_stats[pitching_stats["Pos"] == "RP"])
pitching_stats_rp['SO_percentile'] = pitching_stats_rp['SO'].rank(pct=True).round(2)
pitching_stats_rp['SO9_percentile'] = pitching_stats_rp['SO9'].rank(pct=True).round(2)

pitching_stats = pd.concat([pitching_stats_sp, pitching_stats_rp], ignore_index=True)

pitching_stats['WHIP_percentile'] = pitching_stats['WHIP'].rank(pct=True).round(2)
pitching_stats['WHIP_percentile'] = 1 - pitching_stats['WHIP_percentile']

pitching_stats["dominance_score_raw"] = 10 * pitching_stats["SO_percentile"] + 80 * pitching_stats["SO9_percentile"] + 50 * pitching_stats["WHIP_percentile"]
pitching_stats["dominance_score"] = pitching_stats['dominance_score_raw'].rank(pct=True).round(4) * 100

print(pitching_stats.sort_values(by='dominance_score', ascending=False).head(5))

                  Name   mlbID   G  GS   SO   SO9   WHIP Pos  SO_percentile  \
237         Josh Hader  623352  48   0   76  13.0  0.854  RP           0.97   
136       Tarik Skubal  669373  24  24  190  11.2  0.873  SP           0.99   
209  Edwin D\xc3\xadaz  621242  46   0   68  13.3  0.913  RP           0.92   
152       Zack Wheeler  554430  24  24  195  11.7  0.935  SP           1.00   
198    Aroldis Chapman  547973  51   0   66  12.7  0.729  RP           0.90   

     SO9_percentile  WHIP_percentile  dominance_score_raw  dominance_score  
237            0.97             0.98                136.3           100.00  
136            0.97             0.97                136.0            99.73  
209            0.98             0.96                135.6            99.46  
152            0.98             0.94                135.4            99.18  
198            0.95             1.00                135.0            98.91  


### Calculate HOSS = Size Score + Dominance Score

In [63]:
pitchers = pd.merge(
    pitching_stats, 
    pitchers, 
    on="Name",
    how="left"
)

pitchers["HOSS"] = pitchers["size_score"] + pitchers["dominance_score"]

pitchers_sorted = pitchers.sort_values(by='HOSS', ascending=False)
print(pitchers_sorted[["Name", "size_score", "dominance_score", "HOSS"]].head(10))

                Name  size_score  dominance_score    HOSS
136      Paul Skenes        97.5            96.46  193.96
31   Garrett Crochet        93.0            95.10  188.10
56     Hunter Greene        91.3            95.64  186.94
137     Tarik Skubal        86.5            99.73  186.23
287    Trevor Megill        95.4            90.46  185.86
199  Aroldis Chapman        84.6            98.91  183.51
179     David Bednar        88.4            94.82  183.22
205    Fernando Cruz        83.8            98.09  181.89
253     Jeff Hoffman        85.9            95.91  181.81
140   Michael Soroka        94.9            84.20  179.10
