In [195]:
import pandas as pd
import numpy as np

### Read in BRef data & Calculate Height and Weight Z-Scores

In [196]:
pitchers = pd.read_csv("pitchers_br.csv")

def height_to_inches(height_str):
    feet, inches = height_str.split("'")
    inches = inches.replace('"', '').strip()
    return int(feet) * 12 + int(inches)

pitchers['height_inches'] = pitchers['Ht'].apply(height_to_inches)
# pitchers["Wt"].apply(int)

pitchers['height_percentile'] = pitchers['Ht'].rank(pct=True).round(2)
pitchers['weight_percentile'] = pitchers['Wt'].rank(pct=True).round(2)

pitchers["height_zscore"] = (pitchers["height_inches"] - pitchers["height_inches"].mean()) / pitchers["height_inches"].std()
pitchers["weight_zscore"] = (pitchers["Wt"] - pitchers["Wt"].mean()) / pitchers["Wt"].std()


# pitchers_sorted = pitchers.sort_values(by='height_zscore', ascending=False)
# print(pitchers_sorted)


### Calculate Size Score

In [197]:
pitchers["size_score"] = 10 * pitchers["height_percentile"] + 90 * pitchers["weight_percentile"]
# pitchers_sorted = pitchers.sort_values(by='size_score', ascending=False)
# print(pitchers_sorted)

In [198]:
# Alternative Calculation of Size score: z-scores
pitchers["size_score"] = 0.1 * pitchers["height_zscore"] + 0.9 * pitchers["weight_zscore"]
# pitchers_sorted = pitchers.sort_values(by='size_score', ascending=False)
# print(pitchers_sorted)

### Big-Body Score (BMI) as an alternative to Size Score

BMI = $\frac{Weight (lbs)}{(Height (in)^{2.5})}$

In [199]:
pitchers["big_body_raw"] = pitchers["Wt"] / (pitchers["height_inches"])**2
# pitchers['big_body_score'] = pitchers['big_body_raw'].rank(pct=True).round(4) * 100
pitchers["big_body_score"] = (pitchers["big_body_raw"] - pitchers["big_body_raw"].mean()) / pitchers["big_body_raw"].std() # + 0.5 * pitchers["height_zscore"]
pitchers_sorted = pitchers.sort_values(by='big_body_score', ascending=False)
print(pitchers_sorted[["Name", "big_body_score"]])

                 Name  big_body_score
164    Tony Santillan        3.640947
348  Brusdar Graterol        3.373899
43        Joe Jiménez        3.236370
602         Matt Gage        2.629504
276      Ronel Blanco        2.629504
..                ...             ...
483     Michael Arias       -2.680574
3        Juan Morillo       -2.764940
527      Jean Cabrera       -2.815345
453  Huascar Brazobán       -2.933435
252    Keider Montero       -3.031846

[742 rows x 2 columns]


### Calculate Dominance Score from stats
This year's stats from FanGraphs

In [200]:
pitching_stats = pd.read_csv("pitchers_fg.csv")

In [201]:
# set a 10-game qualification filter
pitching_stats = pitching_stats[pitching_stats["G"] > 10]

# determine starters/relievers
pitching_stats["Pos"] = np.where(
    pitching_stats["GS"] > 0.5 * pitching_stats["G"], 
    "SP", 
    "RP"
)

In [202]:
# pitching_stats_sp = pd.DataFrame(pitching_stats[pitching_stats["Pos"] == "SP"])
# pitching_stats_sp['SO_percentile'] = pitching_stats_sp['SO'].rank(pct=True).round(2)
# pitching_stats_sp['SO9_percentile'] = pitching_stats_sp['K/9'].rank(pct=True).round(2)
# pitching_stats_rp = pd.DataFrame(pitching_stats[pitching_stats["Pos"] == "RP"])
# pitching_stats_rp['SO_percentile'] = pitching_stats_rp['SO'].rank(pct=True).round(2)
# pitching_stats_rp['SO9_percentile'] = pitching_stats_rp['K/9'].rank(pct=True).round(2)

import pandas as pd

# SP pitchers
pitching_stats_sp = pd.DataFrame(pitching_stats[pitching_stats["Pos"] == "SP"])
pitching_stats_sp['SO_z'] = (pitching_stats_sp['SO'] - pitching_stats_sp['SO'].mean()) / pitching_stats_sp['SO'].std()
pitching_stats_sp['SO9_z'] = (pitching_stats_sp['K/9'] - pitching_stats_sp['K/9'].mean()) / pitching_stats_sp['K/9'].std()

# RP pitchers
# pitching_stats_rp = pd.DataFrame(pitching_stats[pitching_stats["Pos"] == "RP"])
# pitching_stats_rp['SO_z'] = (pitching_stats_rp['SO'] - pitching_stats_rp['SO'].mean()) / pitching_stats_rp['SO'].std()
# pitching_stats_rp['SO9_z'] = (pitching_stats_rp['K/9'] - pitching_stats_rp['K/9'].mean()) / pitching_stats_rp['K/9'].std()


# pitching_stats = pd.concat([pitching_stats_sp, pitching_stats_rp], ignore_index=True)

pitching_stats = pitching_stats_sp

pitching_stats["fb_velo"] = pitching_stats[["vFA (sc)", "vSI (sc)", "vFC (sc)",]].max(axis=1, skipna=True)
pitching_stats['fbv_z'] = (pitching_stats['fb_velo'] - pitching_stats['fb_velo'].mean()) / pitching_stats['fb_velo'].std()

pitching_stats["Stuff+_z"] = (pitching_stats["Stuff+"] - pitching_stats["Stuff+"].mean()) / pitching_stats["Stuff+"].std()
pitching_stats["Pitching+_z"] = (pitching_stats["Pitching+"] - pitching_stats["Pitching+"].mean()) / pitching_stats["Pitching+"].std()
pitching_stats["WAR_z"] = (pitching_stats["WAR"] - pitching_stats["WAR"].mean()) / pitching_stats["WAR"].std()



# pitching_stats["dominance_score_raw"] = 10 * pitching_stats["SO_percentile"] + 80 * pitching_stats["SO9_percentile"] + 50 * pitching_stats["fbv_percentile"] + pitching_stats["Stuff+"]
# pitching_stats["dominance_score"] = pitching_stats['dominance_score_raw'].rank(pct=True).round(4) * 100
# pitching_stats["dominance_score_raw"] = 10 * pitching_stats["SO9_z"] + 20 * pitching_stats["fbv_z"] + 30 * (pitching_stats["Stuff+"] / 100)
pitching_stats["dominance_score_raw"] = pitching_stats["WAR_z"] + pitching_stats["fbv_z"]
pitching_stats["dominance_score"] = (pitching_stats['dominance_score_raw'] - pitching_stats['dominance_score_raw'].mean()) / pitching_stats['dominance_score_raw'].std()


# print(pitching_stats.sort_values(by='dominance_score', ascending=False).head(5))

### Calculate HOSS = Size Score + Dominance Score

In [203]:
pitchers_hoss = pd.merge(
    pitching_stats, 
    pitchers, 
    on="Name",
    how="left"
)

# pitchers["HOSS"] = pitchers["size_score"] + pitchers["dominance_score"]
pitchers_hoss["HOSS"] = pitchers_hoss["big_body_score"] + pitchers_hoss["dominance_score"]

pitchers_sorted = pitchers_hoss.sort_values(by='HOSS', ascending=False)
pitchers_sorted
print(pitchers_sorted[["Name", "big_body_score", "WAR_z", "fbv_z", "HOSS"]].head(10))

                Name  big_body_score     WAR_z     fbv_z      HOSS
0       Tarik Skubal        1.365200  3.389206  1.748986  4.560917
2        Paul Skenes        1.384650  2.722889  2.090439  4.378318
7     Framber Valdez        2.206999  2.056572  0.236839  3.633394
22    Drew Rasmussen        2.039369  1.140387  0.870965  3.290336
1    Garrett Crochet        0.683298  2.889468  1.163639  3.204143
4       Hunter Brown        0.656487  2.139862  1.407534  2.862803
47     Hunter Greene        0.838836  0.390780  2.627007  2.715760
6         Logan Webb        1.131888  2.139862 -0.543625  2.124674
56     Adrian Houser        1.466344  0.140911  0.578291  1.913655
119    Corbin Burnes        1.618060 -0.608695  0.822186  1.750842


### Official HOSS Players

In [204]:
pitchers_hoss = pitchers_hoss[pitchers_hoss["big_body_score"] > 1]
pitchers_hoss = pitchers_hoss[pitchers_hoss["WAR_z"] > 1]
pitchers_hoss = pitchers_hoss[pitchers_hoss["fbv_z"] > 1]
pitchers_sorted = pitchers_hoss.sort_values(by='HOSS', ascending=False)
pitchers_sorted
print(pitchers_sorted[["Name", "big_body_score", "WAR_z", "fbv_z", "HOSS"]].head(10))

           Name  big_body_score     WAR_z     fbv_z      HOSS
0  Tarik Skubal         1.36520  3.389206  1.748986  4.560917
2   Paul Skenes         1.38465  2.722889  2.090439  4.378318
