In [13]:
# load dataset
import pandas as pd

pitcher_path = "/Users/shindasol/Downloads/2026 Associate Analyst Survey Data - Pitcher B.csv"
hitter_path  = "/Users/shindasol/Downloads/2026 Associate Analyst Survey Data - Player A.csv"

p = pd.read_csv(pitcher_path)
h = pd.read_csv(hitter_path)

print(p.shape, h.shape)
p.head()

(2764, 103) (3506, 103)


Unnamed: 0,pitcher,uuid,created,updated,pitch_number,game_date,game_time,inning_plate_appearance,plate_appearance_pitch_number,pitcher_throws,...,strikezone_bottom,pitch_effective_velocity_location,is_runner_on_first,is_runner_on_second,is_runner_on_third,is_swing_and_miss,is_chase,is_last_pitch_plate_appearance,year,savant_zone
0,Pitcher B,6f53088d-85da-4a0c-a90d-836c0a653e0f,11/22/2025 0:19:15,11/22/2025 0:19:15,104,7/26/2024,11:26:27 PM,1,1,left,...,1.531523,82.395752,False,False,False,False,False,False,2024,waste
1,Pitcher B,90bfa9e1-0d32-4788-ab30-3a4110fca159,11/22/2025 0:19:15,11/22/2025 0:19:15,33,6/28/2024,10:51:04 PM,2,3,left,...,1.488702,97.954193,False,False,False,False,False,False,2024,chase
2,Pitcher B,ea707603-36c2-4e7f-9965-bcde49ca5b47,11/22/2025 0:19:15,11/22/2025 0:19:15,178,8/1/2024,11:30:08 PM,4,4,left,...,1.528121,91.807708,False,False,True,False,False,True,2024,chase
3,Pitcher B,e7ff96aa-9fe2-4cb0-acef-7f4de52d30ca,11/22/2025 0:19:15,11/22/2025 0:19:15,152,7/21/2024,6:34:04 PM,2,1,left,...,1.479356,94.8095,False,False,False,False,False,False,2024,chase
4,Pitcher B,9250c58c-fe65-40e9-985b-07c5fe8a1701,11/22/2025 0:19:15,11/22/2025 0:19:15,253,7/21/2024,7:32:21 PM,1,2,left,...,1.479356,91.150895,False,False,False,False,False,False,2024,shadow


In [15]:
def add_metrics(df):
    df = df.copy()
    for col in ["is_strike","is_swing","is_swing_and_miss","is_chase"]:
        df[col] = df[col].fillna(False).astype(bool)

    df["called_strike"] = df["is_strike"] & (~df["is_swing"])
    df["csw"] = df["called_strike"] | df["is_swing_and_miss"]
    df["in_zone"] = df["savant_zone"].isin(["heart","shadow"])
    df["edge"]    = df["savant_zone"].isin(["shadow"])
    return df

p = add_metrics(p)
h = add_metrics(h)

# Pitch mix
mix = p.groupby(["year","pitch_type"]).size().reset_index(name="pitches")
mix["usage"] = mix["pitches"] / mix.groupby("year")["pitches"].transform("sum")

# Performance
perf = (p.groupby(["year","pitch_type"])
          .agg(pitches=("pitch_type","size"),
               swings=("is_swing","sum"),
               whiffs=("is_swing_and_miss","sum"),
               csw=("csw","sum"),
               chases=("is_chase","sum"),
               in_zone=("in_zone","sum"),
               edge=("edge","sum"))
          .reset_index())

perf["Whiff%"] = perf["whiffs"]/perf["swings"].replace(0,np.nan)
perf["CSW%"]   = perf["csw"]/perf["pitches"]
perf["Chase/Swing"] = perf["chases"]/perf["swings"].replace(0,np.nan)
perf["Zone%"]  = perf["in_zone"]/perf["pitches"]
perf["Edge%"]  = perf["edge"]/perf["pitches"]

perf.head()

Unnamed: 0,year,pitch_type,pitches,swings,whiffs,csw,chases,in_zone,edge,Whiff%,CSW%,Chase/Swing,Zone%,Edge%
0,2024,changeup,59,26,7,11,8,32,25,0.269231,0.186441,0.307692,0.542373,0.423729
1,2024,curveball,101,30,10,32,17,46,23,0.333333,0.316832,0.566667,0.455446,0.227723
2,2024,cutter,101,50,17,32,21,60,31,0.34,0.316832,0.42,0.594059,0.306931
3,2024,sinker,230,108,15,54,14,174,100,0.138889,0.234783,0.12963,0.756522,0.434783
4,2025,changeup,336,155,42,56,60,177,119,0.270968,0.166667,0.387097,0.526786,0.354167


In [29]:
mix = p.groupby(["year","pitch_type"]).size().reset_index(name="pitches")
mix["usage"] = mix["pitches"] / mix.groupby("year")["pitches"].transform("sum")

mix[mix.year == 2024]
 


Unnamed: 0,year,pitch_type,pitches,usage
0,2024,changeup,59,0.120163
1,2024,curveball,101,0.205703
2,2024,cutter,101,0.205703
3,2024,sinker,230,0.468432


In [31]:
mix[mix.year == 2025]

Unnamed: 0,year,pitch_type,pitches,usage
4,2025,changeup,336,0.147822
5,2025,curveball,133,0.058513
6,2025,cutter,523,0.230092
7,2025,four_seam,141,0.062033
8,2025,sinker,883,0.388473
9,2025,sweeper,257,0.113066


In [17]:
# pitcher mix
import matplotlib.pyplot as plt

pivot = mix.pivot(index="year", columns="pitch_type", values="usage").fillna(0).sort_index()
pivot.plot(kind="bar", stacked=True, figsize=(8,4))
plt.title("Pitcher B Pitch Mix by Season")
plt.ylabel("Usage")
plt.tight_layout()
plt.savefig("/Users/shindasol/Downloads/pitcherB_pitch_mix.png", dpi=300)
plt.close()


In [19]:
# Pitcher B – Whiff% by Pitch (2025)
d = perf[(perf.year==2025) & (perf.pitches>=60)]
(d.set_index("pitch_type")["Whiff%"]*100).plot(kind="bar", figsize=(8,4))
plt.title("Pitcher B Whiff% per Swing by Pitch Type (2025)")
plt.ylabel("Whiff%")
plt.tight_layout()
plt.savefig("/Users/shindasol/Downloads/pitcherB_whiff_2025.png", dpi=300)
plt.close()


In [21]:
# Player A – Whiff% by Pitch (2025)
hperf = (h.groupby(["year","pitch_type"])
           .agg(pitches=("pitch_type","size"),
                swings=("is_swing","sum"),
                whiffs=("is_swing_and_miss","sum"))
           .reset_index())
hperf["Whiff%"] = hperf["whiffs"]/hperf["swings"].replace(0,np.nan)

d = hperf[(hperf.year==2025) & (hperf.pitches>=60)]
(d.set_index("pitch_type")["Whiff%"]*100).sort_values(ascending=False).plot(
    kind="bar", figsize=(8,4))
plt.title("Position Player A Whiff% per Swing by Pitch Type (2025)")
plt.ylabel("Whiff%")
plt.tight_layout()
plt.savefig("/Users/shindasol/Downloads/playerA_whiff_2025.png", dpi=300)
plt.close()

In [23]:
# sweeper
plt.figure(figsize=(5,5))
plt.hexbin(
    p[(p.year==2025) & (p.pitch_type=="sweeper")]["pitch_plate_location_side"],
    p[(p.year==2025) & (p.pitch_type=="sweeper")]["pitch_plate_location_height"],
    gridsize=30)
plt.title("Pitcher B Sweeper Location (2025)")
plt.xlabel("Plate Side")
plt.ylabel("Plate Height")
plt.tight_layout()
plt.savefig("/Users/shindasol/Downloads/pitcherB_sweeper_location.png", dpi=300)
plt.close()