In [194]:


# this notebook requires my baseball-reference scraping library called sportsref.
# uncomment next line and run this cell to install:
# !pip install git+https://github.com/double-dose-larry/sportsref



In [1]:
import pandas as pd
import numpy as np
import altair as alt

from sportsref.baseball import Player

In [59]:
def get_starting_stats(pitcher):
    try:
        df = Player(pitcher,verbose=False).advanced_pages('pitch').get_df("pitching_starter")
        df = df.query("Tm != 'TOT'").dropna()
        df["name"] = pitcher
        return df
    except:
        pass

In [60]:
# get the active win list
active_w = pd.read_html("https://www.baseball-reference.com/leaders/W_active.shtml")[0]

In [61]:
# clean the win list a bit
active_w["name"] = (active_w["Player (yrs, age)"]
                        .str.split("(")
                        .apply(pd.Series)[0]
                        .str.replace("\xa0", " ")
                        .str.strip())

In [62]:
# get the player names
names = active_w.name

In [None]:
%%time
# scrape the starting pitching statistics from bref
# this took me 3 and a half minutes. don't mind all the junk this will spit out, I gotta fix my library
df = pd.concat([
    get_starting_stats(p)
    for p in names
])

In [210]:
# aggregate needed columns on a career level
career_df = df.groupby("name")[["GS", "Wgs", "Lgs", "Wchp", "Ltuf"]].sum()

In [211]:
# add percentages and group columns
career_df["cheap_win_pct"] = career_df.Wchp / career_df.Wgs
career_df["tuff_loss_pct"] = career_df.Ltuf / career_df.Lgs
career_df["tuff_loser_cheap_winner"] = np.where(career_df.cheap_win_pct >= .35,"Cheap Winners","Everyone Else")
career_df["tuff_loser_cheap_winner"] = np.where(career_df.tuff_loss_pct >= .35,"Tough Losers", career_df.tuff_loser_cheap_winner)

In [212]:
# get only those that started at least 100 games
career_df = career_df.reset_index().query("GS >= 100")

In [213]:
# pretty names for columns
career_df.columns = ["Name","Games Started", 
                     "Wins from starts", "Losses from starts",
                     "Cheap Wins", "Tough Losses", 
                     "cheap_win_pct", "tuff_loss_pct", "Group"]

In [248]:
# plot altair chart

# main scatter plot
chart = alt.Chart(career_df).mark_circle(size=100).encode(
    alt.X("cheap_win_pct", 
          axis=alt.Axis(format="%", title="Percent of Wins in Not Quality Starts")),
    alt.Y("tuff_loss_pct", 
          axis=alt.Axis(format="%", title="Percent of Losses in Quality Starts")),
    color=alt.Color("Group:N", 
                    legend=alt.Legend(orient="bottom-left",
                                      title=None)),
    tooltip=["Name", "Games Started", 
             "Wins from starts", "Losses from starts", 
             "Cheap Wins", "Tough Losses"]
)

# have names appear for some of the extreme values
annotation = alt.Chart(career_df).mark_text(
    align='left',
    baseline='middle',
    fontSize = 12,
    dx = 7,
    dy = 2
).encode(
    x='cheap_win_pct',
    y='tuff_loss_pct',
    text='Name'
).transform_filter(
    (alt.datum.cheap_win_pct >= .4) | (alt.datum.tuff_loss_pct >= .4)
)

# put it together
(chart + annotation).properties(
    title="Active Starters with 100+ Games Started",
    width = 600,
    height = 600
)#.save("cheap_winners_tough_losers.html")