In [1]:
import os, sys, requests, io, json
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
sys.path.append('/home/dcooper/rockies/RockiesAnalysis/')
from utils.scraping.safe_playerid_lookup import fangraphs_playerid_lookup



In [2]:
## Scraping current rosters
def scrape_roster(URL):

    session = requests.session()
    content = session.get(URL).content
    soup = BeautifulSoup(content)
    tables = soup.find_all('table')
    names = []
    for table in tables:
        html = str(table)
        if 'mlb' not in URL:
            html = html.replace("<tbody>", "<tbody><tr>").replace("</td>\n<tr>", "</td></tr><tr>")
        df = pd.read_html(io.StringIO(html))[0]
        if df.columns[0] == 'Manager/Coach':
            continue
        col = df.iloc[:,1]
        for r in col:
            name = ' '.join(r.split()[:2])
            names.append(name)

    return names

def scrape_all_rosters():
    MLB_URL = 'https://www.mlb.com/rockies/roster/40-man'
    AAA_URL = 'https://www.milb.com/albuquerque/roster'
    AA_URL = 'https://www.milb.com/hartford/roster'
    HighA_URL = 'https://www.milb.com/spokane/roster'
    A_URL = 'https://www.milb.com/fresno/roster'
    ACL_URL = 'https://www.milb.com/arizona-complex/roster/1994'
    DSL_URL = 'https://www.milb.com/dominican-summer/roster/629'

    names = []
    for URL in [MLB_URL, AAA_URL, AA_URL, HighA_URL, A_URL, ACL_URL, DSL_URL]:
        level_names = scrape_roster(URL)
        for name in level_names:
            if name == 'Jose De':
                name = 'Jose De La Cruz'
            names.append(name)

    return names

def split_name(name):
        try:
            first, last = name.split()
        except:
            first, last = name.split()[0], ' '.join(name.split()[1:])

        return last, first

def load_playerids(names):
    from utils.scraping.safe_playerid_lookup import fangraphs_playerid_lookup
    playerids = []
    for name in names:
        playerid = fangraphs_playerid_lookup(*split_name(name))
        playerids.append(playerid)

    return playerids

def get_player_fangraphs_props(last, first, playerid):
    URL = f'https://www.fangraphs.com/players/{first}-{last}/{playerid}/stats/pitching'
    session = requests.session()
    content = session.get(URL).content
    soup = BeautifulSoup(content)
    next_data = soup.find("script", id="__NEXT_DATA__")
    props = json.loads(next_data.string)

    return props

def get_all_player_fangraphs_props(names, playerids):
    fangraph_props = []
    for (name, playerid) in zip(names, playerids):
        props = get_player_fangraphs_props(*split_name(name), playerid)
        fangraph_props.append(props)

    return fangraph_props

def is_pitcher(props):

    pos = props['props']['pageProps']['dataStats']['playerInfo']['Position']
    if pos == 'P':
        return True
    else:
        return False

def get_prospect_data(props, verbose: bool=False):
    try:
        return props['props']['pageProps']['dataCommon']['prospect'][0]
    except:
        if verbose:
            print('Could not find propect info for', props['props']['pageProps']['dataStats']['playerInfo']['firstLastName'])
        return {}


def is_prospect(props):

    service_time = props['props']['pageProps']['dataContractStatus']['serviceTime']
    if service_time == 0 or service_time is None or service_time == '':
        return True
    elif float(service_time) > 0.08:
        return False
    
    if is_pitcher(props):
        if 'data' in props['props']['pageProps']['dataCommon'].keys():
            IP = float(props['props']['pageProps']['dataCommon']['data'][0]['IP'])
            if IP >= 50:
                return False
    else:
        raise NotImplementedError()

    return True


def get_specific_data(in_data, keys):
    out_data = {}
    for key in keys:
        if key in in_data.keys():
            out_data[key] = in_data[key]
        else:
            out_data[key] = np.nan

    return out_data
    

In [6]:
names = scrape_all_rosters()
playerids = load_playerids(names)
fangraph_props = get_all_player_fangraphs_props(names, playerids)

Gathering player lookup table. This may take a moment.


In [7]:
pitching_prospect_keys = [
    'playerName',
    'Age',
    'Range',
    'Touch',
    'FV_Current',
    'pFB',
    'fFB',
    'pSL',
    'fSL',
    'pCB',
    'fCB',
    'pCH',
    'fCH',
    'pCT',
    'fCT',
    'pCMD',
    'fCMD',
    'Draft_Rnd',
]

In [8]:
pitchers_df = pd.DataFrame(columns=pitching_prospect_keys)
for (name, playerid, props) in zip(names, playerids, fangraph_props):
    if is_pitcher(props) and is_prospect(props):
        prospect_data = get_prospect_data(props)   
        if prospect_data == {}:
            continue
        pitching_propsect_data = get_specific_data(prospect_data, pitching_prospect_keys)
        pitchers_df.loc[pitchers_df.shape[0]] = [item for _, item in pitching_propsect_data.items()]


In [9]:
# first, last = 'Mason', 'Albright'
# playerid = fangraphs_playerid_lookup(last, first)
# props = get_player_fangraphs_props(last, first, playerid)

In [10]:
# with open('test.json', 'w') as f:
#     json.dump(props, f, indent=4)

In [11]:
pitchers_df

Unnamed: 0,playerName,Age,Range,Touch,FV_Current,pFB,fFB,pSL,fSL,pCB,fCB,pCH,fCH,pCT,fCT,pCMD,fCMD,Draft_Rnd
0,McCade Brown,25.3777777,93-97,99,45,60,60,50.0,55.0,50.0,55.0,30.0,45.0,,,40,45,3.0
1,Pierson Ohl,26.0583333,89-92,93,40,35,35,45.0,45.0,,,55.0,60.0,,,60,70,14.0
2,Carson Palmquist,24.9555555,89-92,94,45,40,40,50.0,55.0,40.0,40.0,40.0,55.0,,,45,50,3.0
3,Mason Albright,21.9611111,91-92,94,40,40,45,,,55.0,60.0,30.0,45.0,,,40,55,12.0
4,Eiberson Castellano,24.3944444,93-95,96,40,50,55,,,60.0,60.0,40.0,50.0,,,30,40,
5,Brayan Castillo,23.0555555,93-96,98,40,50,55,45.0,50.0,,,40.0,50.0,,,35,50,
6,Victor Juarez,20.2833333,91-93,94,40,30,40,,,50.0,50.0,50.0,60.0,,,35,60,
7,Jack Mahoney,24.1333333,92-94,97,40,45,45,45.0,50.0,,,45.0,50.0,,,35,55,3.0
8,Chris McMahon,23.6583333,91-93,95,40,40,45,50.0,55.0,,,45.0,55.0,,,35,50,2.0
9,Michael Prosecky,24.5861111,90-93,95,37,45,50,20.0,50.0,55.0,60.0,30.0,45.0,,,30,45,6.0


In [12]:
# Drop NA
pitchers_df.dropna(axis=0, how='all')

# Change Range to Sits
pitcher_ranges = pitchers_df['Range'].to_numpy()
sits = np.zeros(pitcher_ranges.shape)
for i, pitcher_range in enumerate(pitcher_ranges):
    min, max = pitcher_range.split('-')
    sits[i] = np.mean([int(min), int(max)])
pitchers_df = pitchers_df.drop(['Range'], axis=1)
pitchers_df.insert(2, 'Sits', sits)

# Change Touch to Tops
pitchers_df.rename(columns = {'Touch' : 'Tops'}, inplace=True)
pitchers_df

Unnamed: 0,playerName,Age,Sits,Tops,FV_Current,pFB,fFB,pSL,fSL,pCB,fCB,pCH,fCH,pCT,fCT,pCMD,fCMD,Draft_Rnd
0,McCade Brown,25.3777777,95.0,99,45,60,60,50.0,55.0,50.0,55.0,30.0,45.0,,,40,45,3.0
1,Pierson Ohl,26.0583333,90.5,93,40,35,35,45.0,45.0,,,55.0,60.0,,,60,70,14.0
2,Carson Palmquist,24.9555555,90.5,94,45,40,40,50.0,55.0,40.0,40.0,40.0,55.0,,,45,50,3.0
3,Mason Albright,21.9611111,91.5,94,40,40,45,,,55.0,60.0,30.0,45.0,,,40,55,12.0
4,Eiberson Castellano,24.3944444,94.0,96,40,50,55,,,60.0,60.0,40.0,50.0,,,30,40,
5,Brayan Castillo,23.0555555,94.5,98,40,50,55,45.0,50.0,,,40.0,50.0,,,35,50,
6,Victor Juarez,20.2833333,92.0,94,40,30,40,,,50.0,50.0,50.0,60.0,,,35,60,
7,Jack Mahoney,24.1333333,93.0,97,40,45,45,45.0,50.0,,,45.0,50.0,,,35,55,3.0
8,Chris McMahon,23.6583333,92.0,95,40,40,45,50.0,55.0,,,45.0,55.0,,,35,50,2.0
9,Michael Prosecky,24.5861111,91.5,95,37,45,50,20.0,50.0,55.0,60.0,30.0,45.0,,,30,45,6.0


## Normalize

In [13]:
norm_pitchers_df = pd.DataFrame(pitchers_df['playerName'])
 
for col in pitchers_df.columns[1:]:
    vals = pitchers_df[col].to_numpy().astype(float)
    
    if col == 'Age':
        min, max = 40, 16
    elif col == 'Sits':
        min, max = 86.5, 98.2
    elif col == 'Tops':
        min, max = 89, 102
    elif col == 'Draft_Rnd':
        min, max = 10, 1
    else:
        min, max = 20, 80

    norm_pitchers_df[col] = 100 * (vals - min) / (max - min)

stat_cols = norm_pitchers_df.columns[1:].to_list()
norm_pitchers_df
    

Unnamed: 0,playerName,Age,Sits,Tops,FV_Current,pFB,fFB,pSL,fSL,pCB,fCB,pCH,fCH,pCT,fCT,pCMD,fCMD,Draft_Rnd
0,McCade Brown,60.925926,72.649573,76.923077,41.666667,66.666667,66.666667,50.0,58.333333,50.0,58.333333,16.666667,41.666667,,,33.333333,41.666667,77.777778
1,Pierson Ohl,58.090278,34.188034,30.769231,33.333333,25.0,25.0,41.666667,41.666667,,,58.333333,66.666667,,,66.666667,83.333333,-44.444444
2,Carson Palmquist,62.685185,34.188034,38.461538,41.666667,33.333333,33.333333,50.0,58.333333,33.333333,33.333333,33.333333,58.333333,,,41.666667,50.0,77.777778
3,Mason Albright,75.162037,42.735043,38.461538,33.333333,33.333333,41.666667,,,58.333333,66.666667,16.666667,41.666667,,,33.333333,58.333333,-22.222222
4,Eiberson Castellano,65.023148,64.102564,53.846154,33.333333,50.0,58.333333,,,66.666667,66.666667,33.333333,50.0,,,16.666667,33.333333,
5,Brayan Castillo,70.601852,68.376068,69.230769,33.333333,50.0,58.333333,41.666667,50.0,,,33.333333,50.0,,,25.0,50.0,
6,Victor Juarez,82.152778,47.008547,38.461538,33.333333,16.666667,33.333333,,,50.0,50.0,50.0,66.666667,,,25.0,66.666667,
7,Jack Mahoney,66.111111,55.555556,61.538462,33.333333,41.666667,41.666667,41.666667,50.0,,,41.666667,50.0,,,25.0,58.333333,77.777778
8,Chris McMahon,68.090278,47.008547,46.153846,33.333333,33.333333,41.666667,50.0,58.333333,,,41.666667,58.333333,,,25.0,50.0,88.888889
9,Michael Prosecky,64.224537,42.735043,46.153846,28.333333,41.666667,50.0,0.0,50.0,58.333333,66.666667,16.666667,41.666667,,,16.666667,41.666667,44.444444


## Test Scoring

In [14]:
# raw_weights = np.random.rand(norm_pitchers_df.shape[1]-1) # Would be from sliders
# weights = {c: w for c, w in zip(stat_cols, raw_weights)}

# def normalize_weights(raw_weights):
#     total = sum(raw_weights.values())
#     if total == 0:
#         # fallback: equal weights
#         n = len(raw_weights)
#         return {k: 1/n for k in raw_weights}
#     return {k: v / total for k, v in raw_weights.items()}


# def score(df, weights, stat_cols):

#     weights = normalize_weights(weights)
#     sum(weights.values())

#     scores = np.zeros(df.shape[0])
#     for i in range(df.shape[0]):
#         scores[i] = np.nansum([v*w for v, (_, w) in zip(df[stat_cols].iloc[i].to_numpy(), weights.items())])

#     df['scores'] = scores

#     return df.sort_values('scores', ascending=False)

# scored_df = score(norm_pitchers_df, weights, stat_cols)
# scored_df

## Test widgets

In [17]:
import pandas as pd
import numpy as np
import ipywidgets as widgets
from IPython.display import display


# --- functions ---
def normalize_weights(raw_weights):
    total = sum(raw_weights.values())
    if total == 0:
        n = len(raw_weights)
        return {k: 1/n for k in raw_weights}
    return {k: v / total for k, v in raw_weights.items()}

def score_prospects(df, stat_cols, weights):
    w = np.array([weights[c] for c in stat_cols])
    df = df.copy()
    scores = np.zeros(df.shape[0])
    for i in range(df.shape[0]):
        scores[i] = np.nansum([v*w for v, (_, w) in zip(df[stat_cols].iloc[i].to_numpy(), weights.items())])
    df['score'] = scores    
    df["rank"] = df["score"].rank(ascending=False, method="dense").astype(int)
    return df.sort_values("score", ascending=False)

# --- sliders ---
sliders = {
    stat: widgets.FloatSlider(
        value=0,
        min=0,
        max=100,
        step=1,
        description=stat,
        continuous_update=True
    )
    for stat in stat_cols
}

out = widgets.Output()

# --- update function ---
def update_scores(change=None):
    raw = {stat: sliders[stat].value for stat in stat_cols}
    weights = normalize_weights(raw)
    scored_df = score_prospects(norm_pitchers_df, stat_cols, weights)
    
    with out:
        out.clear_output(wait=True)
        display(pd.DataFrame({
            "Player": scored_df["playerName"],
            "Score": scored_df["score"].round(3),
            "Rank": scored_df["rank"]
        }))

# --- connect sliders ---
for s in sliders.values():
    s.observe(update_scores, names="value")

# Sliders in a vertical column
sliders_box = widgets.VBox(list(sliders.values()))

# Output will be in its own box
output_box = widgets.VBox([out])

# Combine sliders and output horizontally
ui = widgets.HBox([sliders_box, output_box], layout=widgets.Layout(align_items='flex-start'))

display(ui)

# initial call
update_scores()


HBox(children=(VBox(children=(FloatSlider(value=0.0, description='Age', step=1.0), FloatSlider(value=0.0, descâ€¦