In [5]:
from bs4 import BeautifulSoup
from collections import defaultdict
import copy
import json
import requests
from transitions import Machine
from transitions.extensions import GraphMachine
from models import Pitch, Pitcher, init_db, SessionLocal

In [18]:
db = SessionLocal()

In [207]:
url = "https://baseballsavant.mlb.com/statcast_search?hfPT=&hfAB=&hfGT=R%7C&hfPR=&hfZ=&hfStadium=&hfBBL=&hfNewZones=&hfPull=&hfC=&hfSea=2024%7C&hfSit=&player_type=pitcher&hfOuts=&hfOpponent=&pitcher_throws=&batter_stands=&hfSA=&game_date_gt=&game_date_lt=&hfMo=&hfTeam=NYM%7C&home_road=&hfRO=&position=&hfInfield=&hfOutfield=&hfInn=&hfBBT=&hfFlag=&metric_1=&group_by=name-date&min_pitches=0&min_results=0&min_pas=0&sort_col=pitches&player_event_sort=api_p_release_speed&sort_order=desc"#results"
lst = [requests.get(url)]

# Url with results for a certain pitcher from one game
url2= "https://baseballsavant.mlb.com/statcast_search?hfPT=&hfAB=&hfGT=R%7C&hfPR=&hfZ=&hfStadium=&hfBBL=&hfNewZones=&hfPull=&hfC=&hfSea=2024%7C&hfSit=&player_type=pitcher&hfOuts=&hfOpponent=&pitcher_throws=&batter_stands=&hfSA=&game_date_gt=&game_date_lt=&hfMo=&hfTeam=NYM%7C&home_road=&hfRO=&position=&hfInfield=&hfOutfield=&hfInn=&hfBBT=&hfFlag=&metric_1=&group_by=name-date&min_pitches=0&min_results=0&min_pas=0&sort_col=pitches&player_event_sort=api_p_release_speed&sort_order=desc&type=details&player_id=622663&ep_game_date=2024-08-17&ep_game_pk=745785"

res = requests.get(url=url2)

parser_input = res.text

html_page = BeautifulSoup(parser_input, "html.parser")
rows = html_page.find_all(attrs={"style": "text-align: center;"})
pitch_data = [str(x) for x in rows if "search-pitch-label" in str(x)]
pitch_sequence = [x.split("\">")[2].split("<")[0] for x in pitch_data]
pitch_sequence.reverse()

In [111]:
# Function to parse out pitcher names and IDs from main page
def populate_pitchers_table(base_url: str):
    db = SessionLocal()
    
    base_url_results = base_url + "#results"
    res = requests.get(url=base_url_results)
    
    souped_page = SoupFactory([res]).convert_to_soup()[0]
    pitcher_tds = souped_page.find_all("td", class_="player_name")
    count = 0
    for entry in pitcher_tds:
        pitcher_id_str = str(entry).split("id_")[1].split("\"")[0]
        
        pitcher_id_int = int(pitcher_id_str)
        
        existing_pitcher = db.query(Pitcher).filter_by(pitcher_id=pitcher_id_int).first()
        if not existing_pitcher:
            pitcher_name = str(entry).split(">\n")[1].split(" <")[0].split(",")
            stripped_name = [x.strip() for x in pitcher_name]
            pitcher_name_str = f"{stripped_name[1]} {stripped_name[0]}"
            print(f"Adding pitcher {pitcher_name_str} with id {pitcher_id_int} to db")
            new_pitcher = Pitcher(pitcher_name=pitcher_name_str, pitcher_id=pitcher_id_int)
            db.add(new_pitcher)
            db.commit()
            count += 1

        else:
            continue
    print(f"Finished adding {count} players to db. Closing connection now")
    db.close()
    """
    db = SessionLocal()
    new_pitcher = Pitcher(pitcher_name=player_name_str, pitcher_id=player_id_int)
    db.add(new_pitcher)
    db.commit()
    db.close()
    """

    
    

# Class that takes requests from baseball savant and preps them to be fed into
# parser
class SoupFactory:
    def __init__(self, reqs: list[requests.models.Response]):
        self.requests = reqs
        self.num_pages = len(reqs)
        self.valid_requests = self.check_response_codes()
        
    def check_response_codes(self):
        valid_requests = []
        for idx, req in enumerate(self.requests):
            try:
                if req.status_code == 200:
                    valid_requests.append(req)
            except ValueError:
                print(f"Warning: No status code found for request {idx} with status code {req.status_code}") 
        return valid_requests
        
    def convert_to_soup(self):
        souped_requests = []
        for idx, req in enumerate(self.valid_requests):
            try:
                text = req.text
                html_page = BeautifulSoup(text, "html.parser")
                souped_requests.append(html_page)
            except ValueError:
                print(f"Warn: Cannot parse text for request {idx}")
        if len(souped_requests) == 0:
            print("Error: No requests could be parsed. Please ensure list of requests is non-empty.")
            return
        else:
            return souped_requests

# This is expecting the main pitch page as an input 
class GameUrlRetriever:
    def __init__(self, base_url):
        self.base_url = base_url
        self.main_page = self.get_main_page()
        self.urls = self.retrieve_urls(self.main_page)
        
    def get_main_page(self):
        base_url_results = self.base_url + "#results"
        result = [requests.get(base_url_results)]
        main_page = SoupFactory(result).convert_to_soup()
        return main_page
        
    def retrieve_urls(self, main_page):
        # Exract all trs which contain the information we need to make API calls
        # to the game pitch results
        pitcher_trs = self.main_page[0].find_all("tr", class_="search_row default-table-row")
        game_urls = []
        for entry in pitcher_trs:
            try:
                player_name_date = str(entry).split("player_name-date_")[1].split("\"")[0]
                player_name_date = player_name_date.split("_")
                player_id, date, game_id = player_name_date[0], player_name_date[1], player_name_date[2]
                game_url = self.base_url + f"&type=details&player_id={player_id}&ep_game_date={date}&ep_game_pk={game_id}"
                game_urls.append(game_url)
            except ValueError:
                print("Warn: failed to parse player, date, and game information from entry")
        return game_urls

# Class to take in the list of URLs and process them in parallel



In [109]:
class MarkovChain:
    # to do, convert sequence input to list of ints
    def __init__(self, sequence: list[str]):
        self.sequence = sequence
        self.states = [state for state in set(sequence)]
        self.sequence_length = len(sequence)
        self.transition_counts = self.generate_counts()
        self.transition_probabilities = self.generate_probabilities()
        
    def generate_counts(self):
        states = set(self.sequence)
        state_dict = defaultdict(lambda: defaultdict(int))
        for state1 in states:
            for state2 in states:
                state_dict[state1][state2] = 0

        old_pitch = self.sequence[0]
        
        for i in range(1,len(self.sequence)):
            new_pitch = self.sequence[i]
            state_dict[old_pitch][new_pitch] += 1
            old_pitch = new_pitch
        return state_dict
        
    def generate_probabilities(self):
        transition_probabilities = copy.deepcopy(self.transition_counts)
        
        for state1 in transition_probabilities:
            total_count = sum([x for x in transition_probabilities[state1].values()])
            for state2 in transition_probabilities:
                transition_probabilities[state1][state2] /= total_count
        return transition_probabilities

    def generate_state_machine(self):

        transitions = []
        for state1 in self.states:
            for state2 in self.states:
                if self.transition_probabilities[state1][state2] != 0:
                    transitions.append({"trigger": str(round(self.transition_probabilities[state1][state2],3)), "source": state1, "dest": state2})
        
        pitch_machine = GraphMachine(states=self.states, transitions=transitions, initial=self.sequence[0])
        return pitch_machine

In [16]:
temp = MarkovChain(pitch_sequence)

In [17]:
test = temp.generate_state_machine()

In [18]:
test.get_graph().draw('my_state_diagram.jpg', prog="dot")

In [10]:
temp.transition_probabilities

defaultdict(<function __main__.MarkovChain.generate_counts.<locals>.<lambda>()>,
            {'ST': defaultdict(int,
                         {'ST': 0.23529411764705882,
                          'CH': 0.058823529411764705,
                          'SI': 0.17647058823529413,
                          'SL': 0.0,
                          'FF': 0.29411764705882354,
                          'FC': 0.23529411764705882}),
             'CH': defaultdict(int,
                         {'ST': 0.0,
                          'CH': 0.0,
                          'SI': 0.0,
                          'SL': 0.0,
                          'FF': 1.0,
                          'FC': 0.0}),
             'SI': defaultdict(int,
                         {'ST': 0.21052631578947367,
                          'CH': 0.0,
                          'SI': 0.3684210526315789,
                          'SL': 0.0,
                          'FF': 0.2631578947368421,
                          'FC': 0.15789473684210525

In [72]:
url = "https://baseballsavant.mlb.com/statcast_search?hfPT=&hfAB=&hfGT=R%7C&hfPR=&hfZ=&hfStadium=&hfBBL=&hfNewZones=&hfPull=&hfC=&hfSea=2024%7C&hfSit=&player_type=pitcher&hfOuts=&hfOpponent=&pitcher_throws=&batter_stands=&hfSA=&game_date_gt=&game_date_lt=&hfMo=&hfTeam=&home_road=&hfRO=&position=&hfInfield=&hfOutfield=&hfInn=&hfBBT=&hfFlag=&metric_1=&group_by=name-date&min_pitches=0&min_results=0&min_pas=0&sort_col=pitches&player_event_sort=api_p_release_speed&sort_order=desc"
#game_urls = GameUrlRetriever(url)

In [13]:
game_urls.urls[0:3]

['https://baseballsavant.mlb.com/statcast_search?hfPT=&hfAB=&hfGT=R%7C&hfPR=&hfZ=&hfStadium=&hfBBL=&hfNewZones=&hfPull=&hfC=&hfSea=2024%7C&hfSit=&player_type=pitcher&hfOuts=&hfOpponent=&pitcher_throws=&batter_stands=&hfSA=&game_date_gt=&game_date_lt=&hfMo=&hfTeam=NYM%7C&home_road=&hfRO=&position=&hfInfield=&hfOutfield=&hfInn=&hfBBT=&hfFlag=&metric_1=&group_by=name-date&min_pitches=0&min_results=0&min_pas=0&sort_col=pitches&player_event_sort=api_p_release_speed&sort_order=desc&type=details&player_id=622663&ep_game_date=2024-08-17&ep_game_pk=745785',
 'https://baseballsavant.mlb.com/statcast_search?hfPT=&hfAB=&hfGT=R%7C&hfPR=&hfZ=&hfStadium=&hfBBL=&hfNewZones=&hfPull=&hfC=&hfSea=2024%7C&hfSit=&player_type=pitcher&hfOuts=&hfOpponent=&pitcher_throws=&batter_stands=&hfSA=&game_date_gt=&game_date_lt=&hfMo=&hfTeam=NYM%7C&home_road=&hfRO=&position=&hfInfield=&hfOutfield=&hfInn=&hfBBT=&hfFlag=&metric_1=&group_by=name-date&min_pitches=0&min_results=0&min_pas=0&sort_col=pitches&player_event_sort=

In [112]:
populate_pitchers_table(url)


Adding pitcher Kevin Gausman with id 592332 to db
Adding pitcher Bowden Francis with id 670102 to db
Adding pitcher Chris Bassitt with id 605135 to db
Adding pitcher Zack Wheeler with id 554430 to db
Adding pitcher Blake Snell with id 605483 to db
Adding pitcher JP Sears with id 676664 to db
Adding pitcher Dylan Cease with id 656302 to db
Adding pitcher Chris Sale with id 519242 to db
Adding pitcher Luis Severino with id 622663 to db
Adding pitcher Kyle Gibson with id 502043 to db
Adding pitcher Ranger Suárez with id 624133 to db
Adding pitcher Hunter Greene with id 668881 to db
Adding pitcher Tanner Houck with id 656557 to db
Adding pitcher Seth Lugo with id 607625 to db
Adding pitcher Joey Lucchesi with id 664192 to db
Adding pitcher Aaron Nola with id 605400 to db
Adding pitcher Freddy Peralta with id 642547 to db
Adding pitcher MacKenzie Gore with id 669022 to db
Adding pitcher Pablo López with id 641154 to db
Adding pitcher Jake Irvin with id 663623 to db
Adding pitcher Joe Ryan w

In [74]:
ww = "<td class=\"player_name tr-data align-left table-static-column-two\" id=\"id_663738\"> Lynch IV, Daniel <span class=\"search-label\">LHP </span> </td>"

In [90]:
temp = ww.split("> ")[1].split(" <")[0].split(",")
tt = [x.strip() for x in temp]

In [94]:
f"{tt[1]} {tt[0]}" 

'Daniel Lynch IV'