In [31]:
from bs4 import BeautifulSoup
from collections import defaultdict
import copy
import json
import requests
from transitions import Machine
from transitions.extensions import GraphMachine

In [207]:
url = "https://baseballsavant.mlb.com/statcast_search?hfPT=&hfAB=&hfGT=R%7C&hfPR=&hfZ=&hfStadium=&hfBBL=&hfNewZones=&hfPull=&hfC=&hfSea=2024%7C&hfSit=&player_type=pitcher&hfOuts=&hfOpponent=&pitcher_throws=&batter_stands=&hfSA=&game_date_gt=&game_date_lt=&hfMo=&hfTeam=NYM%7C&home_road=&hfRO=&position=&hfInfield=&hfOutfield=&hfInn=&hfBBT=&hfFlag=&metric_1=&group_by=name-date&min_pitches=0&min_results=0&min_pas=0&sort_col=pitches&player_event_sort=api_p_release_speed&sort_order=desc"#results"
lst = [requests.get(url)]

# Url with results for a certain pitcher from one game
url2= "https://baseballsavant.mlb.com/statcast_search?hfPT=&hfAB=&hfGT=R%7C&hfPR=&hfZ=&hfStadium=&hfBBL=&hfNewZones=&hfPull=&hfC=&hfSea=2024%7C&hfSit=&player_type=pitcher&hfOuts=&hfOpponent=&pitcher_throws=&batter_stands=&hfSA=&game_date_gt=&game_date_lt=&hfMo=&hfTeam=NYM%7C&home_road=&hfRO=&position=&hfInfield=&hfOutfield=&hfInn=&hfBBT=&hfFlag=&metric_1=&group_by=name-date&min_pitches=0&min_results=0&min_pas=0&sort_col=pitches&player_event_sort=api_p_release_speed&sort_order=desc&type=details&player_id=622663&ep_game_date=2024-08-17&ep_game_pk=745785"

res = requests.get(url=url2)

parser_input = res.text

html_page = BeautifulSoup(parser_input, "html.parser")
rows = html_page.find_all(attrs={"style": "text-align: center;"})
pitch_data = [str(x) for x in rows if "search-pitch-label" in str(x)]
pitch_sequence = [x.split("\">")[2].split("<")[0] for x in pitch_data]
pitch_sequence.reverse()

In [219]:
# Class that takes requests from baseball savant and preps them to be fed into
# parser
class SoupFactory:
    def __init__(self, reqs: list[requests.models.Response]):
        self.requests = reqs
        self.num_pages = len(reqs)
        self.valid_requests = self.check_response_codes()
        
    def check_response_codes(self):
        valid_requests = []
        for idx, req in enumerate(self.requests):
            try:
                if req.status_code == 200:
                    valid_requests.append(req)
            except ValueError:
                print(f"Warning: No status code found for request {idx} with status code {req.status_code}") 
        return valid_requests
        
    def convert_to_soup(self):
        souped_requests = []
        for idx, req in enumerate(self.valid_requests):
            try:
                text = req.text
                html_page = BeautifulSoup(text, "html.parser")
                souped_requests.append(html_page)
            except ValueError:
                print(f"Warn: Cannot parse text for request {idx}")
        if len(souped_requests) == 0:
            print("Error: No requests could be parsed. Please ensure list of requests is non-empty.")
            return
        else:
            return souped_requests

# This is expecting the main pitch page as an input 
class GameUrlRetriever:
    
    def __init__(self, base_url):
        self.base_url = base_url
        self.main_page = self.get_main_page()
        self.urls = self.retrieve_urls(main_page)
        
    def get_main_page(self):
        base_url_results = self.base_url + "#results"
        result = [requests.get(base_url_results)]
        main_page = SoupFactory(result).convert_to_soup()
        return main_page
        
    def retrieve_urls(self, main_page):
        # Exract all trs which contain the information we need to make API calls
        # to the game pitch results
        pitcher_trs = self.main_page[0].find_all("tr", class_="search_row default-table-row")
        game_urls = []
        for entry in pitcher_trs:
            try:
                player_name_date = str(entry).split("player_name-date_")[1].split("\"")[0]
                player_name_date = player_name_date.split("_")
                player_id, date, game_id = player_name_date[0], player_name_date[1], player_name_date[2]
                game_url = self.base_url + f"&type=details&player_id={player_id}&ep_game_date={date}&ep_game_pk={game_id}"
                game_urls.append(game_url)
            except ValueError:
                print("Warn: failed to parse player, date, and game information from entry")
        return game_urls

class Parser


In [15]:
class MarkovChain:
    # to do, convert sequence input to list of ints
    def __init__(self, sequence: list[str]):
        self.sequence = sequence
        self.states = [state for state in set(sequence)]
        self.sequence_length = len(sequence)
        self.transition_counts = self.generate_counts()
        self.transition_probabilities = self.generate_probabilities()
        
    def generate_counts(self):
        states = set(self.sequence)
        state_dict = defaultdict(lambda: defaultdict(int))
        for state1 in states:
            for state2 in states:
                state_dict[state1][state2] = 0

        old_pitch = self.sequence[0]
        
        for i in range(1,len(self.sequence)):
            new_pitch = self.sequence[i]
            state_dict[old_pitch][new_pitch] += 1
            old_pitch = new_pitch
        return state_dict
        
    def generate_probabilities(self):
        transition_probabilities = copy.deepcopy(self.transition_counts)
        
        for state1 in transition_probabilities:
            total_count = sum([x for x in transition_probabilities[state1].values()])
            for state2 in transition_probabilities:
                transition_probabilities[state1][state2] /= total_count
        return transition_probabilities

    def generate_state_machine(self):

        transitions = []
        for state1 in self.states:
            for state2 in self.states:
                if self.transition_probabilities[state1][state2] != 0:
                    transitions.append({"trigger": str(round(self.transition_probabilities[state1][state2],3)), "source": state1, "dest": state2})
        
        pitch_machine = GraphMachine(states=self.states, transitions=transitions, initial=self.sequence[0])
        return pitch_machine

In [16]:
temp = MarkovChain(pitch_sequence)

In [17]:
test = temp.generate_state_machine()

In [18]:
test.get_graph().draw('my_state_diagram.jpg', prog="dot")

In [10]:
temp.transition_probabilities

defaultdict(<function __main__.MarkovChain.generate_counts.<locals>.<lambda>()>,
            {'ST': defaultdict(int,
                         {'ST': 0.23529411764705882,
                          'CH': 0.058823529411764705,
                          'SI': 0.17647058823529413,
                          'SL': 0.0,
                          'FF': 0.29411764705882354,
                          'FC': 0.23529411764705882}),
             'CH': defaultdict(int,
                         {'ST': 0.0,
                          'CH': 0.0,
                          'SI': 0.0,
                          'SL': 0.0,
                          'FF': 1.0,
                          'FC': 0.0}),
             'SI': defaultdict(int,
                         {'ST': 0.21052631578947367,
                          'CH': 0.0,
                          'SI': 0.3684210526315789,
                          'SL': 0.0,
                          'FF': 0.2631578947368421,
                          'FC': 0.15789473684210525

In [218]:
url = "https://baseballsavant.mlb.com/statcast_search?hfPT=&hfAB=&hfGT=R%7C&hfPR=&hfZ=&hfStadium=&hfBBL=&hfNewZones=&hfPull=&hfC=&hfSea=2024%7C&hfSit=&player_type=pitcher&hfOuts=&hfOpponent=&pitcher_throws=&batter_stands=&hfSA=&game_date_gt=&game_date_lt=&hfMo=&hfTeam=NYM%7C&home_road=&hfRO=&position=&hfInfield=&hfOutfield=&hfInn=&hfBBT=&hfFlag=&metric_1=&group_by=name-date&min_pitches=0&min_results=0&min_pas=0&sort_col=pitches&player_event_sort=api_p_release_speed&sort_order=desc"#results"
game_urls = GameUrlRetriever(url)

https://baseballsavant.mlb.com/statcast_search?hfPT=&hfAB=&hfGT=R%7C&hfPR=&hfZ=&hfStadium=&hfBBL=&hfNewZones=&hfPull=&hfC=&hfSea=2024%7C&hfSit=&player_type=pitcher&hfOuts=&hfOpponent=&pitcher_throws=&batter_stands=&hfSA=&game_date_gt=&game_date_lt=&hfMo=&hfTeam=NYM%7C&home_road=&hfRO=&position=&hfInfield=&hfOutfield=&hfInn=&hfBBT=&hfFlag=&metric_1=&group_by=name-date&min_pitches=0&min_results=0&min_pas=0&sort_col=pitches&player_event_sort=api_p_release_speed&sort_order=desc


In [220]:
game_urls.urls

677