In [13]:
from bs4 import BeautifulSoup
from collections import defaultdict
import copy
import json
import requests
from transitions import Machine
from transitions.extensions import GraphMachine
from models import Pitch, Pitcher, init_db, SessionLocal
import concurrent.futures

In [14]:
db = SessionLocal()

In [3]:
url = "https://baseballsavant.mlb.com/statcast_search?hfPT=&hfAB=&hfGT=R%7C&hfPR=&hfZ=&hfStadium=&hfBBL=&hfNewZones=&hfPull=&hfC=&hfSea=2024%7C&hfSit=&player_type=pitcher&hfOuts=&hfOpponent=&pitcher_throws=&batter_stands=&hfSA=&game_date_gt=&game_date_lt=&hfMo=&hfTeam=NYM%7C&home_road=&hfRO=&position=&hfInfield=&hfOutfield=&hfInn=&hfBBT=&hfFlag=&metric_1=&group_by=name-date&min_pitches=0&min_results=0&min_pas=0&sort_col=pitches&player_event_sort=api_p_release_speed&sort_order=desc"#results"
lst = [requests.get(url)]

# Url with results for a certain pitcher from one game
url2= "https://baseballsavant.mlb.com/statcast_search?hfPT=&hfAB=&hfGT=R%7C&hfPR=&hfZ=&hfStadium=&hfBBL=&hfNewZones=&hfPull=&hfC=&hfSea=2024%7C&hfSit=&player_type=pitcher&hfOuts=&hfOpponent=&pitcher_throws=&batter_stands=&hfSA=&game_date_gt=&game_date_lt=&hfMo=&hfTeam=NYM%7C&home_road=&hfRO=&position=&hfInfield=&hfOutfield=&hfInn=&hfBBT=&hfFlag=&metric_1=&group_by=name-date&min_pitches=0&min_results=0&min_pas=0&sort_col=pitches&player_event_sort=api_p_release_speed&sort_order=desc&type=details&player_id=622663&ep_game_date=2024-08-17&ep_game_pk=745785"

res = requests.get(url=url2)

parser_input = res.text

html_page = BeautifulSoup(parser_input, "html.parser")
rows = html_page.find_all(attrs={"style": "text-align: center;"})
pitch_data = [str(x) for x in rows if "search-pitch-label" in str(x)]
pitch_sequence = [x.split("\">")[2].split("<")[0] for x in pitch_data]
pitch_sequence.reverse()

In [65]:
# Function to parse out pitcher names and IDs from main page
def populate_pitchers_table(base_url: str):
    db = SessionLocal()
    
    base_url_results = base_url + "#results"
    res = requests.get(url=base_url_results)
    
    souped_page = SoupFactory([res]).convert_to_soup()[0]
    pitcher_tds = souped_page.find_all("td", class_="player_name")
    count = 0
    for entry in pitcher_tds:
        pitcher_id_str = str(entry).split("id_")[1].split("\"")[0]
        
        pitcher_id_int = int(pitcher_id_str)
        
        existing_pitcher = db.query(Pitcher).filter_by(pitcher_id=pitcher_id_int).first()
        if not existing_pitcher:
            pitcher_name = str(entry).split(">\n")[1].split(" <")[0].split(",")
            stripped_name = [x.strip() for x in pitcher_name]
            pitcher_name_str = f"{stripped_name[1]} {stripped_name[0]}"
            print(f"Adding pitcher {pitcher_name_str} with id {pitcher_id_int} to db")
            new_pitcher = Pitcher(pitcher_name=pitcher_name_str, pitcher_id=pitcher_id_int)
            db.add(new_pitcher)
            db.commit()
            count += 1

        else:
            continue
    print(f"Finished adding {count} players to db. Closing connection now")
    db.close()

def populate_pitches_table():
    print("asdf")
    
    

# Class that takes requests from baseball savant and preps them to be fed into
# parser
class SoupFactory:
    def __init__(self, reqs: list[requests.models.Response]):
        self.requests = reqs
        self.num_pages = len(reqs)
        self.valid_requests = self.check_response_codes()
        
    def check_response_codes(self):
        valid_requests = []
        for idx, req in enumerate(self.requests):
            try:
                if req.status_code == 200:
                    valid_requests.append(req)
            except ValueError:
                print(f"Warning: No status code found for request {idx} with status code {req.status_code}") 
        return valid_requests
        
    def convert_to_soup(self):
        souped_requests = []
        for idx, req in enumerate(self.valid_requests):
            try:
                text = req.text
                html_page = BeautifulSoup(text, "html.parser")
                souped_requests.append(html_page)
            except ValueError:
                print(f"Warn: Cannot parse text for request {idx}")
        if len(souped_requests) == 0:
            print("Error: No requests could be parsed. Please ensure list of requests is non-empty.")
            return
        else:
            return souped_requests

# This is expecting the main pitch page as an input 
class GameUrlRetriever:
    def __init__(self, base_url):
        self.base_url = base_url
        self.main_page = self.get_main_page()
        self.urls = self.retrieve_urls(self.main_page)
        
    def get_main_page(self):
        base_url_results = self.base_url + "#results"
        result = [requests.get(base_url_results)]
        main_page = SoupFactory(result).convert_to_soup()
        return main_page
        
    def retrieve_urls(self, main_page):
        # Exract all trs which contain the information we need to make API calls
        # to the game pitch results
        pitcher_trs = self.main_page[0].find_all("tr", class_="search_row default-table-row")
        game_urls = []
        for entry in pitcher_trs:
            try:
                player_name_date = str(entry).split("player_name-date_")[1].split("\"")[0]
                player_name_date = player_name_date.split("_")
                player_id, date, game_id = player_name_date[0], player_name_date[1], player_name_date[2]
                game_url = self.base_url + f"&type=details&player_id={player_id}&ep_game_date={date}&ep_game_pk={game_id}"
                game_urls.append({"game_url": game_url, "player_id": player_id, "game_id": game_id})
            except ValueError:
                print("Warn: failed to parse player, date, and game information from entry")
        return game_urls

# Class to take in the list of URLs and process them in parallel
class GameUrlProcessor:
    def __init__(self, game_dic_list: list):
        self.game_list = game_dic_list
    
    def process_urls(self):
        results = []
        with concurrent.futures.ThreadPoolExecutor() as executor:
            futures = {executor.submit(self.fetch_url_and_extract_pitch, game): game for game in self.game_list}
            for future in concurrent.futures.as_completed(futures):
                print(future.result())
        
            
    def fetch_url_and_extract_pitch(self, game_info):
        try:
            print("hi")
            response = requests.get(game_info["game_url"])
            html_page = SoupFactory([response]).convert_to_soup()
            trs = page[0].find_all("tr")[1:-1]
            pitcher_id_int = int(game_info["player_id"])
            game_id_int = int(game_info["game_id"])
            for idx, tr in enumerate(trs):
                pitch_data = []
                for elem in tr:
                    inner_text = elem.text.strip()
                    if inner_text != "":
                        pitch_data.append(inner_text)
                new_pitch = self.parse_pitch_info(pitch_data)
                pitch_data.append(new_pitch)
            return pitch_data
            
        except Exception as e:
            return f"Error: {e}"
        
    def parse_pitch_info(self, pitch_data):
        try:
            pitch_date_str = pitch_data[0]
            pitch_type_str = pitch_data[1]
            pitch_mph_flo = float(pitch_data[2])
            spin_rate_int = int(pitch_data[3])
            pitch_number_int = idx
            pitch_result_str = pitch_data[-2]
            pa_result_str = pitch_data[-1]
            new_pitch = Pitch(
                pitcher_id=pitcher_id_int, 
                game_id=game_id_int,
                pitch_number=pitch_number_int,
                pitch_type=pitch_type_str,
                pitch_result=pitch_result_str,
                pa_result=pa_result_str,
                spin_rate=spin_rate_int,
                pitch_mph=pitch_mph_flo,
                pitch_date=pitch_date_str
            )
        except ValueError:
            print("Error parsing pitch data. Skipping row")
        return new_pitch

In [43]:
url = "https://baseballsavant.mlb.com/statcast_search?hfPT=&hfAB=&hfGT=R%7C&hfPR=&hfZ=&hfStadium=&hfBBL=&hfNewZones=&hfPull=&hfC=&hfSea=2024%7C&hfSit=&player_type=pitcher&hfOuts=&hfOpponent=&pitcher_throws=&batter_stands=&hfSA=&game_date_gt=&game_date_lt=&hfMo=&hfTeam=&home_road=&hfRO=&position=&hfInfield=&hfOutfield=&hfInn=&hfBBT=&hfFlag=&metric_1=&group_by=name-date&min_pitches=0&min_results=0&min_pas=0&sort_col=pitches&player_event_sort=api_p_release_speed&sort_order=desc"
game_urls = GameUrlRetriever(url)

In [66]:
qqq = GameUrlProcessor(game_urls.urls[0:3])

hi
hi
hi
['2024-04-09', 'SI', '92.6', '2071', 'Bassitt, Chris(R)', 'Crawford, J.P.(L)', '90.3', '27°', '324', '7', '1-0', 'b\n\t\t\t\tTop 1', 'Hit Into Play', 'J.P. Crawford flies out to left fielder Davis Schneider.', <models.Pitch object at 0x28b809580>]
['2024-04-09', 'SI', '92.6', '2071', 'Bassitt, Chris(R)', 'Crawford, J.P.(L)', '90.3', '27°', '324', '7', '1-0', 'b\n\t\t\t\tTop 1', 'Hit Into Play', 'J.P. Crawford flies out to left fielder Davis Schneider.', <models.Pitch object at 0x28b8d0f80>]
['2024-04-09', 'SI', '92.6', '2071', 'Bassitt, Chris(R)', 'Crawford, J.P.(L)', '90.3', '27°', '324', '7', '1-0', 'b\n\t\t\t\tTop 1', 'Hit Into Play', 'J.P. Crawford flies out to left fielder Davis Schneider.', <models.Pitch object at 0x1787b6540>]


In [4]:
class MarkovChain:
    # to do, convert sequence input to list of ints
    def __init__(self, sequence: list[str]):
        self.sequence = sequence
        self.states = [state for state in set(sequence)]
        self.sequence_length = len(sequence)
        self.transition_counts = self.generate_counts()
        self.transition_probabilities = self.generate_probabilities()
        
    def generate_counts(self):
        states = set(self.sequence)
        state_dict = defaultdict(lambda: defaultdict(int))
        for state1 in states:
            for state2 in states:
                state_dict[state1][state2] = 0

        old_pitch = self.sequence[0]
        
        for i in range(1,len(self.sequence)):
            new_pitch = self.sequence[i]
            state_dict[old_pitch][new_pitch] += 1
            old_pitch = new_pitch
        return state_dict
        
    def generate_probabilities(self):
        transition_probabilities = copy.deepcopy(self.transition_counts)
        
        for state1 in transition_probabilities:
            total_count = sum([x for x in transition_probabilities[state1].values()])
            for state2 in transition_probabilities:
                transition_probabilities[state1][state2] /= total_count
        return transition_probabilities

    def generate_state_machine(self):

        transitions = []
        for state1 in self.states:
            for state2 in self.states:
                if self.transition_probabilities[state1][state2] != 0:
                    transitions.append({"trigger": str(round(self.transition_probabilities[state1][state2],3)), "source": state1, "dest": state2})
        
        pitch_machine = GraphMachine(states=self.states, transitions=transitions, initial=self.sequence[0])
        return pitch_machine

In [16]:
temp = MarkovChain(pitch_sequence)

In [17]:
test = temp.generate_state_machine()

In [18]:
test.get_graph().draw('my_state_diagram.jpg', prog="dot")

In [10]:
temp.transition_probabilities

defaultdict(<function __main__.MarkovChain.generate_counts.<locals>.<lambda>()>,
            {'ST': defaultdict(int,
                         {'ST': 0.23529411764705882,
                          'CH': 0.058823529411764705,
                          'SI': 0.17647058823529413,
                          'SL': 0.0,
                          'FF': 0.29411764705882354,
                          'FC': 0.23529411764705882}),
             'CH': defaultdict(int,
                         {'ST': 0.0,
                          'CH': 0.0,
                          'SI': 0.0,
                          'SL': 0.0,
                          'FF': 1.0,
                          'FC': 0.0}),
             'SI': defaultdict(int,
                         {'ST': 0.21052631578947367,
                          'CH': 0.0,
                          'SI': 0.3684210526315789,
                          'SL': 0.0,
                          'FF': 0.2631578947368421,
                          'FC': 0.15789473684210525

In [5]:
url = "https://baseballsavant.mlb.com/statcast_search?hfPT=&hfAB=&hfGT=R%7C&hfPR=&hfZ=&hfStadium=&hfBBL=&hfNewZones=&hfPull=&hfC=&hfSea=2024%7C&hfSit=&player_type=pitcher&hfOuts=&hfOpponent=&pitcher_throws=&batter_stands=&hfSA=&game_date_gt=&game_date_lt=&hfMo=&hfTeam=&home_road=&hfRO=&position=&hfInfield=&hfOutfield=&hfInn=&hfBBT=&hfFlag=&metric_1=&group_by=name-date&min_pitches=0&min_results=0&min_pas=0&sort_col=pitches&player_event_sort=api_p_release_speed&sort_order=desc"
game_urls = GameUrlRetriever(url)

In [14]:
game_urls.urls[2]

{'game_url': 'https://baseballsavant.mlb.com/statcast_search?hfPT=&hfAB=&hfGT=R%7C&hfPR=&hfZ=&hfStadium=&hfBBL=&hfNewZones=&hfPull=&hfC=&hfSea=2024%7C&hfSit=&player_type=pitcher&hfOuts=&hfOpponent=&pitcher_throws=&batter_stands=&hfSA=&game_date_gt=&game_date_lt=&hfMo=&hfTeam=&home_road=&hfRO=&position=&hfInfield=&hfOutfield=&hfInn=&hfBBT=&hfFlag=&metric_1=&group_by=name-date&min_pitches=0&min_results=0&min_pas=0&sort_col=pitches&player_event_sort=api_p_release_speed&sort_order=desc&type=details&player_id=605135&ep_game_date=2024-04-09&ep_game_pk=744957',
 'player_id': '605135',
 'game_id': '744957'}

In [8]:
test = requests.get(game_urls.urls[2]["game_url"])


In [9]:
page = SoupFactory([test]).convert_to_soup()

In [18]:
rows = page[0].find_all(attrs={"style": "text-align: center;"})
trs = page[0].find_all("tr")
pitch_data = [str(x) for x in rows if "search-pitch-label" in str(x)]
pitch_sequence = [x.split("\">")[2].split("<")[0] for x in pitch_data]
pitch_sequence.reverse()

In [22]:

for idx, pitch in enumerate(pitch_sequence):
    pitch_instance = Pitch(pitch_type=pitch, pitcher_id

0 SI
1 SI
2 SI
3 FC
4 FC
5 SI
6 CU
7 SI
8 SI
9 SI
10 ST
11 SI
12 SI
13 ST
14 FS
15 SI
16 SI
17 FC
18 SI
19 SI
20 FC
21 SL
22 SI
23 SI
24 CU
25 FC
26 SL
27 FC
28 SI
29 CU
30 ST
31 FC
32 FC
33 SL
34 SI
35 SI
36 FC
37 ST
38 SI
39 SI
40 SI
41 FC
42 CH
43 CH
44 SI
45 SI
46 SL
47 SI
48 SI
49 SI
50 FS
51 CU
52 FS
53 SL
54 SI
55 FF
56 FC
57 FC
58 FS
59 SI
60 FF
61 SI
62 CU
63 SI
64 SI
65 ST
66 SI
67 FC
68 CU
69 ST
70 SI
71 FC
72 SI
73 FC
74 SI
75 CU
76 FC
77 FC
78 SI
79 ST
80 CU
81 SL
82 SL
83 SI
84 ST
85 SI
86 ST
87 SI
88 FC
89 FF
90 SI
91 SI
92 FC
93 FC
94 ST
95 SL
96 SL
97 SI
98 SI
99 CH
100 CH
101 SI
102 SI
103 SI
104 FS
105 FS
106 CH
107 CH
108 SI
109 SL
110 CU
111 CU
112 FC
113 SI
114 SI


In [23]:
page

[<script>
 	// AJAX options to hide show values
 	$('#chk_ajax_spin_rate_605135').on("click", function() {
 			$('.ajax_spin_rate_605135').toggle();
 	});
 	$('#chk_ajax_spin_axis_605135').on("click", function() {
 			$('.ajax_spin_axis_605135').toggle();
 	});
 
 	$('#chk_ajax_des_605135').on("click", function() {
 			$('.ajax_des_605135').toggle();
 	});
 	$('#chk_ajax_description_605135').on("click", function() {
 			$('.ajax_description_605135').toggle();
 	});
 	$('#chk_ajax_inning_605135').on("click", function() {
 			$('.ajax_inning_605135').toggle();
 	});
 
 	$('#chk_ajax_launch_angle_605135').on("click", function() {
 			$('.ajax_launch_angle_605135').toggle();
 	});
 	$('#chk_ajax_hang_time_605135').on("click", function() {
 			$('.ajax_hang_time_605135').toggle();
 	});
 	$('#chk_ajax_effective_speed_605135').on("click", function() {
 			$('.ajax_effective_speed_605135').toggle();
 	});
 
 </script>
 <!-- Show/Hide Table Values<br/>
 <div style="float: left; padding-right: 

In [10]:
trs = page[0].find_all("tr")[1:-1]
pitcher_id_int = int(game_urls.urls[2]["player_id"])
game_id_int = int(game_urls.urls[2]["game_id"])
for idx, tr in enumerate(trs):
    pitch_data = []
    for elem in tr:
        inner_text = elem.text.strip()
        if inner_text != "":
            pitch_data.append(inner_text)
    try:
        pitch_date_str = pitch_data[0]
        pitch_type_str = pitch_data[1]
        pitch_mph_flo = float(pitch_data[2])
        spin_rate_int = int(pitch_data[3])
        pitch_number_int = idx
        pitch_result_str = pitch_data[-2]
        pa_result_str = pitch_data[-1]
        new_pitch = Pitch(
            pitcher_id=pitcher_id_int, 
            game_id=game_id_int,
            pitch_number=pitch_number_int,
            pitch_type=pitch_type_str,
            pitch_result=pitch_result_str,
            pa_result=pa_result_str,
            spin_rate=spin_rate_int,
            pitch_mph=pitch_mph_flo,
            pitch_date=pitch_date_str
        )
        db.add(new_pitch)
        db.commit()
    except ValueError:
        print("Error parsing pitch data. Skipping row")

In [50]:
len(game_urls.urls)

10000

In [51]:
pitch_date_str

'2024-04-09'