In [97]:
import requests
import pandas as pd
from datetime import datetime,date
import numpy as np

In [98]:
def phi_correlation(a, b, c, d):
    return ((a * d) - (b * c)) / np.sqrt((a+b)*(a+c)*(b+d)*(c+d))
    
def agg_first_inning_data(startdate, enddate=None):
    if not enddate:
        enddate = startdate
    mlb_schedule = "http://statsapi.mlb.com/api/v1/schedule/games/?sportId=1&startDate={}&endDate={}".format(startdate, enddate)
    response = requests.get(mlb_schedule)
    json_data = response.json()
    games = []
    for dt in json_data['dates']:
        df = pd.DataFrame(dt['games'])
        df = df[(df.seriesDescription != 'Spring Training') & (df.seriesDescription != 'Exhibition') & (df.scheduledInnings == 9)].apply(get_first_inning_data, axis = 1)
        if not len(df) == 0:
            games.append(df[df.game_id.notna()])
    return pd.concat(games)


def get_first_inning_data(game):
    if game['status']['detailedState'] == 'Postponed':
        return pd.Series()
    hits = ['Single', 'Double', 'Triple', 'Home Run']
    mlb_live_api = "https://ws.statsapi.mlb.com{}?language=en".format(game['link'])
    response = requests.get(mlb_live_api)
    json_data = response.json()
    try:
        game_info = {
                'game_id': game['gameGuid'],
                'game_link': game['link'],
                'commence_time': game['gameDate'],
                'startdate': game['officialDate'],
                'away_team': game['teams']['away']['team']['name'],
                'home_team': game['teams']['home']['team']['name'],
                'game_status': game['status']['detailedState']
            }
        plays = json_data['liveData']['plays']['allPlays']
        fi_plays = [p for p in plays if p['about']['inning'] == 1]
        runs_scored = fi_plays[-1]['result']['awayScore'] + fi_plays[-1]['result']['homeScore']
        hit_count = 0
        error_count = 0
        for i in fi_plays:
            if i['result']['event'] in hits:
                hit_count += 1
            if "error" in i['result']['description']:
                error_count += 1
        rhe = runs_scored + hit_count + error_count
        total_runs = game['teams']['away']['score'] + game['teams']['home']['score']
        game_info['rhe'] = rhe
        game_info['total'] = total_runs
        return pd.Series(game_info)
    except Exception as e:
        print(game['link'])

In [None]:
mlb2024 = agg_first_inning_data('2024-03-01', '2024-09-09')
mlb2023 = agg_first_inning_data('2023-03-01', '2023-11-09')

In [100]:
mlb2024

Unnamed: 0,game_id,game_link,commence_time,startdate,away_team,home_team,game_status,rhe,total
0,a3e8bd6d-f718-4643-a584-b034d18181d1,/api/v1.1/game/745444/feed/live,2024-03-20T10:05:00Z,2024-03-20,Los Angeles Dodgers,San Diego Padres,Final,0.0,7.0
0,b486e453-8c94-4cf6-b628-0947209ba6c9,/api/v1.1/game/746175/feed/live,2024-03-21T10:05:00Z,2024-03-21,San Diego Padres,Los Angeles Dodgers,Final,12.0,26.0
0,c138642c-8c26-46dc-a7b8-1e5f8d1739e2,/api/v1.1/game/747060/feed/live,2024-03-28T19:05:00Z,2024-03-28,Los Angeles Angels,Baltimore Orioles,Final,6.0,14.0
1,bec767a4-c174-4e4e-ba7e-a16759c76cd4,/api/v1.1/game/746737/feed/live,2024-03-28T20:10:00Z,2024-03-28,Washington Nationals,Cincinnati Reds,Final,1.0,10.0
2,36e1df7c-1e8c-406b-a4ac-0267d072fc96,/api/v1.1/game/745445/feed/live,2024-03-28T20:10:00Z,2024-03-28,San Francisco Giants,San Diego Padres,Final,0.0,10.0
...,...,...,...,...,...,...,...,...,...
4,38eced77-f8d5-4942-a888-c647facdacc2,/api/v1.1/game/744888/feed/live,2024-09-09T23:07:00Z,2024-09-09,New York Mets,Toronto Blue Jays,Final,1.0,5.0
5,fef70815-3e89-492f-98bd-3787763de782,/api/v1.1/game/746909/feed/live,2024-09-09T23:10:00Z,2024-09-09,Baltimore Orioles,Boston Red Sox,Final,7.0,15.0
6,a215b032-cb10-4fc9-b712-5820d7a0dc06,/api/v1.1/game/745860/feed/live,2024-09-09T23:40:00Z,2024-09-09,Los Angeles Angels,Minnesota Twins,Final,4.0,8.0
7,c6672aed-4a06-4d31-95a8-e77f60211246,/api/v1.1/game/746746/feed/live,2024-09-09T23:40:00Z,2024-09-09,Cleveland Guardians,Chicago White Sox,Final,5.0,8.0


In [99]:
total = 8
a = len(mlb2024[(mlb2024['rhe'] > 2) & (mlb2024['total'] > total)])
b = len(mlb2024[(mlb2024['rhe'] > 2) & (mlb2024['total'] <= total)])
c = len(mlb2024[(mlb2024['rhe'] <= 2) & (mlb2024['total'] > total)])
d = len(mlb2024[(mlb2024['rhe'] <= 2) & (mlb2024['total'] <= total)])
phi_correlation(a, b, c, d)

0.23363406264074332