In [1]:
import os
import re
import json
import pandas as pd
from copy import deepcopy
from collections import defaultdict
from typing import Any, Dict, List

import sys
sys.path.insert(0, '../../pipelines/tasks/mlb/')

from common.helpers.extractors import get_pitch_events, calculate_total_outs, get_current_state_before_pitch, get_outs_from_event

In [2]:
games = []
for file in os.listdir('../../data/mlb/pbp/'):

    with open(f'../../data/mlb/pbp/{file}', 'r', encoding='UTF8') as pbp_input:
        data = json.load(pbp_input)

    games.append(data)

In [3]:
def create_graph(teams):
    areas = ['home', 'away']

    possible_outs = ['0', '1', '2']

    possible_states = [
        [0, 0, 0],
        [1, 0, 0],
        [0, 1, 0],
        [1, 1, 0],
        [0, 0, 1],
        [1, 0, 1],
        [0, 1, 1],
        [1, 1, 1],
    ]

    graph = {}
    for team in teams:
        graph[team] = {
            area: {
                out: {
                    ''.join(map(str, state)): { 'runs': 0, 'types': defaultdict(lambda: 0) }
                    for state in possible_states
                }
                for out in possible_outs
            }
            for area in areas
        }

    return graph

In [4]:
graph = create_graph(['min'])
for game in games:
    team_lookup = {
        game['home']: 'home',
        game['away']: 'away',
    }

    for period in game['periods']:
        if 'issues' in period:
            print(f'skipping period {period["id"]} in {game["id"]} due to {period["issues"]} issue(s)...')
            continue

        outs = 0
        bases = [0, 0, 0]
        at_bat = period['atBat']

        if not at_bat in graph:
            continue

        pitch_events = get_pitch_events(period['events'])

        print(at_bat)
        for event in period['events']:
            if 'isInfoPlay' in event:
                continue

            entities = event['entities']
            if 'premature' in entities:
                ## ie. player caught stealing to end the inning
                continue

            pitches = event['pitches']
            if len(pitches) == 0:
                continue
            
            ## prior to the last pitch
            outs_before, bases_before = get_current_state_before_pitch(pitches, pitch_events)

            outs += outs_before
            bases = bases_before.copy()

            out_key = str(outs)
            state_key = ''.join(map(str, bases))

            area = team_lookup[at_bat]
            item = graph[at_bat][area][out_key][state_key]
            item['runs'] += entities['runs'] if 'runs' in entities else 0
            item['types'][entities['type']] += 1

            ## save event
            print(entities['type'], outs, bases)

            last_pitch = pitches[-1]
            result = last_pitch['result']
            if 'afterPitchEvent' in result:
                outs += get_outs_from_event(pitch_events[result['afterPitchEvent']])

            outs += get_outs_from_event(event)
            bases = last_pitch['result']['bases'].copy()

        print('END:', outs, bases)
        print('')

min
struck out 0 [0, 0, 0]
struck out 1 [0, 0, 0]
grounded out 2 [0, 0, 0]
END: 3 [0, 0, 0]

min
popped out 0 [0, 0, 0]
struck out 1 [0, 0, 0]
struck out 2 [0, 0, 0]
END: 3 [0, 0, 0]

min
flied out 0 [0, 0, 0]
flied out 1 [0, 0, 0]
struck out 2 [0, 0, 0]
END: 3 [0, 0, 0]

min
struck out 0 [0, 0, 0]
struck out 1 [0, 0, 0]
lined out 2 [0, 0, 0]
END: 3 [0, 0, 0]

min
struck out 0 [0, 0, 0]
struck out 1 [0, 0, 0]
flied out 2 [0, 0, 0]
END: 3 [0, 0, 0]

min
struck out 0 [0, 0, 0]
struck out 1 [0, 0, 0]
struck out 2 [0, 0, 0]
END: 3 [0, 0, 0]

min
struck out 0 [0, 0, 0]
popped out 1 [0, 0, 0]
grounded out 2 [0, 0, 0]
END: 3 [0, 0, 0]

min
grounded out 0 [0, 0, 0]
singled 1 [0, 0, 0]
walked 1 [1, 0, 0]
struck out 1 [1, 1, 0]
lined out 2 [1, 1, 0]
END: 3 [1, 1, 0]

min
lined out 0 [0, 0, 0]
flied out 1 [0, 0, 0]
flied out 2 [0, 0, 0]
END: 3 [0, 0, 0]

min
grounded out 0 [0, 0, 0]
singled 1 [0, 0, 0]
singled 1 [1, 0, 0]
struck out 1 [1, 0, 1]
walked 2 [1, 0, 1]
singled 2 [1, 1, 1]
singled 2 [1,

In [5]:
def slim_graph_down(graph_to_slim):
    keys_to_delete = []
    for team in graph_to_slim.keys():
        for area in graph_to_slim[team].keys():
            for out in graph_to_slim[team][area].keys():
                for state in graph_to_slim[team][area][out].keys():
                    total = sum(graph_to_slim[team][area][out][state]['types'].values())
                    if total == 0:
                        keys_to_delete.append((team, area, out, state))

    for team, area, out, state in keys_to_delete:
        del graph_to_slim[team][area][out][state]

    return graph_to_slim

slim_graph_down(graph)

{'min': {'home': {'0': {'000': {'runs': 18,
     'types': defaultdict(<function __main__.create_graph.<locals>.<dictcomp>.<dictcomp>.<dictcomp>.<lambda>()>,
                 {'struck out': 82,
                  'popped out': 10,
                  'flied out': 55,
                  'grounded out': 74,
                  'lined out': 28,
                  'singled': 34,
                  'walked': 38,
                  'doubled': 14,
                  'homered': 18,
                  'infield single': 12,
                  'ground rule double': 1,
                  'fielding error': 2,
                  'tripled': 1,
                  'fouled out': 2})},
    '100': {'runs': 7,
     'types': defaultdict(<function __main__.create_graph.<locals>.<dictcomp>.<dictcomp>.<dictcomp>.<lambda>()>,
                 {'doubled': 4,
                  'grounded into double play': 12,
                  'hit by pitch': 2,
                  'fouled out': 4,
                  'singled': 11,
                

In [6]:
def compute_likelihoods(graph):
    graph_copy = slim_graph_down(deepcopy(graph))

    for team in graph_copy.keys():
        for area in graph_copy[team].keys():
            for out in graph_copy[team][area].keys():
                for state in graph_copy[team][area][out].keys():
                    runs = graph_copy[team][area][out][state]['runs']
                    total = sum(graph_copy[team][area][out][state]['types'].values())
                    graph_copy[team][area][out][state] = 0.0 if runs == 0 else round(float(runs) / total, 3)
    
    return graph_copy

likelihood_graph = compute_likelihoods(graph)
likelihood_graph

{'min': {'home': {'0': {'000': 0.049,
    '100': 0.085,
    '010': 0.16,
    '110': 0.05,
    '001': 0.0,
    '101': 0.333,
    '011': 0.2,
    '111': 1.111},
   '1': {'000': 0.044,
    '100': 0.104,
    '010': 0.059,
    '110': 0.375,
    '001': 0.545,
    '101': 0.609,
    '011': 0.818,
    '111': 0.875},
   '2': {'000': 0.029,
    '100': 0.029,
    '010': 0.125,
    '110': 0.298,
    '001': 0.0,
    '101': 0.379,
    '011': 0.214,
    '111': 0.765}},
  'away': {'0': {'000': 0.022,
    '100': 0.137,
    '010': 0.13,
    '110': 0.273,
    '001': 1.0,
    '101': 0.833,
    '011': 0.875,
    '111': 1.0},
   '1': {'000': 0.004,
    '100': 0.127,
    '010': 0.257,
    '110': 0.303,
    '001': 0.333,
    '101': 0.818,
    '011': 1.25,
    '111': 0.7},
   '2': {'000': 0.028,
    '100': 0.179,
    '010': 0.179,
    '110': 0.156,
    '001': 0.227,
    '101': 0.3,
    '011': 0.667,
    '111': 0.7}}}}

In [7]:
def flatten_graph(graph):
    records = []
    for team in graph.keys():  
        for where in graph[team].keys():
            for outs in graph[team][where].keys():
                row = {
                    'team': team,
                    'where': where,
                    'team': team,
                    'outs': outs,
                }

                row.update(
                    graph[team][where][outs]
                )

                records.append(row)

    return records

def flatten_full_graph(graph):
    records = []
    for team in graph.keys():  
        for where in graph[team].keys():
            for outs in graph[team][where].keys():
                for bases in graph[team][where][outs].keys():
                    row = {
                        'team': team,
                        'where': where,
                        'team': team,
                        'outs': outs,
                        'bases': bases
                    }

                    row.update(
                        graph[team][where][outs][bases]['types']
                    )

                    records.append(row)

    return records

In [8]:
pd.DataFrame(flatten_full_graph(graph))

Unnamed: 0,team,where,outs,bases,struck out,popped out,flied out,grounded out,lined out,singled,...,grounded into fielder's choice,lined into double play,sacrificed,throwing error,bunt single,sacrifice fly,catcher's interference,flied into double play,popped into double play,intentionally walked
0,min,home,0,0,82.0,10.0,55.0,74.0,28.0,34.0,...,,,,,,,,,,
1,min,home,0,100,13.0,2.0,13.0,1.0,2.0,11.0,...,8.0,1.0,2.0,,,,,,,
2,min,home,0,10,5.0,,2.0,5.0,1.0,1.0,...,,,,1.0,,,,,,
3,min,home,0,110,5.0,,3.0,1.0,,3.0,...,2.0,1.0,,,1.0,,,,,
4,min,home,0,1,2.0,,,,,,...,,,,,,,,,,
5,min,home,0,101,1.0,,1.0,,,,...,,,,,,1.0,,,,
6,min,home,0,11,1.0,,,1.0,,,...,1.0,,,,,,,,,
7,min,home,0,111,2.0,,,,,,...,2.0,,,,,1.0,,,,
8,min,home,1,0,63.0,8.0,39.0,55.0,13.0,30.0,...,,,,,,,,,,
9,min,home,1,100,21.0,3.0,18.0,9.0,2.0,11.0,...,11.0,,,,,,1.0,,,


In [9]:
pd.DataFrame(flatten_graph(likelihood_graph))

Unnamed: 0,team,where,outs,000,100,010,110,001,101,011,111
0,min,home,0,0.049,0.085,0.16,0.05,0.0,0.333,0.2,1.111
1,min,home,1,0.044,0.104,0.059,0.375,0.545,0.609,0.818,0.875
2,min,home,2,0.029,0.029,0.125,0.298,0.0,0.379,0.214,0.765
3,min,away,0,0.022,0.137,0.13,0.273,1.0,0.833,0.875,1.0
4,min,away,1,0.004,0.127,0.257,0.303,0.333,0.818,1.25,0.7
5,min,away,2,0.028,0.179,0.179,0.156,0.227,0.3,0.667,0.7
