In [1]:
import os
import sys
import json
from collections import defaultdict
import pandas as pd

In [2]:



df = pd.read_csv('../../../data/mlb/schedules.csv', index_col=None)
df = df[df.GAME_ID.isin(df.GAME_ID.dropna().tolist())]
df.GAME_ID = df.GAME_ID.astype(int)

df['WIN'] = df['RESULT'].map(lambda a: a[0])

import re
def get_score(score):
    match = re.search(r'\d+-\d+', score)
    return match.group(0)

df['SCORE'] = df['RESULT'].map(get_score)

df = df.loc[:, ['GAME_ID', 'WIN', 'SCORE']]
df = df.set_index(['GAME_ID'])
df



Unnamed: 0_level_0,WIN,SCORE
GAME_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
401427971,L,2-1
401354277,L,4-3
401354292,W,10-4
401354303,W,4-0
401354312,L,7-2
...,...,...
401356561,L,3-2
401356576,L,5-2
401423191,L,3-2
401423208,L,8-3


In [3]:
dump_file = True

In [4]:
games = []
for file in os.listdir('../../../data/mlb/pbp/'):
    with open(f'../../../data/mlb/pbp/{file}', 'r', encoding='UTF8') as pbp_input:
        game = json.load(pbp_input)

    games.append(game)

In [5]:
for game in games:

    runs = defaultdict(lambda: 0)
    
    for period in game['periods']:

        atBat = period['atBat']
        runs[atBat] += 0

        for event in period['events']:

            entities = event['entities']
            if 'issues' in entities:
                print('issues')
                print(event)
                print()

            if 'runs' in entities:
                runs[atBat] += entities['runs']

        if 'issues' in period:
            print('issues')
            print(period)
            print()

    score = df.loc[int(game['id'])].SCORE
    current_game_score = '-'.join(
        list(map(str, sorted(runs.values(), reverse=True)))
    )

    if score != current_game_score:
        print(game['id'], score, current_game_score)


issues
{'isScoringPlay': False, 'pitches': [{'coords': {'x': 155, 'y': 150}, 'velocity': 95, 'type': 'four-seam fb', 'order': 1, 'result': {'type': 'strike', 'outcome': 'strike looking', 'bases': [1, 0, 0]}, 'count': {'balls': 0, 'strikes': 0}, 'prior': {'bases': [1, 0, 0]}}, {'coords': {'x': 120, 'y': 163}, 'velocity': 88, 'type': 'slider', 'order': 2, 'result': {'type': 'strike', 'outcome': 'strike swinging', 'bases': [1, 0, 0]}, 'count': {'balls': 0, 'strikes': 1}, 'prior': {'bases': [1, 0, 0]}}, {'coords': {'x': 109, 'y': 209}, 'velocity': 87, 'type': 'slider', 'order': 3, 'result': {'type': 'strike', 'outcome': 'strike swinging', 'bases': [1, 0, 0]}, 'count': {'balls': 0, 'strikes': 2}, 'prior': {'bases': [1, 0, 0]}}], 'score': {'away': 0, 'home': 1}, 'desc': 'Kirilloff struck out swinging, catcher Tom Murphy to first baseman Ty France.', 'entities': {'player': 'Kirilloff', 'type': 'struck out', 'outs': 1, 'order': ['catcher Tom Murphy', 'first baseman Ty France'], 'effort': 'swin

In [6]:
if dump_file:

    keys = set()
    text_instances = []

    for file in os.listdir('../../../data/mlb/pbp/'):

        with open(f'../../../data/mlb/pbp/{file}', 'r', encoding='UTF8') as pbp_input:
            game = json.load(pbp_input)

        for period in game['periods']:
            for event in period['events']:

                description = event['desc']
                if not description in keys:
                    text_instances.append((description, event['entities']))
                    keys.add(description)

    with open('../../../pipelines/tasks/mlb/common/tests/docs/desc_to_entities.json', 'w') as test_instances:
        json.dump(text_instances, test_instances, indent=2)