# Retrosheet Pre-Processing
The purpose of this sheet is to pre-process the raw Retrosheet event files rather than working with the baseball on a stick data.

In [1]:
%load_ext autoreload
%autoreload 1

In [2]:
import numpy as np
import pandas as pd
import datetime, copy, imp
import time
import os
import re
import matplotlib.pyplot as plt

from tqdm.auto import tqdm, trange
from tqdm.notebook import tqdm
tqdm.pandas()

import sys
sys.path.insert(0, '../../util/')

%aimport data_cache
from data_cache import CacheResult

In [127]:
cachePathStr = '/Users/gmessier/data/baseball/cache/'
dataPathStr = '/Users/gmessier/data/baseball/'
tgtSeasons = [ 2016, 2017 ]  # The seasons that define our player population.
startSeason = 1992  # Player event start season.
endSeason = 2019 # Player event end season.
topPlayerFrc = 0.1 # Fraction of players to select as top performers.

### Target Player Roster
- The player population is every non-pitcher who was playing in the target seasons.

In [16]:
rosterFiles = [] 
for tgtSeason in tgtSeasons:
    pathStr = dataPathStr + '{}/'.format(tgtSeason)
    allFiles = os.listdir(pathStr)
    fileRegex = re.compile('ROS')
    rosterFiles += [ '{}/{}'.format(tgtSeason,x) for x in allFiles if fileRegex.search(x) ]

In [18]:
@CacheResult
def ProcessRosters(fileNames):
    rowList = []
    for fileName in tqdm(fileNames):    
        file = open(dataPathStr + fileName)        
        for line in file:
            flds = line.split(',')
            row = {
                'PlayerId': flds[0],
                'LastName': flds[1],
                'FirstName': flds[2],
                'Posn': flds[-1][0:-1],
                'Team': flds[-2]
            }
            rowList.append(row)
            
        file.close()
        
    tbl = pd.DataFrame.from_dict(rowList)
    
    # Since we mainly care about position and not team, arbitrarily keep one duplicated record.
    tbl = tbl.loc[~(tbl.PlayerId.duplicated())] 
            
    return tbl

In [19]:
rstr = ProcessRosters(rosterFiles,path=cachePathStr)

  0%|          | 0/60 [00:00<?, ?it/s]

In [20]:
# Do not include pitchers in the assessment of offsensive capability.
offPlayers = rstr[rstr.Posn != 'P'].PlayerId

### Player Transactions

In [24]:
tranFileStr = '~/data/baseball/tran/tran.txt'

In [25]:
headers = [
    'PriDate', 'Time', 'ApxInd', 'SecDate', 'ApxIndSec', 'TrId',
    'Player', 'Type', 'FromTeam', 'FromLeague', 'ToTeam',
    'ToLeague', 'DftType', 'DftRnd', 'PickNo', 'Info']

types = {
    'PriDate': 'object', 'Time': 'object', 'ApxInd': 'object', 
    'SecDate': 'object', 'ApxIndSec': 'object', 'TrId': 'object',
    'Player': 'object', 'Type': 'object', 'FromTeam': 'object', 
    'FromLeague': 'object', 'ToTeam': 'object',
    'ToLeague': 'object', 'DftType': 'object', 'DftRnd': 'object', 
    'PickNo': 'object', 'Info': 'object'}

In [26]:
trnRaw = pd.read_csv(tranFileStr,sep=',',names=headers,dtype=types)

In [27]:
tranEventType = { 
    'F': 'FreeAgent', 'R': 'Released', 'D': 'Draft', 'T': 'Trade', 'W': 'Waivers', 'C': 'CndDeal', 'P': 'Purchase',
    'U': 'Unknown', 'X': 'ExpansionPick', 'M': 'MinorLeague', 'L': 'Loan', 'J': 'Jump', 'A': 'Assigned', 'V': 'LeagueCtrl',
    'Z': 'Retired', 'p': 'Purchase', np.nan: np.nan }

def TransactionTimeline(tbl):
    
    dates = pd.to_datetime(pd.DataFrame({ 
        'year': tbl.PriDate.str[:4].astype('int'), 
        'month': tbl.PriDate.str[4:6].astype('int').combine(1,max), 
        'day': tbl.PriDate.str[6:8].astype('int').combine(1,max),
        'minute': [ 1 ]*len(tbl.index)}))
    
    return pd.DataFrame({
        'Date': dates, 
        'Event': [ tranEventType[t] for t in tbl.Type.str[0] ] 
        }).sort_values('Date')

In [28]:
@CacheResult
def GenTransactionTimelines(trn):
    return trnRaw.groupby('Player').progress_apply(TransactionTimeline)

In [29]:
trn = GenTransactionTimelines(trnRaw,path=cachePathStr)
trn = trn[trn.index.get_level_values(0).isin(offPlayers)]

  0%|          | 0/18755 [00:00<?, ?it/s]

### Process Event Files

In [63]:
seasons = range(startSeason,endSeason+1)

In [64]:
dataPathStr = '/Users/gmessier/data/baseball/'
fileRegex = re.compile('EV(A|N)')
eventFiles = []
for season in seasons:
    pathStr = '{}{}/'.format(dataPathStr,season)
    allFiles = os.listdir(pathStr)
    for x in allFiles:
        if fileRegex.search(x):
            eventFiles.append('{}{}'.format(pathStr,x))

In [163]:
hitRegex = re.compile("^(S\d|D\d|T\d|DGR|HR|HP)"); # For now, includes being hit by a pitch.
singleRegex = re.compile("^S|^HP");
doubleRegex = re.compile("^D");
tripleRegex = re.compile("^T");
homerunRegex = re.compile("^H");
rbiRegex = re.compile("-H");
bbRegex = re.compile("^(?:IW|W(?:[^P]|$))")

In [77]:
def ProcessOffensiveEvents(eventFileStr):

    fp = open(eventFileStr)
    rowList = []

    gameStr = ''
    topBottom = [ 'Top', 'Bottom' ]

    for line in fp:
        row = {}
        flds = line.split(',')

        if flds[0] == 'id':
            gameStr = flds[1][0:-1]
            dateStr = gameStr[3:7] + '-' + gameStr[7:9] + '-' + gameStr[9:11]
            if gameStr[-1] == '0':
                curTime = pd.to_datetime(dateStr + ' 12:00')
            elif gameStr[-1] == '1':
                curTime = pd.to_datetime(dateStr + ' 15:01')
            elif gameStr[-1] == '2':
                curTime = pd.to_datetime(dateStr + ' 18:02')
            else:
                sys.exit("ERROR: Can't handle more than a triple header.")

        if flds[0] == 'start':

            entry = { 'Date': curTime, 'PlayerId': flds[1], 'Event': 'Start', 'Rbi': 0 }
            rowList.append(entry)
            
        if flds[0] == 'play':
            curTime += pd.Timedelta('1 min')
            result = flds[6][0:-1]
                            
            if hitRegex.search(result):
                entry = { 'Date': curTime, 'PlayerId': flds[3], 'Event': '', 'Rbi': len(rbiRegex.findall(result)) }

                if singleRegex.search(result):
                    entry['Event'] = '1B'
                elif doubleRegex.search(result):
                    entry['Event'] = '2B'
                elif tripleRegex.search(result):
                    entry['Event'] = '3B'
                else:
                    entry['Event'] = 'HR'

                rowList.append(entry)
                    
            if bbRegex.search(result):
                curTime += pd.Timedelta('1 min')
                entry = { 'Date': curTime, 'PlayerId': flds[3], 'Event': 'BB', 'Rbi': len(rbiRegex.findall(result)) }
                rowList.append(entry)
            
    
    return pd.DataFrame.from_dict(rowList)    
        

In [78]:
#tt = ProcessOffensiveEvents('/Users/gmessier/data/baseball/2017/2017ANA.EVA')

In [79]:
@CacheResult
def ProcessEventFiles(files):
    tbl = pd.DataFrame()
    for file in tqdm(files):
        tbl = pd.concat([ tbl, ProcessOffensiveEvents(file) ],ignore_index=True)
    return tbl    

In [80]:
evnt = ProcessEventFiles(eventFiles,path=cachePathStr)
evnt = evnt[evnt.PlayerId.isin(offPlayers)]

  0%|          | 0/826 [00:00<?, ?it/s]

### Offsensive Performance

In [115]:
yhooPts = {
    '1B': 2.6, '2B': 5.2, '3B': 7.8, 'HR': 10.4, 'R': 1.9, 'RBI': 1.9, 'BB': 2.6, 'SB': 4.2, 'HBP': 2.6
}

In [116]:
def TotalGamePoints(tbl):
    points = 0
    
    points += sum(tbl.Event == '1B') * yhooPts['1B']
    points += sum(tbl.Event == '2B') * yhooPts['2B']
    points += sum(tbl.Event == '3B') * yhooPts['3B']
    points += sum(tbl.Event == 'HR') * yhooPts['HR']
    points += sum(tbl.Event == 'BB') * yhooPts['BB']

    points += tbl.Rbi.sum() * yhooPts['RBI']

    return points

In [117]:
def PlayerGamePointTotals(tbl):
    pts = tbl.groupby(tbl.Date.dt.date).apply(CalcGamePoints)
    return pd.DataFrame({
        'Date': pd.to_datetime(pts.index)+pd.Timedelta('23 hours'),
        'Points': pts
        })

In [119]:
@CacheResult
def CalcPlayerPoints(evnt):
    return evnt.groupby('PlayerId').progress_apply(PlayerGamePointTotals)

In [130]:
# Determines the total Yahoo points scored by each player on each of their game dates.
playerPts = CalcPlayerPoints(evnt,path=cachePathStr)

In [132]:
def CalcPlayerSeasonTotals(tbl):
    return tbl.groupby(tbl.Date.dt.year).apply(lambda x: sum(x.Points))

In [138]:
@CacheResult
def CalcSeasonTotals(pts):
    return pts.groupby(level=0).progress_apply(CalcPlayerSeasonTotals)

In [142]:
# Determines the total points scored by each player in a season.
seaTtls = CalcSeasonTotals(playerPts,path=cachePathStr)

In [158]:
def CalcTopPlayerCutoff(tbl,cutoffFrc):
    years = tbl.index.get_level_values(1).unique().sort_values()
    cutoffs = []
    for year in years:
        nPlayers = len(tbl.loc[:,year])
        cutoffs.append(tbl.loc[:,year].sort_values().iloc[int(nPlayers*(1-cutoffFrc))])
    return pd.Series(cutoffs,index=years)

In [162]:
# The cutoffs in each season to be in the top topPlayerFrc of players.
topThsh = CalcTopPlayerCutoff(seaTtls,topPlayerFrc)

In [None]:
playerPts.groupby(level=0).progress_apply(FindTopPlayerEvents,thresholds=topThsh)

In [183]:
def FindTopPlayerEvents(tbl,thresholds):
    years = tbl.Date.dt.year.unique()
    eventDates = []
    for year in years:
        seasn = tbl[tbl.Date.dt.year == year]
        cumPts = seasn.Points.cumsum()
        if cumPts.iloc[-1] > thresholds[year]:
            eventDates.append(seasn.loc[cumPts > thresholds[year]].Date.min())
    return pd.DataFrame({'Date': eventDates, 'Event': [ 'TopPlayer' ]*len(eventDates)})

In [189]:
@CacheResult
def FindTopPlayers(playerPts,thresholds):
    return playerPts.groupby(level=0).progress_apply(FindTopPlayerEvents,thresholds=topThsh)

In [190]:
topEvnt = FindTopPlayers(playerPts,topThsh,path=cachePathStr)

  0%|          | 0/874 [00:00<?, ?it/s]

### Merge Major Events and Transaction Events

In [245]:
def MergeEventsPlayer(evnt,trans,topPlayer):
    playerId = evnt.iloc[0].PlayerId
    tbl = pd.concat([ 
        evnt.loc[evnt.Event.isin(['Start','HR'])][['Date','Event']], 
        trans.loc[playerId]
        ],ignore_index=True)
    
    if playerId in topPlayer.index.get_level_values(0):
        tbl = pd.concat([ tbl, topPlayer.loc[playerId] ],ignore_index=True)
    
    tbl.sort_values(by=['Date'],inplace=True,ignore_index=True)
    return tbl

In [246]:
#t = MergeEventsPlayer(evnt.loc[evnt.PlayerId == 'abrej003'],trn,topEvnt)

In [247]:
@CacheResult
def MergeEventsPlayers():
    return evnt.groupby('PlayerId').progress_apply(MergeEventsPlayer,trans=trn,topPlayer=topEvnt)

In [248]:
tbl = MergeEventsPlayers(path=cachePathStr)

  0%|          | 0/874 [00:00<?, ?it/s]

Unnamed: 0,Date,Event
0,2009-06-09 00:01:00,Draft
1,2012-06-04 00:01:00,Draft
2,2013-06-12 12:00:00,Start
3,2013-06-14 12:00:00,Start
4,2013-06-14 12:53:00,HR
...,...,...
718,2019-09-12 12:00:00,Start
719,2019-09-15 12:00:00,Start
720,2019-09-22 12:00:00,Start
721,2019-09-29 12:00:00,Start


In [74]:
def ExtractMajorEvents(tbl,trans):

    tbl = tbl.loc[ (tbl.Event == 'Start') & (tbl.Event == 'HR') ][['Date','Event']]

    pts = tbl.groupby(tbl.Date.dt.date).apply(CalcGamePoints)
    ptsDf = pd.DataFrame({
                'Date': pd.to_datetime(pts.index)+pd.Timedelta('23 hours'),
                'Points': pts
                })
    
    playerId = tbl.iloc[0].PlayerId

    tbl = pd.concat([ evnt[['Date','Event']], trans.loc[playerId] ],ignore_index=True)
    tbl.sort_values(by=['Date'],inplace=True,ignore_index=True)    
    
    
    return tbl.columns

In [87]:
plyr = evnt.loc[evnt.PlayerId == 'troum001']

In [90]:
plyrH = plyr.loc[plyr.Event != 'Start']

In [99]:
v = plyrH.groupby(plyrH.Date.dt.date).apply(CalcGamePoints)

In [105]:
pd.DataFrame({ 'Date': pd.to_datetime(v.index), 'Points': v }).dtypes

Date      datetime64[ns]
Points           float64
dtype: object

In [111]:
pd.to_datetime(v.index) + pd.Timedelta('23 hours')

DatetimeIndex(['2011-07-09 23:00:00', '2011-07-15 23:00:00',
               '2011-07-16 23:00:00', '2011-07-20 23:00:00',
               '2011-07-22 23:00:00', '2011-07-24 23:00:00',
               '2011-08-19 23:00:00', '2011-08-20 23:00:00',
               '2011-08-21 23:00:00', '2011-08-24 23:00:00',
               ...
               '2019-08-21 23:00:00', '2019-08-24 23:00:00',
               '2019-08-25 23:00:00', '2019-08-27 23:00:00',
               '2019-08-30 23:00:00', '2019-08-31 23:00:00',
               '2019-09-01 23:00:00', '2019-09-03 23:00:00',
               '2019-09-05 23:00:00', '2019-09-07 23:00:00'],
              dtype='datetime64[ns]', name='Date', length=1035, freq=None)

In [86]:
evnt.iloc[0].Date.hour

12

In [12]:
@CacheResult
def ProcessTargetSeason():
    tbl = pd.DataFrame()
    for eventFile in tqdm(eventFiles):
        tbl = tbl.append(ProcessEventFile(tgtSeasonPathStr + eventFile))
    return tbl

In [13]:
tgt = ProcessTargetSeason(path=cachePathStr)

In [15]:
def OffsensiveImpact(tbl):
    impact = 0
    impact += sum((tbl.Event == 'Hit')&(tbl.EndBase == 1)) * yhooPts['1B']
    impact += sum((tbl.Event == 'Hit')&(tbl.EndBase == 2)) * yhooPts['2B']
    impact += sum((tbl.Event == 'Hit')&(tbl.EndBase == 3)) * yhooPts['3B']
    impact += sum((tbl.Event == 'Hit')&(tbl.EndBase == 4)) * yhooPts['HR']
    impact += sum((tbl.Event == 'Walk')) * yhooPts['BB']

    impact += tbl.Rbi.sum() * yhooPts['RBI']
    
    return impact

In [16]:
@CacheResult
def CalcOffsensiveImpact():
    return tgt.groupby('PlayerId').progress_apply(OffsensiveImpact)

In [17]:
imp = CalcOffsensiveImpact(path=cachePathStr)
imp = imp[imp.index.isin(offPlayers)]

In [18]:
impPercentile = 0.1
impPercentileRank = int(impPercentile * len(imp))
threshold = imp.sort_values(ascending=False).iloc[impPercentileRank]

playerLabel = pd.DataFrame({'Super': (imp >= threshold)},index = imp.index)

### Save Data

In [36]:
baseballDataFileStr = '/Users/gmessier/data/baseball/2019SeasonClassification.hdf'
tbl.to_hdf(baseballDataFileStr,key='Features',mode='w')
playerLabel.to_hdf(baseballDataFileStr,key='Labels')

In [37]:
label = playerLabel
label.rename(columns={'Super': 'Acute'},inplace=True)

In [38]:
translate = { 
    'FreeAgent': 'Referral', 'Start': 'Stay', 'HR': 'MajorEvent', 
    'Released': 'Referral', 'Draft': 'Referral', 'Trade': 'Referral',
    'Waivers': 'Referral', 'CndDeal': 'Referral', 'Purchase': 'Referral',
    'TopPlayer': 'AdverseOutcome'
}

In [39]:
tblGeneric = tbl.replace(translate)

In [40]:
baseballDataFileStr = '/Users/gmessier/data/baseball/2019GenericClassification.hdf'
tblGeneric.to_hdf(baseballDataFileStr,key='Features',mode='w')
label.to_hdf(baseballDataFileStr,key='Labels')

# Unused Code
- Start on a more wholistic retrosheet processing routine that includes positions on base.

In [None]:
hitRegex = re.compile("^(S\d|D\d|T\d|DGR|HR|HP)"); # For now, includes being hit by a pitch.
runRegex = re.compile("\d-(?:\d|H)")
stealRegex = re.compile("SB(?:\d|H)")
singleRegex = re.compile("^S|^HP");
doubleRegex = re.compile("^D");
tripleRegex = re.compile("^T");
homerunRegex = re.compile("^H");
rbiRegex = re.compile("-H");
bbRegex = re.compile("^(?:IW|W(?:[^P]|$))")
strikeOutRegex = re.compile("^K")
errorRegex = re.compile("E\d")
pitchRegex = re.compile("(B|C|F|H|I|K|L|M|N|O|P|Q|R|S|T|U|V|X|Y)")
fieldOutRegex = re.compile("^(?:\d+(?:\(.\))*)+")

In [8]:
allFiles = os.listdir(tgtSeasonPathStr)
fileRegex = re.compile('EV(A|N)')
eventFiles = [ x for x in allFiles if fileRegex.search(x) ]

In [10]:
def CreatePlayerEvent(eventType, position, playerId, teamId,
                      pitchCount = np.nan, rbi = np.nan, startBase = np.nan, endBase = np.nan,
                      outs = np.nan, players = np.nan, score = [ 0, 0 ]):
     return { 
        'HomeScore': score[1],'AwayScore': score[0],
        'PlayerId': playerId,'Position': position, 
        'PlayerTeam': teamId,'Event': eventType, 
        'PitchCount': pitchCount,'Rbi': rbi,
        'StartBase': startBase,'EndBase': endBase,
        'Outs': outs,'Players': players
        }


In [11]:
def ProcessEventFile(eventFileStr):

    fp = open(eventFileStr)
    rowList = []

    gameStr = ''
    topBottom = [ 'Top', 'Bottom' ]

    for line in fp:
        row = {}
        flds = line.split(',')

        if flds[0] == 'id':
            gameStr = flds[1][0:-1]
            score = [ 0, 0 ]
            teams = [ '', '' ]

            dateStr = gameStr[3:7] + '-' + gameStr[7:9] + '-' + gameStr[9:11]
            if gameStr[-1] == '0':
                curTime = pd.to_datetime(dateStr + ' 12:00')
            elif gameStr[-1] == '1':
                curTime = pd.to_datetime(dateStr + ' 15:01')
            elif gameStr[-1] == '2':
                curTime = pd.to_datetime(dateStr + ' 18:02')
            else:
                sys.exit("ERROR: Can't handle more than a triple header.")

        if flds[0] == 'info':
            if flds[1] == 'visteam':
                teams[0] = flds[2][0:-1]
            if flds[1] == 'hometeam':
                teams[1] = flds[2][0:-1]
        
        if flds[0] == 'play':
        
            atBatId = int(flds[2])
            result = flds[6][0:-1]
            batter = flds[3]
            pitchCount = len(pitchRegex.findall(flds[5]))
            
            timeInfo = {
                'GameId': gameStr, 'Time': curTime,
                'HomeTeam': teams[1], 'AwayTeam': teams[0],
                'Inning': int(flds[1]), 'TopBottom': topBottom[atBatId]
            }

            # --------- Run -----------
            if runRegex.search(result):

                for runStr in runRegex.findall(result):

                    startBase = int(runStr[0])
                    if runStr[-1] == 'H':
                        endBase = 4
                        score[atBatId] += 1
                    else:
                        endBase = int(runStr[-1])
                        
                    eventInfo = CreatePlayerEvent(
                        'Run',
                        position = 'Runner',
                        playerId = 'TBD',  # Need to implement on base functionality.
                        teamId = teams[atBatId],
                        pitchCount = pitchCount,
                        rbi = len(rbiRegex.findall(result)), 
                        endBase = endBase,
                        score = score)

                    rowList.append({ **timeInfo, **eventInfo })


            # --------- Hit -----------
            if hitRegex.search(result):

                if singleRegex.search(result):
                    endBase = 1
                elif doubleRegex.search(result):
                    endBase = 2
                elif tripleRegex.search(result):
                    endBase = 3
                else:
                    endBase = 4
                    score[atBatId] += 1

                eventInfo = CreatePlayerEvent(
                    'Hit',
                    position = 'Batter',
                    playerId = batter,
                    teamId = teams[atBatId],
                    pitchCount = pitchCount,
                    rbi = len(rbiRegex.findall(result)), 
                    endBase = endBase,
                    score = score)

                rowList.append({ **timeInfo, **eventInfo })

            # --------- Steal -----------
            if stealRegex.search(result):

                for stealStr in stealRegex.findall(result):

                    if stealStr[-1] == 'H':
                        endBase = 4
                        score[atBatId] += 1
                    else:
                        endBase = int(stealStr[-1])
                    startBase = endBase-1

                    eventInfo = CreatePlayerEvent(
                        'Steal',
                        position = 'Runner',
                        playerId = 'TBD',  # Need to implement on base functionality.
                        teamId = teams[atBatId],
                        startBase = startBase,
                        endBase = endBase,
                        score = score)

                    rowList.append({ **timeInfo, **eventInfo })

                
            # --------- Base on Balls -----------
            if bbRegex.search(result):

                eventInfo = CreatePlayerEvent(
                    'Walk',
                    position = 'Batter',
                    playerId = batter,
                    teamId = teams[atBatId],
                    pitchCount = pitchCount,
                    endBase = 1,
                    score = score)

                rowList.append({ **timeInfo, **eventInfo })

            

            # --------- Strike Out -----------
            if strikeOutRegex.search(result):

                eventInfo = CreatePlayerEvent(
                    'StrikeOut',
                    position = 'Batter',
                    playerId = batter,
                    teamId = teams[atBatId],
                    pitchCount = pitchCount,
                    score = score)

                rowList.append({ **timeInfo, **eventInfo })

            curTime += pd.Timedelta('6 min')
    
    return pd.DataFrame.from_dict(rowList) 
   