In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

## Data import

Assumes data is in `DATA_DIR` and includes the directory and file structure of data as downloaded from [retrosheet.org](retrosheet.org).

In [2]:
DATA_DIR = '../data'

In [3]:
import glob

all_files = glob.glob("{}/*seve/*.EV*".format(DATA_DIR))

Constrain to years of interest

In [4]:
years = list(range(2010, 2016))

In [5]:
year_files = [f for f in all_files if int(f.split('/')[-1][:4]) in years]

Parse game information, including plays and lineup changes.

In [6]:
from io import StringIO

def parse_file(filename):
    game_info_io, game_play_io, lineup_io = str_io_list = [StringIO() for _ in range(3)]
    with open(filename) as f:
        game_id = ''
        lineup_id = 0
        new_game = True
        for line in f:
            line = line.rstrip('\n')
            if line.startswith('id'):
                game_id = line.split(',')[-1]
            elif line.startswith('start'):
                if not new_game:
                    new_game = True
                    lineup_id = 0
                lineup_io.write(','.join([game_id, str(lineup_id)] 
                                         + line.split(',')[1:]) + '\n')
            elif line.startswith('info'):
                game_info_io.write(','.join([game_id] + line.split(',')[1:]) + '\n')
            elif line.startswith('play'):
                game_play_io.write(','.join([game_id, str(lineup_id)] 
                                            + line.split(',')[1:]) + '\n')
            elif line.startswith('sub'):
                new_game = False
                lineup_id += 1
                lineup_io.write(','.join([game_id, str(lineup_id)] 
                                         + line.split(',')[1:]) + '\n')
                
    
    # "rewind" to the beginning of the StringIO object
    for str_io in str_io_list:
        str_io.seek(0)
    
    return (pd.read_csv(game_info_io, header=None, 
                    names=['Game_ID','Var','Value']).pivot('Game_ID','Var','Value'),
            pd.read_csv(game_play_io, header=None, index_col=False,
                    names=['Game_ID', 'Lineup_ID', 'Inning', 'Home', 'Retrosheet_ID', 
                           'Count', 'Pitches', 'Play']),
           pd.read_csv(lineup_io, header=None, index_col=False,
                      names=['Game_ID', 'Lineup_ID', 'Retrosheet_ID', 'Name', 'Home', 
                             'Order', 'Position']))

Parse all files

In [7]:
parsed_files = [parse_file(f) for f in year_files]

Concatenate game info, plays and lineup data

In [8]:
games = pd.concat([df[0] for df in parsed_files], ignore_index=True)
games.shape

(14579, 27)

In [9]:
plays = pd.concat([df[1] for df in parsed_files], ignore_index=True)
plays.shape

(1304164, 8)

In [10]:
plays.loc[301:320]

Unnamed: 0,Game_ID,Lineup_ID,Inning,Home,Retrosheet_ID,Count,Pitches,Play
301,ANA201004080,2,7,0,hudso001,1,C+3X,43/G
302,ANA201004080,2,7,0,mauej001,21,BBCX,53/G
303,ANA201004080,2,7,1,aybae001,0,,NP
304,ANA201004080,3,7,1,aybae001,12,.LFFFBT,K
305,ANA201004080,3,7,1,abreb001,22,CFBFBFX,8/F
306,ANA201004080,3,7,1,huntt001,2,CFFS,K
307,ANA201004080,3,8,0,mornj001,0,,NP
308,ANA201004080,4,8,0,mornj001,32,.CBBBCFFB,W
309,ANA201004080,4,8,0,cuddm001,10,BX,S7/G.1-2
310,ANA201004080,4,8,0,thomj002,0,X,HR/89/F.2-H;1-H


In [11]:
lineups = pd.concat([df[2] for df in parsed_files], ignore_index=True)
lineups.shape

(435719, 7)

Create hierarchical index for lineups

In [12]:
lineups_hi = lineups.set_index(['Game_ID', 'Lineup_ID', 'Home', 'Order'])
lineups_hi.head(25)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Retrosheet_ID,Name,Position
Game_ID,Lineup_ID,Home,Order,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ANA201004050,0,0,1,spand001,Denard Span,8
ANA201004050,0,0,2,hudso001,Orlando Hudson,4
ANA201004050,0,0,3,mauej001,Joe Mauer,2
ANA201004050,0,0,4,mornj001,Justin Morneau,3
ANA201004050,0,0,5,cuddm001,Michael Cuddyer,9
ANA201004050,0,0,6,kubej002,Jason Kubel,10
ANA201004050,0,0,7,yound003,Delmon Young,7
ANA201004050,0,0,8,hardj003,J.J. Hardy,6
ANA201004050,0,0,9,puntn001,Nick Punto,5
ANA201004050,0,0,0,bakes002,Scott Baker,1


Function for constructing a given lineup from lineup changes

In [13]:
def get_lineup(game_id, lineup_id, data=lineups_hi):
    game_data = data.loc[game_id]
    current_lineup = game_data.loc[0].copy()
    try:
        for l in range(lineup_id):
            lineup_change = game_data.loc[l+1]
            current_lineup.loc[lineup_change.index] = lineup_change
    except IndexError:
        print('Invalid lineup number', lineup_id)
        return None
    return current_lineup

For example, pick arbitrary play and reconstruct the lineup at the time:

In [18]:
plays.loc[24776]

Game_ID          BAL201006220
Lineup_ID                   4
Inning                      8
Home                        0
Retrosheet_ID        stanm004
Count                      11
Pitches                  .CBX
Play                     S7/G
Name: 24776, dtype: object

In [19]:
get_lineup('BAL201006220', 4)

Unnamed: 0_level_0,Unnamed: 1_level_0,Retrosheet_ID,Name,Position
Home,Order,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1,coghc001,Chris Coghlan,7
0,2,sancg001,Gaby Sanchez,3
0,3,ramih003,Hanley Ramirez,6
0,4,cantj001,Jorge Cantu,10
0,5,uggld001,Dan Uggla,4
0,6,rossc001,Cody Ross,8
0,7,stanm004,Mike Stanton,9
0,8,helmw001,Wes Helms,5
0,9,paulr001,Ronny Paulino,2
0,0,tankt001,Taylor Tankersley,1
