## Predicting 2023 MLB Season: 01 - Get Data + Data Wrangling
This notebook wrangles data downloaded from www.retrosheet.org into a dataframe for model building. Specifically, for each game, calculate team statistics over their past 162 and 30 games.

The resulting dataframe is saved to a file. This file will be the starting point for the next notebook, in which model v1 will be  built.

The game logs in the <raw_data> folder can be found here: https://www.retrosheet.org/gamelogs/index.html


In [1]:
import numpy as np
import pandas as pd

# Display more data
pd.set_option('display.max_columns',1000)
pd.set_option('display.max_rows',1000)

In [2]:
# Testing one of the csvs
fname = './raw_data/gl2000.txt'
df = pd.read_csv(fname, header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160
0,20000329,0,Wed,CHN,NL,1,NYN,NL,1,5,3,54,N,,,,TOK01,55000,206,100010210,001000020,33,12,1,0,2,5,1,0,1,10,0,5,1,0,4,0,13,3,3,3,0,0,27,10,2,0,1,0,33,7,1,0,1,3,1,1,0,3,0,4,0,0,1,0,8,5,5,5,1,0,27,12,0,0,4,0,marsr901,Randy Marsh,herna901,Angel Hernandez,fostm901,Marty Foster,kulpr901,Ron Kulpa,,(none),,(none),bayld001,Don Baylor,valeb102,Bobby Valentine,liebj001,Jon Lieber,hampm001,Mike Hampton,aguir001,Rick Aguilera,andrs001,Shane Andrews,liebj001,Jon Lieber,hampm001,Mike Hampton,youne001,Eric Young,4,bufod001,Damon Buford,8,gracm001,Mark Grace,3,sosas001,Sammy Sosa,9,rodrh001,Henry Rodriguez,7,andrs001,Shane Andrews,5,nievj002,Jose Nieves,6,giraj001,Joe Girardi,2,liebj001,Jon Lieber,1,hendr001,Rickey Henderson,7,hamid001,Darryl Hamilton,8,alfoe001,Edgardo Alfonzo,4,piazm001,Mike Piazza,2,ventr001,Robin Ventura,5,belld001,Derek Bell,9,zeilt001,Todd Zeile,3,ordor001,Rey Ordonez,6,hampm001,Mike Hampton,1,,Y
1,20000330,0,Thu,NYN,NL,2,CHN,NL,2,5,1,66,N,,,,TOK01,55000,235,1000004,00001000000,37,6,2,0,1,5,1,1,1,8,0,5,1,0,0,0,10,5,0,0,0,0,33,14,2,0,2,0,36,5,0,0,0,0,2,0,0,6,1,9,0,0,2,0,10,7,5,5,0,0,33,14,0,0,0,0,herna901,Angel Hernandez,fostm901,Marty Foster,kulpr901,Ron Kulpa,marsr901,Randy Marsh,,(none),,(none),valeb102,Bobby Valentine,bayld001,Don Baylor,cookd001,Dennis Cook,yound002,Danny Young,,(none),agbab001,Benny Agbayani,reedr002,Rick Reed,farnk001,Kyle Farnsworth,hendr001,Rickey Henderson,7,hamid001,Darryl Hamilton,8,alfoe001,Edgardo Alfonzo,4,piazm001,Mike Piazza,2,ventr001,Robin Ventura,5,belld001,Derek Bell,9,zeilt001,Todd Zeile,3,ordor001,Rey Ordonez,6,reedr002,Rick Reed,1,youne001,Eric Young,4,bufod001,Damon Buford,8,sosas001,Sammy Sosa,9,gracm001,Mark Grace,3,rodrh001,Henry Rodriguez,7,andrs001,Shane Andrews,5,husoj001,Jeff Huson,6,giraj001,Joe Girardi,2,farnk001,Kyle Farnsworth,1,,Y
2,20000403,0,Mon,COL,NL,1,ATL,NL,1,0,2,51,D,,,,ATL02,42255,134,0,00000020x,31,6,2,0,0,0,1,0,0,2,2,7,0,0,1,0,7,3,2,2,1,0,24,10,0,0,1,0,30,7,0,0,2,2,0,0,1,1,0,6,1,0,1,0,6,2,0,0,0,0,27,12,0,0,1,0,hirsj901,John Hirschbeck,willc901,Charlie Williams,wegnm901,Mark Wegner,reynj901,Jim Reynolds,,(none),,(none),bellb001,Buddy Bell,cox-b103,Bobby Cox,maddg002,Greg Maddux,astap001,Pedro Astacio,remlm001,Mike Remlinger,galaa001,Andres Galarraga,astap001,Pedro Astacio,maddg002,Greg Maddux,goodt001,Tom Goodwin,8,lansm001,Mike Lansing,4,walkl001,Larry Walker,9,cirij001,Jeff Cirillo,5,heltt001,Todd Helton,3,hammj001,Jeffrey Hammonds,7,peren001,Neifi Perez,6,maynb001,Brent Mayne,2,astap001,Pedro Astacio,1,veraq001,Quilvio Veras,4,sandr002,Reggie Sanders,7,jonec004,Chipper Jones,5,jordb001,Brian Jordan,9,galaa001,Andres Galarraga,3,jonea002,Andruw Jones,8,peree002,Eddie Perez,2,weisw001,Walt Weiss,6,maddg002,Greg Maddux,1,,Y
3,20000403,0,Mon,MIL,NL,1,CIN,NL,1,3,3,31,D,,,,CIN08,55596,111,2100,21000x,22,7,1,0,0,2,0,0,0,5,0,1,1,0,0,0,8,1,3,3,0,0,15,5,0,0,0,0,19,5,1,0,1,3,0,0,0,1,0,4,0,0,0,0,2,2,2,2,0,0,16,8,2,0,0,0,marsr901,Randy Marsh,herna901,Angel Hernandez,fostm901,Marty Foster,kulpr901,Ron Kulpa,,(none),,(none),loped001,Davey Lopes,mckej801,Jack McKeon,,,,,,(none),,(none),woods001,Steve Woodard,harnp001,Pete Harnisch,grism001,Marquis Grissom,8,lorem001,Mark Loretta,6,burnj001,Jeromy Burnitz,9,jenkg001,Geoff Jenkins,7,hernj001,Jose Hernandez,5,barkk001,Kevin Barker,3,bellr002,Ronnie Belliard,4,blanh001,Henry Blanco,2,woods001,Steve Woodard,1,reesp001,Pokey Reese,4,larkb001,Barry Larkin,6,grifk002,Ken Griffey,8,bichd001,Dante Bichette,9,yound001,Dmitri Young,3,taube001,Ed Taubensee,2,boona001,Aaron Boone,5,tuckm001,Michael Tucker,7,harnp001,Pete Harnisch,1,,Y
4,20000403,0,Mon,SFN,NL,1,FLO,NL,1,4,6,51,N,,,,MIA01,35101,166,2100001,20002101x,35,10,2,2,1,4,0,0,0,1,0,8,0,0,2,0,5,2,4,4,0,0,24,7,2,0,1,0,36,12,3,0,0,5,0,0,1,1,0,7,1,0,1,0,8,3,4,4,0,0,27,15,0,0,2,0,demud901,Dana DeMuth,relic901,Charlie Reliford,eddid901,Doug Eddings,carlm901,Mark Carlson,,(none),,(none),baked002,Dusty Baker,bolej801,John Boles,ferna001,Alex Fernandez,hernl003,Livan Hernandez,alfoa001,Antonio Alfonseca,,(none),hernl003,Livan Hernandez,ferna001,Alex Fernandez,benam001,Marvin Benard,8,muelb001,Bill Mueller,5,bondb001,Barry Bonds,7,kentj001,Jeff Kent,4,snowj001,J.T. Snow,3,burke001,Ellis Burks,9,aurir001,Rich Aurilia,6,estab001,Bobby Estalella,2,hernl003,Livan Hernandez,1,castl001,Luis Castillo,4,gonza002,Alex Gonzalez,6,floyc001,Cliff Floyd,7,wilsp002,Preston Wilson,8,lowem001,Mike Lowell,5,millk005,Kevin Millar,3,browb003,Brant Brown,9,redmm001,Mike Redmond,2,ferna001,Alex Fernandez,1,,Y


In [3]:
# Add column names to df
colnames = ['date','dblheader_code','day_of_week','team_v','league_v','game_no_v',
           'team_h','league_h','game_no_h', 'runs_v', 'runs_h','outs_total','day_night',
            'completion_info','forfeit_info','protest_info','ballpark_id','attendance','game_minutes',
            'linescore_v','linescore_h',
           'AB_v','H_v','2B_v','3B_v','HR_v','RBI_v','SH_v','SF_v','HBP_v','BB_v','IBB_v','SO_v',
            'SB_v', 'CS_v','GIDP_v','CI_v','LOB_v',
            'P_num_v','ERind_v','ERteam_v','WP_v','balk_v',
            'PO_v','ASST_v','ERR_v','PB_v','DP_v','TP_v',
           'AB_h', 'H_h', '2B_h', '3B_h', 'HR_h', 'RBI_h', 'SH_h', 'SF_h', 'HBP_h', 'BB_h', 'IBB_h','SO_h',
            'SB_h', 'CS_h', 'GIDP_h', 'CI_h', 'LOB_h',
            'P_num_h', 'ERind_h', 'ERteam_h', 'WP_h', 'balk_h',
            'PO_h', 'ASST_h', 'ERR_h', 'PB_h', 'DP_h', 'TP_h',
            'ump_HB_id', 'ump_HB_name','ump_1B_id', 'ump_1B_name','ump_2B_id', 'ump_2B_name',
            'ump_3B_id', 'ump_3B_name','ump_LF_id', 'ump_LF_name','ump_RF_id', 'ump_RF_name',
            'mgr_id_v', 'mgr_name_v', 'mgr_id_h', 'mgr_name_h',
            'pitcher_id_w','pitcher_name_w','pitcher_id_l','pitcher_name_l','pitcher_id_s','pitcher_name_s',
            'GWRBI_id','GWRBI_name','pitcher_start_id_v','pitcher_start_name_v','pitcher_start_id_h','pitcher_start_name_h',
            'batter1_name_v', 'batter1_id_v', 'batter1_pos_v', 'batter2_name_v', 'batter2_id_v', 'batter2_pos_v',
            'batter3_name_v', 'batter3_id_v', 'batter3_pos_v', 'batter4_name_v', 'batter4_id_v', 'batter4_pos_v',
            'batter5_name_v', 'batter5_id_v', 'batter5_pos_v', 'batter6_name_v', 'batter6_id_v', 'batter6_pos_v',
            'batter7_name_v', 'batter7_id_v', 'batter7_pos_v', 'batter8_name_v', 'batter8_id_v', 'batter8_pos_v',
            'batter9_name_v', 'batter9_id_v', 'batter9_pos_v', 'batter1_name_h', 'batter1_id_h', 'batter1_pos_h',
            'batter2_name_h', 'batter2_id_h', 'batter2_pos_h', 'batter3_name_h', 'batter3_id_h', 'batter3_pos_h',
            'batter4_name_h', 'batter4_id_h', 'batter4_pos_h', 'batter5_name_h', 'batter5_id_h', 'batter5_pos_h',
            'batter6_name_h', 'batter6_id_h', 'batter6_pos_h', 'batter7_name_h', 'batter7_id_h', 'batter7_pos_h',
            'batter8_name_h', 'batter8_id_h', 'batter8_pos_h', 'batter9_name_h', 'batter9_id_h', 'batter9_pos_h',           
           'misc_info','acqui_info'
           ]

df.columns = colnames
df.sample(10)

Unnamed: 0,date,dblheader_code,day_of_week,team_v,league_v,game_no_v,team_h,league_h,game_no_h,runs_v,runs_h,outs_total,day_night,completion_info,forfeit_info,protest_info,ballpark_id,attendance,game_minutes,linescore_v,linescore_h,AB_v,H_v,2B_v,3B_v,HR_v,RBI_v,SH_v,SF_v,HBP_v,BB_v,IBB_v,SO_v,SB_v,CS_v,GIDP_v,CI_v,LOB_v,P_num_v,ERind_v,ERteam_v,WP_v,balk_v,PO_v,ASST_v,ERR_v,PB_v,DP_v,TP_v,AB_h,H_h,2B_h,3B_h,HR_h,RBI_h,SH_h,SF_h,HBP_h,BB_h,IBB_h,SO_h,SB_h,CS_h,GIDP_h,CI_h,LOB_h,P_num_h,ERind_h,ERteam_h,WP_h,balk_h,PO_h,ASST_h,ERR_h,PB_h,DP_h,TP_h,ump_HB_id,ump_HB_name,ump_1B_id,ump_1B_name,ump_2B_id,ump_2B_name,ump_3B_id,ump_3B_name,ump_LF_id,ump_LF_name,ump_RF_id,ump_RF_name,mgr_id_v,mgr_name_v,mgr_id_h,mgr_name_h,pitcher_id_w,pitcher_name_w,pitcher_id_l,pitcher_name_l,pitcher_id_s,pitcher_name_s,GWRBI_id,GWRBI_name,pitcher_start_id_v,pitcher_start_name_v,pitcher_start_id_h,pitcher_start_name_h,batter1_name_v,batter1_id_v,batter1_pos_v,batter2_name_v,batter2_id_v,batter2_pos_v,batter3_name_v,batter3_id_v,batter3_pos_v,batter4_name_v,batter4_id_v,batter4_pos_v,batter5_name_v,batter5_id_v,batter5_pos_v,batter6_name_v,batter6_id_v,batter6_pos_v,batter7_name_v,batter7_id_v,batter7_pos_v,batter8_name_v,batter8_id_v,batter8_pos_v,batter9_name_v,batter9_id_v,batter9_pos_v,batter1_name_h,batter1_id_h,batter1_pos_h,batter2_name_h,batter2_id_h,batter2_pos_h,batter3_name_h,batter3_id_h,batter3_pos_h,batter4_name_h,batter4_id_h,batter4_pos_h,batter5_name_h,batter5_id_h,batter5_pos_h,batter6_name_h,batter6_id_h,batter6_pos_h,batter7_name_h,batter7_id_h,batter7_pos_h,batter8_name_h,batter8_id_h,batter8_pos_h,batter9_name_h,batter9_id_h,batter9_pos_h,misc_info,acqui_info
1911,20000826,0,Sat,HOU,NL,129,MON,NL,126,4,5,51,N,,,,MON02,8619,141,4,00010004x,35,10,1,0,0,4,0,0,0,1,0,7,0,0,1,0,5,3,5,5,0,0,24,10,0,0,1,0,30,9,2,1,1,5,2,0,0,2,0,2,0,0,1,0,5,2,4,4,0,0,27,9,0,1,1,0,hudsm901,Marvin Hudson,welkt901,Tim Welke,cedeg901,Gary Cederstrom,scotd901,Dale Scott,,(none),,(none),dierl101,Larry Dierker,alouf101,Felipe Alou,hermd001,Dustin Hermanson,limaj001,Jose Lima,klins002,Steve Kline,vidrj001,Jose Vidro,limaj001,Jose Lima,hermd001,Dustin Hermanson,ceder001,Roger Cedeno,7,lugoj001,Julio Lugo,4,bagwj001,Jeff Bagwell,3,hidar001,Richard Hidalgo,8,aloum001,Moises Alou,9,spieb001,Bill Spiers,5,euset001,Tony Eusebio,2,bogat001,Tim Bogar,6,limaj001,Jose Lima,1,jonet004,Terry Jones,7,bradm001,Milton Bradley,8,vidrj001,Jose Vidro,4,guerv001,Vladimir Guerrero,9,stevl001,Lee Stevens,3,blumg001,Geoff Blum,5,cabro001,Orlando Cabrera,6,barrm003,Michael Barrett,2,hermd001,Dustin Hermanson,1,,Y
2104,20000909,0,Sat,HOU,NL,142,CHN,NL,141,14,4,54,D,,,,CHI11,38203,173,202500131,000003100,45,19,4,0,7,14,0,0,0,3,0,11,0,0,1,0,7,3,4,4,0,0,27,7,1,0,1,0,36,9,2,1,0,4,0,0,0,1,0,3,0,0,0,0,6,5,14,14,0,0,27,9,0,0,1,0,barrt901,Ted Barrett,randt901,Tony Randazzo,monte901,Ed Montague,laynj901,Jerry Layne,,(none),,(none),dierl101,Larry Dierker,bayld001,Don Baylor,holtc001,Chris Holt,quevr001,Ruben Quevedo,,(none),bogat001,Tim Bogar,holtc001,Chris Holt,quevr001,Ruben Quevedo,lugoj001,Julio Lugo,4,bogat001,Tim Bogar,6,bagwj001,Jeff Bagwell,3,berkl001,Lance Berkman,9,hidar001,Richard Hidalgo,8,wardd002,Daryle Ward,7,trubc001,Chris Truby,5,chavr001,Raul Chavez,2,holtc001,Chris Holt,1,youne001,Eric Young,4,gutir001,Ricky Gutierrez,6,sosas001,Sammy Sosa,9,gracm001,Mark Grace,3,browr001,Roosevelt Brown,7,bufod001,Damon Buford,8,greew001,Willie Greene,5,reedj001,Jeff Reed,2,quevr001,Ruben Quevedo,1,,Y
1797,20000818,0,Fri,PIT,NL,120,CIN,NL,121,6,3,54,N,,,,CIN08,31891,167,2022,100001010,36,11,3,0,1,5,1,0,0,2,1,7,1,0,0,0,6,3,3,3,0,0,27,10,1,0,0,0,32,6,2,0,1,3,0,1,0,3,0,6,0,0,0,0,6,5,5,5,1,0,27,10,1,0,0,0,morrd901,Dan Morrison,fleta901,Andy Fletcher,,(none),timmt901,Tim Timmons,,(none),,(none),lamog101,Gene Lamont,mckej801,Jack McKeon,ritct001,Todd Ritchie,sulls001,Scott Sullivan,willm005,Mike Williams,,(none),ritct001,Todd Ritchie,wills002,Scott Williamson,browa001,Adrian Brown,8,kendj001,Jason Kendall,2,gileb002,Brian Giles,7,vandj001,John Vander Wal,3,ramia001,Aramis Ramirez,5,morrw001,Warren Morris,4,ramia002,Alex Ramirez,9,wilse001,Enrique Wilson,6,ritct001,Todd Ritchie,1,tuckm001,Michael Tucker,7,larkb001,Barry Larkin,6,grifk002,Ken Griffey,8,bichd001,Dante Bichette,9,cases001,Sean Casey,3,stync001,Chris Stynes,5,castj004,Juan Castro,4,laruj001,Jason LaRue,2,wills002,Scott Williamson,1,,Y
361,20000430,0,Sun,CHA,AL,25,DET,AL,23,3,4,70,D,,,,DET05,28435,216,10000011000,000000102001,43,10,1,0,1,3,1,0,0,4,0,13,0,1,0,0,9,5,3,3,0,0,34,19,2,0,2,0,45,15,2,0,0,4,1,0,0,4,1,4,0,1,2,0,12,5,3,3,1,0,36,10,0,0,0,0,reedr901,Rick Reed,wendh902,Hunter Wendelstedt,buckc901,CB Bucknor,vanvm901,Mike Vanvleet,,(none),,(none),nossj101,Joe Nossek,melvb001,Bob Melvin,andem002,Matt Anderson,eyres001,Scott Eyre,,(none),higgb001,Bobby Higginson,sirom001,Mike Sirotka,nomoh001,Hideo Nomo,durhr001,Ray Durham,4,valej003,Jose Valentin,6,thomf001,Frank Thomas,10,ordom001,Magglio Ordonez,9,konep001,Paul Konerko,3,singc001,Chris Singleton,8,abboj002,Jeff Abbott,7,nortg001,Greg Norton,5,johnm003,Mark Johnson,2,jeffg001,Gregg Jefferies,3,ausmb001,Brad Ausmus,2,encaj001,Juan Encarnacion,8,gonzj002,Juan Gonzalez,10,higgb001,Bobby Higginson,7,easld001,Damion Easley,4,magew001,Wendell Magee,9,halts001,Shane Halter,5,cruzd001,Deivi Cruz,6,,Y
727,20000528,0,Sun,BOS,AL,46,NYA,AL,46,2,0,54,N,,,,NYC16,55339,179,2,000000000,30,5,0,1,1,2,0,0,0,0,0,13,0,1,0,0,1,1,0,0,0,0,27,4,0,0,1,0,30,4,1,0,0,0,0,0,2,1,0,9,2,0,1,0,6,1,2,2,0,0,27,9,0,0,0,0,rapue901,Ed Rapuano,rungb901,Brian Runge,shulj901,John Shulock,millb901,Bill Miller,,(none),,(none),willj107,Jimy Williams,torrj101,Joe Torre,martp001,Pedro Martinez,clemr001,Roger Clemens,,(none),nixot001,Trot Nixon,martp001,Pedro Martinez,clemr001,Roger Clemens,fryej001,Jeff Frye,4,nixot001,Trot Nixon,9,daubb001,Brian Daubach,10,garcn001,Nomar Garciaparra,6,everc001,Carl Everett,8,stanm002,Mike Stanley,3,oleat001,Troy O'Leary,7,valej002,John Valentin,5,varij001,Jason Varitek,2,knobc001,Chuck Knoblauch,4,jeted001,Derek Jeter,6,oneip001,Paul O'Neill,9,willb002,Bernie Williams,8,posaj001,Jorge Posada,2,martt002,Tino Martinez,3,spens001,Shane Spencer,10,leder001,Ricky Ledee,7,bross001,Scott Brosius,5,"umpchange,6,umphome,rungb901,6,ump1b,shulj901,...",Y
161,20000415,0,Sat,ATL,NL,11,MIL,NL,12,3,6,51,D,,,,MIL05,24755,144,10000020,20001003x,30,5,1,0,1,2,1,0,0,5,1,4,0,0,2,0,6,2,5,5,0,0,24,11,1,0,0,0,32,9,2,0,2,5,1,0,0,1,0,4,0,0,0,0,4,3,2,2,0,0,27,13,1,0,2,0,schrp901,Paul Schrieber,cuzzp901,Phil Cuzzi,mcclt901,Tim McClelland,craft901,Terry Craft,,(none),,(none),cox-b103,Bobby Cox,loped001,Davey Lopes,weatd001,David Weathers,burkj001,John Burkett,wickb001,Bob Wickman,,(none),millk004,Kevin Millwood,haynj001,Jimmy Haynes,veraq001,Quilvio Veras,4,jonea002,Andruw Jones,8,lockk001,Keith Lockhart,5,galaa001,Andres Galarraga,3,bonib001,Bobby Bonilla,7,lopej001,Javy Lopez,2,sandr002,Reggie Sanders,9,furcr001,Rafael Furcal,6,millk004,Kevin Millwood,1,grism001,Marquis Grissom,8,lorem001,Mark Loretta,6,burnj001,Jeromy Burnitz,9,jenkg001,Geoff Jenkins,7,hernj001,Jose Hernandez,5,barkk001,Kevin Barker,3,bellr002,Ronnie Belliard,4,casar001,Raul Casanova,2,haynj001,Jimmy Haynes,1,,Y
1332,20000715,0,Sat,SLN,NL,90,CHA,AL,90,7,15,51,N,,,,CHI12,40681,240,110020003,30021090x,37,13,1,0,2,6,0,1,2,5,0,8,1,0,2,0,11,5,15,15,1,0,24,6,1,0,0,0,38,15,6,0,1,15,0,1,2,6,0,8,1,1,0,0,8,5,6,6,0,0,27,11,3,0,2,0,marqa901,Alfonso Marquez,iassd901,Dan Iassogna,nelsj901,Jeff Nelson,cousd901,Derryl Cousins,,(none),,(none),larut101,Tony LaRussa,manuj101,Jerry Manuel,parqj001,Jim Parque,kiled001,Darryl Kile,,(none),thomf001,Frank Thomas,kiled001,Darryl Kile,parqj001,Jim Parque,vinaf001,Fernando Vina,4,rente001,Edgar Renteria,6,edmoj001,Jim Edmonds,8,tatif001,Fernando Tatis,5,davie001,Eric Davis,10,lankr001,Ray Lankford,7,peree001,Eduardo Perez,3,mathm001,Mike Matheny,2,dunss001,Shawon Dunston,9,durhr001,Ray Durham,4,valej003,Jose Valentin,6,thomf001,Frank Thomas,10,ordom001,Magglio Ordonez,9,konep001,Paul Konerko,3,singc001,Chris Singleton,8,abboj002,Jeff Abbott,7,perrh001,Herbert Perry,5,fordb001,Brook Fordyce,2,,Y
1887,20000824,0,Thu,OAK,AL,126,CLE,AL,123,11,7,54,N,,,,CLE08,43276,231,53021000,200000203,41,12,1,0,1,10,0,0,1,6,0,10,1,0,0,0,10,4,5,5,1,0,27,9,3,0,2,0,37,11,2,0,0,6,1,0,0,9,1,9,0,0,1,0,13,5,10,10,1,0,27,5,1,0,0,0,drakr901,Rob Drake,scotd901,Dale Scott,hudsm901,Marvin Hudson,cedeg901,Gary Cederstrom,,(none),,(none),howea001,Art Howe,manuc101,Charlie Manuel,zitob001,Barry Zito,woods001,Steve Woodard,mecij001,Jim Mecir,longt002,Terrence Long,zitob001,Barry Zito,woods001,Steve Woodard,longt002,Terrence Long,8,velar001,Randy Velarde,4,giamj001,Jason Giambi,10,stanm002,Mike Stanley,3,grieb001,Ben Grieve,7,tejam001,Miguel Tejada,6,staim001,Matt Stairs,9,chave001,Eric Chavez,5,fasas001,Sal Fasano,2,loftk001,Kenny Lofton,8,vizqo001,Omar Vizquel,6,alomr001,Roberto Alomar,4,ramim002,Manny Ramirez,9,frymt001,Travis Fryman,5,segud001,David Segui,3,thomj002,Jim Thome,10,cordw001,Wil Cordero,7,aloms001,Sandy Alomar,2,,Y
1476,20000725,0,Tue,ARI,NL,100,SLN,NL,99,3,7,51,N,,,,STL09,44454,179,300000,00001600x,30,5,1,0,0,3,0,0,0,4,0,2,0,0,1,0,4,4,6,6,0,0,24,4,3,0,2,0,30,7,2,0,2,6,1,0,0,9,0,9,3,0,2,0,9,2,3,3,0,0,27,14,0,1,2,0,marsr901,Randy Marsh,fostm901,Marty Foster,kulpr901,Ron Kulpa,vanvm901,Mike Vanvleet,,(none),,(none),showb801,Buck Showalter,larut101,Tony LaRussa,stepg001,Garrett Stephenson,johnr005,Randy Johnson,,(none),tatif001,Fernando Tatis,johnr005,Randy Johnson,stepg001,Garrett Stephenson,womat001,Tony Womack,6,counc001,Craig Counsell,4,gonzl001,Luis Gonzalez,7,finls001,Steve Finley,8,willm003,Matt Williams,5,durae001,Erubiel Durazo,3,bautd001,Danny Bautista,9,stink001,Kelly Stinnett,2,johnr005,Randy Johnson,1,polap001,Placido Polanco,4,rente001,Edgar Renteria,6,edmoj001,Jim Edmonds,8,tatif001,Fernando Tatis,5,peree001,Eduardo Perez,3,dunss001,Shawon Dunston,7,paquc001,Craig Paquette,9,mathm001,Mike Matheny,2,stepg001,Garrett Stephenson,1,,Y
1351,20000716,0,Sun,ATL,NL,92,TBA,AL,89,6,4,54,D,,,,STP01,41066,149,2110002,000010300,34,8,1,0,3,6,1,0,0,4,0,8,0,0,1,0,6,3,4,4,0,0,27,7,0,0,0,0,31,5,4,0,1,4,0,1,0,4,0,8,0,0,0,0,5,2,6,6,0,0,27,14,0,0,1,0,hudsm901,Marvin Hudson,timmt901,Tim Timmons,cedeg901,Gary Cederstrom,scotd901,Dale Scott,,(none),,(none),corrp102,Pat Corrales,rothl101,Larry Rothschild,kamis001,Scott Kamieniecki,mecij001,Jim Mecir,,(none),furcr001,Rafael Furcal,millk004,Kevin Millwood,yan-e001,Esteban Yan,furcr001,Rafael Furcal,4,jonea002,Andruw Jones,8,jonec004,Chipper Jones,5,jordb001,Brian Jordan,9,galaa001,Andres Galarraga,10,joynw001,Wally Joyner,3,lopej001,Javy Lopez,2,sandr002,Reggie Sanders,7,weisw001,Walt Weiss,6,willg001,Gerald Williams,8,cox-s001,Steve Cox,3,vaugg001,Greg Vaughn,7,mcgrf001,Fred McGriff,10,tramb001,Bubba Trammell,9,johnr006,Russ Johnson,5,difem001,Mike Difelice,2,guilo001,Ozzie Guillen,6,cairm001,Miguel Cairo,4,,Y


## Create a Table with every game since 2000

In [4]:
df = pd.DataFrame()
# Concatenate every file together
for year in range(2000,2023):
    fname = f'./raw_data/gl{year}.txt'
    df_temp = pd.read_csv(fname, header=None)
    #  Assign column names
    df_temp.columns = colnames
    # Add year to df
    df_temp['season'] = year
    df = pd.concat((df, df_temp))
    
# Explore df size
df.shape

(54345, 162)

In [5]:
df.info(max_cols=200)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 54345 entries, 0 to 2429
Data columns (total 162 columns):
 #    Column                Non-Null Count  Dtype  
---   ------                --------------  -----  
 0    date                  54345 non-null  int64  
 1    dblheader_code        54345 non-null  int64  
 2    day_of_week           54345 non-null  object 
 3    team_v                54345 non-null  object 
 4    league_v              54345 non-null  object 
 5    game_no_v             54345 non-null  int64  
 6    team_h                54345 non-null  object 
 7    league_h              54345 non-null  object 
 8    game_no_h             54345 non-null  int64  
 9    runs_v                54345 non-null  int64  
 10   runs_h                54345 non-null  int64  
 11   outs_total            54345 non-null  int64  
 12   day_night             54345 non-null  object 
 13   completion_info       43 non-null     object 
 14   forfeit_info          0 non-null      float64
 15   p

In [6]:
## Calculate additional useful columns
df['run_diff'] = df['runs_h']-df['runs_v']
df['home_victory'] = (df['run_diff']>0).astype(int)
df['run_total'] = df['runs_h'].copy()+df['runs_v'].copy()
df['date_dblhead'] = (df['date'].astype(str) + df['dblheader_code'].astype(str)).astype(int)

In [7]:
# On average, how often did the home team win?
df.home_victory.mean()

0.5388352194314104

In [8]:
# Retrieve single teams games
df_yankees = df.loc[((df.team_v=='NYA') | (df.team_h=='NYA'))]
df_yankees.shape

(3622, 166)

In [9]:
df_yankees.head(100)

Unnamed: 0,date,dblheader_code,day_of_week,team_v,league_v,game_no_v,team_h,league_h,game_no_h,runs_v,runs_h,outs_total,day_night,completion_info,forfeit_info,protest_info,ballpark_id,attendance,game_minutes,linescore_v,linescore_h,AB_v,H_v,2B_v,3B_v,HR_v,RBI_v,SH_v,SF_v,HBP_v,BB_v,IBB_v,SO_v,SB_v,CS_v,GIDP_v,CI_v,LOB_v,P_num_v,ERind_v,ERteam_v,WP_v,balk_v,PO_v,ASST_v,ERR_v,PB_v,DP_v,TP_v,AB_h,H_h,2B_h,3B_h,HR_h,RBI_h,SH_h,SF_h,HBP_h,BB_h,IBB_h,SO_h,SB_h,CS_h,GIDP_h,CI_h,LOB_h,P_num_h,ERind_h,ERteam_h,WP_h,balk_h,PO_h,ASST_h,ERR_h,PB_h,DP_h,TP_h,ump_HB_id,ump_HB_name,ump_1B_id,ump_1B_name,ump_2B_id,ump_2B_name,ump_3B_id,ump_3B_name,ump_LF_id,ump_LF_name,ump_RF_id,ump_RF_name,mgr_id_v,mgr_name_v,mgr_id_h,mgr_name_h,pitcher_id_w,pitcher_name_w,pitcher_id_l,pitcher_name_l,pitcher_id_s,pitcher_name_s,GWRBI_id,GWRBI_name,pitcher_start_id_v,pitcher_start_name_v,pitcher_start_id_h,pitcher_start_name_h,batter1_name_v,batter1_id_v,batter1_pos_v,batter2_name_v,batter2_id_v,batter2_pos_v,batter3_name_v,batter3_id_v,batter3_pos_v,batter4_name_v,batter4_id_v,batter4_pos_v,batter5_name_v,batter5_id_v,batter5_pos_v,batter6_name_v,batter6_id_v,batter6_pos_v,batter7_name_v,batter7_id_v,batter7_pos_v,batter8_name_v,batter8_id_v,batter8_pos_v,batter9_name_v,batter9_id_v,batter9_pos_v,batter1_name_h,batter1_id_h,batter1_pos_h,batter2_name_h,batter2_id_h,batter2_pos_h,batter3_name_h,batter3_id_h,batter3_pos_h,batter4_name_h,batter4_id_h,batter4_pos_h,batter5_name_h,batter5_id_h,batter5_pos_h,batter6_name_h,batter6_id_h,batter6_pos_h,batter7_name_h,batter7_id_h,batter7_pos_h,batter8_name_h,batter8_id_h,batter8_pos_h,batter9_name_h,batter9_id_h,batter9_pos_h,misc_info,acqui_info,season,run_diff,home_victory,run_total,date_dblhead
8,20000403,0,Mon,NYA,AL,1,ANA,AL,1,3,2,54,N,,,,ANA01,42704.0,182,2100,010000001,32,6,0,0,2,3,0,0,0,3,0,3,0,2,0,0,5,3,2,2,0,0,27,5,0,0,1,0,35,10,1,0,1,2,0,0,0,5,0,6,0,1,0,0,11,3,3,3,0,0,27,16,1,0,0,0,mcclt901,Tim McClelland,craft901,Terry Craft,schrp901,Paul Schrieber,cuzzp901,Phil Cuzzi,,(none),,(none),torrj101,Joe Torre,sciom001,Mike Scioscia,herno001,Orlando Hernandez,hillk001,Ken Hill,rivem002,Mariano Rivera,oneip001,Paul O'Neill,herno001,Orlando Hernandez,hillk001,Ken Hill,knobc001,Chuck Knoblauch,4,jeted001,Derek Jeter,6,oneip001,Paul O'Neill,9,willb002,Bernie Williams,10,martt002,Tino Martinez,3,leder001,Ricky Ledee,8,posaj001,Jorge Posada,2,spens001,Shane Spencer,7,bross001,Scott Brosius,5,erstd001,Darin Erstad,7,kenna001,Adam Kennedy,4,vaugm001,Mo Vaughn,3,salmt001,Tim Salmon,9,andeg001,Garret Anderson,8,glaut001,Troy Glaus,5,spies001,Scott Spiezio,10,molib001,Bengie Molina,2,disag001,Gary Disarcina,6,,Y,2000,-1,0,5,200004030
20,20000404,0,Tue,NYA,AL,2,ANA,AL,2,5,3,54,N,,,,ANA01,25818.0,200,101102,000003000,35,9,2,1,1,5,0,0,0,6,2,7,1,0,2,0,9,4,0,0,0,0,27,11,1,0,2,0,36,10,0,0,0,3,0,0,0,6,0,9,1,0,2,0,12,4,5,5,0,0,27,12,1,0,2,0,craft901,Terry Craft,schrp901,Paul Schrieber,cuzzp901,Phil Cuzzi,mcclt901,Tim McClelland,,(none),,(none),torrj101,Joe Torre,sciom001,Mike Scioscia,mendr001,Ramiro Mendoza,perct001,Troy Percival,rivem002,Mariano Rivera,willb002,Bernie Williams,clemr001,Roger Clemens,bottk001,Kent Bottenfield,knobc001,Chuck Knoblauch,4,jeted001,Derek Jeter,6,oneip001,Paul O'Neill,9,willb002,Bernie Williams,10,martt002,Tino Martinez,3,leder001,Ricky Ledee,8,posaj001,Jorge Posada,2,spens001,Shane Spencer,7,bellc001,Clay Bellinger,5,erstd001,Darin Erstad,7,kenna001,Adam Kennedy,4,vaugm001,Mo Vaughn,3,salmt001,Tim Salmon,9,andeg001,Garret Anderson,8,glaut001,Troy Glaus,5,spies001,Scott Spiezio,10,molib001,Bengie Molina,2,disag001,Gary Disarcina,6,,Y,2000,-2,0,8,200004040
34,20000405,0,Wed,NYA,AL,3,ANA,AL,3,6,12,51,N,,,,ANA01,24560.0,181,1202010,12610110x,40,13,3,0,2,6,0,0,0,0,0,5,1,0,2,0,7,4,12,12,0,0,24,8,0,0,0,0,33,12,4,0,1,12,0,2,1,8,0,4,1,0,0,0,8,3,6,6,0,0,27,13,2,0,2,0,schrp901,Paul Schrieber,cuzzp901,Phil Cuzzi,mcclt901,Tim McClelland,craft901,Terry Craft,,(none),,(none),torrj101,Joe Torre,sciom001,Mike Scioscia,schos001,Scott Schoeneweis,coned001,David Cone,,(none),erstd001,Darin Erstad,coned001,David Cone,schos001,Scott Schoeneweis,knobc001,Chuck Knoblauch,4,jeted001,Derek Jeter,6,oneip001,Paul O'Neill,9,willb002,Bernie Williams,10,spens001,Shane Spencer,7,martt002,Tino Martinez,3,posaj001,Jorge Posada,2,kellb002,Roberto Kelly,8,bellc001,Clay Bellinger,5,erstd001,Darin Erstad,7,kenna001,Adam Kennedy,4,vaugm001,Mo Vaughn,3,salmt001,Tim Salmon,9,andeg001,Garret Anderson,8,glaut001,Troy Glaus,5,spies001,Scott Spiezio,10,walbm001,Matt Walbeck,2,disag001,Gary Disarcina,6,,Y,2000,6,1,18,200004050
65,20000407,0,Fri,NYA,AL,4,SEA,AL,4,5,7,51,N,,,,SEA03,40827.0,189,2010002,11003101x,35,9,4,0,2,4,0,0,0,0,0,8,0,0,1,0,3,3,7,7,0,1,24,8,0,0,0,0,35,12,2,0,2,6,0,1,0,5,1,9,3,0,0,0,10,4,4,4,0,0,27,9,1,0,1,0,randt901,Tony Randazzo,monte901,Ed Montague,laynj901,Jerry Layne,barrt901,Ted Barrett,,(none),,(none),torrj101,Joe Torre,pinil001,Lou Piniella,halaj001,John Halama,petta001,Andy Pettitte,sasak001,Kazuhiro Sasaki,olivj001,Joe Oliver,petta001,Andy Pettitte,halaj001,John Halama,knobc001,Chuck Knoblauch,4,jeted001,Derek Jeter,6,oneip001,Paul O'Neill,9,willb002,Bernie Williams,8,spens001,Shane Spencer,10,martt002,Tino Martinez,3,posaj001,Jorge Posada,2,kellb002,Roberto Kelly,7,bellc001,Clay Bellinger,5,camem001,Mike Cameron,8,javis001,Stan Javier,10,rodra001,Alex Rodriguez,6,olerj001,John Olerud,3,buhnj001,Jay Buhner,9,belld002,David Bell,4,olivj001,Joe Oliver,2,guilc001,Carlos Guillen,5,gipsc001,Charles Gipson,7,,Y,2000,2,1,12,200004070
80,20000408,0,Sat,NYA,AL,5,SEA,AL,5,3,2,54,D,,,,SEA03,45261.0,215,10000101,000011000,34,10,2,0,2,3,1,0,0,5,1,7,1,0,1,0,10,5,2,2,0,0,27,8,2,0,1,0,32,5,1,0,1,2,1,0,1,2,1,10,0,0,1,0,7,4,3,3,0,0,27,10,0,1,2,0,monte901,Ed Montague,laynj901,Jerry Layne,barrt901,Ted Barrett,randt901,Tony Randazzo,,(none),,(none),torrj101,Joe Torre,pinil001,Lou Piniella,nelsj001,Jeff Nelson,mesaj001,Jose Mesa,rivem002,Mariano Rivera,oneip001,Paul O'Neill,herno001,Orlando Hernandez,mechg001,Gil Meche,knobc001,Chuck Knoblauch,4,jeted001,Derek Jeter,6,oneip001,Paul O'Neill,9,willb002,Bernie Williams,8,martt002,Tino Martinez,3,leder001,Ricky Ledee,7,posaj001,Jorge Posada,2,spens001,Shane Spencer,10,soria001,Alfonso Soriano,5,mclem001,Mark McLemore,4,javis001,Stan Javier,10,rodra001,Alex Rodriguez,6,olerj001,John Olerud,3,mabrj001,John Mabry,9,camem001,Mike Cameron,8,ibanr001,Raul Ibanez,7,wilsd001,Dan Wilson,2,guilc001,Carlos Guillen,5,,Y,2000,-1,0,5,200004080
94,20000409,0,Sun,NYA,AL,6,SEA,AL,6,3,9,51,D,,,,SEA03,45488.0,160,201000,00106020x,29,3,2,0,1,3,0,1,0,0,0,5,0,0,0,0,0,3,3,3,0,0,24,7,2,0,3,0,30,9,2,0,1,9,1,1,0,6,0,6,1,1,1,0,5,2,3,3,0,0,27,11,0,0,0,0,laynj901,Jerry Layne,barrt901,Ted Barrett,randt901,Tony Randazzo,monte901,Ed Montague,,(none),,(none),torrj101,Joe Torre,pinil001,Lou Piniella,moyej001,Jamie Moyer,clemr001,Roger Clemens,,(none),rodra001,Alex Rodriguez,clemr001,Roger Clemens,moyej001,Jamie Moyer,knobc001,Chuck Knoblauch,4,jeted001,Derek Jeter,6,oneip001,Paul O'Neill,9,willb002,Bernie Williams,8,martt002,Tino Martinez,3,posaj001,Jorge Posada,2,leyrj001,Jim Leyritz,10,kellb002,Roberto Kelly,7,soria001,Alfonso Soriano,5,mclem001,Mark McLemore,7,camem001,Mike Cameron,8,rodra001,Alex Rodriguez,6,olerj001,John Olerud,3,marte001,Edgar Martinez,10,buhnj001,Jay Buhner,9,belld002,David Bell,4,wilsd001,Dan Wilson,2,guilc001,Carlos Guillen,5,,Y,2000,6,1,12,200004090
129,20000412,0,Wed,TEX,AL,8,NYA,AL,7,6,8,51,D,,,,NYC16,48487.0,213,2002200,00004130x,33,10,2,0,0,4,1,0,1,7,0,8,0,1,3,0,9,6,7,7,0,0,24,9,1,0,0,0,38,13,2,1,1,7,0,0,0,2,0,7,1,0,0,0,8,5,5,5,0,0,27,9,1,0,3,0,timmt901,Tim Timmons,gormb901,Brian Gorman,everm901,Mike Everitt,crawj901,Jerry Crawford,,(none),,(none),oatej101,Johnny Oates,torrj101,Joe Torre,nelsj001,Jeff Nelson,munom001,Mike Munoz,rivem002,Mariano Rivera,martt002,Tino Martinez,olivd001,Darren Oliver,coned001,David Cone,clayr001,Royce Clayton,6,greer001,Rusty Greer,7,rodri001,Ivan Rodriguez,2,palmr001,Rafael Palmeiro,3,segud001,David Segui,10,mater001,Ruben Mateo,8,cataf001,Frank Catalanotto,4,kaplg001,Gabe Kapler,9,alicl001,Luis Alicea,5,knobc001,Chuck Knoblauch,4,jeted001,Derek Jeter,6,oneip001,Paul O'Neill,9,willb002,Bernie Williams,8,martt002,Tino Martinez,3,spens001,Shane Spencer,10,posaj001,Jorge Posada,2,kellb002,Roberto Kelly,7,soria001,Alfonso Soriano,5,,Y,2000,2,1,14,200004120
142,20000413,0,Thu,TEX,AL,9,NYA,AL,8,1,5,51,N,,,,NYC16,23805.0,179,10000000,10000400x,32,4,1,0,0,0,0,0,0,2,0,5,1,0,1,0,6,1,5,5,0,0,24,13,2,1,1,0,31,8,0,1,0,5,0,0,0,3,0,4,1,0,1,0,5,2,0,0,0,0,27,5,2,0,1,0,gormb901,Brian Gorman,everm901,Mike Everitt,crawj901,Jerry Crawford,timmt901,Tim Timmons,,(none),,(none),oatej101,Johnny Oates,torrj101,Joe Torre,herno001,Orlando Hernandez,rogek001,Kenny Rogers,,(none),posaj001,Jorge Posada,rogek001,Kenny Rogers,herno001,Orlando Hernandez,clayr001,Royce Clayton,6,curtc001,Chad Curtis,7,rodri001,Ivan Rodriguez,2,palmr001,Rafael Palmeiro,3,segud001,David Segui,10,mater001,Ruben Mateo,8,kaplg001,Gabe Kapler,9,alicl001,Luis Alicea,4,evant001,Tom Evans,5,knobc001,Chuck Knoblauch,4,jeted001,Derek Jeter,6,oneip001,Paul O'Neill,9,willb002,Bernie Williams,8,martt002,Tino Martinez,3,kellb002,Roberto Kelly,7,posaj001,Jorge Posada,2,spens001,Shane Spencer,10,soria001,Alfonso Soriano,5,,Y,2000,4,1,6,200004130
157,20000414,0,Fri,KCA,AL,12,NYA,AL,9,5,7,51,N,,,,NYC16,33094.0,228,20012000,02122000x,35,9,2,0,1,4,0,0,1,5,0,7,1,1,1,0,9,3,7,7,0,0,24,8,2,0,0,0,32,10,3,0,1,7,0,0,0,11,1,8,3,1,0,0,12,3,5,5,1,0,27,11,1,0,1,0,willc901,Charlie Williams,hirsj901,John Hirschbeck,wegnm901,Mark Wegner,reynj901,Jim Reynolds,,(none),,(none),muset101,Tony Muser,torrj101,Joe Torre,clemr001,Roger Clemens,witaj001,Jay Witasick,rivem002,Mariano Rivera,leder001,Ricky Ledee,witaj001,Jay Witasick,clemr001,Roger Clemens,damoj001,Johnny Damon,7,feblc001,Carlos Febles,4,beltc001,Carlos Beltran,8,dye-j001,Jermaine Dye,9,sweem002,Mike Sweeney,3,randj002,Joe Randa,5,quinm001,Mark Quinn,10,zaung001,Gregg Zaun,2,sancr001,Rey Sanchez,6,knobc001,Chuck Knoblauch,4,jeted001,Derek Jeter,6,oneip001,Paul O'Neill,9,willb002,Bernie Williams,8,martt002,Tino Martinez,3,leder001,Ricky Ledee,7,posaj001,Jorge Posada,2,spens001,Shane Spencer,10,delgw001,Wilson Delgado,5,,Y,2000,2,1,12,200004140
170,20000415,0,Sat,KCA,AL,13,NYA,AL,10,1,7,51,D,,,,NYC16,34056.0,153,100,30002020x,29,2,1,0,0,1,0,0,0,0,0,5,0,0,0,0,1,2,6,6,0,0,24,7,1,0,1,0,34,11,0,0,3,6,0,0,4,0,0,8,0,1,1,0,7,3,1,1,0,0,27,4,0,0,0,0,hirsj901,John Hirschbeck,wegnm901,Mark Wegner,reynj901,Jim Reynolds,willc901,Charlie Williams,,(none),,(none),muset101,Tony Muser,torrj101,Joe Torre,mendr001,Ramiro Mendoza,rosaj001,Jose Rosado,,(none),spens001,Shane Spencer,rosaj001,Jose Rosado,mendr001,Ramiro Mendoza,damoj001,Johnny Damon,7,feblc001,Carlos Febles,4,beltc001,Carlos Beltran,8,dye-j001,Jermaine Dye,9,sweem002,Mike Sweeney,3,randj002,Joe Randa,5,quinm001,Mark Quinn,10,johnb002,Brian Johnson,2,sancr001,Rey Sanchez,6,knobc001,Chuck Knoblauch,4,jeted001,Derek Jeter,6,willb002,Bernie Williams,8,spens001,Shane Spencer,9,martt002,Tino Martinez,3,posaj001,Jorge Posada,2,leyrj001,Jim Leyritz,10,kellb002,Roberto Kelly,7,bellc001,Clay Bellinger,5,,Y,2000,6,1,8,200004150


In [10]:
## Create a team-specific data frame, given the team

# Remove home or away suffix
def strip_suffix(x, suff):
    if x.endswith(suff):
        return(x[:-len(suff)])
    else:
        return(x)

# Remove home columns for visitor table
visit_cols = [col for col in df.columns if not col.endswith('_h')]
# Strip the suffix from remaining columns
visit_cols_stripped = [strip_suffix(col,'_v') for col in visit_cols]

home_cols = [col for col in df.columns if not col.endswith('_v')]
home_cols_stripped = [strip_suffix(col,'_h') for col in home_cols]

## This subsets the game level df by team, to aggregate team statistics easily
## Create rolling sums with an offset, so that the rollsum number represents statistics up to, but not including, the game in question

def create_team_df(team):
    df_team_v = df[(df.team_v==team)]
    opponent = df_team_v['team_h']
    df_team_v = df_team_v[visit_cols]
    df_team_v.columns = visit_cols_stripped
    df_team_v['home_game'] = 0
    df_team_v['opponent'] = opponent

    df_team_h = df[(df.team_h==team)]
    opponent = df_team_h['team_v']
    df_team_h = df_team_h[home_cols]
    df_team_h.columns = home_cols_stripped
    df_team_h['home_game'] = 1
    df_team_h['opponent'] = opponent


    df_team = pd.concat((df_team_h, df_team_v))
    df_team.sort_values(['date', 'game_no'],inplace=True)

    # Creating rollsums for the previous 162 and 30 games
    for winsize in [162,30]:
        suff = str(winsize)
        # Create rolloing sum for all basic metrics
        for raw_col in ['AB','H','2B','3B','HR','BB','runs','SO','SB','CS','ERR']:
            new_col = f'rollsum_{raw_col}_{suff}'
            df_team[new_col] = df_team[raw_col].rolling(winsize, closed='left').sum()

        df_team[f'rollsum_BATAVG_{suff}'] = df_team[f'rollsum_H_{suff}'] / df_team[f'rollsum_AB_{suff}']
        df_team[f'rollsum_OBP_{suff}'] = (df_team[f'rollsum_H_{suff}'] + df_team[f'rollsum_BB_{suff}']) / (
                                    df_team[f'rollsum_AB_{suff}']+df_team[f'rollsum_BB_{suff}'])
        df_team[f'rollsum_SLG_{suff}'] = (df_team[f'rollsum_H_{suff}'] + df_team[f'rollsum_2B_{suff}'] 
                                 + 2*df_team[f'rollsum_3B_{suff}']+ 
                                3*df_team[f'rollsum_HR_{suff}'] ) / (df_team[f'rollsum_AB_{suff}'])
        df_team[f'rollsum_OPS_{suff}'] = df_team[f'rollsum_OBP_{suff}'] + df_team[f'rollsum_SLG_{suff}']
        df_team[f'rollsum_SO_perc_{suff}'] = df_team[f'rollsum_SO_{suff}'] / df_team[f'rollsum_AB_{suff}']
        df_team[f'rollsum_BB_perc_{suff}'] = df_team[f'rollsum_BB_{suff}'] / df_team[f'rollsum_AB_{suff}']

    df_team['season_game'] = df_team['season']*1000 + df_team['game_no']
    df_team.set_index('season_game', inplace=True)
    return(df_team)

In [11]:
df_yankees = create_team_df('NYA')
df_yankees.sample(10)

Unnamed: 0_level_0,date,dblheader_code,day_of_week,team,league,game_no,runs,outs_total,day_night,completion_info,forfeit_info,protest_info,ballpark_id,attendance,game_minutes,linescore,AB,H,2B,3B,HR,RBI,SH,SF,HBP,BB,IBB,SO,SB,CS,GIDP,CI,LOB,P_num,ERind,ERteam,WP,balk,PO,ASST,ERR,PB,DP,TP,ump_HB_id,ump_HB_name,ump_1B_id,ump_1B_name,ump_2B_id,ump_2B_name,ump_3B_id,ump_3B_name,ump_LF_id,ump_LF_name,ump_RF_id,ump_RF_name,mgr_id,mgr_name,pitcher_id_w,pitcher_name_w,pitcher_id_l,pitcher_name_l,pitcher_id_s,pitcher_name_s,GWRBI_id,GWRBI_name,pitcher_start_id,pitcher_start_name,batter1_name,batter1_id,batter1_pos,batter2_name,batter2_id,batter2_pos,batter3_name,batter3_id,batter3_pos,batter4_name,batter4_id,batter4_pos,batter5_name,batter5_id,batter5_pos,batter6_name,batter6_id,batter6_pos,batter7_name,batter7_id,batter7_pos,batter8_name,batter8_id,batter8_pos,batter9_name,batter9_id,batter9_pos,misc_info,acqui_info,season,run_diff,home_victory,run_total,date_dblhead,home_game,opponent,rollsum_AB_162,rollsum_H_162,rollsum_2B_162,rollsum_3B_162,rollsum_HR_162,rollsum_BB_162,rollsum_runs_162,rollsum_SO_162,rollsum_SB_162,rollsum_CS_162,rollsum_ERR_162,rollsum_BATAVG_162,rollsum_OBP_162,rollsum_SLG_162,rollsum_OPS_162,rollsum_SO_perc_162,rollsum_BB_perc_162,rollsum_AB_30,rollsum_H_30,rollsum_2B_30,rollsum_3B_30,rollsum_HR_30,rollsum_BB_30,rollsum_runs_30,rollsum_SO_30,rollsum_SB_30,rollsum_CS_30,rollsum_ERR_30,rollsum_BATAVG_30,rollsum_OBP_30,rollsum_SLG_30,rollsum_OPS_30,rollsum_SO_perc_30,rollsum_BB_perc_30
season_game,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1
2008003,20080403,0,Thu,NYA,AL,3,3,51,N,,,,NYC16,47785.0,165,00000201x,25,6,1,0,0,2,2,1,2,3,0,6,0,0,2,0,6,5,2,2,0,0,27,10,1,0,0,0,millb901,Bill Miller,emmep901,Paul Emmel,darlg901,Gary Darling,mealj901,Jerry Meals,,(none),,(none),giraj001,Joe Girardi,chamj002,Joba Chamberlain,wolfb001,Brian Wolfe,rivem002,Mariano Rivera,abreb001,Bobby Abreu,hughp001,Phil Hughes,damoj001,Johnny Damon,10,jeted001,Derek Jeter,6,abreb001,Bobby Abreu,9,rodra001,Alex Rodriguez,5,giamj001,Jason Giambi,3,canor001,Robinson Cano,4,matsh001,Hideki Matsui,7,molij001,Jose Molina,2,cabrm002,Melky Cabrera,8,,Y,2008,1,1,5,200804030,1,TOR,5705.0,1645.0,326.0,33.0,201.0,634.0,958.0,993.0,120.0,41.0,83.0,0.288344,0.35952,0.462752,0.822272,0.174058,0.111131,1071.0,300.0,68.0,5.0,34.0,131.0,184.0,209.0,21.0,11.0,18.0,0.280112,0.358569,0.448179,0.806748,0.195145,0.122316
2021149,20210918,0,Sat,NYA,AL,149,3,54,D,,,,NYC21,39088.0,206,000000120,32,8,1,0,2,3,0,0,1,4,0,6,0,1,2,0,7,4,10,7,0,0,27,5,2,0,0,0,welkb901,Bill Welke,hobep901,Pat Hoberg,conrc901,Chris Conroy,may-b901,Ben May,,(none),,(none),boona001,Aaron Boone,civaa001,Aaron Civale,gil-l001,Luis Gil,,(none),chany001,Yu Chang,gil-l001,Luis Gil,lemad001,DJ LeMahieu,5,voitl001,Luke Voit,3,judga001,Aaron Judge,9,stanm004,Giancarlo Stanton,10,gallj002,Joey Gallo,7,torrg001,Gleyber Torres,4,sancg002,Gary Sanchez,2,gardb001,Brett Gardner,8,urshg001,Giovanny Urshela,6,,Y,2021,-8,0,14,202109180,1,CLE,5359.0,1289.0,219.0,12.0,227.0,646.0,741.0,1464.0,66.0,16.0,107.0,0.24053,0.322231,0.41295,0.735182,0.273185,0.120545,985.0,235.0,32.0,2.0,54.0,97.0,145.0,266.0,15.0,4.0,18.0,0.238579,0.306839,0.439594,0.746433,0.270051,0.098477
2022018,20220427,0,Wed,NYA,AL,18,5,51,N,,,,NYC21,31122.0,177,20000120x,31,8,0,0,2,4,0,1,0,2,0,9,1,0,0,0,5,3,2,2,0,0,27,14,0,0,0,0,cejan901,Nestor Ceja,barrt901,Ted Barrett,barkl901,Lance Barksdale,lentn901,Nic Lentz,,(none),,(none),boona001,Aaron Boone,kingm002,Michael King,krehj001,Joey Krehbiel,holmc001,Clay Holmes,stanm004,Giancarlo Stanton,montj004,Jordan Montgomery,lemad001,DJ LeMahieu,5,judga001,Aaron Judge,8,rizza001,Anthony Rizzo,3,stanm004,Giancarlo Stanton,9,donaj001,Josh Donaldson,10,torrg001,Gleyber Torres,4,gallj002,Joey Gallo,7,kinei001,Isiah Kiner-Falefa,6,trevj001,Jose Trevino,2,,Y,2022,3,1,7,202204270,1,BAL,5342.0,1289.0,219.0,13.0,227.0,610.0,722.0,1480.0,68.0,21.0,90.0,0.241295,0.319052,0.414639,0.733691,0.27705,0.114189,987.0,239.0,46.0,2.0,41.0,95.0,132.0,269.0,12.0,6.0,10.0,0.242148,0.308688,0.417427,0.726114,0.272543,0.096251
2014047,20140523,0,Fri,NYA,AL,47,5,51,N,,,,CHI12,27091.0,232,300000200,31,8,0,0,1,4,1,1,0,8,0,4,0,0,2,0,9,4,4,4,1,0,24,8,2,1,0,0,reybd901,D.J. Reyburn,nelsj901,Jeff Nelson,belld901,Dan Bellino,woodt901,Tom Woodring,,(none),,(none),giraj001,Joe Girardi,webbd001,Daniel Webb,robed002,David Robertson,,(none),dunna001,Adam Dunn,kuroh001,Hiroki Kuroda,gardb001,Brett Gardner,7,jeted001,Derek Jeter,6,ellsj001,Jacoby Ellsbury,8,teixm001,Mark Teixeira,10,mccab002,Brian McCann,2,solay001,Yangervis Solarte,5,suzui001,Ichiro Suzuki,9,robeb003,Brian Roberts,4,johnk003,Kelly Johnson,3,,Y,2014,1,1,11,201405230,0,CHA,5501.0,1352.0,257.0,25.0,130.0,475.0,650.0,1236.0,120.0,30.0,74.0,0.245773,0.305723,0.372478,0.678201,0.224686,0.086348,1056.0,263.0,44.0,6.0,27.0,95.0,131.0,228.0,22.0,4.0,19.0,0.249053,0.311034,0.378788,0.689822,0.215909,0.089962
2012117,20120815,0,Wed,NYA,AL,117,3,51,N,,,,NYC21,45921.0,185,00300000x,31,10,1,0,0,3,0,1,0,6,0,8,1,0,2,0,11,4,2,2,0,0,27,10,1,0,1,0,carav901,Vic Carapazza,fostm901,Marty Foster,timmt901,Tim Timmons,kellj901,Jeff Kellogg,,(none),,(none),giraj001,Joe Girardi,garcf002,Freddy Garcia,felds001,Scott Feldman,sorir001,Rafael Soriano,swisn001,Nick Swisher,garcf002,Freddy Garcia,jeted001,Derek Jeter,6,swisn001,Nick Swisher,3,granc001,Curtis Granderson,8,teixm001,Mark Teixeira,10,chave001,Eric Chavez,5,ibanr001,Raul Ibanez,7,martr004,Russell Martin,2,suzui001,Ichiro Suzuki,9,nix-j001,Jayson Nix,4,,Y,2012,1,1,5,201208150,1,TEX,5539.0,1463.0,287.0,17.0,247.0,566.0,822.0,1170.0,88.0,27.0,85.0,0.264127,0.332351,0.455858,0.788209,0.211229,0.102185,1027.0,287.0,55.0,2.0,44.0,78.0,159.0,201.0,12.0,8.0,7.0,0.279455,0.330317,0.465433,0.79575,0.195716,0.075949
2010050,20100530,0,Sun,NYA,AL,50,7,51,D,,,,NYC21,45706.0,168,00000052x,35,12,3,0,1,7,0,1,0,1,0,8,1,0,0,0,6,2,1,1,0,0,27,10,1,0,0,0,eddid901,Doug Eddings,demud901,Dana DeMuth,danlk901,Kerwin Danley,buckc901,CB Bucknor,,(none),,(none),giraj001,Joe Girardi,burna001,A.J. Burnett,sippt001,Tony Sipp,,(none),teixm001,Mark Teixeira,burna001,A.J. Burnett,jeted001,Derek Jeter,6,granc001,Curtis Granderson,8,teixm001,Mark Teixeira,3,rodra001,Alex Rodriguez,5,canor001,Robinson Cano,4,swisn001,Nick Swisher,9,miraj001,Juan Miranda,10,gardb001,Brett Gardner,7,moelc001,Chad Moeller,2,,Y,2010,4,1,10,201005300,1,CLE,5600.0,1587.0,308.0,26.0,219.0,677.0,907.0,1021.0,109.0,30.0,86.0,0.283393,0.360682,0.465,0.825682,0.182321,0.120893,1045.0,304.0,55.0,7.0,31.0,115.0,171.0,192.0,13.0,3.0,12.0,0.290909,0.361207,0.445933,0.80714,0.183732,0.110048
2019068,20190614,0,Fri,NYA,AL,68,2,51,N,,,,CHI12,31438.0,182,100000001,32,7,1,0,1,2,0,0,0,5,0,8,0,0,2,0,8,3,8,8,0,0,24,14,3,0,4,0,ticht901,Todd Tichenor,cuzzp901,Phil Cuzzi,lentn901,Nic Lentz,hamaa901,Adam Hamari,,(none),,(none),boona001,Aaron Boone,gioll001,Lucas Giolito,sabac001,CC Sabathia,,(none),jimee001,Eloy Jimenez,sabac001,CC Sabathia,lemad001,DJ LeMahieu,3,voitl001,Luke Voit,10,hicka001,Aaron Hicks,8,sancg002,Gary Sanchez,2,gregd001,Didi Gregorius,6,torrg001,Gleyber Torres,4,frazc001,Clint Frazier,9,urshg001,Giovanny Urshela,5,gardb001,Brett Gardner,7,,Y,2019,8,1,12,201906140,0,CHA,5487.0,1379.0,237.0,20.0,262.0,610.0,852.0,1396.0,60.0,19.0,106.0,0.251321,0.326226,0.445052,0.771278,0.25442,0.111172,1040.0,269.0,46.0,4.0,52.0,104.0,172.0,275.0,9.0,6.0,25.0,0.258654,0.326049,0.460577,0.786626,0.264423,0.1
2021077,20210627,0,Sun,NYA,AL,77,2,51,D,,,,BOS07,34504.0,189,000002000,33,7,0,0,1,2,0,0,0,3,0,13,0,0,1,0,7,3,8,8,1,0,24,8,1,0,0,0,segac901,Chris Segal,hoyej901,James Hoye,carlm901,Mark Carlson,bakej902,Jordan Baker,,(none),,(none),boona001,Aaron Boone,rodre004,Eduardo Rodriguez,coleg001,Gerrit Cole,,(none),herne001,Enrique Hernandez,coleg001,Gerrit Cole,lemad001,DJ LeMahieu,4,judga001,Aaron Judge,8,voitl001,Luke Voit,3,stanm004,Giancarlo Stanton,10,urshg001,Giovanny Urshela,5,torrg001,Gleyber Torres,6,andum001,Miguel Andujar,7,frazc001,Clint Frazier,9,higak001,Kyle Higashioka,2,,Y,2021,7,1,11,202106270,0,BOS,5324.0,1285.0,227.0,15.0,249.0,644.0,770.0,1437.0,51.0,15.0,105.0,0.24136,0.323224,0.42994,0.753164,0.26991,0.120962,1005.0,244.0,40.0,3.0,41.0,117.0,121.0,284.0,7.0,1.0,19.0,0.242786,0.321747,0.410945,0.732692,0.282587,0.116418
2015084,20150708,0,Wed,NYA,AL,84,5,51,N,,,,NYC21,41626.0,189,01020101x,31,9,1,0,3,5,0,1,0,4,1,8,1,0,1,0,7,5,4,4,1,0,27,10,1,0,1,0,hirsj901,John Hirschbeck,welkb901,Bill Welke,tumpj901,John Tumpane,hoyej901,James Hoye,,(none),,(none),giraj001,Joe Girardi,sabac001,CC Sabathia,scrie001,Evan Scribner,milla002,Andrew Miller,pirej001,Jose Pirela,sabac001,CC Sabathia,ellsj001,Jacoby Ellsbury,8,gardb001,Brett Gardner,7,rodra001,Alex Rodriguez,10,teixm001,Mark Teixeira,3,younc004,Chris Young,9,murpj001,John Ryan Murphy,2,gregd001,Didi Gregorius,6,pirej001,Jose Pirela,4,petig001,Gregorio Petit,5,,Y,2015,1,1,9,201507080,1,OAK,5506.0,1356.0,263.0,21.0,183.0,475.0,685.0,1150.0,91.0,22.0,103.0,0.246277,0.306136,0.40138,0.707516,0.208863,0.08627,1040.0,275.0,52.0,3.0,42.0,104.0,142.0,216.0,8.0,2.0,16.0,0.264423,0.331294,0.441346,0.77264,0.207692,0.1
2020019,20200814,0,Fri,NYA,AL,19,10,51,N,,,,NYC21,,215,00203032x,38,14,3,0,1,10,0,0,0,5,0,8,1,0,0,0,9,4,3,3,0,0,27,7,0,0,0,0,fairc901,Chad Fairchild,viscj901,Jansen Visconti,addir901,Ryan Additon,torrc901,Carlos Torres,,(none),,(none),boona001,Aaron Boone,coleg001,Gerrit Cole,brewc002,Colten Brewer,,(none),torrg001,Gleyber Torres,coleg001,Gerrit Cole,lemad001,DJ LeMahieu,4,voitl001,Luke Voit,3,hicka001,Aaron Hicks,8,urshg001,Giovanny Urshela,5,torrg001,Gleyber Torres,6,taucm001,Mike Tauchman,9,sancg002,Gary Sanchez,2,frazc001,Clint Frazier,10,gardb001,Brett Gardner,7,,Y,2020,7,1,13,202008140,1,BOS,5562.0,1493.0,291.0,18.0,307.0,566.0,950.0,1428.0,57.0,22.0,99.0,0.268429,0.335999,0.492808,0.828807,0.256742,0.101762,961.0,233.0,45.0,2.0,52.0,104.0,155.0,279.0,10.0,3.0,18.0,0.242456,0.316432,0.455775,0.772207,0.290323,0.108221


In [12]:
# Create the team level dataframe for each team - put in dict for easy access
team_data_dict = {}
for team in df.team_v.unique():
    team_data_dict[team] = create_team_df(team)

team_data_dict['NYA']

Unnamed: 0_level_0,date,dblheader_code,day_of_week,team,league,game_no,runs,outs_total,day_night,completion_info,forfeit_info,protest_info,ballpark_id,attendance,game_minutes,linescore,AB,H,2B,3B,HR,RBI,SH,SF,HBP,BB,IBB,SO,SB,CS,GIDP,CI,LOB,P_num,ERind,ERteam,WP,balk,PO,ASST,ERR,PB,DP,TP,ump_HB_id,ump_HB_name,ump_1B_id,ump_1B_name,ump_2B_id,ump_2B_name,ump_3B_id,ump_3B_name,ump_LF_id,ump_LF_name,ump_RF_id,ump_RF_name,mgr_id,mgr_name,pitcher_id_w,pitcher_name_w,pitcher_id_l,pitcher_name_l,pitcher_id_s,pitcher_name_s,GWRBI_id,GWRBI_name,pitcher_start_id,pitcher_start_name,batter1_name,batter1_id,batter1_pos,batter2_name,batter2_id,batter2_pos,batter3_name,batter3_id,batter3_pos,batter4_name,batter4_id,batter4_pos,batter5_name,batter5_id,batter5_pos,batter6_name,batter6_id,batter6_pos,batter7_name,batter7_id,batter7_pos,batter8_name,batter8_id,batter8_pos,batter9_name,batter9_id,batter9_pos,misc_info,acqui_info,season,run_diff,home_victory,run_total,date_dblhead,home_game,opponent,rollsum_AB_162,rollsum_H_162,rollsum_2B_162,rollsum_3B_162,rollsum_HR_162,rollsum_BB_162,rollsum_runs_162,rollsum_SO_162,rollsum_SB_162,rollsum_CS_162,rollsum_ERR_162,rollsum_BATAVG_162,rollsum_OBP_162,rollsum_SLG_162,rollsum_OPS_162,rollsum_SO_perc_162,rollsum_BB_perc_162,rollsum_AB_30,rollsum_H_30,rollsum_2B_30,rollsum_3B_30,rollsum_HR_30,rollsum_BB_30,rollsum_runs_30,rollsum_SO_30,rollsum_SB_30,rollsum_CS_30,rollsum_ERR_30,rollsum_BATAVG_30,rollsum_OBP_30,rollsum_SLG_30,rollsum_OPS_30,rollsum_SO_perc_30,rollsum_BB_perc_30
season_game,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1
2000001,20000403,0,Mon,NYA,AL,1,3,54,N,,,,ANA01,42704.0,182,000002100,32,6,0,0,2,3,0,0,0,3,0,3,0,2,0,0,5,3,2,2,0,0,27,5,0,0,1,0,mcclt901,Tim McClelland,craft901,Terry Craft,schrp901,Paul Schrieber,cuzzp901,Phil Cuzzi,,(none),,(none),torrj101,Joe Torre,herno001,Orlando Hernandez,hillk001,Ken Hill,rivem002,Mariano Rivera,oneip001,Paul O'Neill,herno001,Orlando Hernandez,knobc001,Chuck Knoblauch,4,jeted001,Derek Jeter,6,oneip001,Paul O'Neill,9,willb002,Bernie Williams,10,martt002,Tino Martinez,3,leder001,Ricky Ledee,8,posaj001,Jorge Posada,2,spens001,Shane Spencer,7,bross001,Scott Brosius,5,,Y,2000,-1,0,5,200004030,0,ANA,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2000002,20000404,0,Tue,NYA,AL,2,5,54,N,,,,ANA01,25818.0,200,000101102,35,9,2,1,1,5,0,0,0,6,2,7,1,0,2,0,9,4,0,0,0,0,27,11,1,0,2,0,craft901,Terry Craft,schrp901,Paul Schrieber,cuzzp901,Phil Cuzzi,mcclt901,Tim McClelland,,(none),,(none),torrj101,Joe Torre,mendr001,Ramiro Mendoza,perct001,Troy Percival,rivem002,Mariano Rivera,willb002,Bernie Williams,clemr001,Roger Clemens,knobc001,Chuck Knoblauch,4,jeted001,Derek Jeter,6,oneip001,Paul O'Neill,9,willb002,Bernie Williams,10,martt002,Tino Martinez,3,leder001,Ricky Ledee,8,posaj001,Jorge Posada,2,spens001,Shane Spencer,7,bellc001,Clay Bellinger,5,,Y,2000,-2,0,8,200004040,0,ANA,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2000003,20000405,0,Wed,NYA,AL,3,6,51,N,,,,ANA01,24560.0,181,001202010,40,13,3,0,2,6,0,0,0,0,0,5,1,0,2,0,7,4,12,12,0,0,24,8,0,0,0,0,schrp901,Paul Schrieber,cuzzp901,Phil Cuzzi,mcclt901,Tim McClelland,craft901,Terry Craft,,(none),,(none),torrj101,Joe Torre,schos001,Scott Schoeneweis,coned001,David Cone,,(none),erstd001,Darin Erstad,coned001,David Cone,knobc001,Chuck Knoblauch,4,jeted001,Derek Jeter,6,oneip001,Paul O'Neill,9,willb002,Bernie Williams,10,spens001,Shane Spencer,7,martt002,Tino Martinez,3,posaj001,Jorge Posada,2,kellb002,Roberto Kelly,8,bellc001,Clay Bellinger,5,,Y,2000,6,1,18,200004050,0,ANA,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2000004,20000407,0,Fri,NYA,AL,4,5,51,N,,,,SEA03,40827.0,189,002010002,35,9,4,0,2,4,0,0,0,0,0,8,0,0,1,0,3,3,7,7,0,1,24,8,0,0,0,0,randt901,Tony Randazzo,monte901,Ed Montague,laynj901,Jerry Layne,barrt901,Ted Barrett,,(none),,(none),torrj101,Joe Torre,halaj001,John Halama,petta001,Andy Pettitte,sasak001,Kazuhiro Sasaki,olivj001,Joe Oliver,petta001,Andy Pettitte,knobc001,Chuck Knoblauch,4,jeted001,Derek Jeter,6,oneip001,Paul O'Neill,9,willb002,Bernie Williams,8,spens001,Shane Spencer,10,martt002,Tino Martinez,3,posaj001,Jorge Posada,2,kellb002,Roberto Kelly,7,bellc001,Clay Bellinger,5,,Y,2000,2,1,12,200004070,0,SEA,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2000005,20000408,0,Sat,NYA,AL,5,3,54,D,,,,SEA03,45261.0,215,010000101,34,10,2,0,2,3,1,0,0,5,1,7,1,0,1,0,10,5,2,2,0,0,27,8,2,0,1,0,monte901,Ed Montague,laynj901,Jerry Layne,barrt901,Ted Barrett,randt901,Tony Randazzo,,(none),,(none),torrj101,Joe Torre,nelsj001,Jeff Nelson,mesaj001,Jose Mesa,rivem002,Mariano Rivera,oneip001,Paul O'Neill,herno001,Orlando Hernandez,knobc001,Chuck Knoblauch,4,jeted001,Derek Jeter,6,oneip001,Paul O'Neill,9,willb002,Bernie Williams,8,martt002,Tino Martinez,3,leder001,Ricky Ledee,7,posaj001,Jorge Posada,2,spens001,Shane Spencer,10,soria001,Alfonso Soriano,5,,Y,2000,-1,0,5,200004080,0,SEA,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022158,20221002,0,Sun,NYA,AL,158,1,54,D,,,,NYC21,44332.0,215,000010000,30,4,1,0,0,0,0,0,0,5,0,11,0,0,0,0,7,6,3,3,0,0,27,7,0,0,1,0,addir901,Ryan Additon,merzd901,Dan Merzel,bakej902,Jordan Baker,carlm901,Mark Carlson,,(none),,(none),boona001,Aaron Boone,gilll001,Logan Gillaspie,chapa001,Aroldis Chapman,tated001,Dillon Tate,hendg002,Gunnar Henderson,gonzc002,Chi Chi Gonzalez,judga001,Aaron Judge,10,rizza001,Anthony Rizzo,3,lemad001,DJ LeMahieu,4,donaj001,Josh Donaldson,5,cabro002,Oswaldo Cabrera,9,badeh001,Harrison Bader,8,kinei001,Isiah Kiner-Falefa,6,trevj001,Jose Trevino,2,hicka001,Aaron Hicks,7,,Y,2022,-2,0,4,202210020,1,BAL,5429.0,1308.0,229.0,9.0,252.0,607.0,811.0,1398.0,99.0,33.0,72.0,0.240928,0.317263,0.425677,0.742940,0.257506,0.111807,1008.0,248.0,44.0,0.0,46.0,112.0,155.0,274.0,21.0,6.0,16.0,0.246032,0.321429,0.426587,0.748016,0.271825,0.111111
2022159,20221003,0,Mon,NYA,AL,159,3,54,N,,,,ARL03,35906.0,149,000010020,29,8,0,0,2,3,0,0,0,7,0,4,1,0,4,0,6,3,1,1,0,0,27,8,0,0,1,0,fleta901,Andy Fletcher,morag901,Gabe Morales,segac901,Chris Segal,willl901,Lew Williams,,(none),,(none),boona001,Aaron Boone,sevel001,Luis Severino,perem004,Martin Perez,effrs001,Scott Effross,higak001,Kyle Higashioka,sevel001,Luis Severino,judga001,Aaron Judge,9,stanm004,Giancarlo Stanton,10,lemad001,DJ LeMahieu,3,donaj001,Josh Donaldson,5,perao002,Oswald Peraza,4,hicka001,Aaron Hicks,8,kinei001,Isiah Kiner-Falefa,6,higak001,Kyle Higashioka,2,gonzm002,Marwin Gonzalez,7,,Y,2022,-2,0,4,202210030,0,TEX,5426.0,1305.0,227.0,9.0,252.0,612.0,807.0,1399.0,99.0,33.0,72.0,0.240509,0.317489,0.424991,0.742480,0.257833,0.112790,1007.0,248.0,45.0,0.0,46.0,114.0,155.0,276.0,21.0,6.0,16.0,0.246276,0.322926,0.428004,0.750930,0.274081,0.113208
2022160,20221004,1,Tue,NYA,AL,160,5,54,D,,,,ARL03,30553.0,165,110010020,34,8,0,0,3,5,1,0,1,1,0,6,2,0,0,0,5,4,4,4,0,0,27,5,0,0,1,0,morag901,Gabe Morales,roser901,Randy Rosenberg,willl901,Lew Williams,fleta901,Andy Fletcher,,(none),,(none),boona001,Aaron Boone,chapa001,Aroldis Chapman,burkb001,Brock Burke,loaij001,Jonathan Loaisiga,badeh001,Harrison Bader,tailj001,Jameson Taillon,judga001,Aaron Judge,10,rizza001,Anthony Rizzo,3,cabro002,Oswaldo Cabrera,5,badeh001,Harrison Bader,8,hicka001,Aaron Hicks,7,perao002,Oswald Peraza,4,kinei001,Isiah Kiner-Falefa,6,gonzm002,Marwin Gonzalez,9,higak001,Kyle Higashioka,2,,Y,2022,-1,0,9,202210041,0,TEX,5422.0,1307.0,227.0,9.0,249.0,615.0,804.0,1392.0,100.0,33.0,72.0,0.241055,0.318370,0.424013,0.742383,0.256732,0.113427,1004.0,250.0,44.0,0.0,46.0,118.0,155.0,271.0,22.0,6.0,16.0,0.249004,0.327986,0.430279,0.758265,0.269920,0.117530
2022161,20221004,2,Tue,NYA,AL,161,2,51,N,,,,ARL03,38832.0,156,100010000,31,5,0,0,2,2,0,0,0,5,0,12,0,0,1,0,7,2,2,2,0,0,24,8,1,0,1,0,segac901,Chris Segal,willl901,Lew Williams,fleta901,Andy Fletcher,roser901,Randy Rosenberg,,(none),,(none),boona001,Aaron Boone,allak001,Kolby Allard,coleg001,Gerrit Cole,moorm003,Matt Moore,tavel001,Leody Taveras,coleg001,Gerrit Cole,judga001,Aaron Judge,9,stanm004,Giancarlo Stanton,10,cabro002,Oswaldo Cabrera,4,donaj001,Josh Donaldson,5,lemad001,DJ LeMahieu,3,perao002,Oswald Peraza,6,badeh001,Harrison Bader,8,trevj001,Jose Trevino,2,hicka001,Aaron Hicks,7,,Y,2022,1,1,5,202210042,0,TEX,5420.0,1306.0,225.0,9.0,252.0,616.0,806.0,1388.0,102.0,33.0,72.0,0.240959,0.318423,0.425277,0.743700,0.256089,0.113653,1001.0,249.0,44.0,0.0,46.0,116.0,153.0,271.0,24.0,6.0,16.0,0.248751,0.326768,0.430569,0.757338,0.270729,0.115884


In [13]:
## Create a variety of summarized statistics for each game
## For each game, we look up the home and visiting team in the team data dictionary, and then look up the game, and pull the relevant stats

BATAVG_162_h = np.zeros(df.shape[0])
BATAVG_162_v = np.zeros(df.shape[0])
OBP_162_h = np.zeros(df.shape[0])
OBP_162_v = np.zeros(df.shape[0])
SLG_162_h = np.zeros(df.shape[0])
SLG_162_v = np.zeros(df.shape[0])
OPS_162_h = np.zeros(df.shape[0])
OPS_162_v = np.zeros(df.shape[0])
SB_162_h = np.zeros(df.shape[0])
SB_162_v = np.zeros(df.shape[0])
CS_162_h = np.zeros(df.shape[0])
CS_162_v = np.zeros(df.shape[0])
ERR_162_h = np.zeros(df.shape[0])
ERR_162_v = np.zeros(df.shape[0])
SO_perc_162_h = np.zeros(df.shape[0])
SO_perc_162_v = np.zeros(df.shape[0])
BB_perc_162_h = np.zeros(df.shape[0])
BB_perc_162_v = np.zeros(df.shape[0])
BATAVG_30_h = np.zeros(df.shape[0])
BATAVG_30_v = np.zeros(df.shape[0])
OBP_30_h = np.zeros(df.shape[0])
OBP_30_v = np.zeros(df.shape[0])
SLG_30_h = np.zeros(df.shape[0])
SLG_30_v = np.zeros(df.shape[0])
OPS_30_h = np.zeros(df.shape[0])
OPS_30_v = np.zeros(df.shape[0])
SB_30_h = np.zeros(df.shape[0])
SB_30_v = np.zeros(df.shape[0])
CS_30_h = np.zeros(df.shape[0])
CS_30_v = np.zeros(df.shape[0])
ERR_30_h = np.zeros(df.shape[0])
ERR_30_v = np.zeros(df.shape[0])
SO_perc_30_h = np.zeros(df.shape[0])
SO_perc_30_v = np.zeros(df.shape[0])
BB_perc_30_h = np.zeros(df.shape[0])
BB_perc_30_v = np.zeros(df.shape[0])


i=0
for index, row in df.iterrows():
    if i%1000==0:
        print(i)
    home_team = row['team_h']
    visit_team = row['team_v']
    game_index_v = row['season']*1000 + row['game_no_v']
    game_index_h = row['season']*1000 + row['game_no_h']
    BATAVG_162_h[i] = team_data_dict[home_team].loc[game_index_h,'rollsum_BATAVG_162']
    BATAVG_162_v[i] = team_data_dict[visit_team].loc[game_index_v,'rollsum_BATAVG_162']
    OBP_162_h[i] = team_data_dict[home_team].loc[game_index_h,'rollsum_OBP_162']
    OBP_162_v[i] = team_data_dict[visit_team].loc[game_index_v,'rollsum_OBP_162']
    SLG_162_h[i] = team_data_dict[home_team].loc[game_index_h,'rollsum_SLG_162']
    SLG_162_v[i] = team_data_dict[visit_team].loc[game_index_v,'rollsum_SLG_162']
    OPS_162_h[i] = team_data_dict[home_team].loc[game_index_h,'rollsum_OPS_162']
    OPS_162_v[i] = team_data_dict[visit_team].loc[game_index_v,'rollsum_OPS_162']
    SB_162_h[i] = team_data_dict[home_team].loc[game_index_h,'rollsum_SB_162']
    SB_162_v[i] = team_data_dict[visit_team].loc[game_index_v,'rollsum_SB_162']
    CS_162_h[i] = team_data_dict[home_team].loc[game_index_h,'rollsum_CS_162']
    CS_162_v[i] = team_data_dict[visit_team].loc[game_index_v,'rollsum_CS_162']
    ERR_162_h[i] = team_data_dict[home_team].loc[game_index_h,'rollsum_ERR_162']
    ERR_162_v[i] = team_data_dict[visit_team].loc[game_index_v,'rollsum_ERR_162']
    SO_perc_162_h[i] = team_data_dict[home_team].loc[game_index_h,'rollsum_SO_perc_162']
    SO_perc_162_v[i] = team_data_dict[visit_team].loc[game_index_v,'rollsum_SO_perc_162']
    BB_perc_162_h[i] = team_data_dict[home_team].loc[game_index_h,'rollsum_BB_perc_162']
    BB_perc_162_v[i] = team_data_dict[visit_team].loc[game_index_v,'rollsum_BB_perc_162']
    BATAVG_30_h[i] = team_data_dict[home_team].loc[game_index_h,'rollsum_BATAVG_30']
    BATAVG_30_v[i] = team_data_dict[visit_team].loc[game_index_v,'rollsum_BATAVG_30']
    OBP_30_h[i] = team_data_dict[home_team].loc[game_index_h,'rollsum_OBP_30']
    OBP_30_v[i] = team_data_dict[visit_team].loc[game_index_v,'rollsum_OBP_30']
    SLG_30_h[i] = team_data_dict[home_team].loc[game_index_h,'rollsum_SLG_30']
    SLG_30_v[i] = team_data_dict[visit_team].loc[game_index_v,'rollsum_SLG_30']
    OPS_30_h[i] = team_data_dict[home_team].loc[game_index_h,'rollsum_OPS_30']
    OPS_30_v[i] = team_data_dict[visit_team].loc[game_index_v,'rollsum_OPS_30']
    SB_30_h[i] = team_data_dict[home_team].loc[game_index_h,'rollsum_SB_30']
    SB_30_v[i] = team_data_dict[visit_team].loc[game_index_v,'rollsum_SB_30']
    CS_30_h[i] = team_data_dict[home_team].loc[game_index_h,'rollsum_CS_30']
    CS_30_v[i] = team_data_dict[visit_team].loc[game_index_v,'rollsum_CS_30']
    ERR_30_h[i] = team_data_dict[home_team].loc[game_index_h,'rollsum_ERR_30']
    ERR_30_v[i] = team_data_dict[visit_team].loc[game_index_v,'rollsum_ERR_30']
    SO_perc_30_h[i] = team_data_dict[home_team].loc[game_index_h,'rollsum_SO_perc_30']
    SO_perc_30_v[i] = team_data_dict[visit_team].loc[game_index_v,'rollsum_SO_perc_30']
    BB_perc_30_h[i] = team_data_dict[home_team].loc[game_index_h,'rollsum_BB_perc_30']
    BB_perc_30_v[i] = team_data_dict[visit_team].loc[game_index_v,'rollsum_BB_perc_30']
    i+=1

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000


In [14]:
# Add constructed arrays into the main game level dataframe
df['BATAVG_162_h'] = BATAVG_162_h
df['BATAVG_162_v'] = BATAVG_162_v
df['OBP_162_h'] = OBP_162_h
df['OBP_162_v'] = OBP_162_v
df['SLG_162_h'] = SLG_162_h
df['SLG_162_v'] = SLG_162_v
df['OPS_162_h'] = OPS_162_h
df['OPS_162_v'] = OPS_162_v
df['SB_162_h'] = SB_162_h
df['SB_162_v'] = SB_162_v
df['CS_162_h'] = CS_162_h
df['CS_162_v'] = CS_162_v
df['ERR_162_h'] = ERR_162_h
df['ERR_162_v'] = ERR_162_v
df['SO_perc_162_h'] = SO_perc_162_h
df['SO_perc_162_v'] = SO_perc_162_v
df['BB_perc_162_h'] = BB_perc_162_h
df['BB_perc_162_v'] = BB_perc_162_v
df['BATAVG_30_h'] = BATAVG_30_h
df['BATAVG_30_v'] = BATAVG_30_v
df['OBP_30_h'] = OBP_30_h
df['OBP_30_v'] = OBP_30_v
df['SLG_30_h'] = SLG_30_h
df['SLG_30_v'] = SLG_30_v
df['OPS_30_h'] = OPS_30_h
df['OPS_30_v'] = OPS_30_v
df['SB_30_h'] = SB_30_h
df['SB_30_v'] = SB_30_v
df['CS_30_h'] = CS_30_h
df['CS_30_v'] = CS_30_v
df['ERR_30_h'] = ERR_30_h
df['ERR_30_v'] = ERR_30_v
df['SO_perc_30_h'] = SO_perc_30_h
df['SO_perc_30_v'] = SO_perc_30_v
df['BB_perc_30_h'] = BB_perc_30_h
df['BB_perc_30_v'] = BB_perc_30_v

In [15]:
df.shape

(54345, 202)

In [16]:
df.sample(5)

Unnamed: 0,date,dblheader_code,day_of_week,team_v,league_v,game_no_v,team_h,league_h,game_no_h,runs_v,runs_h,outs_total,day_night,completion_info,forfeit_info,protest_info,ballpark_id,attendance,game_minutes,linescore_v,linescore_h,AB_v,H_v,2B_v,3B_v,HR_v,RBI_v,SH_v,SF_v,HBP_v,BB_v,IBB_v,SO_v,SB_v,CS_v,GIDP_v,CI_v,LOB_v,P_num_v,ERind_v,ERteam_v,WP_v,balk_v,PO_v,ASST_v,ERR_v,PB_v,DP_v,TP_v,AB_h,H_h,2B_h,3B_h,HR_h,RBI_h,SH_h,SF_h,HBP_h,BB_h,IBB_h,SO_h,SB_h,CS_h,GIDP_h,CI_h,LOB_h,P_num_h,ERind_h,ERteam_h,WP_h,balk_h,PO_h,ASST_h,ERR_h,PB_h,DP_h,TP_h,ump_HB_id,ump_HB_name,ump_1B_id,ump_1B_name,ump_2B_id,ump_2B_name,ump_3B_id,ump_3B_name,ump_LF_id,ump_LF_name,ump_RF_id,ump_RF_name,mgr_id_v,mgr_name_v,mgr_id_h,mgr_name_h,pitcher_id_w,pitcher_name_w,pitcher_id_l,pitcher_name_l,pitcher_id_s,pitcher_name_s,GWRBI_id,GWRBI_name,pitcher_start_id_v,pitcher_start_name_v,pitcher_start_id_h,pitcher_start_name_h,batter1_name_v,batter1_id_v,batter1_pos_v,batter2_name_v,batter2_id_v,batter2_pos_v,batter3_name_v,batter3_id_v,batter3_pos_v,batter4_name_v,batter4_id_v,batter4_pos_v,batter5_name_v,batter5_id_v,batter5_pos_v,batter6_name_v,batter6_id_v,batter6_pos_v,batter7_name_v,batter7_id_v,batter7_pos_v,batter8_name_v,batter8_id_v,batter8_pos_v,batter9_name_v,batter9_id_v,batter9_pos_v,batter1_name_h,batter1_id_h,batter1_pos_h,batter2_name_h,batter2_id_h,batter2_pos_h,batter3_name_h,batter3_id_h,batter3_pos_h,batter4_name_h,batter4_id_h,batter4_pos_h,batter5_name_h,batter5_id_h,batter5_pos_h,batter6_name_h,batter6_id_h,batter6_pos_h,batter7_name_h,batter7_id_h,batter7_pos_h,batter8_name_h,batter8_id_h,batter8_pos_h,batter9_name_h,batter9_id_h,batter9_pos_h,misc_info,acqui_info,season,run_diff,home_victory,run_total,date_dblhead,BATAVG_162_h,BATAVG_162_v,OBP_162_h,OBP_162_v,SLG_162_h,SLG_162_v,OPS_162_h,OPS_162_v,SB_162_h,SB_162_v,CS_162_h,CS_162_v,ERR_162_h,ERR_162_v,SO_perc_162_h,SO_perc_162_v,BB_perc_162_h,BB_perc_162_v,BATAVG_30_h,BATAVG_30_v,OBP_30_h,OBP_30_v,SLG_30_h,SLG_30_v,OPS_30_h,OPS_30_v,SB_30_h,SB_30_v,CS_30_h,CS_30_v,ERR_30_h,ERR_30_v,SO_perc_30_h,SO_perc_30_v,BB_perc_30_h,BB_perc_30_v
316,20000427,0,Thu,SDN,NL,22,PIT,NL,21,12,4,54,D,,,,PIT07,15459.0,164,410100006,000040000,41,15,2,1,2,11,0,0,1,9,1,5,2,1,0,0,12,3,4,4,1,0,27,8,0,0,0,0,31,4,0,0,1,4,0,0,1,3,0,6,0,0,0,0,4,4,7,7,1,0,27,13,1,0,0,0,rungb901,Brian Runge,shulj901,John Shulock,rapue901,Ed Rapuano,millb901,Bill Miller,,(none),,(none),bochb002,Bruce Bochy,lamog101,Gene Lamont,clemm001,Matt Clement,parrj002,Jose Parra,,(none),jackd003,Damian Jackson,clemm001,Matt Clement,parrj002,Jose Parra,marta001,Al Martin,7,jackd003,Damian Jackson,6,klesr001,Ryan Klesko,3,nevip001,Phil Nevin,5,owene001,Eric Owens,8,boonb002,Bret Boone,4,hernc001,Carlos Hernandez,2,dehak001,Kory DeHaan,9,clemm001,Matt Clement,1,browa001,Adrian Brown,8,morrw001,Warren Morris,4,vandj001,John Vander Wal,3,gileb002,Brian Giles,9,cordw001,Wil Cordero,7,mearp001,Pat Meares,6,ramia001,Aramis Ramirez,5,osikk001,Keith Osik,2,parrj002,Jose Parra,1,,Y,2000,-8,0,16,200004270,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2141,20020909,0,Mon,SDN,NL,144,ARI,NL,144,2,5,51,N,,,,PHO01,30486.0,179,1010,02120000x,36,10,2,0,1,2,0,0,0,4,0,10,0,0,1,0,11,4,5,5,0,0,24,10,1,0,1,0,35,12,1,1,1,4,0,0,0,2,0,7,0,1,0,0,8,3,2,2,0,0,27,9,0,0,1,0,nelsj901,Jeff Nelson,kulpr901,Ron Kulpa,scotd901,Dale Scott,joycj901,Jim Joyce,,(none),,(none),bochb002,Bruce Bochy,brenb001,Bob Brenly,johnr005,Randy Johnson,tomkb001,Brett Tomko,kim-b001,Byung-Hyun Kim,durae001,Erubiel Durazo,tomkb001,Brett Tomko,johnr005,Randy Johnson,kingg001,Gene Kingsale,8,cruzd001,Deivi Cruz,6,nevip001,Phil Nevin,5,gantr001,Ron Gant,7,tramb001,Bubba Trammell,9,buchb002,Brian Buchanan,3,gonzw001,Wiki Gonzalez,2,matoj001,Julius Matos,4,tomkb001,Brett Tomko,1,womat001,Tony Womack,6,finls001,Steve Finley,8,spivj001,Junior Spivey,4,gonzl001,Luis Gonzalez,7,durae001,Erubiel Durazo,3,willm003,Matt Williams,5,delld001,David Dellucci,9,moelc001,Chad Moeller,2,johnr005,Randy Johnson,1,,Y,2002,3,1,7,200209090,0.266799,0.255373,0.343603,0.323211,0.421663,0.391728,0.765266,0.714939,90.0,71.0,43.0,41.0,85.0,123.0,0.182938,0.193968,0.117008,0.100235,0.241062,0.28146,0.325159,0.342707,0.374872,0.432277,0.700031,0.774983,11.0,19.0,9.0,7.0,13.0,19.0,0.225741,0.183477,0.124617,0.09318
126,20020411,0,Thu,OAK,AL,10,TEX,AL,9,0,7,51,D,,,,ARL02,21903.0,160,0,02112001x,28,2,1,0,0,0,0,0,1,1,0,6,0,1,0,0,3,3,7,7,1,0,24,9,0,0,1,0,36,14,4,2,1,6,0,1,1,4,0,5,0,0,1,0,11,1,0,0,0,0,27,7,0,0,0,0,kulpr901,Ron Kulpa,joycj901,Jim Joyce,guccc901,Chris Guccione,scotd901,Dale Scott,,(none),,(none),howea001,Art Howe,narrj001,Jerry Narron,david002,Doug Davis,muldm001,Mark Mulder,,(none),menck001,Kevin Mench,muldm001,Mark Mulder,david002,Doug Davis,giamj002,Jeremy Giambi,7,menef001,Frank Menechino,4,chave001,Eric Chavez,5,justd001,David Justice,10,tejam001,Miguel Tejada,6,longt002,Terrence Long,8,hernr002,Ramon Hernandez,2,colam001,Mike Colangelo,9,penac001,Carlos Pena,3,kaplg001,Gabe Kapler,8,greer001,Rusty Greer,7,rodra001,Alex Rodriguez,6,palmr001,Rafael Palmeiro,3,rodri001,Ivan Rodriguez,10,perrh001,Herbert Perry,5,menck001,Kevin Mench,9,haseb001,Bill Haselman,2,younm003,Michael Young,4,,Y,2002,7,1,7,200204110,0.273638,0.266429,0.337129,0.341667,0.465905,0.447321,0.803034,0.788988,96.0,67.0,30.0,27.0,114.0,124.0,0.19174,0.184286,0.095782,0.114286,0.250729,0.27676,0.318302,0.351211,0.419825,0.483124,0.738127,0.834335,12.0,9.0,4.0,4.0,16.0,26.0,0.186589,0.211186,0.099125,0.114754
339,20020427,0,Sat,CHA,AL,24,OAK,AL,24,1,16,51,D,,,,OAK01,26111.0,162,10000000,11720014x,31,4,3,0,1,1,0,0,0,0,0,4,0,0,0,0,3,5,15,15,1,0,24,5,1,0,0,0,42,19,2,0,5,15,0,2,1,7,0,9,0,0,0,0,12,3,1,1,0,0,27,10,0,0,0,0,reilm901,Mike Reilly,kellj901,Jeff Kellogg,mealj901,Jerry Meals,coope901,Eric Cooper,,(none),,(none),manuj101,Jerry Manuel,howea001,Art Howe,fyhrm001,Mike Fyhrie,buehm001,Mark Buehrle,,(none),,(none),buehm001,Mark Buehrle,fyhrm001,Mike Fyhrie,loftk001,Kenny Lofton,8,durhr001,Ray Durham,4,thomf001,Frank Thomas,10,ordom001,Magglio Ordonez,9,konep001,Paul Konerko,3,valej003,Jose Valentin,5,lee-c001,Carlos Lee,7,aloms001,Sandy Alomar,2,clayr001,Royce Clayton,6,giamj002,Jeremy Giambi,7,menef001,Frank Menechino,4,hatts001,Scott Hatteberg,10,dye-j001,Jermaine Dye,9,chave001,Eric Chavez,5,tejam001,Miguel Tejada,6,longt002,Terrence Long,8,hernr002,Ramon Hernandez,2,penac001,Carlos Pena,3,,Y,2002,15,1,17,200204270,0.264059,0.27483,0.338678,0.339575,0.442862,0.461355,0.781541,0.80093,64.0,124.0,27.0,62.0,123.0,117.0,0.186193,0.178998,0.112833,0.098036,0.261645,0.292201,0.334821,0.352888,0.454906,0.474827,0.789727,0.827715,6.0,28.0,3.0,11.0,21.0,24.0,0.208127,0.162883,0.11001,0.093781
52,20180402,0,Mon,CHN,NL,5,CIN,NL,4,0,1,51,D,,,,CIN09,18963.0,153,0,00010000x,27,2,0,1,0,0,0,0,1,5,0,11,0,1,1,0,6,3,1,1,1,0,24,9,1,0,1,0,28,5,1,1,0,1,1,0,0,6,0,7,0,0,1,0,10,4,0,0,0,0,27,8,0,0,1,0,danlk901,Kerwin Danley,nauep901,Paul Nauert,barrs901,Scott Barry,torrc901,Carlos Torres,,(none),,(none),maddj801,Joe Maddon,pricb801,Bryan Price,mahlt001,Tyler Mahle,chatt001,Tyler Chatwood,igler001,Raisel Iglesias,duvaa001,Adam Duvall,chatt001,Tyler Chatwood,mahlt001,Tyler Mahle,happi001,Ian Happ,8,bryak001,Kris Bryant,5,rizza001,Anthony Rizzo,3,contw001,Willson Contreras,2,schwk001,Kyle Schwarber,7,russa002,Addison Russell,6,heywj001,Jason Heyward,9,baezj001,Javier Baez,4,chatt001,Tyler Chatwood,1,winkj002,Jesse Winker,9,pennc001,Cliff Pennington,6,vottj001,Joey Votto,3,genns001,Scooter Gennett,4,suare001,Eugenio Suarez,5,duvaa001,Adam Duvall,7,barnt001,Tucker Barnhart,2,hamib001,Billy Hamilton,8,mahlt001,Tyler Mahle,1,,Y,2018,1,1,1,201804020,0.252775,0.255338,0.322666,0.331329,0.431119,0.439197,0.753785,0.770526,119.0,62.0,39.0,30.0,80.0,92.0,0.244222,0.254976,0.103185,0.113645,0.244174,0.253602,0.3054,0.332474,0.395137,0.40634,0.700537,0.738814,16.0,12.0,8.0,6.0,17.0,12.0,0.276596,0.26513,0.088146,0.118156


In [17]:
df.to_csv('./output_data/input_model_data_01.csv', index=False)