# Glossary

## Setup

In [1]:
import numpy as np
import pandas as pd

teams_df = pd.read_csv('./data/core/Teams.csv',
                        delimiter=',',
                        usecols=['yearID','teamID','G','W','L','R','RA'])

batting_df = pd.read_csv('./data/core/Batting.csv',
                        delimiter=',')

## Notable Years

1921 affects of dead ball era gone <br />
1970 lahman starts recording HBP and SF <br />
1981 players strike <br />
1994 players strike <br />
2006 post-steroid era <br />

## Win Estimators

### Ratio Estimators: W% = (RS^x)/(RS^x + RA^x)
RPG = (RA + RS)/(games played)

In [22]:
ratio_win_df = teams_df.copy()
ratio_win_df['WIN_PERC'] = ratio_win_df['W']/ratio_win_df['G']

In [23]:
# Pythagorean 1.83 (Alternatively use 1.81 or 2): x = 1.83
ratio_win_df['PYTHAG_183'] = (ratio_win_df['R']**1.81)/((ratio_win_df['R']**1.81)+(ratio_win_df['RA']**1.81))
ratio_win_df

Unnamed: 0,yearID,teamID,G,W,L,R,RA,WIN_PERC,PYTHAG_183
0,1871,BS1,31,20,10,401,303,0.645161,0.624153
1,1871,CH1,28,19,9,302,241,0.678571,0.600702
2,1871,CL1,29,10,19,249,341,0.344828,0.361440
3,1871,FW1,19,7,12,137,243,0.368421,0.261676
4,1871,NY2,33,16,17,302,313,0.484848,0.483817
...,...,...,...,...,...,...,...,...,...
2890,2018,SLN,162,88,74,759,691,0.543210,0.542371
2891,2018,TBA,162,90,72,716,646,0.555556,0.546419
2892,2018,TEX,162,67,95,737,848,0.413580,0.436856
2893,2018,TOR,162,73,89,709,832,0.450617,0.428112


In [29]:
# Pythagenport: x = 1.5 * log(RPG) + .45
ratio_win_df['PORT_X'] = (1.5*np.log10((ratio_win_df['RA']+ratio_win_df['RA'])/ratio_win_df['G']))+.45
ratio_win_df['PYTHAGENPORT'] = (ratio_win_df['R']**ratio_win_df['PORT_X'])/((ratio_win_df['R']**ratio_win_df['PORT_X'])+(ratio_win_df['RA']**ratio_win_df['PORT_X']))
ratio_win_df

Unnamed: 0,yearID,teamID,G,W,L,R,RA,WIN_PERC,PYTHAG_183,PORT_X,PYTHAGENPORT
0,1871,BS1,31,20,10,401,303,0.645161,0.624153,2.386666,0.661237
1,1871,CH1,28,19,9,302,241,0.678571,0.600702,2.303834,0.627104
2,1871,CL1,29,10,19,249,341,0.344828,0.361440,2.507080,0.312534
3,1871,FW1,19,7,12,137,243,0.368421,0.261676,2.561824,0.187227
4,1871,NY2,33,16,17,302,313,0.484848,0.483817,2.367091,0.478841
...,...,...,...,...,...,...,...,...,...,...,...
2890,2018,SLN,162,88,74,759,691,0.543210,0.542371,1.846490,0.543221
2891,2018,TBA,162,90,72,716,646,0.555556,0.546419,1.802621,0.546231
2892,2018,TEX,162,67,95,737,848,0.413580,0.436856,1.979866,0.431003
2893,2018,TOR,162,73,89,709,832,0.450617,0.428112,1.967457,0.421956


In [30]:
# Pythagenpat: x = RPG^.287
ratio_win_df['PAT_X'] = ((ratio_win_df['RA']+ratio_win_df['RA'])/ratio_win_df['G'])**.287
ratio_win_df['PYTHAGENPAT'] = (ratio_win_df['R']**ratio_win_df['PAT_X'])/((ratio_win_df['R']**ratio_win_df['PAT_X'])+(ratio_win_df['RA']**ratio_win_df['PAT_X']))
ratio_win_df

Unnamed: 0,yearID,teamID,G,W,L,R,RA,WIN_PERC,PYTHAG_183,PORT_X,PYTHAGENPORT,PAT_X,PYTHAGENPAT
0,1871,BS1,31,20,10,401,303,0.645161,0.624153,2.386666,0.661237,2.347193,0.658755
1,1871,CH1,28,19,9,302,241,0.678571,0.600702,2.303834,0.627104,2.263081,0.624952
2,1871,CL1,29,10,19,249,341,0.344828,0.361440,2.507080,0.312534,2.475072,0.314700
3,1871,FW1,19,7,12,137,243,0.368421,0.261676,2.561824,0.187227,2.535492,0.189534
4,1871,NY2,33,16,17,302,313,0.484848,0.483817,2.367091,0.478841,2.327037,0.479199
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2890,2018,SLN,162,88,74,759,691,0.543210,0.542371,1.846490,0.543221,1.850099,0.543305
2891,2018,TBA,162,90,72,716,646,0.555556,0.546419,1.802621,0.546231,1.814686,0.546539
2892,2018,TEX,162,67,95,737,848,0.413580,0.436856,1.979866,0.431003,1.962070,0.431615
2893,2018,TOR,162,73,89,709,832,0.450617,0.428112,1.967457,0.421956,1.951373,0.422584


### Differential Estimators: W% = X * (R - RA) / G + .5
RPG = (RA + RS)/(games played)

In [33]:
diff_win_df = teams_df.copy()
diff_win_df['WIN_PERC'] = diff_win_df['W']/diff_win_df['G']

In [34]:
# Palmer-RPW: x = 1 / (10 * sqrt(runs per inning))
# NOTE: Using 9 innings per G, even though that is not accurate
diff_win_df['PALMER_X'] = 1/(10*np.sqrt((diff_win_df['R'] + diff_win_df['RA'])/(diff_win_df['G']*9)))
diff_win_df['PALMER'] = ((diff_win_df['PALMER_X']*(diff_win_df['R']-diff_win_df['RA']))/diff_win_df['G'])+.5
diff_win_df

Unnamed: 0,yearID,teamID,G,W,L,R,RA,WIN_PERC,PALMER_X,PALMER
0,1871,BS1,31,20,10,401,303,0.645161,0.062953,0.699012
1,1871,CH1,28,19,9,302,241,0.678571,0.068124,0.648413
2,1871,CL1,29,10,19,249,341,0.344828,0.066511,0.288999
3,1871,FW1,19,7,12,137,243,0.368421,0.067082,0.125753
4,1871,NY2,33,16,17,302,313,0.484848,0.069493,0.476836
...,...,...,...,...,...,...,...,...,...,...
2890,2018,SLN,162,88,74,759,691,0.543210,0.100275,0.542091
2891,2018,TBA,162,90,72,716,646,0.555556,0.103464,0.544707
2892,2018,TEX,162,67,95,737,848,0.413580,0.095910,0.434284
2893,2018,TOR,162,73,89,709,832,0.450617,0.097270,0.426147


In [35]:
# Tango-RPW: x = 1 / (RPG / 2 + 5)
diff_win_df['TANGO_X'] = 1/((((diff_win_df['R']+diff_win_df['RA'])/diff_win_df['G'])/2)+5)
diff_win_df['TANGO'] = ((diff_win_df['TANGO_X']*(diff_win_df['R']-diff_win_df['RA']))/diff_win_df['G'])+.5
diff_win_df

Unnamed: 0,yearID,teamID,G,W,L,R,RA,WIN_PERC,PALMER_X,PALMER,TANGO_X,TANGO
0,1871,BS1,31,20,10,401,303,0.645161,0.062953,0.699012,0.061144,0.693294
1,1871,CH1,28,19,9,302,241,0.678571,0.068124,0.648413,0.068044,0.648238
2,1871,CL1,29,10,19,249,341,0.344828,0.066511,0.288999,0.065909,0.290909
3,1871,FW1,19,7,12,137,243,0.368421,0.067082,0.125753,0.066667,0.128070
4,1871,NY2,33,16,17,302,313,0.484848,0.069493,0.476836,0.069841,0.476720
...,...,...,...,...,...,...,...,...,...,...,...,...
2890,2018,SLN,162,88,74,759,691,0.543210,0.100275,0.542091,0.105537,0.544300
2891,2018,TBA,162,90,72,716,646,0.555556,0.103464,0.544707,0.108652,0.546948
2892,2018,TEX,162,67,95,737,848,0.413580,0.095910,0.434284,0.101092,0.430733
2893,2018,TOR,162,73,89,709,832,0.450617,0.097270,0.426147,0.102499,0.422177


### Other Win Estimators

In [38]:
other_win_df = teams_df.copy()
other_win_df['WIN_PERC'] = diff_win_df['W']/diff_win_df['G']

In [40]:
# Ben V-L: W% = 0.91 * (RS-RA) / (RS+RA) + .5
other_win_df['BEN_VL'] = ((.91*(other_win_df['R']-other_win_df['RA']))/(other_win_df['R']+other_win_df['RA']))+.5
other_win_df

Unnamed: 0,yearID,teamID,G,W,L,R,RA,WIN_PERC,BEN_VL
0,1871,BS1,31,20,10,401,303,0.645161,0.626676
1,1871,CH1,28,19,9,302,241,0.678571,0.602228
2,1871,CL1,29,10,19,249,341,0.344828,0.358102
3,1871,FW1,19,7,12,137,243,0.368421,0.246158
4,1871,NY2,33,16,17,302,313,0.484848,0.483724
...,...,...,...,...,...,...,...,...,...
2890,2018,SLN,162,88,74,759,691,0.543210,0.542676
2891,2018,TBA,162,90,72,716,646,0.555556,0.546769
2892,2018,TEX,162,67,95,737,848,0.413580,0.436271
2893,2018,TOR,162,73,89,709,832,0.450617,0.427365


## Basic Batting Stats

In [4]:
# Filter to 1970+ to get rid of NA values
basic_batting_df = batting_df[batting_df['yearID'] >= 1970].copy()
basic_batting_df['PA'] = basic_batting_df['AB'] + basic_batting_df['BB'] + basic_batting_df['HBP']+ basic_batting_df['SF'] + basic_batting_df['SH']
basic_batting_df['AVG'] = basic_batting_df['H'] / basic_batting_df['AB']
basic_batting_df['OBP'] = ((basic_batting_df['H'] + basic_batting_df['BB'] + basic_batting_df['HBP']) / (basic_batting_df['AB'] + basic_batting_df['BB'] + basic_batting_df['HBP'] + basic_batting_df['SF']))
basic_batting_df['SLG'] = (basic_batting_df['H'] + basic_batting_df['2B'] + (2 * basic_batting_df['3B']) + (3 * basic_batting_df['HR'])) / basic_batting_df['AB']
basic_batting_df['OPS'] = ((basic_batting_df['H'] + basic_batting_df['BB'] + basic_batting_df['HBP']) / (basic_batting_df['AB'] + basic_batting_df['BB'] + basic_batting_df['HBP'] + basic_batting_df['SF'])) + (((basic_batting_df['H'] - basic_batting_df['2B'] - basic_batting_df['3B'] - basic_batting_df['HR']) + (2 * basic_batting_df['2B']) + (3 * basic_batting_df['3B']) + (4 * basic_batting_df['HR'])) / basic_batting_df['AB'])
basic_batting_df['ISO'] = (((basic_batting_df['2B']) + (2 * basic_batting_df['3B']) + (3 * basic_batting_df['HR'])) / basic_batting_df['AB'])
basic_batting_df['BABIP'] = ((basic_batting_df['H'] - basic_batting_df['HR']) / ((basic_batting_df['AB'] + basic_batting_df['BB'] + basic_batting_df['HBP'] + basic_batting_df['SF'] + basic_batting_df['SH']) - basic_batting_df['SO'] - basic_batting_df['BB'] - basic_batting_df['HR']))
basic_batting_df

Unnamed: 0,playerID,yearID,stint,teamID,lgID,G,AB,R,H,2B,...,SH,SF,GIDP,PA,AVG,OBP,SLG,OPS,ISO,BABIP
48184,aaronha01,1970,1,ATL,NL,150,516,103,154,26,...,0.0,6.0,13.0,598.0,0.298450,0.384615,0.573643,0.958259,0.275194,0.274232
48185,aaronto01,1970,1,ATL,NL,44,63,3,13,2,...,0.0,0.0,5.0,66.0,0.206349,0.242424,0.333333,0.575758,0.126984,0.215686
48186,abernte02,1970,1,CHN,NL,11,0,0,0,0,...,0.0,0.0,0.0,0.0,,,,,,
48187,abernte02,1970,2,SLN,NL,11,3,0,0,0,...,0.0,0.0,0.0,3.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
48188,abernte02,1970,3,KCA,AL,36,14,1,3,0,...,3.0,0.0,0.0,17.0,0.214286,0.214286,0.214286,0.428571,0.000000,0.250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105856,zimmebr01,2018,1,CLE,AL,34,106,14,24,5,...,0.0,0.0,1.0,114.0,0.226415,0.280702,0.330189,0.610890,0.103774,0.360656
105857,zimmejo02,2018,1,DET,AL,25,2,0,0,0,...,0.0,0.0,0.0,2.0,0.000000,0.000000,0.000000,0.000000,0.000000,
105858,zimmery01,2018,1,WAS,NL,85,288,33,76,21,...,0.0,2.0,10.0,323.0,0.263889,0.337461,0.486111,0.823572,0.222222,0.280000
105859,zobribe01,2018,1,CHN,NL,139,455,67,139,28,...,1.0,7.0,8.0,520.0,0.305495,0.377649,0.439560,0.817210,0.134066,0.328283


## Run Estimators

In [None]:
# TODO