# I. Data Preparation

In [1]:
# import packages

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.formula.api as smf

In [2]:
# Read in master data, create "experience-squared variable" and new performance metrics, and subset seasons
Master =pd.read_csv('../Master.csv')
Master['Exp2']= Master['Exp']**2
Master['AVG'] = Master['H']/Master['AB']
Master['ISO'] = Master['SLG']-Master['AVG']
Master['Eye'] = (Master['BB']+Master['HBP'])/Master['PA']
Master = Master[(Master.SalYear >= 1995) & (Master.SalYear <= 2015)]
display(Master)

Unnamed: 0.1,Unnamed: 0,playerID,yearID,G,AB,R,H,2B,3B,HR,...,Exp,Arb,Free,POS,Catch,Infld,Exp2,AVG,ISO,Eye
0,0,abbotje01,1998,89,244,33,68,14,1,12,...,1,0,0,OF,0,0,1,0.278689,0.213115,0.034615
1,1,abbotje01,2000,80,215,31,59,15,1,3,...,3,1,0,OF,0,0,9,0.274419,0.120930,0.095436
2,2,abbotku01,1994,101,345,41,86,17,3,9,...,1,0,0,SS,0,1,1,0.249275,0.144928,0.056604
3,3,abbotku01,1995,120,420,60,107,18,7,17,...,2,0,0,SS,0,1,4,0.254762,0.197619,0.087607
4,4,abbotku01,1996,109,320,37,81,18,7,8,...,3,1,0,SS,0,1,9,0.253125,0.175000,0.071633
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9895,9895,zobribe01,2012,157,560,88,151,39,7,20,...,6,1,0,OF,0,0,36,0.269643,0.201786,0.149701
9896,9896,zobribe01,2013,157,612,77,168,36,3,12,...,7,0,1,2B,0,1,49,0.274510,0.127451,0.113181
9897,9897,zobribe01,2014,146,570,83,155,34,3,10,...,8,0,1,2B,0,1,64,0.271930,0.122807,0.116208
9899,9899,zuninmi01,2013,52,173,22,37,5,0,5,...,0,0,0,C,1,0,0,0.213873,0.115607,0.098446


# II. Running Regressions for Each Season

In [3]:
# Write a function to run the Moneyball regression annually for free agents only
def MBExpandFA(Season):
    MB_Seas = Master[(Master.SalYear == Season) & (Master.Free == 1)]
    global lm
    lm = smf.ols(formula = 'lnSal ~ AVG + ISO + Eye + PA + Exp + Exp2 + C(POS)', data=MB_Seas).fit()
    return;

In [4]:
# 2. Create list to store regression results
index = 0
lm_Results = [0]
for index in range(1,20):
    lm_Results.append(index)
    index = index + 1
display(lm_Results) 

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]

In [5]:
# 3. Run regression and store results
Season = 1995
i = 0
while Season < 2015:
    MBExpandFA(Season)
    lm_Results[i] = lm
    i = i + 1
    Season = Season + 1

In [6]:
# 4. Give each regression result a name, which is the season to which it corresponds
Season = 1995
lm_Season = ["1995"]
for Season in range(1996, 2015):
    lm_Season.append(str(Season))
    Season = Season + 1

In [7]:
#Create a list of season names to label regression results and divide list into eras
    
Pre_MB = lm_Season[:6]
MB_Period = lm_Season[6:14]
Post_MB = lm_Season[14:]

print(Pre_MB)
print(MB_Period)
print(Post_MB)

['1995', '1996', '1997', '1998', '1999', '2000']
['2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008']
['2009', '2010', '2011', '2012', '2013', '2014']


In [8]:
# Regression results from 1995-2000
from statsmodels.iolib.summary2 import summary_col
info_dict={'R-squared' : lambda x: f"{x.rsquared:.2f}",
           'No. observations' : lambda x: f"{int(x.nobs):d}"}
PreMB_Out = summary_col([lm_Results[0],lm_Results[1],lm_Results[2],lm_Results[3],lm_Results[4],
                        lm_Results[5]], model_names=Pre_MB, 
                        regressor_order=['AVG','Eye','ISO'],stars=True, info_dict = info_dict)
print(PreMB_Out)


                    1995      1996       1997       1998       1999       2000   
---------------------------------------------------------------------------------
AVG              3.7261    1.6921     4.4403**   4.0121*    2.8133     2.8979    
                 (2.4150)  (2.8851)   (2.1651)   (2.1222)   (2.0204)   (1.8524)  
Eye              2.4417    0.5096     2.0994     3.8206**   2.0140     1.0060    
                 (1.9611)  (2.0089)   (1.6643)   (1.6066)   (1.5924)   (1.5256)  
ISO              3.5872*** 4.9581***  2.7886***  3.1340***  2.4689***  3.5986*** 
                 (1.1991)  (1.3861)   (1.0009)   (1.1908)   (0.9210)   (0.9752)  
C(POS)[T.2B]     -0.8179** -0.1286    -0.4298    -0.0151    -0.2160    -0.2726   
                 (0.3753)  (0.3466)   (0.2642)   (0.2872)   (0.2294)   (0.2175)  
C(POS)[T.3B]     -0.2760   -0.3602    -0.5173*   -0.0798    0.2404     -0.1103   
                 (0.3050)  (0.3140)   (0.2694)   (0.2479)   (0.2184)   (0.2205)  
C(POS)[T.C]    

In [9]:
# Regression results from 2001-2008
MB_Out = summary_col([lm_Results[6],lm_Results[7],lm_Results[8],lm_Results[9],lm_Results[10],lm_Results[11],
                        lm_Results[12],lm_Results[13]], model_names=MB_Period, 
                        regressor_order=['AVG','Eye','ISO'],stars=True,info_dict = info_dict)
print(MB_Out)


                    2001       2002       2003       2004       2005       2006       2007       2008   
--------------------------------------------------------------------------------------------------------
AVG              0.7142     2.2282     2.1671     2.8623     5.1070**   6.1979**   3.6265*    -0.7795   
                 (2.1011)   (2.6597)   (2.5275)   (2.6710)   (2.2766)   (2.5742)   (2.0114)   (2.0713)  
Eye              -3.0326*   1.7225     3.0337     9.3959***  2.8751     2.9151     4.2304**   3.4829    
                 (1.6224)   (2.0778)   (2.1865)   (2.1082)   (1.8720)   (2.0959)   (1.8341)   (2.1335)  
ISO              5.0715***  2.7926**   1.4127     1.8573     3.1916***  2.6517**   3.0405***  3.1244**  
                 (1.0084)   (1.3380)   (1.3554)   (1.2495)   (1.2057)   (1.2112)   (1.0456)   (1.2600)  
C(POS)[T.2B]     0.1785     -0.0029    -0.2719    0.0510     -0.4057    -0.1014    -0.2304    0.0278    
                 (0.2204)   (0.2865)   (0.2921)   (0.2

In [10]:
# Regression results from 2009-2014
PostMB_Out = summary_col([lm_Results[14],lm_Results[15],lm_Results[16],lm_Results[17],lm_Results[18],
                        lm_Results[19]], model_names=Post_MB, 
                        regressor_order=['AVG','Eye','ISO'],stars=True,info_dict = info_dict)
print(PostMB_Out)


                    2009       2010      2011      2012       2013       2014   
--------------------------------------------------------------------------------
AVG              7.6219***  7.9516*** 6.2201**  -1.5959    2.7462     5.7890*** 
                 (2.4116)   (2.7847)  (2.7746)  (2.6167)   (2.1495)   (2.1811)  
Eye              4.3845**   6.2334*** 4.0906    2.6060     4.1774*    5.8097**  
                 (2.1487)   (2.3060)  (2.7579)  (2.5512)   (2.3415)   (2.5174)  
ISO              1.6764     2.5375    3.1109**  3.2232**   2.7647*    2.9424**  
                 (1.4667)   (1.5673)  (1.4932)  (1.6000)   (1.4094)   (1.3941)  
C(POS)[T.2B]     -0.1089    0.0035    0.2046    0.1738     -0.5499**  0.3901    
                 (0.3495)   (0.3420)  (0.3225)  (0.3226)   (0.2748)   (0.2568)  
C(POS)[T.3B]     0.3158     0.4651    -0.0762   0.4312     -0.2036    0.2050    
                 (0.2555)   (0.3045)  (0.3072)  (0.3012)   (0.2614)   (0.2823)  
C(POS)[T.C]      0.6320**  