In [1]:
# Project goal: Predict MLB batters average yearly salary based on production stats from 5 yrs prior to free agency
# outline:
# Clean data
# EDA on production stats vs salary info
# ideas:
# 1. explore, plot relationship between individual stats and salary
# 2. variable selection using step-wise regression
# 3. explore corralation between salary vs basic stats & advanced stats

In [2]:
import pandas as pd
from pybaseball import playerid_lookup
from pybaseball import player_search_list
import numpy as np

pd.set_option('display.max_rows', 10)

In [3]:
# clean, feature eng salary data:

salary_df = pd.read_csv('mlb-free-agency.csv')
print(len(salary_df.index))
salary_df.head()

1613


Unnamed: 0.1,Unnamed: 0,name,position,age,from_team,to_tam,contract_length,total_salary,avg_salary,year
0,0,Wil Nieves,C,33.2,WSH,MIL,1,775000.0,775000.0,2011
1,1,Albert Pujols,DH,31.8,STL,LAA,10,240000000.0,24000000.0,2012
2,2,Prince Fielder,DH,27.7,MIL,DET,9,214000000.0,23777778.0,2012
3,3,Jose Reyes,SS,28.4,NYM,MIA,6,106000000.0,17666667.0,2012
4,4,C.J. Wilson,SP,31.0,TEX,LAA,5,77500000.0,15500000.0,2012


In [4]:
# check how many entries have no salary info
print(len(salary_df[salary_df.isnull().any(axis=1)].index))

392


In [5]:
# drop rows without salary data
salary_df = salary_df[salary_df['total_salary'].notna()]
len(salary_df.index)

1221

In [6]:
# drop first row since only 1 observation in 2011
salary_df = salary_df.drop([0])
len(salary_df.index)

1220

In [7]:
# drop first column and reset index
salary_df.drop(columns=salary_df.columns[0], 
        axis=1, 
        inplace=True)
salary_df.reset_index(drop=True)

Unnamed: 0,name,position,age,from_team,to_tam,contract_length,total_salary,avg_salary,year
0,Albert Pujols,DH,31.8,STL,LAA,10,240000000.0,24000000.0,2012
1,Prince Fielder,DH,27.7,MIL,DET,9,214000000.0,23777778.0,2012
2,Jose Reyes,SS,28.4,NYM,MIA,6,106000000.0,17666667.0,2012
3,C.J. Wilson,SP,31.0,TEX,LAA,5,77500000.0,15500000.0,2012
4,Mark Buehrle,SP,32.7,CHW,MIA,4,58000000.0,14500000.0,2012
...,...,...,...,...,...,...,...,...,...
1215,Sam McWilliams,RP,25.2,TB,NYM,1,750000.0,750000.0,2021
1216,Kohl Stewart,SP,26.2,BAL,CHC,1,700000.0,700000.0,2021
1217,Scott Heineman,OF,27.9,TEX,TEX,1,595000.0,595000.0,2021
1218,Albert Pujols,DH,41.2,LAA,LAD,1,570500.0,570500.0,2021


In [8]:
# since salary data from spotrac.com don't come with unique playerID, I need to assign each player and fangraphID
# create a list of unique player names, and see how many there are
players = salary_df.name.unique()

players = list(players)
print(len(players))

777


In [9]:
# look up fangraphID by name
players[:10]

['Albert Pujols',
 'Prince Fielder',
 'Jose Reyes',
 'C.J. Wilson',
 'Mark Buehrle',
 'Jonathan Papelbon',
 'Aramis Ramirez',
 'Michael Cuddyer',
 'Heath Bell',
 'Carlos Beltran']

In [10]:
player_list = []
for i in players:
    name = i.split(' ')
    first_name = name[0].lower()
    last_name = name[-1].lower()
    name_tup = (last_name, first_name)
    player_list.append(name_tup)

playerid = player_search_list(player_list)
playerid

Gathering player lookup table. This may take a moment.


Unnamed: 0,name_last,name_first,key_mlbam,key_retro,key_bbref,key_fangraphs,mlb_played_first,mlb_played_last
0,pujols,albert,405395,pujoa001,pujolal01,1177,2001.0,2021.0
1,fielder,prince,425902,fielp001,fieldpr01,4613,2005.0,2016.0
2,reyes,jose,431170,reyej002,reyesjo02,2797,2006.0,2006.0
3,reyes,jose,408314,reyej001,reyesjo01,1736,2003.0,2018.0
4,buehrle,mark,279824,buehm001,buehrma01,225,2000.0,2015.0
...,...,...,...,...,...,...,...,...
755,brebbia,john,605154,brebj001,brebbjo01,12777,2017.0,2021.0
756,middleton,keynan,641871,middk001,middlke01,15264,2017.0,2021.0
757,drake,oliver,543118,drako001,drakeol01,8823,2015.0,2020.0
758,holder,jonathan,656547,holdj002,holdejo02,16588,2016.0,2020.0


In [11]:
playerid['full_name'] = playerid['name_first'] + ' ' + playerid['name_last']

In [12]:
# check the names of plyers that didn't turn up serach result
unique_names = list(playerid['full_name'].unique())
players_missing = []
for i in players:
    if i.lower() not in unique_names:
        players_missing.append(i.lower())
players_missing

['c.j. wilson',
 'jerry hairston jr.',
 'vincent padilla',
 'melvin upton',
 'a.j. pierzynski',
 'j.p. howell',
 'juan carlos oviedo',
 'a.j. burnett',
 'j.p. arencibia',
 'yorbit torrealba',
 'j.c. gutiérrez',
 'john mayberry jr.',
 'eric young jr.',
 'nick massett',
 'jeremy mcbryde',
 'j.a. happ',
 'alejandro de aza',
 'norichika aoki',
 'steven pearce',
 'alexei ramirez',
 'alberto alburquerque',
 'justin de fratus',
 'a.j. griffin',
 'yoenis cespedes',
 'michael dunn',
 'r.a. dickey',
 'a.j. ellis',
 'tom milone',
 'j.d. martinez',
 'c.c. sabathia',
 'michael fiers',
 'seung-hwan oh',
 'leonys martin',
 'a.j. pollock',
 'd.j. lemahieu',
 'hyun-jin ryu',
 'c.j. cron',
 'jose iglesias',
 'alexander claudio',
 'c.j. edwards',
 'daniel winkler',
 'j.t. realmuto',
 'jackie bradley jr.',
 'tommy la stella',
 'daniel santana',
 'sam mcwilliams',
 'joe gatto']

In [13]:
print(f"Number of players not searched: {len(players_missing)}")

Number of players not searched: 47


In [14]:
# there are quite a few missing players and formatting and searching all of them is too tedious
# I decided to search only players with abbreviated first name, e.g: c.j. wilson

# come up with a list of tuple of player names that contain '.', and turn them into a list of tuple for ID search
# also, insert an empty space after the first '.' so the last name complies with ID search format
players_missing_tup = []
for i in players_missing:
    name = i.split(' ')
    if '.' not in name[0]:
        continue
    first_name = name[0].replace('.', '. ', 1).lower()
    last_name = name[-1].lower()
    name_tup = (last_name, first_name)
    players_missing_tup.append(name_tup)
print(f"Number of players with abbreviated name: {len(players_missing_tup)}")
players_missing_tup

Number of players with abbreviated name: 17


[('wilson', 'c. j.'),
 ('pierzynski', 'a. j.'),
 ('howell', 'j. p.'),
 ('burnett', 'a. j.'),
 ('arencibia', 'j. p.'),
 ('gutiérrez', 'j. c.'),
 ('happ', 'j. a.'),
 ('griffin', 'a. j.'),
 ('dickey', 'r. a.'),
 ('ellis', 'a. j.'),
 ('martinez', 'j. d.'),
 ('sabathia', 'c. c.'),
 ('pollock', 'a. j.'),
 ('lemahieu', 'd. j.'),
 ('cron', 'c. j.'),
 ('edwards', 'c. j.'),
 ('realmuto', 'j. t.')]

In [15]:
# do another ID search on players with abbreviated first name, store in playerid_abb

playerid_abb = player_search_list(players_missing_tup)
playerid_abb
# we don't have a ton of new search results but it's better than nothing

Unnamed: 0,name_last,name_first,key_mlbam,key_retro,key_bbref,key_fangraphs,mlb_played_first,mlb_played_last
0,wilson,c. j.,450351,wilsc004,wilsocj01,3580,2005.0,2015.0
1,pierzynski,a. j.,150229,piera001,pierza.01,746,1998.0,2016.0
2,howell,j. p.,434442,howej003,howeljp01,8245,2005.0,2017.0
3,burnett,a. j.,150359,burna001,burnea.01,512,1999.0,2015.0
4,arencibia,j. p.,450317,arenj001,arencjp01,697,2010.0,2015.0
...,...,...,...,...,...,...,...,...
7,dickey,r. a.,285079,dickr001,dicker.01,1245,2001.0,2017.0
8,ellis,a. j.,454560,ellia001,ellisaj01,5677,2008.0,2018.0
9,martinez,j. d.,502110,martj006,martijd02,6184,2011.0,2021.0
10,cron,c. j.,543068,cronc002,croncj01,12546,2014.0,2021.0


In [16]:
# add "full_name" into the new search results, then append new results into playerid df
playerid_abb['full_name'] = playerid_abb['name_first'] + ' ' + playerid['name_last']

playerid = playerid.append(playerid_abb)
playerid.shape
# The total number of IDs will be more than # of unique player names because there are multiple players with the same name

(772, 9)

In [17]:
# since the salary data contains info starting from 2011, and we consider player stats 5 yrs prior to the salary year

# we drop players who retired before 2006

index_names = playerid[playerid['mlb_played_last'] <= 2006 ].index
playerid = playerid.drop(index_names)

playerid.shape

(750, 9)

In [18]:
# also we will drop the mlbam, key_retro, and key_bbref columns
playerid = playerid.drop(columns=['key_mlbam', 'key_retro', 'key_bbref'])
playerid

Unnamed: 0,name_last,name_first,key_fangraphs,mlb_played_first,mlb_played_last,full_name
0,pujols,albert,1177,2001.0,2021.0,albert pujols
1,fielder,prince,4613,2005.0,2016.0,prince fielder
3,reyes,jose,1736,2003.0,2018.0,jose reyes
4,buehrle,mark,225,2000.0,2015.0,mark buehrle
5,papelbon,jonathan,5975,2005.0,2016.0,jonathan papelbon
...,...,...,...,...,...,...
7,dickey,r. a.,1245,2001.0,2017.0,r. a. cuddyer
8,ellis,a. j.,5677,2008.0,2018.0,a. j. bell
9,martinez,j. d.,6184,2011.0,2021.0,j. d. beltran
10,cron,c. j.,12546,2014.0,2021.0,c. j. willingham


In [19]:
mask = playerid.duplicated(subset=['full_name'], keep=False)
playerid[mask]

Unnamed: 0,name_last,name_first,key_fangraphs,mlb_played_first,mlb_played_last,full_name
79,young,chris,3196,2004.0,2017.0,chris young
80,young,chris,3882,2006.0,2018.0,chris young
163,hernandez,roberto,3273,2006.0,2016.0,roberto hernandez
164,hernandez,roberto,605,1991.0,2007.0,roberto hernandez
272,rodriguez,francisco,1642,2002.0,2017.0,francisco rodriguez
...,...,...,...,...,...,...
561,jennings,dan,-1,,,dan jennings
634,smith,will,8048,2012.0,2021.0,will smith
635,smith,will,19197,2019.0,2021.0,will smith
738,taylor,michael,11489,2014.0,2021.0,michael taylor


In [20]:
# drop all players(including the origional one) if other players with same name exist
playerid = playerid.drop_duplicates(subset=['full_name'], keep=False)
# reset index
playerid = playerid.reset_index(drop=True)
playerid

Unnamed: 0,name_last,name_first,key_fangraphs,mlb_played_first,mlb_played_last,full_name
0,pujols,albert,1177,2001.0,2021.0,albert pujols
1,fielder,prince,4613,2005.0,2016.0,prince fielder
2,reyes,jose,1736,2003.0,2018.0,jose reyes
3,buehrle,mark,225,2000.0,2015.0,mark buehrle
4,papelbon,jonathan,5975,2005.0,2016.0,jonathan papelbon
...,...,...,...,...,...,...
727,dickey,r. a.,1245,2001.0,2017.0,r. a. cuddyer
728,ellis,a. j.,5677,2008.0,2018.0,a. j. bell
729,martinez,j. d.,6184,2011.0,2021.0,j. d. beltran
730,cron,c. j.,12546,2014.0,2021.0,c. j. willingham


In [21]:
# reformat full_name so that it is the same as the full name in the salary data
playerid['full_name_new'] = playerid['name_first'].str.strip() + playerid['name_last']
playerid.head()

Unnamed: 0,name_last,name_first,key_fangraphs,mlb_played_first,mlb_played_last,full_name,full_name_new
0,pujols,albert,1177,2001.0,2021.0,albert pujols,albertpujols
1,fielder,prince,4613,2005.0,2016.0,prince fielder,princefielder
2,reyes,jose,1736,2003.0,2018.0,jose reyes,josereyes
3,buehrle,mark,225,2000.0,2015.0,mark buehrle,markbuehrle
4,papelbon,jonathan,5975,2005.0,2016.0,jonathan papelbon,jonathanpapelbon


In [22]:
# add full_name_new column to salary_df for joining
salary_df['full_name_new'] = salary_df['name'].str.lower()
salary_df['full_name_new'] = salary_df['full_name_new'].str.replace(' ', '')

salary_df

Unnamed: 0,name,position,age,from_team,to_tam,contract_length,total_salary,avg_salary,year,full_name_new
1,Albert Pujols,DH,31.8,STL,LAA,10,240000000.0,24000000.0,2012,albertpujols
2,Prince Fielder,DH,27.7,MIL,DET,9,214000000.0,23777778.0,2012,princefielder
3,Jose Reyes,SS,28.4,NYM,MIA,6,106000000.0,17666667.0,2012,josereyes
4,C.J. Wilson,SP,31.0,TEX,LAA,5,77500000.0,15500000.0,2012,c.j.wilson
5,Mark Buehrle,SP,32.7,CHW,MIA,4,58000000.0,14500000.0,2012,markbuehrle
...,...,...,...,...,...,...,...,...,...,...
1597,Sam McWilliams,RP,25.2,TB,NYM,1,750000.0,750000.0,2021,sammcwilliams
1598,Kohl Stewart,SP,26.2,BAL,CHC,1,700000.0,700000.0,2021,kohlstewart
1599,Scott Heineman,OF,27.9,TEX,TEX,1,595000.0,595000.0,2021,scottheineman
1600,Albert Pujols,DH,41.2,LAA,LAD,1,570500.0,570500.0,2021,albertpujols


In [23]:
# now the data in salary_df and playerid are all cleaned. Time to merge them 

salaryFinalDF = pd.merge(salary_df, playerid, on='full_name_new', how='inner')
salaryFinalDF.head()

Unnamed: 0,name,position,age,from_team,to_tam,contract_length,total_salary,avg_salary,year,full_name_new,name_last,name_first,key_fangraphs,mlb_played_first,mlb_played_last,full_name
0,Albert Pujols,DH,31.8,STL,LAA,10,240000000.0,24000000.0,2012,albertpujols,pujols,albert,1177,2001.0,2021.0,albert pujols
1,Albert Pujols,DH,41.2,LAA,LAD,1,570500.0,570500.0,2021,albertpujols,pujols,albert,1177,2001.0,2021.0,albert pujols
2,Prince Fielder,DH,27.7,MIL,DET,9,214000000.0,23777778.0,2012,princefielder,fielder,prince,4613,2005.0,2016.0,prince fielder
3,Jose Reyes,SS,28.4,NYM,MIA,6,106000000.0,17666667.0,2012,josereyes,reyes,jose,1736,2003.0,2018.0,jose reyes
4,Jose Reyes,SS,33.0,COL,NYM,1,507500.0,507500.0,2016,josereyes,reyes,jose,1736,2003.0,2018.0,jose reyes


In [24]:
# drop unneeded columns in new DF

salaryFinalDF = salaryFinalDF.drop(columns=['full_name_new', 'name_last', 'name_first', 'full_name'])
print(salaryFinalDF.shape)
salaryFinalDF.head()

(1140, 12)


Unnamed: 0,name,position,age,from_team,to_tam,contract_length,total_salary,avg_salary,year,key_fangraphs,mlb_played_first,mlb_played_last
0,Albert Pujols,DH,31.8,STL,LAA,10,240000000.0,24000000.0,2012,1177,2001.0,2021.0
1,Albert Pujols,DH,41.2,LAA,LAD,1,570500.0,570500.0,2021,1177,2001.0,2021.0
2,Prince Fielder,DH,27.7,MIL,DET,9,214000000.0,23777778.0,2012,4613,2005.0,2016.0
3,Jose Reyes,SS,28.4,NYM,MIA,6,106000000.0,17666667.0,2012,1736,2003.0,2018.0
4,Jose Reyes,SS,33.0,COL,NYM,1,507500.0,507500.0,2016,1736,2003.0,2018.0


In [25]:
# for unknow reason, there are a few entires where the salary year info is later than the mlb_played_last, thus we need to investigate
salaryFinalDF[salaryFinalDF['year'] - salaryFinalDF['mlb_played_last'] > 1]
# hypothesis: the player signed their next FA contract from over 1 yr before FA, then retired before the next contract starts


Unnamed: 0,name,position,age,from_team,to_tam,contract_length,total_salary,avg_salary,year,key_fangraphs,mlb_played_first,mlb_played_last
336,Rich Harden,SP,31.0,OAK,MIN,1,1000000.0,1000000.0,2013,1772,2003.0,2011.0
418,Johan Santana,SP,34.9,NYM,BAL,1,8050000.0,8050000.0,2014,755,2000.0,2012.0
419,Johan Santana,SP,35.9,BAL,TOR,1,2500000.0,2500000.0,2015,755,2000.0,2012.0
425,Josh Johnson,SP,30.9,SD,SD,1,1000000.0,1000000.0,2015,4567,2005.0,2013.0
634,Pedro Feliciano,RP,38.4,STL,CHC,1,700000.0,700000.0,2015,1601,2002.0,2013.0
762,Roger Bernadina,LF,31.6,COL,NYM,1,750000.0,750000.0,2016,6421,2008.0,2014.0
763,Carlos Quentin,LF,33.4,SEA,MIN,1,750000.0,750000.0,2016,6274,2006.0,2014.0
768,Maikel Cleto,RP,27.2,CHW,ATL,1,507500.0,507500.0,2016,5529,2011.0,2014.0
838,Jacob Lindgren,RP,23.7,NYY,ATL,1,1025000.0,1025000.0,2017,16215,2015.0,2015.0
1131,Felix Hernandez,SP,34.8,ATL,BAL,1,1000000.0,1000000.0,2021,4772,2005.0,2019.0


In [26]:
# Still, we will drop the entires if there is >1 gap between salary year and mlb_played_last, since there will be missing years in production stats prior to saalary year

index_names = salaryFinalDF[salaryFinalDF['year'] - salaryFinalDF['mlb_played_last'] > 1].index

salaryFinalDF.drop(index_names, inplace = True)


In [27]:
# also we need to drop all pitchers since we're only considering batters
drop_SP = salaryFinalDF[salaryFinalDF['position'] == 'SP'].index
salaryFinalDF.drop(drop_SP, inplace = True)
drop_RP = salaryFinalDF[salaryFinalDF['position'] == 'RP'].index
salaryFinalDF.drop(drop_RP, inplace = True)

# reset index
salaryFinalDF = salaryFinalDF.reset_index(drop=True)

In [28]:
# change 'year' column to 'year-fa': year free agency

salaryFinalDF.rename(columns={'year': 'year_fa'}, inplace=True)

In [29]:
# replace all outfield positions with: OF
salaryFinalDF.position.replace(['LF', 'CF', 'RF'], 'OF', inplace=True)

In [30]:
pd.set_option('display.max_rows', 10)
salaryFinalDF.position.value_counts()

OF    177
C      95
2B     68
3B     59
1B     58
DH     36
SS     34
Name: position, dtype: int64

In [31]:
unique_ID = list(salaryFinalDF.key_fangraphs.unique())
print(f"Total number of unique ID's: {len(unique_ID)}")
unique_ID[:10]

Total number of unique ID's: 341


[1177, 4613, 1736, 1002, 1534, 589, 2103, 1572, 88, 6104]

## import mlb batting stats from 07-20 and clean data furthermore
1. Remove entries with BA less than 400:
2. Remove unwanted variables

In [32]:
# import mlb all batting stats from 2007-2020
batting_df = pd.read_csv('mlb-batting.csv')
batting_df.head()
print(batting_df.shape)

(13069, 320)


In [33]:
# drop rows with AB < 400
# batting_df = batting_df.drop(batting_df[batting_df['AB']<400].index)
# batting_df.shape

In [34]:
# drop all unwanted stats
pd.set_option('display.max_columns', None)
batting_df.head()

Unnamed: 0.1,Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,2B,3B,HR,R,RBI,BB,IBB,SO,HBP,SF,SH,GDP,SB,CS,AVG,GB,FB,LD,IFFB,Pitches,Balls,Strikes,IFH,BU,BUH,BB%,K%,BB/K,OBP,SLG,OPS,ISO,BABIP,GB/FB,LD%,GB%,FB%,IFFB%,HR/FB,IFH%,BUH%,wOBA,wRAA,wRC,Bat,Fld,Rep,Pos,RAR,WAR,Dol,Spd,wRC+,WPA,-WPA,+WPA,RE24,REW,pLI,phLI,PH,WPA/LI,Clutch,FB% (Pitch),FBv,SL%,SLv,CT%,CTv,CB%,CBv,CH%,CHv,SF%,SFv,KN%,KNv,XX%,PO%,wFB,wSL,wCT,wCB,wCH,wSF,wKN,wFB/C,wSL/C,wCT/C,wCB/C,wCH/C,wSF/C,wKN/C,O-Swing%,Z-Swing%,Swing%,O-Contact%,Z-Contact%,Contact%,Zone%,F-Strike%,SwStr%,BsR,FA% (sc),FT% (sc),FC% (sc),FS% (sc),FO% (sc),SI% (sc),SL% (sc),CU% (sc),KC% (sc),EP% (sc),CH% (sc),SC% (sc),KN% (sc),UN% (sc),vFA (sc),vFT (sc),vFC (sc),vFS (sc),vFO (sc),vSI (sc),vSL (sc),vCU (sc),vKC (sc),vEP (sc),vCH (sc),vSC (sc),vKN (sc),FA-X (sc),FT-X (sc),FC-X (sc),FS-X (sc),FO-X (sc),SI-X (sc),SL-X (sc),CU-X (sc),KC-X (sc),EP-X (sc),CH-X (sc),SC-X (sc),KN-X (sc),FA-Z (sc),FT-Z (sc),FC-Z (sc),FS-Z (sc),FO-Z (sc),SI-Z (sc),SL-Z (sc),CU-Z (sc),KC-Z (sc),EP-Z (sc),CH-Z (sc),SC-Z (sc),KN-Z (sc),wFA (sc),wFT (sc),wFC (sc),wFS (sc),wFO (sc),wSI (sc),wSL (sc),wCU (sc),wKC (sc),wEP (sc),wCH (sc),wSC (sc),wKN (sc),wFA/C (sc),wFT/C (sc),wFC/C (sc),wFS/C (sc),wFO/C (sc),wSI/C (sc),wSL/C (sc),wCU/C (sc),wKC/C (sc),wEP/C (sc),wCH/C (sc),wSC/C (sc),wKN/C (sc),O-Swing% (sc),Z-Swing% (sc),Swing% (sc),O-Contact% (sc),Z-Contact% (sc),Contact% (sc),Zone% (sc),Pace,Def,wSB,UBR,Age Rng,Off,Lg,wGDP,Pull%,Cent%,Oppo%,Soft%,Med%,Hard%,TTO%,CH% (pi),CS% (pi),CU% (pi),FA% (pi),FC% (pi),FS% (pi),KN% (pi),SB% (pi),SI% (pi),SL% (pi),XX% (pi),vCH (pi),vCS (pi),vCU (pi),vFA (pi),vFC (pi),vFS (pi),vKN (pi),vSB (pi),vSI (pi),vSL (pi),vXX (pi),CH-X (pi),CS-X (pi),CU-X (pi),FA-X (pi),FC-X (pi),FS-X (pi),KN-X (pi),SB-X (pi),SI-X (pi),SL-X (pi),XX-X (pi),CH-Z (pi),CS-Z (pi),CU-Z (pi),FA-Z (pi),FC-Z (pi),FS-Z (pi),KN-Z (pi),SB-Z (pi),SI-Z (pi),SL-Z (pi),XX-Z (pi),wCH (pi),wCS (pi),wCU (pi),wFA (pi),wFC (pi),wFS (pi),wKN (pi),wSB (pi),wSI (pi),wSL (pi),wXX (pi),wCH/C (pi),wCS/C (pi),wCU/C (pi),wFA/C (pi),wFC/C (pi),wFS/C (pi),wKN/C (pi),wSB/C (pi),wSI/C (pi),wSL/C (pi),wXX/C (pi),O-Swing% (pi),Z-Swing% (pi),Swing% (pi),O-Contact% (pi),Z-Contact% (pi),Contact% (pi),Zone% (pi),Pace (pi),FRM,AVG+,BB%+,K%+,OBP+,SLG+,ISO+,BABIP+,LD+%,GB%+,FB%+,HR/FB%+,Pull%+,Cent%+,Oppo%+,Soft%+,Med%+,Hard%+,EV,LA,Barrels,Barrel%,maxEV,HardHit,HardHit%,Events,CStr%,CSW%,xBA,xSLG,xwOBA
0,0,13611,2018,Mookie Betts,BOS,25,136,520,614,180,96,47,5,32,129,80,81,8,91,8,5,0,5,30,6,0.346,147,195,92,18,2582,1095,1487,14,0,0,0.132,0.148,0.89,0.438,0.64,1.078,0.294,0.368,0.0075,0.212,0.339,0.449,0.092,0.164,0.095,0.0,0.449,67.2,139,62.4,16.8,18.4,-5.3,101.1,10.4,$83.3,7.0,185,5.77,-8.16,13.93,62.93,6.11,0.92,3.62,4,6.44,-0.15,0.547,93.2,0.199,84.5,0.05,88.7,0.073,79.1,0.114,84.4,0.017,85.4,,,0.009,,40.1,11.2,4.2,1.1,11.1,2.3,,2.84,2.19,3.28,0.58,3.79,5.06,,0.198,0.57,0.356,0.706,0.93,0.859,0.426,0.562,0.05,6.9,0.35,0.097,0.047,0.022,,0.101,0.204,0.058,0.015,,0.106,,,,93.5,92.8,88.4,85.2,,92.1,84.6,78.4,80.8,,84.4,,,-2.4,-6.1,0.4,-4.1,,-1.6,1.6,2.9,4.5,,0.3,,,9.6,6.0,4.9,3.2,,5.4,1.2,-5.3,-5.8,,4.5,,,18.3,13.3,3.1,2.5,,7.1,9.8,1.5,2.5,,10.1,,,2.03,5.33,2.58,4.34,,2.72,1.87,1.03,6.59,,3.72,,,0.158,0.546,0.355,0.635,0.919,0.857,0.509,,11.6,3.1,2.7,25 - 25,69.3,1.9,1.1,0.472,0.348,0.18,0.122,0.433,0.445,0.332,0.105,,0.07,0.366,0.046,0.026,,,0.173,0.201,0.0,84.6,,79.0,93.6,88.6,84.6,,,92.6,84.7,93.5,0.4,,3.6,-1.9,0.5,-3.2,,,-4.2,2.1,-6.4,3.1,,-7.2,8.2,4.2,1.6,,,4.4,-0.2,5.9,10.5,,0.9,18.2,4.0,4.0,,,21.4,10.5,0.0,3.82,,0.48,1.91,3.3,5.95,,,4.75,2.0,2.04,0.162,0.539,0.355,0.647,0.917,0.857,0.512,,,139.0,159,68,137.0,154.0,176.0,125.0,1.0,80.0,123.0,129.0,115.0,102.0,73.0,69.0,92.0,125.0,92.3,18.5,57.0,0.131,110.6,217.0,0.5,434,0.22,0.27,,,
1,1,10155,2013,Mike Trout,LAA,21,157,589,716,190,115,39,9,27,109,97,110,10,136,9,8,0,8,33,7,0.323,191,164,106,6,3015,1295,1720,31,0,0,0.154,0.19,0.81,0.432,0.557,0.988,0.234,0.376,0.0116,0.23,0.414,0.356,0.037,0.165,0.162,0.0,0.423,61.1,140,62.2,1.6,20.5,-1.2,94.1,10.2,$75.2,6.8,176,5.01,-10.61,15.62,75.41,8.22,1.01,,0,7.6,-2.62,0.605,92.3,0.162,83.8,0.047,88.2,0.081,77.3,0.089,83.1,0.013,84.5,0.004,76.5,0.013,,47.0,4.2,3.8,3.0,8.8,1.1,-0.2,2.61,0.86,2.68,1.23,3.35,3.0,-1.78,0.237,0.556,0.375,0.703,0.89,0.823,0.432,0.535,0.066,8.2,0.398,0.124,0.047,0.01,,0.084,0.167,0.066,0.014,0.001,0.086,,0.003,,92.4,91.4,88.9,83.4,,91.9,83.7,76.8,79.3,65.9,82.9,,78.4,-2.0,-2.1,1.2,-1.6,,-2.1,2.0,4.4,2.5,-5.3,1.9,,-2.1,8.6,6.2,5.1,2.8,,5.9,0.3,-6.3,-5.7,-8.9,4.1,,1.8,25.0,9.7,4.9,-0.5,,11.4,2.0,2.3,0.7,-0.4,10.9,,0.2,2.11,2.62,3.51,-1.72,,4.57,0.41,1.15,1.72,-17.57,4.23,,2.12,0.205,0.539,0.375,0.668,0.877,0.821,0.511,24.3,0.5,3.1,3.6,21 - 21,70.5,2.8,1.5,0.308,0.382,0.31,0.113,0.508,0.38,0.381,0.084,0.0,0.085,0.405,0.053,0.017,0.004,,0.196,0.155,,83.6,61.8,77.9,93.1,88.8,84.6,76.9,,92.4,84.3,,2.5,10.4,4.5,-1.5,1.5,-5.6,-2.3,,-3.1,2.3,,3.5,-15.9,-7.3,8.5,4.2,1.2,-0.5,,5.0,-0.3,,1.0,0.0,0.6,2.1,1.0,0.3,0.0,,1.6,0.6,,0.39,0.0,0.23,0.18,0.66,0.58,0.22,,0.28,0.13,,0.203,0.537,0.375,0.646,0.882,0.821,0.517,,,126.0,189,96,134.0,137.0,157.0,126.0,1.08,96.0,100.0,151.0,78.0,107.0,126.0,70.0,94.0,126.0,,,0.0,,,0.0,,0,0.2,0.266,,,
2,2,10155,2012,Mike Trout,LAA,20,139,559,639,182,117,27,8,30,129,83,67,4,139,6,7,0,7,49,5,0.326,187,139,95,6,2608,992,1616,22,6,3,0.105,0.218,0.48,0.399,0.564,0.963,0.238,0.383,0.0135,0.226,0.444,0.33,0.043,0.216,0.118,0.5,0.409,48.2,121,50.1,10.4,18.9,-0.2,96.2,10.1,$65.5,8.6,167,5.41,-8.48,13.9,54.86,5.79,0.91,,0,6.24,-0.32,0.647,92.2,0.153,83.8,0.048,87.7,0.078,77.7,0.061,82.8,0.013,84.2,,,0.005,,18.8,11.5,8.6,2.6,10.2,-0.4,,1.12,2.9,6.89,1.27,6.44,-1.08,,0.26,0.551,0.396,0.709,0.876,0.818,0.469,0.573,0.072,14.3,0.398,0.123,0.058,0.014,0.001,0.115,0.152,0.059,0.017,,0.063,,,,92.4,91.8,88.5,84.1,88.4,91.2,83.8,76.8,79.8,,82.7,,,-2.3,-1.8,1.0,-6.0,-5.8,-2.8,2.3,2.6,1.9,,2.6,,,8.6,6.9,5.6,3.8,3.4,5.1,0.3,-5.9,-6.0,,4.4,,,10.9,7.7,4.6,0.2,0.3,3.2,10.2,4.5,-1.0,,9.8,,,1.06,2.41,3.06,0.48,15.75,1.06,2.59,2.93,-2.27,,6.05,,,0.236,0.53,0.396,0.696,0.862,0.817,0.543,23.4,10.2,7.0,6.2,20 - 20,64.4,2.8,1.1,0.323,0.384,0.293,0.143,0.529,0.328,0.369,0.057,0.001,0.077,0.422,0.058,0.014,,,0.207,0.15,,83.3,66.2,78.1,93.1,88.1,84.6,,,91.7,84.5,,2.7,8.2,2.4,-1.5,1.0,-5.7,,,-3.5,2.9,,3.5,-11.7,-6.8,8.2,4.3,1.8,,,4.6,-0.3,,0.1,0.0,0.9,-3.4,-0.5,0.1,,,-0.6,1.7,,0.09,0.23,0.43,-0.31,-0.36,0.39,,,-0.11,0.43,,0.233,0.532,0.396,0.681,0.866,0.817,0.543,,,127.0,131,113,125.0,137.0,152.0,130.0,1.08,100.0,95.0,182.0,80.0,110.0,119.0,93.0,95.0,114.0,,,0.0,,,0.0,,0,0.221,0.293,,,
3,3,9166,2012,Buster Posey,SFG,25,148,530,610,178,114,39,1,24,78,103,69,7,96,2,9,0,19,1,1,0.336,206,128,109,5,2599,1027,1572,17,0,0,0.113,0.157,0.72,0.408,0.549,0.957,0.213,0.368,0.0161,0.246,0.465,0.289,0.039,0.188,0.083,0.0,0.406,44.7,114,46.4,30.4,18.0,6.2,96.9,10.1,$66.0,2.0,164,4.93,-9.49,14.42,51.59,5.57,1.05,2.51,5,5.11,-0.41,0.55,91.3,0.2,83.4,0.062,87.3,0.1,76.1,0.081,81.6,0.007,85.7,,,0.007,,29.8,-4.1,2.3,-1.9,14.4,1.6,,2.1,-0.79,1.4,-0.73,6.9,8.84,,0.264,0.607,0.418,0.771,0.896,0.853,0.45,0.572,0.061,-4.6,0.31,0.104,0.068,0.009,,0.129,0.2,0.091,0.011,,0.079,,,,91.7,90.6,87.5,84.5,,90.4,83.5,75.7,80.7,,81.6,,,-1.4,-0.6,0.5,-3.2,,-5.2,2.1,2.5,4.6,,1.4,,,7.9,6.1,5.5,3.1,,4.3,0.4,-6.3,-5.8,,3.9,,,15.7,4.7,5.0,1.3,,7.3,-4.5,-1.9,-0.5,,14.3,,,1.96,1.75,2.88,5.74,,2.21,-0.88,-0.83,-1.84,,6.99,,,0.234,0.584,0.418,0.727,0.896,0.852,0.526,22.2,36.5,-1.0,-2.4,25 - 25,41.8,0.5,-1.2,0.381,0.363,0.255,0.108,0.576,0.316,0.31,0.074,0.001,0.103,0.301,0.087,0.01,,,0.222,0.179,,82.4,75.0,77.2,92.9,87.6,84.8,,,91.3,84.5,,1.5,7.9,3.4,-1.6,0.9,-3.3,,,-2.4,2.4,,3.3,-8.3,-7.2,8.1,4.0,1.5,,,4.4,-0.2,,0.1,0.1,1.1,-1.0,-0.3,0.5,,,-0.8,1.8,,0.04,3.76,0.42,-0.13,-0.14,1.95,,,-0.14,0.4,,0.225,0.59,0.418,0.712,0.899,0.852,0.528,,24.8,129.0,137,82,125.0,133.0,140.0,121.0,1.17,103.0,86.0,171.0,98.0,102.0,100.0,70.0,103.0,109.0,,,0.0,,,0.0,,0,0.19,0.251,,,
4,4,10155,2016,Mike Trout,LAA,24,159,549,681,173,107,32,5,29,123,100,116,12,137,11,5,0,5,30,7,0.315,172,153,92,4,3014,1293,1721,18,0,0,0.17,0.201,0.85,0.441,0.55,0.991,0.235,0.371,0.0112,0.221,0.412,0.367,0.026,0.19,0.105,0.0,0.418,56.2,136,57.3,3.3,20.5,1.0,94.7,9.7,$77.5,6.6,170,6.55,-9.08,15.62,73.56,7.76,1.0,5.3,1,6.76,-0.19,0.58,92.5,0.162,84.2,0.059,87.6,0.09,78.6,0.08,84.2,0.018,85.0,0.012,76.1,0.012,,31.5,0.4,8.8,13.2,7.4,0.3,1.0,1.83,0.07,5.02,4.9,3.11,0.57,2.93,0.23,0.603,0.388,0.706,0.87,0.814,0.423,0.558,0.071,9.6,0.38,0.098,0.058,0.022,,0.107,0.16,0.069,0.023,0.0,0.071,,0.013,,92.8,92.2,88.0,85.0,,91.2,84.0,77.7,82.0,65.8,84.3,,75.8,-2.0,-0.7,1.7,-3.4,,-3.5,2.6,4.4,4.6,-3.2,0.9,,-2.5,9.9,7.1,5.3,3.2,,5.8,1.9,-5.3,-6.2,2.7,4.0,,1.3,23.2,6.8,2.9,2.6,,4.0,1.7,11.6,-0.6,0.1,4.5,,0.8,2.05,2.35,1.71,4.03,,1.27,0.35,5.68,-0.87,9.2,2.15,,1.93,0.209,0.572,0.387,0.671,0.868,0.814,0.491,22.6,4.3,2.6,3.5,24 - 24,67.0,2.9,3.5,0.405,0.329,0.266,0.12,0.463,0.417,0.414,0.074,0.001,0.092,0.386,0.066,0.02,0.011,,0.166,0.147,0.0,84.7,74.7,79.3,93.6,88.7,85.4,76.4,,92.2,85.1,0.0,0.9,10.4,4.4,-1.1,1.9,-4.2,-1.2,,-3.6,3.3,0.0,2.6,-9.9,-7.1,8.6,4.1,1.0,-0.6,,5.0,0.4,0.0,7.1,0.0,13.2,22.0,6.9,0.9,0.7,,8.9,-1.0,-0.1,3.22,-0.77,4.82,1.91,3.54,1.53,1.94,,1.79,-0.22,-5.71,0.208,0.574,0.387,0.67,0.868,0.814,0.489,,,122.0,212,97,137.0,130.0,141.0,124.0,1.08,94.0,103.0,146.0,99.0,96.0,106.0,64.0,93.0,133.0,90.9,13.7,56.0,0.134,117.1,167.0,0.4,417,0.188,0.259,,,


In [35]:
var_list_basic = ['IDfg', 
            'Season', 
            'Name', 
            'Team', 
            'Age', 
            'G', 
            'PA', 
            'AB', 
            'R', 
            'H',
            '2B',
            '3B',
            'HR',
            'RBI',
            'SB',
            'CS',
            'BB',
            'SO',
            'GDP',
            'HBP',
            'SH',
            'SF',
            'IBB',
            'AVG',
            'OBP',
            'SLG',
            'BABIP',
            'ISO',
            'OPS',
            'wRC',
            'wRAA',
            'wOBA',
            'wRC+',
            'WPA',
            'WAR']
# not the full list
batting_basic_df = batting_df[var_list_basic]
batting_basic_df = batting_basic_df.sort_values(by=['IDfg', 'Season'], ascending=True)
batting_basic_df.reset_index(drop=True, inplace=True)
batting_basic_df

Unnamed: 0,IDfg,Season,Name,Team,Age,G,PA,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,GDP,HBP,SH,SF,IBB,AVG,OBP,SLG,BABIP,ISO,OPS,wRC,wRAA,wOBA,wRC+,WPA,WAR
0,1,2007,Alfredo Amezaga,FLA,29,133,448,400,46,105,14,9,2,30,13,7,35,52,4,4,4,5,0,0.263,0.324,0.358,0.293,0.095,0.682,45,-9.9,0.305,79,-2.52,2.0
1,1,2008,Alfredo Amezaga,FLA,30,125,337,311,41,82,13,5,3,32,8,2,19,47,6,3,4,0,1,0.264,0.312,0.367,0.303,0.103,0.679,33,-7.5,0.301,77,-0.59,1.2
2,1,2009,Alfredo Amezaga,FLA,31,27,75,69,6,15,3,0,0,5,1,1,5,16,0,0,0,1,2,0.217,0.267,0.261,0.278,0.043,0.528,3,-6.2,0.228,28,-0.90,-0.2
3,1,2011,Alfredo Amezaga,- - -,33,40,87,77,6,14,1,0,0,4,0,0,7,14,1,0,2,1,0,0.182,0.247,0.195,0.219,0.013,0.442,2,-7.4,0.208,17,-0.85,-0.6
4,2,2007,Garret Anderson,LAA,35,108,450,417,67,124,31,1,16,80,1,0,27,54,8,0,0,6,9,0.297,0.336,0.492,0.306,0.194,0.827,62,6.3,0.348,110,-0.02,1.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13064,22581,2020,Alejandro Kirk,TOR,21,9,25,24,4,9,2,0,1,3,0,0,1,4,0,0,0,0,0,0.375,0.400,0.583,0.421,0.208,0.983,5,2.1,0.418,165,0.08,0.2
13065,23378,2020,Jose Barrero,CIN,22,24,68,67,4,13,0,0,0,2,1,1,1,26,1,0,0,0,0,0.194,0.206,0.194,0.317,0.000,0.400,0,-8.1,0.179,0,-0.98,-0.6
13066,24618,2020,Ryan Jeffers,MIN,23,26,62,55,5,15,0,0,3,7,0,0,5,19,0,2,0,0,0,0.273,0.355,0.436,0.364,0.164,0.791,9,1.4,0.346,120,-0.04,0.5
13067,27459,2020,Yoshi Tsutsugo,TBR,28,51,185,157,27,31,5,1,8,24,0,0,26,50,5,1,0,1,1,0.197,0.314,0.395,0.230,0.197,0.708,21,-1.7,0.309,99,0.01,0.3


### create player position tabel for later use

In [36]:
# positions_df = salaryFinalDF.drop_duplicates(subset=['key_fangraphs'])
# positions_df = positions_df[['name', 'position', 'key_fangraphs']]

# Merge salary and batting for EDA

In [37]:
# re_index batting_basic_df
batting_basic_df.reset_index(drop=True, inplace=True)

In [38]:
batting_basic_df = batting_basic_df.set_index(['IDfg'])
batting_basic_df.sort_index(level=['IDfg'], inplace=True)

In [39]:
batting_basic_df

Unnamed: 0_level_0,Season,Name,Team,Age,G,PA,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,GDP,HBP,SH,SF,IBB,AVG,OBP,SLG,BABIP,ISO,OPS,wRC,wRAA,wOBA,wRC+,WPA,WAR
IDfg,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1
1,2007,Alfredo Amezaga,FLA,29,133,448,400,46,105,14,9,2,30,13,7,35,52,4,4,4,5,0,0.263,0.324,0.358,0.293,0.095,0.682,45,-9.9,0.305,79,-2.52,2.0
1,2008,Alfredo Amezaga,FLA,30,125,337,311,41,82,13,5,3,32,8,2,19,47,6,3,4,0,1,0.264,0.312,0.367,0.303,0.103,0.679,33,-7.5,0.301,77,-0.59,1.2
1,2009,Alfredo Amezaga,FLA,31,27,75,69,6,15,3,0,0,5,1,1,5,16,0,0,0,1,2,0.217,0.267,0.261,0.278,0.043,0.528,3,-6.2,0.228,28,-0.90,-0.2
1,2011,Alfredo Amezaga,- - -,33,40,87,77,6,14,1,0,0,4,0,0,7,14,1,0,2,1,0,0.182,0.247,0.195,0.219,0.013,0.442,2,-7.4,0.208,17,-0.85,-0.6
2,2007,Garret Anderson,LAA,35,108,450,417,67,124,31,1,16,80,1,0,27,54,8,0,0,6,9,0.297,0.336,0.492,0.306,0.194,0.827,62,6.3,0.348,110,-0.02,1.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22581,2020,Alejandro Kirk,TOR,21,9,25,24,4,9,2,0,1,3,0,0,1,4,0,0,0,0,0,0.375,0.400,0.583,0.421,0.208,0.983,5,2.1,0.418,165,0.08,0.2
23378,2020,Jose Barrero,CIN,22,24,68,67,4,13,0,0,0,2,1,1,1,26,1,0,0,0,0,0.194,0.206,0.194,0.317,0.000,0.400,0,-8.1,0.179,0,-0.98,-0.6
24618,2020,Ryan Jeffers,MIN,23,26,62,55,5,15,0,0,3,7,0,0,5,19,0,2,0,0,0,0.273,0.355,0.436,0.364,0.164,0.791,9,1.4,0.346,120,-0.04,0.5
27459,2020,Yoshi Tsutsugo,TBR,28,51,185,157,27,31,5,1,8,24,0,0,26,50,5,1,0,1,1,0.197,0.314,0.395,0.230,0.197,0.708,21,-1.7,0.309,99,0.01,0.3


## Select rows based on FA year

In [40]:
# aggregate batting_basic based on FA year
agg_method = {'Age':'max',
             'G':'sum',
             'PA':'sum',
             'AB':'sum',
             'R':'sum',
             'H':'sum',
             '2B':'sum',
             '3B':'sum',
             'HR':'sum',
             'RBI':'sum',
             'SB':'sum',
             'CS':'sum',
             'BB':'sum',
             'SO':'sum',
             'GDP':'sum',
             'HBP':'sum',
             'SH':'sum',
             'SF':'sum',
             'IBB':'sum',
             'AVG':'mean',
             'OBP':'mean',
             'SLG':'mean',
             'BABIP':'mean',
             'ISO':'mean',
             'OPS':'mean',
             'wRC':'sum',
             'wRAA':'sum',
             'wRC+':'sum',
             'WPA':'sum',
             'WAR':'sum'}
batting_aggDF = pd.DataFrame()
for row in salaryFinalDF.itertuples():
    selected_years = [row.year_fa-1, row.year_fa-2, row.year_fa-3, row.year_fa-4, row.year_fa-5,]
    player = batting_basic_df.loc[row.key_fangraphs]
    player = player[player['Season'].isin(selected_years)].groupby(by=['IDfg','Name']).agg(agg_method)
    player['Year_FA']=row.year_fa
    player['Salary']=row.avg_salary
    batting_aggDF = batting_aggDF.append(player)

In [41]:
batting_aggDF

Unnamed: 0_level_0,Unnamed: 1_level_0,Age,G,PA,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,GDP,HBP,SH,SF,IBB,AVG,OBP,SLG,BABIP,ISO,OPS,wRC,wRAA,wRC+,WPA,WAR,Year_FA,Salary
IDfg,Name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1
1177,Albert Pujols,31,772,3371,2823,543,914,195,3,195,571,48,18,482,310,118,29,0,37,153,0.3244,0.42280,0.60320,0.30600,0.27880,1.0258,675,277.2,830,29.50,35.5,2012,24000000.0
1177,Albert Pujols,40,588,2492,2294,244,570,86,0,102,402,11,0,166,326,87,10,0,22,16,0.2444,0.29460,0.41580,0.24480,0.17120,0.7102,272,-28.9,441,1.33,-2.3,2021,570500.0
4613,Prince Fielder,27,802,3500,2899,487,825,161,8,200,565,9,8,505,637,64,66,0,30,110,0.2846,0.39900,0.55280,0.29860,0.26800,0.9516,629,216.0,735,26.24,19.2,2012,23777778.0
1736,Jose Reyes,28,614,2883,2616,434,776,140,59,48,238,214,55,235,283,30,4,16,12,35,0.2950,0.35440,0.44240,0.31600,0.14760,0.7968,394,53.7,566,4.12,20.4,2012,17666667.0
1736,Jose Reyes,32,638,2895,2652,396,785,146,34,44,242,148,32,204,279,31,2,18,19,21,0.2962,0.34440,0.42580,0.31580,0.12960,0.7702,371,49.3,553,8.78,16.6,2016,507500.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14109,Albert Almora Jr.,26,489,1316,1224,160,332,63,4,28,134,4,4,67,227,33,5,10,10,6,0.2528,0.30100,0.37180,0.29660,0.11920,0.6724,141,-18.0,391,-4.20,2.0,2021,1250000.0
13329,Jake Lamb,29,465,1792,1541,237,362,82,15,74,267,14,8,217,451,37,19,0,15,19,0.2210,0.32040,0.40980,0.26680,0.18880,0.7300,237,21.4,457,5.56,5.0,2021,1000000.0
5486,Abraham Almonte,31,210,591,530,76,122,32,7,8,49,13,4,52,136,14,2,2,5,1,0.2114,0.30400,0.33800,0.27100,0.12660,0.6422,56,-15.0,362,-2.92,0.0,2021,990000.0
14145,Daniel Robertson,26,250,855,728,95,170,32,3,16,74,5,5,99,215,22,23,1,4,0,0.2535,0.35475,0.34225,0.34125,0.08825,0.6970,99,-5.9,396,-0.18,2.4,2021,900000.0


## natural log salaries

In [42]:
batting_aggDF.insert(len(batting_aggDF.columns), 'Salary_log',
         np.log(batting_aggDF['Salary']))

In [43]:
batting_aggDF

Unnamed: 0_level_0,Unnamed: 1_level_0,Age,G,PA,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,GDP,HBP,SH,SF,IBB,AVG,OBP,SLG,BABIP,ISO,OPS,wRC,wRAA,wRC+,WPA,WAR,Year_FA,Salary,Salary_log
IDfg,Name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1
1177,Albert Pujols,31,772,3371,2823,543,914,195,3,195,571,48,18,482,310,118,29,0,37,153,0.3244,0.42280,0.60320,0.30600,0.27880,1.0258,675,277.2,830,29.50,35.5,2012,24000000.0,16.993564
1177,Albert Pujols,40,588,2492,2294,244,570,86,0,102,402,11,0,166,326,87,10,0,22,16,0.2444,0.29460,0.41580,0.24480,0.17120,0.7102,272,-28.9,441,1.33,-2.3,2021,570500.0,13.254268
4613,Prince Fielder,27,802,3500,2899,487,825,161,8,200,565,9,8,505,637,64,66,0,30,110,0.2846,0.39900,0.55280,0.29860,0.26800,0.9516,629,216.0,735,26.24,19.2,2012,23777778.0,16.984262
1736,Jose Reyes,28,614,2883,2616,434,776,140,59,48,238,214,55,235,283,30,4,16,12,35,0.2950,0.35440,0.44240,0.31600,0.14760,0.7968,394,53.7,566,4.12,20.4,2012,17666667.0,16.687190
1736,Jose Reyes,32,638,2895,2652,396,785,146,34,44,242,148,32,204,279,31,2,18,19,21,0.2962,0.34440,0.42580,0.31580,0.12960,0.7702,371,49.3,553,8.78,16.6,2016,507500.0,13.137252
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14109,Albert Almora Jr.,26,489,1316,1224,160,332,63,4,28,134,4,4,67,227,33,5,10,10,6,0.2528,0.30100,0.37180,0.29660,0.11920,0.6724,141,-18.0,391,-4.20,2.0,2021,1250000.0,14.038654
13329,Jake Lamb,29,465,1792,1541,237,362,82,15,74,267,14,8,217,451,37,19,0,15,19,0.2210,0.32040,0.40980,0.26680,0.18880,0.7300,237,21.4,457,5.56,5.0,2021,1000000.0,13.815511
5486,Abraham Almonte,31,210,591,530,76,122,32,7,8,49,13,4,52,136,14,2,2,5,1,0.2114,0.30400,0.33800,0.27100,0.12660,0.6422,56,-15.0,362,-2.92,0.0,2021,990000.0,13.805460
14145,Daniel Robertson,26,250,855,728,95,170,32,3,16,74,5,5,99,215,22,23,1,4,0,0.2535,0.35475,0.34225,0.34125,0.08825,0.6970,99,-5.9,396,-0.18,2.4,2021,900000.0,13.710150
