In [1]:
### Improvements to Week 3
###    -DONE- Demarcating and not fitting borders
###       Not "done done", in that we still lose some of Paul Abbott's data because of pandas's bizarre shift behavior.
###    -DONE- Eliminating non 2001-8
###    -DONE- Eliminating data loss (?)
###       No pitchers lost to age. No games lost to unrecorded pitches/strikes.


In [2]:
### Setting up the environment.

import numpy as np
import pandas as pd
import datetime

In [3]:
### Reading in the data.

#######################################################################################################################

# Part 1: Reading in performance/workload records from box score parser.

allData = pd.read_csv('parse_012717afternoon.csv', \
                      header=None, names = ['Name', 'IP', 'H', 'H2', 'R', \
                                           'ER', 'BB', 'SO', 'HR', 'ERA', \
                                           'BF', 'Pit', 'Str', 'Ctct', 'StS', \
                                           'StL', 'GB', 'FB', 'LD', 'Unk', \
                                           'GSc', 'WPA', 'aLI', \
                                           'RE24', 'GameDate', 'Temperature'])
allData = allData.drop('H2', 1)
for c in list(allData):
    # Get nicely formatted datetime data for each game. For pitcherFullParse, this is the 24th column.
    if c != 'GameDate':
        allData[c] = pd.to_numeric(allData[c], errors='ignore')
    else:
        allData[c] = pd.to_datetime(allData[c], format='%Y%m%d')
# Sort by pitcher, then by date. Result is concatenated dataframes of each pitcher's career.
allData = allData.sort_values(['Name', 'GameDate'])
allData['interceptRow'] = np.ones(len(allData))
# Remove dates we won't be using. Injury DB only goes from 2001-2008.
allData = allData[allData['GameDate']>datetime.date(2001,1,1)]
allData = allData[allData['GameDate']<datetime.date(2009,1,1)]
# Clean out undesired columns... things we aren't using in current analysis.
allData = allData.drop(labels=['R','HR','Ctct','StS','StL','FB','LD','Unk','GSc','WPA','aLI','RE24','Temperature'],axis=1)

In [4]:
#######################################################################################################################
# Part 2: Read in pitcher birthdays from age_db.csv and add to performance/workload records.

ageDb = pd.read_csv('db_age.csv', header=None)
ageDb[1] = pd.to_datetime(ageDb[1], format='%Y-%m-%d', errors='coerce')
allData['birthday'] = pd.NaT #pd.Series(np.zeros(len(allData)), index=allData.index)''
pitcherList = allData['Name'].drop_duplicates()
for pitcher in pitcherList:
    if sum(ageDb[0]==pitcher) > 0:
    ########## Age Portion    
        age = ageDb.loc[ageDb[0]==pitcher,1].values[0]
        pl = allData['Name']==pitcher
        allData.loc[pl,'birthday'] = pd.Series(age, index=pl.index)
allData['Age'] = allData['GameDate'] - allData['birthday']
allData['Age'] = pd.to_numeric(allData['Age'].values) / 100000000000000 / 365.25
allData = allData.drop('birthday',axis=1)
#allData[allData.birthday.isnull()].to_csv('rejects.csv')

In [5]:
#######################################################################################################################

# Part 3: Calculate other pitcher-specific performance metrics.
#    ### notational notes as statistics multiply
#        prefices
#        m - mean
#        c - career total (for quotients, sum numerator over sum denominator, for other quantities, sum)
#        f - fraction
#        r - recent
#        d - deviation from career average

for pitcher in pitcherList:
    pl = allData['Name']==pitcher
    mIP = float(allData.loc[pl,'IP'].sum()) / sum(pl)
    cERA = 9*float(allData.loc[pl,'ER'].sum()) / float(allData.loc[pl,'IP'].sum())
    cfGB = float(allData.loc[pl,'GB'].sum()) / float(allData.loc[pl,'BF'].sum())
    cfSO = float(allData.loc[pl,'SO'].sum()) / float(allData.loc[pl,'BF'].sum())
    cfStr = float(allData.loc[pl,'Str'].sum()) / float(allData.loc[pl,'Pit'].sum())
    allData.loc[pl,'mIP'] = pd.Series(mIP, index=pl.index)
    allData.loc[pl,'cERA'] = pd.Series(cERA, index=pl.index)
    allData.loc[pl,'cfGB'] = pd.Series(cfGB, index=pl.index)
    allData.loc[pl,'cfSO'] = pd.Series(cfSO, index=pl.index)
    allData.loc[pl,'cfStr'] = pd.Series(cfStr, index=pl.index)

In [6]:
allData.to_csv('db_mlb_project_postPitcherLabor2.csv')

In [7]:
print allData.head(20)


             Name   IP  H  R  ER  BB  SO   ERA  BF  Pit  Str  GB   GameDate  \
21891   abbotpa01  4.1  5  4   3   2   1  6.23  21   83   54   3 2001-04-28   
100284  abbotpa01  5.2  4  4   4   2   6  6.30  23   81   48   2 2001-05-04   
120514  abbotpa01  5.1  5  2   1   3   6  4.70  23   97   58   2 2001-05-11   
100292  abbotpa01  6.1  3  2   2   4  10  4.15  25  100   58   4 2001-05-16   
68018   abbotpa01  2.2  9  8   8   1   1  6.66  18   60   36   3 2001-05-22   
54399   abbotpa01  9.0  6  3   3   3   6  5.67  34  119   76  12 2001-05-28   
100310  abbotpa01  5.0  4  2   2   4   1  5.40  23   98   57   6 2001-06-02   
100320  abbotpa01  7.2  5  1   1   3   4  4.70  31  116   71   8 2001-06-08   
38581   abbotpa01  7.0  6  1   1   2   6  4.25  28   94   59   9 2001-06-14   
83207   abbotpa01  6.0  7  5   5   2   2  4.58  27  105   63   8 2001-06-19   
100330  abbotpa01  6.2  4  3   3   3   4  4.52  28  112   67   7 2001-06-24   
116264  abbotpa01  6.0  8  4   4   5   5  4.65  31  

In [8]:
### Part 4:
#   Set up a use matrix. This will indicate whether a row should be used in the calculation of the preceding statistics for rows below it.
gbVar = 30 #days
# No pitcher has pitched more than six times in the previous month.
o1 = allData.shift(1)
o2 = allData.shift(2)
o3 = allData.shift(3)
o4 = allData.shift(4)
o5 = allData.shift(5)
o6 = allData.shift(6)
u1,u2,u3,u4,u5,u6 = ((allData['Name'] == o1['Name'])&((allData['GameDate']-np.timedelta64(gbVar,'D'))<o1['GameDate'])),\
                    ((allData['Name'] == o2['Name'])&((allData['GameDate']-np.timedelta64(gbVar,'D'))<o2['GameDate'])),\
                    ((allData['Name'] == o3['Name'])&((allData['GameDate']-np.timedelta64(gbVar,'D'))<o3['GameDate'])),\
                    ((allData['Name'] == o4['Name'])&((allData['GameDate']-np.timedelta64(gbVar,'D'))<o4['GameDate'])),\
                    ((allData['Name'] == o5['Name'])&((allData['GameDate']-np.timedelta64(gbVar,'D'))<o5['GameDate'])),\
                    ((allData['Name'] == o6['Name'])&((allData['GameDate']-np.timedelta64(gbVar,'D'))<o6['GameDate']))
print allData.shape, o1.shape, u1.shape

(38861, 20) (38861, 20) (38861,)


In [9]:
allData['rIP'] = allData['IP']+o1['IP']*u1.astype(int)+o2['IP']*u2.astype(int)+\
                               o3['IP']*u3.astype(int)+o4['IP']*u4.astype(int)+\
                               o5['IP']*u5.astype(int)+o6['IP']*u6.astype(int)
allData['rBF'] = allData['BF']+o1['BF']*u1.astype(int)+o2['BF']*u2.astype(int)+\
                               o3['BF']*u3.astype(int)+o4['BF']*u4.astype(int)+\
                               o5['BF']*u5.astype(int)+o6['BF']*u6.astype(int)
allData['rPit'] = allData['Pit']+o1['Pit']*u1.astype(int)+o2['Pit']*u2.astype(int)+\
                               o3['Pit']*u3.astype(int)+o4['Pit']*u4.astype(int)+\
                               o5['Pit']*u5.astype(int)+o6['Pit']*u6.astype(int)
allData['rERA'] = 9*(allData['ER']+o1['ER']*u1.astype(int)+o2['ER']*u2.astype(int)+\
                               o3['ER']*u3.astype(int)+o4['ER']*u4.astype(int)+\
                               o5['ER']*u5.astype(int)+o6['ER']*u6.astype(int)) / allData['rIP']
allData['rfGB'] = (allData['GB']+o1['GB']*u1.astype(int)+o2['GB']*u2.astype(int)+\
                               o3['GB']*u3.astype(int)+o4['GB']*u4.astype(int)+\
                               o5['GB']*u5.astype(int)+o6['GB']*u6.astype(int)) / allData['rBF']
allData['rfSO'] = (allData['SO']+o1['SO']*u1.astype(int)+o2['SO']*u2.astype(int)+\
                               o3['SO']*u3.astype(int)+o4['SO']*u4.astype(int)+\
                               o5['SO']*u5.astype(int)+o6['SO']*u6.astype(int)) / allData['rBF']
allData['rfStr'] = (allData['Str']+o1['Str']*u1.astype(int)+o2['Str']*u2.astype(int)+\
                               o3['Str']*u3.astype(int)+o4['Str']*u4.astype(int)+\
                               o5['Str']*u5.astype(int)+o6['Str']*u6.astype(int)) / allData['rPit']
allData['dIP'] = allData['IP'] / allData['mIP']
allData['drIP'] = allData['rIP'] / allData['mIP']
allData['dERA'] = allData['ERA'] / allData['cERA']
allData['drERA'] = allData['rERA'] / allData['cERA']
allData['fGB'] = allData['GB'] / allData['BF']
allData['dfGB'] = allData['fGB'] / allData['cfGB']
allData['drfGB'] = allData['rfGB'] / allData['cfGB']
allData['fSO'] = allData['SO'] / allData['BF']
allData['dfSO'] = allData['fSO'] / allData['cfSO']
allData['drfSO'] = allData['rfSO'] / allData['cfSO']
allData['fStr'] = allData['Str'] / allData['Pit']
allData['dfStr'] = allData['fStr'] / allData['cfStr']
allData['drfStr'] = allData['rfStr'] / allData['cfStr']

In [None]:
#######################################################################################################################

# Part 4: Read in injury transfer dates from Disabled List transaction parser.

tarData = pd.read_csv('db_dl_history.csv', header=None)
tarData[1] = pd.to_datetime(tarData[1], format='%Y%m%d')
allData.loc[:,'targets'] = pd.Series(np.zeros(len(allData)), index=allData.index)
for i in range(len(tarData)):
    dlPitch = tarData.iloc[i,0]
    playerSubSet = allData['Name'].str.match(dlPitch,as_indexer=True)
    tData = allData.loc[playerSubSet.index[playerSubSet.values],'GameDate']
    tData2 = tData < tarData.iloc[i,1]
    tData3 = tData2[tData2.values == True]
    if sum(tData3) > 1:
        allData.loc[tData3.index[-2],'targets'] = 1
    else:
        print tarData.iloc[i,:]
        
#   Set up a use matrix. This will indicate whether a row should be used in the calculation of the preceding statistics for rows below it.
gbVar = 31 #days
# We will demand that the pitcher pitches five times in the next month. Possibly a higher bar than average, but we want clear targets.

o = allData.shift(-5)
u = (allData['Name'] == o1['Name'])&((allData['GameDate']+np.timedelta64(gbVar,'D'))<o1['GameDate'])
allData.iloc[u,'targets'] = -1

In [11]:
allData.to_csv('db_mlb_project_weekp3.csv')

In [14]:
print allData.loc[allData['Name']=='alvarwi01','GameDate']

74746    2002-04-06
112054   2002-04-14
112094   2002-05-31
120700   2002-06-05
112106   2002-06-10
46792    2002-06-16
38749    2002-06-22
112118   2002-06-28
116440   2002-07-04
112130   2002-07-14
96420    2003-07-08
38933    2003-07-13
59064    2003-08-09
47016    2003-08-13
59070    2003-08-20
50526    2003-08-28
59088    2003-09-02
38975    2003-09-07
59092    2003-09-12
59102    2003-09-18
96478    2003-09-23
105036   2003-09-28
92235    2004-05-07
59142    2004-05-12
87949    2004-05-18
8494     2004-05-23
59152    2004-05-28
59188    2004-07-05
59198    2004-07-10
50644    2004-07-19
59208    2004-07-24
39095    2004-07-29
31045    2004-08-12
59226    2004-08-17
59236    2004-08-22
79457    2004-08-27
59254    2004-09-15
105250   2005-05-25
59318    2005-05-30
Name: GameDate, dtype: datetime64[ns]
