In [None]:
### Improvements to Week 2
###    DONE--- parsing temperature
###    --- doing extensive row-wise aggregation of pitcher statistics
###           (computationally expensive)
###           1. whole-career statistics for clustering pitchers by type
###           2. bunch-of-recent-game statistics for a more comprehensive look at recent performance/workload


In [1]:
### Setting up the environment.

import numpy as np
import pandas as pd

from datetime import timedelta

In [2]:
### Reading in the data.

#######################################################################################################################

# Part 1: Reading in performance/workload records from box score parser.

allData = pd.read_csv('parse_012717afternoon.csv', \
                      header=None, names = ['Name', 'IP', 'H', 'H2', 'R', \
                                           'ER', 'BB', 'SO', 'HR', 'ERA', \
                                           'BF', 'Pit', 'Str', 'Ctct', 'StS', \
                                           'StL', 'GB', 'FB', 'LD', 'Unk', \
                                           'GSc', 'WPA', 'aLI', \
                                           'RE24', 'GameDate', 'Temperature'])
allData = allData.drop('H2', 1)
for c in list(allData):
    # Get nicely formatted datetime data for each game. For pitcherFullParse, this is the 24th column.
    if c != 'GameDate':
        allData[c] = pd.to_numeric(allData[c], errors='ignore')
    else:
        allData[c] = pd.to_datetime(allData[c], format='%Y%m%d')
# Sort by pitcher, then by date. Result is concatenated dataframes of each pitcher's career.
allData = allData.sort_values(['Name', 'GameDate'])
allData['interceptRow'] = np.ones(len(allData))

In [3]:
#######################################################################################################################

# Part 2: Read in pitcher birthdays from age_db.csv and add to performance/workload records.
#    1/30/17 - Now includes calculating several pitcher-specific features.

ageDb = pd.read_csv('db_age.csv', header=None)
ageDb[1] = pd.to_datetime(ageDb[1], format='%Y-%m-%d', errors='coerce')
allData['birthday'] = pd.NaT #pd.Series(np.zeros(len(allData)), index=allData.index)''
pitcherList = allData['Name'].drop_duplicates()
for pitcher in pitcherList:
    if sum(ageDb[0]==pitcher) > 0:
    ########## Age Portion    
        age = ageDb.loc[ageDb[0]==pitcher,1].values[0]
        pl = allData['Name']==pitcher
        allData.loc[pl,'birthday'] = pd.Series(age, index=pl.index)
    ########## Other personal statistics.
        gbPerc = float(allData.loc[pl,'GB'].sum()) / float(allData.loc[pl,'BF'].sum())
        KpBF = float(allData.loc[pl,'SO'].sum()) / float(allData.loc[pl,'BF'].sum())
        pl2 = np.logical_and(allData['Str']!=np.int64(-1), allData['Name']==pitcher)
        paIP = float(allData.loc[pl,'IP'].sum()) / sum(pl)
        paERA = float(allData.loc[pl,'ERA'].sum()) / sum(pl)
        try:
            sPerc = float(allData.loc[pl2,'Str'].sum()) / float(allData.loc[pl2,'Pit'].sum())
        except:
            sPerc = 0
        allData.loc[pl,'carGbPerc'] = pd.Series(gbPerc, index=pl.index)
        allData.loc[pl,'carKpBF'] = pd.Series(KpBF, index=pl.index)
        allData.loc[pl,'carSPerc'] = pd.Series(sPerc, index=pl.index)
        allData.loc[pl,'paIP'] = pd.Series(paIP, index=pl.index)
        allData.loc[pl,'paERA'] = pd.Series(paERA, index=pl.index)
allData['ages'] = allData['GameDate'] - allData['birthday']
allData['ages'] = pd.to_numeric(allData['ages'].values) / 100000000000000 / 365.25
allData[allData.birthday.isnull()].to_csv('rejects.csv')

In [5]:
# Part 3: Engineered non-personal statistics.
#    N.B. Requires cleaning... currently not handling borders between pitchers.

allData = allData.set_index(pd.Series(range(len(allData))))

allData[['devIP','devERA','devSPerc','devKpBF','devGbPerc']] = \
    allData[['devIP','devERA','devSPerc','devKpBF','devGbPerc']] / \
    allData[['paIP','paERA','sPerc','KpBF','gbPerc']]


allData[['sIP']] = allData[['IP']] + allData[['IP']].shift(1) + allData[['IP']].shift(2) + \
    allData[['IP']].shift(3) + allData[['IP']].shift(4) + allData[['IP']].shift(5) + allData[['IP']].shift(6)
    
allData[['sERA','sSPerc','sKpBF','sGbPerc']]
    ( allData[['ER','Str','SO','GB']] + \
    allData[['ER','Str','SO','GB']].shift(1) + \
    allData[['ER','Str','SO','GB']].shift(2) + \
    allData[['ER','Str','SO','GB']].shift(3) + \
    allData[['ER','Str','SO','GB']].shift(4) + \
    allData[['ER','Str','SO','GB']].shift(5) + \
    allData[['ER','Str','SO','GB']].shift(6) ) / \
    ( allData[['IP','Pit','BF','BF']] + \
    allData[['IP','Pit','BF','BF']].shift(1) + \
    allData[['IP','Pit','BF','BF']].shift(2) + \
    allData[['IP','Pit','BF','BF']].shift(3) + \
    allData[['IP','Pit','BF','BF']].shift(4) + \
    allData[['IP','Pit','BF','BF']].shift(5) + \
    allData[['IP','Pit','BF','BF']].shift(6) )

allData[['devsIP','devsERA','devsSPerc','devsKpBF','devsGbPerc']] = \
    allData[['sIP','sERA','sSPerc','sKpBF','sGbPerc']] / \
    allData[['paIP','paERA','sPerc','KpBF','gbPerc']]

print allData.tail(25)

            Name   IP   H  R  ER  BB  SO  HR    ERA  BF     ...          KpBF  \
124924  zitoba01  7.0   5  0   0   1   4   0   3.42  26     ...      0.171496   
124925  zitoba01  3.2   6  6   1   1   4   0   3.29  19     ...      0.171496   
124926  zitoba01  5.0   6  1   1   4   1   0   3.06  22     ...      0.171496   
124927  zitoba01  7.0   4  1   1   0   3   1   2.75  24     ...      0.171496   
124928  zitoba01  5.2  12  8   5   2   2   0   3.40  29     ...      0.171496   
124929  zitoba01  5.2  11  5   5   2   4   0   3.91  29     ...      0.171496   
124930  zitoba01  6.0   7  4   4   1   3   1   4.13  27     ...      0.171496   
124931  zitoba01  6.0   3  1   1   6   5   0   3.88  26     ...      0.171496   
124932  zitoba01  6.0   7  4   4   3   0   0   4.06  27     ...      0.171496   
124933  zitoba01  4.2  11  8   8   1   4   0   4.79  24     ...      0.171496   
124934  zitoba01  5.2   6  2   2   3   8   0   4.67  26     ...      0.171496   
124935  zitoba01  7.0   6  1

In [None]:
#######################################################################################################################

# Part 4: Read in injury transfer dates from Disabled List transaction parser.

tarData = pd.read_csv('db_dl_history.csv', header=None)
tarData[1] = pd.to_datetime(tarData[1], format='%Y%m%d')
allData.loc[:,'targets'] = pd.Series(np.zeros(len(allData)), index=allData.index)
for i in range(len(tarData)):
    dlPitch = tarData.iloc[i,0]
    playerSubSet = allData['Name'].str.match(dlPitch,as_indexer=True)
    tData = allData.loc[playerSubSet.index[playerSubSet.values],'GameDate']
    tData2 = tData < tarData.iloc[i,1]
    tData3 = tData2[tData2.values == True]
    if sum(tData3) > 1:
        allData.loc[tData3.index[-2],'targets'] = 1
    else:
        x = True

In [None]:
allData.to_csv('db_mlb_project_week2.csv')