In [None]:
### Improvements to Week 2
###    DONE--- parsing temperature
###    --- doing extensive row-wise aggregation of pitcher statistics
###           (computationally expensive)
###           1. whole-career statistics for clustering pitchers by type
###           2. bunch-of-recent-game statistics for a more comprehensive look at recent performance/workload


In [1]:
### Setting up the environment.

import numpy as np
import pandas as pd

from datetime import timedelta

In [2]:
### Reading in the data.

#######################################################################################################################

# Part 1: Reading in performance/workload records from box score parser.

allData = pd.read_csv('parse_012717afternoon.csv', \
                      header=None, names = ['Name', 'IP', 'H', 'H2', 'R', \
                                           'ER', 'BB', 'SO', 'HR', 'ERA', \
                                           'BF', 'Pit', 'Str', 'Ctct', 'StS', \
                                           'StL', 'GB', 'FB', 'LD', 'Unk', \
                                           'GSc', 'WPA', 'aLI', \
                                           'RE24', 'GameDate', 'Temperature'])
allData = allData.drop('H2', 1)
for c in list(allData):
    # Get nicely formatted datetime data for each game. For pitcherFullParse, this is the 24th column.
    if c != 'GameDate':
        allData[c] = pd.to_numeric(allData[c], errors='ignore')
    else:
        allData[c] = pd.to_datetime(allData[c], format='%Y%m%d')
# Sort by pitcher, then by date. Result is concatenated dataframes of each pitcher's career.
allData = allData.sort_values(['Name', 'GameDate'])
allData['interceptRow'] = np.ones(len(allData))

In [3]:
#######################################################################################################################

# Part 2: Read in pitcher birthdays from age_db.csv and add to performance/workload records.
#    1/30/17 - Now includes calculating several pitcher-specific features.

ageDb = pd.read_csv('db_age.csv', header=None)
ageDb[1] = pd.to_datetime(ageDb[1], format='%Y-%m-%d', errors='coerce')
allData['birthday'] = pd.NaT #pd.Series(np.zeros(len(allData)), index=allData.index)''
pitcherList = allData['Name'].drop_duplicates()
for pitcher in pitcherList:
    if sum(ageDb[0]==pitcher) > 0:
    ########## Age Portion    
        age = ageDb.loc[ageDb[0]==pitcher,1].values[0]
        pl = allData['Name']==pitcher
        allData.loc[pl,'birthday'] = pd.Series(age, index=pl.index)
    ########## Other personal statistics.
        gbPerc = float(allData.loc[pl,'GB'].sum()) / float(allData.loc[pl,'BF'].sum())
        KpBF = float(allData.loc[pl,'SO'].sum()) / float(allData.loc[pl,'BF'].sum())
        pl2 = np.logical_and(allData['Str']!=np.int64(-1), allData['Name']==pitcher)
        paIP = float(allData.loc[pl,'IP'].sum()) / sum(pl)
        paERA = float(allData.loc[pl,'ERA'].sum()) / sum(pl)
        try:
            sPerc = float(allData.loc[pl2,'Str'].sum()) / float(allData.loc[pl2,'Pit'].sum())
        except:
            sPerc = 0
        allData.loc[pl,'carGbPerc'] = pd.Series(gbPerc, index=pl.index)
        allData.loc[pl,'carKpBF'] = pd.Series(KpBF, index=pl.index)
        allData.loc[pl,'carSPerc'] = pd.Series(sPerc, index=pl.index)
        allData.loc[pl,'paIP'] = pd.Series(paIP, index=pl.index)
        allData.loc[pl,'paERA'] = pd.Series(paERA, index=pl.index)
allData['ages'] = allData['GameDate'] - allData['birthday']
allData['ages'] = pd.to_numeric(allData['ages'].values) / 100000000000000 / 365.25
allData[allData.birthday.isnull()].to_csv('rejects.csv')


In [4]:
allData.to_csv('db_mlb_project_postPitcherLabor.csv')

In [41]:
# Part 3: Engineered non-personal statistics.
#    N.B. Requires cleaning... currently not handling borders between pitchers.

allData = allData.set_index(pd.Series(range(len(allData))))

allData['sPerc'] = pd.Series(np.zeros(len(allData)), index=allData.index)
allData['KpBF'] = pd.Series(np.zeros(len(allData)), index=allData.index)
allData['gbPerc'] = pd.Series(np.zeros(len(allData)), index=allData.index)
allData[['sPerc','KpBF','gbPerc']] = allData[['Str','SO','GB']].astype(float).values / \
    allData[['Pit','BF','BF']].astype(float).values

allData['devIP'] = pd.Series(np.zeros(len(allData)), index=allData.index)
allData['devERA'] = pd.Series(np.zeros(len(allData)), index=allData.index)
allData['devSPerc'] = pd.Series(np.zeros(len(allData)), index=allData.index)
allData['devKpBF'] = pd.Series(np.zeros(len(allData)), index=allData.index)
allData['devGbPerc'] = pd.Series(np.zeros(len(allData)), index=allData.index)
allData[['devIP','devERA','devSPerc','devKpBF','devGbPerc']] = \
    allData[['IP','ERA','sPerc','KpBF','gbPerc']].values / \
    allData[['paIP','paERA','carSPerc','carKpBF','carGbPerc']].values

allData['sIP'] = pd.Series(np.zeros(len(allData)), index=allData.index)
allData[['sIP']] = allData[['IP']] + allData[['IP']].shift(1) + allData[['IP']].shift(2) + \
    allData[['IP']].shift(3) + allData[['IP']].shift(4) + allData[['IP']].shift(5) + allData[['IP']].shift(6)

allData['sERA'] = pd.Series(np.zeros(len(allData)), index=allData.index)
allData['sSPerc'] = pd.Series(np.zeros(len(allData)), index=allData.index)
allData['sKpBF'] = pd.Series(np.zeros(len(allData)), index=allData.index)
allData['sGbPerc'] = pd.Series(np.zeros(len(allData)), index=allData.index)
allData[['sERA','sSPerc','sKpBF','sGbPerc']] = \
    ( allData[['ER','Str','SO','GB']].values + \
    allData[['ER','Str','SO','GB']].shift(1).values + \
    allData[['ER','Str','SO','GB']].shift(2).values + \
    allData[['ER','Str','SO','GB']].shift(3).values + \
    allData[['ER','Str','SO','GB']].shift(4).values + \
    allData[['ER','Str','SO','GB']].shift(5).values + \
    allData[['ER','Str','SO','GB']].shift(6).values ) / \
    ( allData[['IP','Pit','BF','BF']].values + \
    allData[['IP','Pit','BF','BF']].shift(1).values + \
    allData[['IP','Pit','BF','BF']].shift(2).values + \
    allData[['IP','Pit','BF','BF']].shift(3).values + \
    allData[['IP','Pit','BF','BF']].shift(4).values + \
    allData[['IP','Pit','BF','BF']].shift(5).values + \
    allData[['IP','Pit','BF','BF']].shift(6).values )
allData[['sERA']] *=9
allData[['sIP']] /=7

allData['devsIP'] = pd.Series(np.zeros(len(allData)), index=allData.index)
allData['devsERA'] = pd.Series(np.zeros(len(allData)), index=allData.index)
allData['devsSPerc'] = pd.Series(np.zeros(len(allData)), index=allData.index)
allData['devsKpBF'] = pd.Series(np.zeros(len(allData)), index=allData.index)
allData['devsGbPerc'] = pd.Series(np.zeros(len(allData)), index=allData.index)
allData[['devsIP','devsERA','devsSPerc','devsKpBF','devsGbPerc']] = \
    allData[['sIP','sERA','sSPerc','sKpBF','sGbPerc']].values / \
    allData[['paIP','paERA','carSPerc','carKpBF','carGbPerc']].values

#print allData[['gbPerc','sGbPerc','devGbPerc','devsGbPerc']].head(25)

      gbPerc   sGbPerc  devGbPerc  devsGbPerc
0   0.263158       NaN   0.961887         NaN
1   0.304348       NaN   1.112444         NaN
2   0.333333       NaN   1.218391         NaN
3   0.235294       NaN   0.860041         NaN
4   0.200000       NaN   0.731034         NaN
5   0.333333       NaN   1.218391         NaN
6   0.476190  0.307087   1.119854    0.722174
7   0.280000  0.308271   0.658474    0.724958
8   0.533333  0.357143   1.254236    0.839890
9   0.413793  0.371622   0.973114    0.873940
10  0.411765  0.391892   0.968344    0.921610
11  0.371429  0.411043   0.873486    0.966647
12  0.307692  0.405882   0.723598    0.954511
13  0.480000  0.408046   1.128813    0.959599
14  0.344828  0.415730   0.810929    0.977670
15  0.617647  0.434066   1.452516    1.020790
16  0.440000  0.438202   1.034745    1.030517
17  0.384615  0.433155   0.904497    1.018648
18  0.423077  0.443820   0.994947    1.043729
19  0.419355  0.448980   0.986194    1.055862
20  0.409091  0.440415   0.962056 

In [42]:
#######################################################################################################################

# Part 4: Read in injury transfer dates from Disabled List transaction parser.

tarData = pd.read_csv('db_dl_history.csv', header=None)
tarData[1] = pd.to_datetime(tarData[1], format='%Y%m%d')
allData.loc[:,'targets'] = pd.Series(np.zeros(len(allData)), index=allData.index)
for i in range(len(tarData)):
    dlPitch = tarData.iloc[i,0]
    playerSubSet = allData['Name'].str.match(dlPitch,as_indexer=True)
    tData = allData.loc[playerSubSet.index[playerSubSet.values],'GameDate']
    tData2 = tData < tarData.iloc[i,1]
    tData3 = tData2[tData2.values == True]
    if sum(tData3) > 1:
        allData.loc[tData3.index[-2],'targets'] = 1
    else:
        x = True

In [43]:
allData.to_csv('db_mlb_project_week2.csv')