In [1]:
### Setting up the environment.

import numpy as np
import pandas as pd

from datetime import timedelta

In [2]:
### Reading in the data.

#######################################################################################################################

# Part 1: Reading in performance/workload records from box score parser.

allData = pd.read_csv('db_box_scores_parsed.csv', \
                      header=None, names = ['Name', 'IP', 'H', 'H2', 'R', \
                                           'ER', 'BB', 'SO', 'HR', 'ERA', \
                                           'BF', 'Pit', 'Str', 'Ctct', 'StS', \
                                           'StL', 'GB', 'FB', 'LD', 'Unk', \
                                           'GSc', 'WPA', 'aLI', \
                                           'RE24', 'GameDate'])
allData = allData.drop('H2', 1)
for c in list(allData):
    # Get nicely formatted datetime data for each game. For pitcherFullParse, this is the 24th column.
    if c != 'GameDate':
        allData[c] = pd.to_numeric(allData[c], errors='ignore')
    else:
        allData[c] = pd.to_datetime(allData[c], format='%Y%m%d')
# Sort by pitcher, then by date. Result is concatenated dataframes of each pitcher's career.
allData = allData.sort_values(['Name', 'GameDate'])
allData['interceptRow'] = np.ones(len(allData))

In [3]:
#######################################################################################################################

# Part 2: Read in pitcher birthdays from age_db.csv and add to performance/workload records.

ageDb = pd.read_csv('db_age.csv', header=None)
ageDb[1] = pd.to_datetime(ageDb[1], format='%Y-%m-%d', errors='coerce')
allData['birthday'] = pd.NaT #pd.Series(np.zeros(len(allData)), index=allData.index)''
pitcherList = allData['Name'].drop_duplicates()
for pitcher in pitcherList:
    if sum(ageDb[0]==pitcher) > 0:
        age = ageDb.loc[ageDb[0]==pitcher,1].values[0]
        pl = allData['Name']==pitcher
        allData.loc[pl,'birthday'] = pd.Series(age, index=pl.index)
allData['ages'] = allData['GameDate'] - allData['birthday']
allData['ages'] = pd.to_numeric(allData['ages'].values) / 100000000000000 / 365.25
allData[allData.birthday.isnull()].to_csv('rejects.csv')

In [4]:
#######################################################################################################################

# Part 3: Read in injury transfer dates from Disabled List transaction parser.

tarData = pd.read_csv('db_dl_history.csv', header=None)
tarData[1] = pd.to_datetime(tarData[1], format='%Y%m%d')
allData.loc[:,'targets'] = pd.Series(np.zeros(len(allData)), index=allData.index)
for i in range(len(tarData)):
    dlPitch = tarData.iloc[i,0]
    playerSubSet = allData['Name'].str.match(dlPitch,as_indexer=True)
    tData = allData.loc[playerSubSet.index[playerSubSet.values],'GameDate']
    tData2 = tData < tarData.iloc[i,1]
    tData3 = tData2[tData2.values == True]
    if sum(tData3) > 1:
        allData.loc[tData3.index[-2],'targets'] = 1
    else:
        x = True

            Name   IP  H  H2  R  ER  BB  SO  HR   ERA   ...     Unk  GSc  \
80759   abadfe01  4.0  4   4  1   1   4   2   0  3.62   ...       0   48   
51979   abadfe01  4.1  9   9  6   6   2   1   2  4.83   ...       0   20   
93642   abadfe01  4.0  6   6  5   4   3   3   0  5.30   ...       0   32   
51989   abadfe01  3.1  4   4  1   1   3   2   0  5.08   ...       0   47   
110755  abadfe01  5.0  6   6  3   3   0   6   0  5.11   ...       0   49   

          WPA   aLI  RE24   GameDate  interceptRow   birthday       ages  \
80759   0.054  1.33   0.8 2012-08-25           1.0 1985-12-17  23.058179   
51979  -0.449  1.05  -3.9 2012-08-31           1.0 1985-12-17  23.071869   
93642  -0.310  0.75  -3.2 2012-09-05           1.0 1985-12-17  23.082820   
51989  -0.057  1.08  -0.5 2012-09-12           1.0 1985-12-17  23.099247   
110755 -0.096  0.86  -0.7 2012-09-18           1.0 1985-12-17  23.115674   

        targets  
80759       0.0  
51979       0.0  
93642       0.0  
51989       0.

In [5]:
allData.to_csv('db_mlb_project_week2.csv')