In [None]:
### Improvements to Week 2
###    --- parsing temperature
###    --- doing extensive row-wise aggregation of pitcher statistics
###           (computationally expensive)
###           1. whole-career statistics for clustering pitchers by type
###           2. bunch-of-recent-game statistics for a more comprehensive look at recent performance/workload


In [1]:
### Setting up the environment.

import numpy as np
import pandas as pd

from datetime import timedelta

In [2]:
### Reading in the data.

#######################################################################################################################

# Part 1: Reading in performance/workload records from box score parser.

allData = pd.read_csv('db_box_scores_parsed.csv', \
                      header=None, names = ['Name', 'IP', 'H', 'H2', 'R', \
                                           'ER', 'BB', 'SO', 'HR', 'ERA', \
                                           'BF', 'Pit', 'Str', 'Ctct', 'StS', \
                                           'StL', 'GB', 'FB', 'LD', 'Unk', \
                                           'GSc', 'WPA', 'aLI', \
                                           'RE24', 'GameDate'])
allData = allData.drop('H2', 1)
for c in list(allData):
    # Get nicely formatted datetime data for each game. For pitcherFullParse, this is the 24th column.
    if c != 'GameDate':
        allData[c] = pd.to_numeric(allData[c], errors='ignore')
    else:
        allData[c] = pd.to_datetime(allData[c], format='%Y%m%d')
# Sort by pitcher, then by date. Result is concatenated dataframes of each pitcher's career.
allData = allData.sort_values(['Name', 'GameDate'])
allData['interceptRow'] = np.ones(len(allData))

In [3]:
#######################################################################################################################

# Part 2: Read in pitcher birthdays from age_db.csv and add to performance/workload records.

ageDb = pd.read_csv('db_age.csv', header=None)
ageDb[1] = pd.to_datetime(ageDb[1], format='%Y-%m-%d', errors='coerce')
allData['birthday'] = pd.NaT #pd.Series(np.zeros(len(allData)), index=allData.index)''
pitcherList = allData['Name'].drop_duplicates()
for pitcher in pitcherList:
    if sum(ageDb[0]==pitcher) > 0:
        age = ageDb.loc[ageDb[0]==pitcher,1].values[0]
        pl = allData['Name']==pitcher
        allData.loc[pl,'birthday'] = pd.Series(age, index=pl.index)
allData['ages'] = allData['GameDate'] - allData['birthday']
allData['ages'] = pd.to_numeric(allData['ages'].values) / 100000000000000 / 365.25
allData[allData.birthday.isnull()].to_csv('rejects.csv')

In [4]:
#######################################################################################################################

# Part 3: Read in injury transfer dates from Disabled List transaction parser.

tarData = pd.read_csv('db_dl_history.csv', header=None)
tarData[1] = pd.to_datetime(tarData[1], format='%Y%m%d')
allData.loc[:,'targets'] = pd.Series(np.zeros(len(allData)), index=allData.index)
for i in range(len(tarData)):
    dlPitch = tarData.iloc[i,0]
    playerSubSet = allData['Name'].str.match(dlPitch,as_indexer=True)
    tData = allData.loc[playerSubSet.index[playerSubSet.values],'GameDate']
    tData2 = tData < tarData.iloc[i,1]
    tData3 = tData2[tData2.values == True]
    if sum(tData3) > 1:
        allData.loc[tData3.index[-2],'targets'] = 1
    else:
        x = True

In [5]:
allData.to_csv('db_mlb_project_week2.csv')