### Yankees/Mets home game schedules 2014-2016
#### Data processing scripts
Source data: http://retrosheet.org/schedule/index.html

Assumptions used to get game start and end times:
- Typical day game start time: NYY - 1:05pm, NYM - 1:10pm 
- Typical night game start time: NYY - 7:05pm, NYM - 7:10pm 
- Typical Saturday afternoon Mets start time: 4:05pm
- Avg length of game: 2:56 (in 2015)

Team codes used in source data (and output CSVs): 
- Yankees - NYA
- Mets - NYN

In [122]:
import pandas as pd
import numpy as np
from datetime import date, datetime, timedelta

In [123]:
fields = ['Date', 'Double header code', 'Day of week', 'Visiting team', 'Visitor league', 'Visitor game number', \
          'Home team', 'Home team league', 'Home team game number', 'Time of day code', 'Postponement/cancellation', \
          'Date of makeup']

drop_fields = list(fields)
drop_fields.remove('Home team')

In [124]:
# function to set game times based on the typical start times
def setTimes(row):
    if row['Home team'] == 'NYN':
        if row['Day of week'] == 'Sat' and row['Time of day code'] == 'D':
            return '1605'
        elif row['Time of day code'] == 'D':
            return '1310'
        else:
            return '1910'
    elif row['Home team'] == 'NYA':
        if row['Time of day code'] == 'D':
            return '1305'
        else:
            return '1905' 

In [127]:
def getSchedules(fileName):
    sched = pd.read_csv(fileName, header=None)
    sched.columns = fields
    team_sched = sched[(sched['Home team'] == 'NYN') | (sched['Home team'] == 'NYA')]
    team_sched = team_sched[team_sched['Postponement/cancellation'].isnull()]
    team_sched.reset_index(drop=True, inplace=True)
    
    # set game start times for each team
    team_sched['time'] = team_sched.apply(setTimes, axis=1)
    
    # create start and end times as datetime fields
    team_sched['starttime'] = team_sched['Date'] + team_sched['time']
    team_sched['starttime'] = [datetime.strptime(x, '%Y%m%d%H%M') for x in team_sched['starttime']]
    team_sched['endtime'] = team_sched['starttime'] + timedelta(0,0,0,0,56,2)

    # get start and end time windows
    team_sched['startwindow_start'] = team_sched['starttime'] - timedelta(0,0,0,0,30,1)
    team_sched['startwindow_end'] = team_sched['starttime'] + timedelta(0,0,0,0,30,0)
    team_sched['endwindow_start'] = team_sched['endtime'] - timedelta(0,0,0,0,30,0)
    team_sched['endwindow_end'] = team_sched['endtime'] + timedelta(0,0,0,0,30,1)
    
    team_sched.drop(drop_fields, axis=1, inplace=True)
    team_sched.drop('time', axis=1, inplace=True)
    return team_sched

In [128]:
files = ['2014SKED.TXT','2015SKED.TXT','2016SKED.TXT']
output = pd.DataFrame(columns=['Home team','starttime','endtime','startwindow_start', \
                               'startwindow_end','endwindow_start','endwindow_end'])

for year in files:
    output = output.append(getSchedules(year))

output.to_csv('combined_schedules.csv')
output.head()

Unnamed: 0,Home team,starttime,endtime,startwindow_start,startwindow_end,endwindow_start,endwindow_end
0,NYN,2014-03-31 13:10:00,2014-03-31 16:06:00,2014-03-31 11:40:00,2014-03-31 13:40:00,2014-03-31 15:36:00,2014-03-31 17:36:00
1,NYN,2014-04-02 19:10:00,2014-04-02 22:06:00,2014-04-02 17:40:00,2014-04-02 19:40:00,2014-04-02 21:36:00,2014-04-02 23:36:00
2,NYN,2014-04-03 13:10:00,2014-04-03 16:06:00,2014-04-03 11:40:00,2014-04-03 13:40:00,2014-04-03 15:36:00,2014-04-03 17:36:00
3,NYN,2014-04-04 19:10:00,2014-04-04 22:06:00,2014-04-04 17:40:00,2014-04-04 19:40:00,2014-04-04 21:36:00,2014-04-04 23:36:00
4,NYN,2014-04-05 16:05:00,2014-04-05 19:01:00,2014-04-05 14:35:00,2014-04-05 16:35:00,2014-04-05 18:31:00,2014-04-05 20:31:00
