In [None]:
import datetime
import pandas as pd
import urllib2
import csv
import numpy as np
from matplotlib import pyplot as plt

In [None]:
def get_mta_df(source, original_format=True):
    if original_format:
        return get_mta_df_old(source)
    else:
        return get_mta_df_new(source)

def get_mta_df_old(source):
    # Import data format prior to 10/18/14
    col_names = ['C/A', 'UNIT', 'SCP', 'DATETIME', 'DESC', 'ENTRIES', 'EXITS']
    raw_data = urllib2.urlopen(source)
    reader = csv.reader((raw_line.replace('\0','') for raw_line in raw_data), delimiter=",")
    mta_list = list()
    for row in reader:
        for entry_num in range(0,8):
            offset = entry_num*5
            try:
                mta_list.append([row[0], row[1], row[2], 
                                 datetime.datetime.strptime('{} {}'.format(row[3+offset],
                                                                           row[4+offset]),
                                                            '%m-%d-%y %H:%M:%S'),
                                 row[5+offset], int(row[6+offset]), int(row[7+offset]),
                                ])
            except:
                pass
    mta_df = pd.DataFrame(mta_list, columns = col_names)
    return mta_df

def get_mta_df_new(source):
    # Import data format post 10/18/14
    col_names = ['C/A', 'UNIT', 'SCP', 'STATION', 'LINENAME', 'DIVISION', 
                 'DATE', 'TIME', 'DESC', 'ENTRIES', 'EXITS']
    mta_df = pd.read_csv(source, sep = ',', skiprows=1, header=None, names=col_names)
    mta_df['DATETIME'] = mta_df.apply(lambda x: 
                                      datetime.datetime.strptime('{} {}'.format(x.DATE,x.TIME),
                                                                 '%m/%d/%Y %H:%M:%S'), axis=1)
    return mta_df

def get_mta_df_by_date(date):
    original_format = date < datetime.datetime(2014,10,18)
    source_url_temp = 'http://web.mta.info/developers/data/nyct/turnstile/turnstile_{}.txt'
    source_url = source_url_temp.format(date.strftime('%y%m%d'))
    return get_mta_df(source_url,original_format)

def get_mta_df_by_date_range(date, num_weeks):
    df_arry = []
    for week in range(0, num_weeks):
        run_date = date + datetime.timedelta(days = week*7)
        df_arry.append(get_mta_df_by_date(run_date))
    return pd.concat(df_arry)

def agg_by_station(target_df, stat_array):
    merged_df = merge_station(target_df)
    return merged_df.groupby('Station')[stat_array].sum()

def agg_by_station_date(target_df, stat_array):
    merged_df = merge_station(target_df)
    return merged_df.groupby(['Station','DATE'])[stat_array].sum()

def merge_station(target_df):
    return merge_station_strict(target_df)

def get_yankee_schedule():
    source = 'https://www.dropbox.com/s/2g5itrjc6mo4huu/yankee_home_2013.csv?dl=1'
    sched = pd.read_csv(source)
    # Restrict to night games
    sched = sched[sched['D/N']=='N']
    sched['Datetime'] = sched.apply(lambda x:
                                    datetime.datetime.strptime(x.Datetime,
                                                               '%m/%d/%Y %H:%M'),axis=1)
    sched['Date'] = sched.Datetime.dt.date
    return sched[['Datetime','Date','Opp','Attendance']]

def get_station_table():
    source = 'http://web.mta.info/developers/resources/nyct/turnstile/Remote-Booth-Station.xls'
    station_table = pd.read_excel(source)
    return station_table

def merge_station_strict(target_df):
    station_table_raw = get_station_table()
    station_table = station_table_raw[['Remote','Booth','Station']].drop_duplicates()
    station_table = station_table.groupby(['Remote','Booth']).sum().reset_index()
    merged_df = target_df.merge(station_table,
                                left_on=['UNIT','C/A'],
                                right_on=['Remote','Booth'],
                                how='left')
    return merged_df

def merge_station_fuzzy(target_df):
    station_table_raw = get_station_table()
    station_table = station_table_raw[['Remote','Station']].drop_duplicates()
    station_table = station_table.groupby(['Remote']).sum().reset_index()
    merged_df = target_df.merge(station_table,left_on='UNIT',right_on='Remote',how='left')
    return merged_df

def calc_deltas(df):
    data_df = df[df.DESC=='REGULAR'].sort_values(['C/A','UNIT','SCP','DATETIME'])
    data_df_lag = data_df.groupby(['C/A','UNIT','SCP']).transform(lambda x:x.shift(-1))
    data_df.loc[:,'ENTRIES_end'] = data_df_lag['ENTRIES']
    data_df.loc[:,'EXITS_end'] = data_df_lag['EXITS']
    data_df['ENTRIES_delta'] = data_df.ENTRIES_end - data_df.ENTRIES
    data_df['EXITS_delta'] = data_df.EXITS_end - data_df.EXITS
    
    # Discard negative counts
    data_df.loc[data_df['ENTRIES_delta']<0,'ENTRIES_delta'] = 0
    data_df.loc[data_df['EXITS_delta']<0,'EXITS_delta'] = 0
    
    # Discard counts implying > 20 rotations per minute (20*60*4)
    data_df.loc[data_df['ENTRIES_delta']>4800,'ENTRIES_delta'] = 0
    data_df.loc[data_df['EXITS_delta']>4800,'EXITS_delta'] = 0
    
    return data_df

In [None]:
##What is the total number of entries & exits across the subway system for August 1, 2013?

In [None]:
data = get_mta_df_by_date(datetime.datetime(2013,8,3))
data_deltas = calc_deltas(data)
data_deltas = data_deltas[(data_deltas.DATETIME>=datetime.datetime(2013,8,1))&
                          (data_deltas.DATETIME<datetime.datetime(2013,8,2))]
data_deltas[['ENTRIES_delta','EXITS_delta']].sum()

In [None]:
#Let’s define the busy-ness as sum of entry & exit count. What station was the busiest on August 1, 2013? What turnstile was the busiest on that date?

In [None]:
data = get_mta_df_by_date(datetime.datetime(2013,8,3))
data_deltas = calc_deltas(data)
data_deltas = data_deltas[(data_deltas.DATETIME>=datetime.datetime(2013,8,1))&
                          (data_deltas.DATETIME<datetime.datetime(2013,8,2))]
station_rollup = agg_by_station(data_deltas,['ENTRIES_delta','EXITS_delta'])
station_rollup['TOTAL_delta'] = station_rollup.ENTRIES_delta + station_rollup.EXITS_delta
station_rollup.sort_values('TOTAL_delta',ascending=False).head(3)

In [None]:
turnstile_rollup = data_deltas.groupby(['C/A','UNIT','SCP'])[['ENTRIES_delta','EXITS_delta']].sum()
turnstile_rollup['TOTAL_delta'] = turnstile_rollup.ENTRIES_delta + turnstile_rollup.EXITS_delta
turnstile_rollup.sort_values('TOTAL_delta',ascending=False).head(3)

In [None]:
#What were the busiest and least-busy stations in the system over all of July 2013?
data = get_mta_df_by_date_range(datetime.datetime(2013,7,6),5)
data_deltas = calc_deltas(data)
data_deltas = data_deltas[(data_deltas.DATETIME>=datetime.datetime(2013,7,1))&
                          (data_deltas.DATETIME<datetime.datetime(2013,8,1))]
station_rollup = agg_by_station(data_deltas,['ENTRIES_delta','EXITS_delta'])
station_rollup['TOTAL_delta'] = station_rollup.ENTRIES_delta + station_rollup.EXITS_delta
station_rollup.sort_values('TOTAL_delta',ascending=False).head(3)

In [None]:
station_rollup.sort_values('TOTAL_delta',ascending=False).tail(3)

In [None]:
#Which station had the highest average number of entries between midnight & 4am on Fridays in July 2013?
data = get_mta_df_by_date_range(datetime.datetime(2013,7,6),5)
data_deltas = calc_deltas(data)
data_deltas = data_deltas[(data_deltas.DATETIME>=datetime.datetime(2013,7,1))&
                          (data_deltas.DATETIME<datetime.datetime(2013,8,1))&
                          (data_deltas.DATETIME.dt.time==datetime.time(0,0,0))&
                          (data_deltas.DATETIME.dt.dayofweek==4)]
station_rollup = agg_by_station(data_deltas,['ENTRIES_delta','EXITS_delta'])
station_rollup.sort_values('ENTRIES_delta',ascending=False).head(3)

In [None]:
#What stations have seen the most usage growth/decline in the last year?
data_t0 = get_mta_df_by_date_range(datetime.datetime(2015,1,3),4)
data_t1 = get_mta_df_by_date_range(datetime.datetime(2016,1,2),4)

In [None]:
station_rollup_t0 = calc_deltas(data_t0).groupby('STATION')[['ENTRIES_delta','EXITS_delta']].sum()
station_rollup_t1 = calc_deltas(data_t1).groupby('STATION')[['ENTRIES_delta','EXITS_delta']].sum()
station_rollup = station_rollup_t0.join(station_rollup_t1, lsuffix='_t0', rsuffix='_t1')
station_rollup['YoY'] = station_rollup.ENTRIES_delta_t1 + station_rollup.EXITS_delta_t1 - station_rollup.ENTRIES_delta_t0 - station_rollup.EXITS_delta_t0
station_rollup.sort_values('YoY').head()

In [None]:
station_rollup_t0 = agg_by_station(calc_deltas(data_t0),['ENTRIES_delta','EXITS_delta'])
station_rollup_t1 = agg_by_station(calc_deltas(data_t1),['ENTRIES_delta','EXITS_delta'])
station_rollup = station_rollup_t0.join(station_rollup_t1, lsuffix='_t0', rsuffix='_t1')
station_rollup['YoY'] = station_rollup.ENTRIES_delta_t1 + station_rollup.EXITS_delta_t1 - station_rollup.ENTRIES_delta_t0 - station_rollup.EXITS_delta_t0
station_rollup.sort_values('YoY').head()

In [None]:
station_rollup.sort_values('YoY').tail()

In [None]:
#What dates are the least busy? Could you identify days on which stations were not operating at full capacity or closed entirely?
data = get_mta_df_by_date_range(datetime.datetime(2015,1,3),2)
data_deltas = calc_deltas(data)
data_deltas.groupby('DATE')[['ENTRIES_delta','EXITS_delta']].sum()

In [None]:
station_rollup = agg_by_station_date(data_deltas,['ENTRIES_delta','EXITS_delta'])
station_capacity = station_rollup.reset_index(level=1,drop=True).groupby(level=0).agg({'ENTRIES_delta':max,
                                                                                       'EXITS_delta':max})
station_utilization = station_rollup.div(station_capacity)

In [None]:
station_utilization[(station_utilization.ENTRIES_delta<0.5)|
                    (station_utilization.EXITS_delta<0.5)]

In [None]:
# Day of week, Day of month, month, Line, Division, Station, unit, Yankees

In [None]:
yankee_schedule = get_yankee_schedule()

data = get_mta_df_by_date_range(datetime.datetime(2013,8,17),4)

In [None]:
data_deltas = calc_deltas(data)
data_deltas = data_deltas[(data_deltas.UNIT=='R195')&
                          (data_deltas.DATETIME.dt.time==datetime.time(16,22,0))]
data_exits = data_deltas.groupby('DATETIME')[['EXITS_delta']].sum()
data_exits['DATE'] = data_exits.index.date
data_exits['DAY'] = data_exits.index.dayofweek
data_exits = data_exits.merge(yankee_schedule,left_on='DATE',right_on='Date',how='left')

In [None]:
data_exits.EXITS_delta.shift(-1)

In [None]:
44117*.34

In [None]:
yankee_schedule = get_yankee_schedule()

In [None]:
yankee_schedule

In [None]:
yankee_schedule['Date'][0]

In [None]:
data.EXITS.sum()

In [None]:
x = merge_station_strict(data_t1)
x[x.STATION=='SUTPHIN BLVD'].EXITS.sum()

In [None]:
station_rollup_t0 = calc_deltas(data_t0).groupby('STATION')[['ENTRIES_delta','EXITS_delta']].sum()
station_rollup_t0[station_rollup_t0.index=='42 ST-PA BUS TE']

In [None]:
station_rollup_t0 = calc_deltas(data_t1).groupby('STATION')[['ENTRIES_delta','EXITS_delta']].sum()
station_rollup_t0[station_rollup_t0.index=='42 ST-PORT AUTH']

In [None]:
station_rollup_t0 = agg_by_station(calc_deltas(data_t0),['ENTRIES_delta','EXITS_delta'])
station_rollup_t0[station_rollup_t0.index=='SUTPHIN BLVD']

In [None]:
x = merge_station(calc_deltas(data_t0))
x[x.STATION=='SUTPHIN BLVD'].Station.unique()

In [None]:
station_rollup_t1 = agg_by_station(calc_deltas(data_t0),['ENTRIES_delta','EXITS_delta'])
station_rollup_t1[station_rollup_t1.index=='42 ST-PA BUS TE']

In [None]:
x[x.Station=='42 ST-PA BUS TE'].STATION.unique()

In [None]:
x[x.STATION=='42 ST-PA BUS TE'].Station.unique()

In [None]:
station_rollup_t0 = calc_deltas(data_t0).groupby('STATION')[['ENTRIES_delta','EXITS_delta']].sum()
station_rollup_t0[station_rollup_t0.index=='42 ST-PORT AUTH']

In [None]:
source = 'https://www.dropbox.com/s/2g5itrjc6mo4huu/yankee_home_2013.csv?dl=1'
pd.read_csv(source)

In [None]:
    source = 'https://www.dropbox.com/s/2g5itrjc6mo4huu/yankee_home_2013.csv?dl=1'
    sched = pd.read_csv(source)
    sched['Datetime'] = sched.apply(lambda x:
                                    datetime.datetime.strptime(x.Datetime,
                                                               '%m/%d/%Y %H:%M'))

In [None]:
datetime.datetime.strptime(sched.Datetime[0],
                                                               '%m/%d/%Y %H:%M')

In [None]:
sched.apply(lambda x:datetime.datetime.strptime(x.Datetime,'%m/%d/%Y %H:%M'))

In [None]:
x = 'http://web.mta.info/developers/data/nyct/turnstile/turnstile_{}.txt'

In [None]:
x.format('3434')