In [1]:
import datetime
import pandas as pd
import urllib2
import csv
import datetime

In [2]:
def get_mta_df(source, original_format=True):
    if original_format:
        return get_mta_df_old(source)
    else:
        return get_mta_df_new(source)

def get_mta_df_old(source):
    # Import data format prior to 10/18/14
    col_names = ['C/A', 'UNIT', 'SCP', 'DATETIME', 'DESC', 'ENTRIES', 'EXITS']
    raw_data = urllib2.urlopen(source)
    reader = csv.reader((raw_line.replace('\0','') for raw_line in raw_data), delimiter=",")
    mta_list = list()
    for row in reader:
        for entry_num in range(0,8):
            offset = entry_num*5
            try:
                mta_list.append([row[0], row[1], row[2], 
                                 datetime.datetime.strptime('{} {}'.format(row[3+offset],
                                                                           row[4+offset]),'%m-%d-%y %H:%M:%S'),
                                 row[5+offset], int(row[6+offset]), int(row[7+offset]),
                                ])
            except:
                pass
    mta_df = pd.DataFrame(mta_list, columns = col_names)
    return mta_df

def get_mta_df_new(source):
    # Import data format post 10/18/14
    col_names = ['C/A', 'UNIT', 'SCP', 'STATION', 'LINENAME', 'DIVISION', 
                 'DATE', 'TIME', 'DESC', 'ENTRIES', 'EXITS']
    mta_df = pd.read_csv(source, sep = ',', skiprows=1, header=None, names=col_names)
    mta_df['DATETIME'] = mta_df.apply(lambda x: 
                                      datetime.datetime.strptime('{} {}'.format(x.DATE,x.TIME),
                                                                 '%m/%d/%Y %H:%M:%S'), axis=1)
    return mta_df

def get_mta_df_by_date(date):
    original_format = date < datetime.datetime(2014,10,18)
    source_url = 'http://web.mta.info/developers/data/nyct/turnstile/turnstile_{}.txt'.format(date.strftime('%y%m%d'))
    return get_mta_df(source_url,original_format)

def get_mta_df_by_date_range(date, num_weeks):
    df_arry = []
    for week in range(0, num_weeks):
        run_date = date + datetime.timedelta(days = week*7)
        df_arry.append(get_mta_df_by_date(run_date))
    return pd.concat(df_arry)

def agg_by_station(target_df, stat_array):
    station_table = get_station_table()
    merged_df = target_df.merge(station_table,left_on='UNIT',right_on='Remote',how='left')
    return merged_df.groupby('Station')[stat_array].sum()

def calc_deltas(df):
    data_df = df[df.DESC=='REGULAR'].sort_values(['C/A','UNIT','SCP','DATETIME'])
    data_df_lag = data_df.groupby(['C/A','UNIT','SCP']).transform(lambda x:x.shift(-1))
    data_df.loc[:,'ENTRIES_end'] = data_df_lag['ENTRIES']
    data_df.loc[:,'EXITS_end'] = data_df_lag['EXITS']
    data_df['ENTRIES_delta'] = data_df.ENTRIES_end - data_df.ENTRIES
    data_df['EXITS_delta'] = data_df.EXITS_end - data_df.EXITS
    
    # Discard negative counts
    data_df.loc[data_df['ENTRIES_delta']<0,'ENTRIES_delta'] = 0
    data_df.loc[data_df['EXITS_delta']<0,'EXITS_delta'] = 0
    
    # Discard counts implying > 20 rotations per minute (20*60*4)
    data_df.loc[data_df['ENTRIES_delta']>4800,'ENTRIES_delta'] = 0
    data_df.loc[data_df['EXITS_delta']>4800,'EXITS_delta'] = 0
    
    return data_df

def get_station_table():
    source = 'http://web.mta.info/developers/resources/nyct/turnstile/Remote-Booth-Station.xls'
    station_table = pd.read_excel(source)
    return station_table[['Remote','Station']].drop_duplicates().groupby('Remote').sum().reset_index()

In [3]:
##What is the total number of entries & exits across the subway system for August 1, 2013?

In [4]:
data = get_mta_df_by_date(datetime.datetime(2013,8,3))
data_deltas = calc_deltas(data)
data_deltas = data_deltas[(data_deltas.DATETIME>=datetime.datetime(2013,8,1))&
                          (data_deltas.DATETIME<datetime.datetime(2013,8,2))]
data_deltas[['ENTRIES_delta','EXITS_delta']].sum()

ENTRIES_delta    5562793
EXITS_delta      4409520
dtype: float64

In [5]:
#Let’s define the busy-ness as sum of entry & exit count. What station was the busiest on August 1, 2013? What turnstile was the busiest on that date?

In [6]:
data = get_mta_df_by_date(datetime.datetime(2013,8,3))
data_deltas = calc_deltas(data)
data_deltas = data_deltas[(data_deltas.DATETIME>=datetime.datetime(2013,8,1))&
                          (data_deltas.DATETIME<datetime.datetime(2013,8,2))]
station_rollup = agg_by_station(data_deltas,['ENTRIES_delta','EXITS_delta'])
station_rollup['TOTAL_delta'] = station_rollup.ENTRIES_delta + station_rollup.EXITS_delta
station_rollup.sort_values('TOTAL_delta',ascending=False).head(3)

Unnamed: 0_level_0,ENTRIES_delta,EXITS_delta,TOTAL_delta
Station,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
34 ST-PENN STA,175185,155858,331043
42 ST-GRD CNTRL,164823,151271,316094
34 ST-HERALD SQ,122548,114529,237077


In [7]:
turnstile_rollup = data_deltas.groupby(['C/A','UNIT','SCP'])[['ENTRIES_delta','EXITS_delta']].sum()
turnstile_rollup['TOTAL_delta'] = turnstile_rollup.ENTRIES_delta + turnstile_rollup.EXITS_delta
turnstile_rollup.sort_values('TOTAL_delta',ascending=False).head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,ENTRIES_delta,EXITS_delta,TOTAL_delta
C/A,UNIT,SCP,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
N063A,R011,00-00-00,1882,9963,11845
R249,R179,01-00-09,964,10559,11523
R240,R047,00-00-00,3755,7302,11057


In [8]:
#What were the busiest and least-busy stations in the system over all of July 2013?
data = get_mta_df_by_date_range(datetime.datetime(2013,7,6),5)
data_deltas = calc_deltas(data)
data_deltas = data_deltas[(data_deltas.DATETIME>=datetime.datetime(2013,7,1))&
                          (data_deltas.DATETIME<datetime.datetime(2013,8,1))]
station_rollup = agg_by_station(data_deltas,['ENTRIES_delta','EXITS_delta'])
station_rollup['TOTAL_delta'] = station_rollup.ENTRIES_delta + station_rollup.EXITS_delta
station_rollup.sort_values('TOTAL_delta',ascending=False).head(3)

Unnamed: 0_level_0,ENTRIES_delta,EXITS_delta,TOTAL_delta
Station,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
34 ST-PENN STA,4577775,4018278,8596053
42 ST-GRD CNTRL,3848128,3610248,7458376
34 ST-HERALD SQ,3205652,2985543,6191195


In [9]:
station_rollup.sort_values('TOTAL_delta',ascending=False).tail(3)

Unnamed: 0_level_0,ENTRIES_delta,EXITS_delta,TOTAL_delta
Station,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ORCHARD BEACH,16260,1002,17262
BROAD CHANNEL,5772,2677,8449
AQUEDUCT TRACK,117,152,269


In [10]:
#Which station had the highest average number of entries between midnight & 4am on Fridays in July 2013?
data = get_mta_df_by_date_range(datetime.datetime(2013,7,6),5)
data_deltas = calc_deltas(data)
data_deltas = data_deltas[(data_deltas.DATETIME>=datetime.datetime(2013,7,1))&
                          (data_deltas.DATETIME<datetime.datetime(2013,8,1))&
                          (data_deltas.DATETIME.apply(lambda x:x.time()==datetime.time(0,0,0)))&
                          (data_deltas.DATETIME.apply(lambda x:x.weekday()==4))]
station_rollup = agg_by_station(data_deltas,['ENTRIES_delta','EXITS_delta'])
station_rollup.sort_values('ENTRIES_delta',ascending=False).head(3)

Unnamed: 0_level_0,ENTRIES_delta,EXITS_delta
Station,Unnamed: 1_level_1,Unnamed: 2_level_1
42 ST-TIMES SQ,23885,6115
34 ST-HERALD SQ,11211,4441
34 ST-PENN STA,9361,5119


In [11]:
#What stations have seen the most usage growth/decline in the last year?
data_t0 = get_mta_df_by_date_range(datetime.datetime(2015,1,3),4)
data_t1 = get_mta_df_by_date_range(datetime.datetime(2016,1,2),4)

In [12]:
station_rollup_t0 = agg_by_station(calc_deltas(data_t0),['ENTRIES_delta','EXITS_delta'])
station_rollup_t1 = agg_by_station(calc_deltas(data_t1),['ENTRIES_delta','EXITS_delta'])
station_rollup = station_rollup_t0.join(station_rollup_t1, lsuffix='_t0', rsuffix='_t1')
station_rollup['YoY'] = station_rollup.ENTRIES_delta_t1 + station_rollup.EXITS_delta_t1 - station_rollup.ENTRIES_delta_t0 - station_rollup.EXITS_delta_t0
station_rollup.sort_values('YoY').head()

Unnamed: 0_level_0,ENTRIES_delta_t0,EXITS_delta_t0,ENTRIES_delta_t1,EXITS_delta_t1,YoY
Station,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
42 ST-PA BUS TE,2464347,1891440,2204050,1719541,-432196
57 ST-7 AVE,645657,396508,496571,208273,-337321
ROCKAWAY AVE,251385,196693,129160,74923,-243995
34 ST-PENN STA,3977861,3389783,3884926,3269389,-213329
METS-WILLETS PT,144020,167220,49402,53041,-208797


In [13]:
station_rollup.sort_values('YoY').tail()

Unnamed: 0_level_0,ENTRIES_delta_t0,EXITS_delta_t0,ENTRIES_delta_t1,EXITS_delta_t1,YoY
Station,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
PARK PLACEWORLD TRADE CTRCHAMBERS ST,1086861,606336,1168761,616102,91666
ESSEX STDELANCEY ST,569412,441528,602674,501440,93174
SARATOGA AVE,132489,53927,201410,84654,99648
FULTON ST,1535037,1359578,1667897,1420392,193674
MAIN ST,1238318,1067246,1393136,1151015,238587


In [14]:
#What dates are the least busy? Could you identify days on which stations were not operating at full capacity or closed entirely?
data = get_mta_df_by_date_range(datetime.datetime(2015,1,3),2)

In [None]:
    data_df = df[df.DESC=='REGULAR'].sort_values(['C/A','UNIT','SCP','DATETIME'])
    data_df_lag = data_df.groupby(['C/A','UNIT','SCP']).transform(lambda x:x.shift(-1))
    data_df.loc[:,'ENTRIES_end'] = data_df_lag['ENTRIES']
    data_df.loc[:,'EXITS_end'] = data_df_lag['EXITS']