In [1]:
import datetime
import pandas as pd
import urllib2
import csv
import numpy as np
from matplotlib import pyplot as plt

In [2]:
def get_mta_df(source, original_format=True):
    if original_format:
        return get_mta_df_old(source)
    else:
        return get_mta_df_new(source)

def get_mta_df_old(source):
    # Import data format prior to 10/18/14
    col_names = ['C/A', 'UNIT', 'SCP', 'DATETIME', 'DESC', 'ENTRIES', 'EXITS']
    raw_data = urllib2.urlopen(source)
    reader = csv.reader((raw_line.replace('\0','') for raw_line in raw_data), delimiter=",")
    mta_list = list()
    for row in reader:
        for entry_num in range(0,8):
            offset = entry_num*5
            try:
                mta_list.append([row[0], row[1], row[2], 
                                 datetime.datetime.strptime('{} {}'.format(row[3+offset],
                                                                           row[4+offset]),
                                                            '%m-%d-%y %H:%M:%S'),
                                 row[5+offset], int(row[6+offset]), int(row[7+offset]),
                                ])
            except:
                pass
    mta_df = pd.DataFrame(mta_list, columns = col_names)
    return mta_df

def get_mta_df_new(source):
    # Import data format post 10/18/14
    col_names = ['C/A', 'UNIT', 'SCP', 'STATION', 'LINENAME', 'DIVISION', 
                 'DATE', 'TIME', 'DESC', 'ENTRIES', 'EXITS']
    mta_df = pd.read_csv(source, sep = ',', skiprows=1, header=None, names=col_names)
    mta_df['DATETIME'] = mta_df.apply(lambda x: 
                                      datetime.datetime.strptime('{} {}'.format(x.DATE,x.TIME),
                                                                 '%m/%d/%Y %H:%M:%S'), axis=1)
    return mta_df

def get_mta_df_by_date(date):
    original_format = date < datetime.datetime(2014,10,18)
    source_url_temp = 'http://web.mta.info/developers/data/nyct/turnstile/turnstile_{}.txt'
    source_url = source_url_temp.format(date.strftime('%y%m%d'))
    return get_mta_df(source_url,original_format)

def get_mta_df_by_date_range(date, num_weeks):
    df_arry = []
    for week in range(0, num_weeks):
        run_date = date + datetime.timedelta(days = week*7)
        df_arry.append(get_mta_df_by_date(run_date))
    return pd.concat(df_arry)

def agg_by_station(target_df, stat_array):
    merged_df = merge_station(target_df)
    return merged_df.groupby('Station')[stat_array].sum()

def agg_by_station_date(target_df, stat_array):
    merged_df = merge_station(target_df)
    return merged_df.groupby(['Station','DATE'])[stat_array].sum()

def merge_station(target_df):
    return merge_station_strict(target_df)

def get_yankee_schedule():
    source = 'https://www.dropbox.com/s/2g5itrjc6mo4huu/yankee_home_2013.csv?dl=1'
    sched = pd.read_csv(source)
    # Restrict to night games
    sched = sched[sched['D/N']=='N']
    sched['Datetime'] = sched.apply(lambda x:
                                    datetime.datetime.strptime(x.Datetime,
                                                               '%m/%d/%Y %H:%M'),axis=1)
    sched['Date'] = sched.Datetime.dt.date
    return sched[['Date','Opp','Attendance']]

def get_station_table():
    source = 'http://web.mta.info/developers/resources/nyct/turnstile/Remote-Booth-Station.xls'
    station_table = pd.read_excel(source)
    return station_table

def merge_station_strict(target_df):
    station_table_raw = get_station_table()
    station_table = station_table_raw[['Remote','Booth','Station']].drop_duplicates()
    station_table = station_table.groupby(['Remote','Booth']).sum().reset_index()
    merged_df = target_df.merge(station_table,
                                left_on=['UNIT','C/A'],
                                right_on=['Remote','Booth'],
                                how='left')
    return merged_df

def merge_station_fuzzy(target_df):
    station_table_raw = get_station_table()
    station_table = station_table_raw[['Remote','Station']].drop_duplicates()
    station_table = station_table.groupby(['Remote']).sum().reset_index()
    merged_df = target_df.merge(station_table,left_on='UNIT',right_on='Remote',how='left')
    return merged_df

def calc_deltas(df):
    data_df = df[df.DESC=='REGULAR'].sort_values(['C/A','UNIT','SCP','DATETIME'])
    data_df_lag = data_df.groupby(['C/A','UNIT','SCP']).transform(lambda x:x.shift(-1))
    data_df.loc[:,'ENTRIES_end'] = data_df_lag['ENTRIES']
    data_df.loc[:,'EXITS_end'] = data_df_lag['EXITS']
    data_df['ENTRIES_delta'] = data_df.ENTRIES_end - data_df.ENTRIES
    data_df['EXITS_delta'] = data_df.EXITS_end - data_df.EXITS
    
    # Discard negative counts
    data_df.loc[data_df['ENTRIES_delta']<0,'ENTRIES_delta'] = 0
    data_df.loc[data_df['EXITS_delta']<0,'EXITS_delta'] = 0
    
    # Discard counts implying > 20 rotations per minute (20*60*4)
    data_df.loc[data_df['ENTRIES_delta']>4800,'ENTRIES_delta'] = 0
    data_df.loc[data_df['EXITS_delta']>4800,'EXITS_delta'] = 0
    
    return data_df

# Total Entries and Exits

What is the total number of entries & exits across the subway system for August 1, 2013?

In [4]:
data = get_mta_df_by_date(datetime.datetime(2013,8,3))
data_deltas = calc_deltas(data)
data_deltas = data_deltas[(data_deltas.DATETIME>=datetime.datetime(2013,8,1))&
                          (data_deltas.DATETIME<datetime.datetime(2013,8,2))]
data_deltas[['ENTRIES_delta','EXITS_delta']].sum()

ENTRIES_delta    5562793
EXITS_delta      4409520
dtype: float64

The subway system saw 5,562,793 entries and 4,409,520 exits across the subway system on August 1, 2013

Note turnstiles that reported less than zero / more than 20 rotations per minute over any four hour period were discarded.

# The busiest stations and turnstiles

Let’s define the busy-ness as sum of entry & exit count. What station was the busiest on August 1, 2013? What turnstile was the busiest on that date?

In [7]:
data = get_mta_df_by_date(datetime.datetime(2013,8,3))
data_deltas = calc_deltas(data)
data_deltas = data_deltas[(data_deltas.DATETIME>=datetime.datetime(2013,8,1))&
                          (data_deltas.DATETIME<datetime.datetime(2013,8,2))]
station_rollup = agg_by_station(data_deltas,['ENTRIES_delta','EXITS_delta'])
station_rollup['TOTAL_delta'] = station_rollup.ENTRIES_delta + station_rollup.EXITS_delta
station_rollup.sort_values('TOTAL_delta',ascending=False).head(3)

Unnamed: 0_level_0,ENTRIES_delta,EXITS_delta,TOTAL_delta
Station,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
34 ST-PENN STA,175185,155858,331043
42 ST-GRD CNTRL,164823,151271,316094
34 ST-HERALD SQ,122548,114529,237077


The busiest station on August 1, 2013 was 34th Street – Penn Station

In [8]:
turnstile_rollup = data_deltas.groupby(['C/A','UNIT','SCP'])[['ENTRIES_delta','EXITS_delta']].sum()
turnstile_rollup['TOTAL_delta'] = turnstile_rollup.ENTRIES_delta + turnstile_rollup.EXITS_delta
turnstile_rollup.sort_values('TOTAL_delta',ascending=False).head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,ENTRIES_delta,EXITS_delta,TOTAL_delta
C/A,UNIT,SCP,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
N063A,R011,00-00-00,1882,9963,11845
R249,R179,01-00-09,964,10559,11523
R240,R047,00-00-00,3755,7302,11057


The busiest turnstile on August 1, 2013 was 00-00-00 at Times Square – 42nd Street / Port Authority Bus Terminal (N063A R011)

# The busiest stations in July

What were the busiest and least-busy stations in the system over all of July 2013?

In [10]:
data = get_mta_df_by_date_range(datetime.datetime(2013,7,6),5)
data_deltas = calc_deltas(data)
data_deltas = data_deltas[(data_deltas.DATETIME>=datetime.datetime(2013,7,1))&
                          (data_deltas.DATETIME<datetime.datetime(2013,8,1))]
station_rollup = agg_by_station(data_deltas,['ENTRIES_delta','EXITS_delta'])
station_rollup['TOTAL_delta'] = station_rollup.ENTRIES_delta + station_rollup.EXITS_delta
station_rollup.sort_values('TOTAL_delta',ascending=False).head(3)

Unnamed: 0_level_0,ENTRIES_delta,EXITS_delta,TOTAL_delta
Station,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
34 ST-PENN STA,4577775,4018278,8596053
42 ST-GRD CNTRL,3848128,3610248,7458376
34 ST-HERALD SQ,3205652,2985543,6191195


The busiest station in August 2013 was 34th Street – Penn Station

In [11]:
station_rollup.sort_values('TOTAL_delta',ascending=False).tail(3)

Unnamed: 0_level_0,ENTRIES_delta,EXITS_delta,TOTAL_delta
Station,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ORCHARD BEACH,16260,1002,17262
BROAD CHANNEL,5772,2677,8449
AQUEDUCT TRACK,117,152,269


The least busiest station in August 2013 was Aqueduct Racetrack

# The busiest stations on Friday nights

Which station had the highest average number of entries between midnight & 4am on Fridays in July 2013?

In [18]:
data = get_mta_df_by_date_range(datetime.datetime(2013,7,6),5)
data_deltas = calc_deltas(data)
data_deltas = data_deltas[(data_deltas.DATETIME>=datetime.datetime(2013,7,1))&
                          (data_deltas.DATETIME<datetime.datetime(2013,8,1))&
                          (data_deltas.DATETIME.dt.time==datetime.time(0,0,0))&
                          (data_deltas.DATETIME.dt.dayofweek==4)]
station_rollup = agg_by_station(data_deltas,['ENTRIES_delta','EXITS_delta'])
station_rollup.sort_values('ENTRIES_delta',ascending=False).head(3)

Unnamed: 0_level_0,ENTRIES_delta,EXITS_delta
Station,Unnamed: 1_level_1,Unnamed: 2_level_1
42 ST-TIMES SQ,23885,6115
34 ST-HERALD SQ,11211,4441
34 ST-PENN STA,9361,5119


Times Square – 42nd Street / Port Authority Bus Terminal had the highest average number of entries between midnight & 4am on Fridays in July 2013.

# Usage Growth

What stations have seen the most usage growth/decline in the last year?

In [19]:
data_t0 = get_mta_df_by_date_range(datetime.datetime(2015,1,3),4)
data_t1 = get_mta_df_by_date_range(datetime.datetime(2016,1,2),4)

We can define station a few different ways. The most obvious is the self reported station name

In [20]:
station_rollup_t0 = calc_deltas(data_t0).groupby('STATION')[['ENTRIES_delta','EXITS_delta']].sum()
station_rollup_t1 = calc_deltas(data_t1).groupby('STATION')[['ENTRIES_delta','EXITS_delta']].sum()
station_rollup = station_rollup_t0.join(station_rollup_t1, lsuffix='_t0', rsuffix='_t1')
station_rollup['YoY'] = station_rollup.ENTRIES_delta_t1 + station_rollup.EXITS_delta_t1 - station_rollup.ENTRIES_delta_t0 - station_rollup.EXITS_delta_t0
station_rollup.sort_values('YoY').head()

Unnamed: 0_level_0,ENTRIES_delta_t0,EXITS_delta_t0,ENTRIES_delta_t1,EXITS_delta_t1,YoY
STATION,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
SUTPHIN BLVD,653496,507420,104683,62885,-993348
21 ST,203449,174611,39987,44831,-293242
VAN SICLEN AVE,215916,155170,74333,67641,-229112
34 ST-PENN STA,3977861,3389783,3884926,3269389,-213329
METS-WILLETS PT,144020,167220,49402,53041,-208797


In [60]:
data_t0[data_t0.STATION == '34 ST-PENN STA'].UNIT.unique()
data_t1[data_t1.STATION == '34 ST-PENN STA'].UNIT.unique()
#data_t1[data_t1.UNIT in ['R434', 'R441', 'R068'])].STATION.unique()
#data_t1[data_t1.UNIT == 'R434'].STATION.unique()
#data_t0[data_t0.UNIT == 'R068'].STATION.unique()
#array(['R012', 'R013', 'R031', 'R293'], dtype=object)

array(['R012', 'R013', 'R031', 'R293'], dtype=object)

By this metric, Sutphin Boulevard saw the largest drop. However this is largely because unit R024 was transferred from SUTPHIN BLVD to SUTPHIN-ARCHER in mid 2015. Similarly unit R303 was transferred from 21 ST to 21 ST-QNSBRIDGE and R434 was transferred from VAN SICLEN AVE to VAN SICLEN AV. The largest station drop with consistent units was 34th Street – Penn Station.

In [21]:
station_rollup_t0 = agg_by_station(calc_deltas(data_t0),['ENTRIES_delta','EXITS_delta'])
station_rollup_t1 = agg_by_station(calc_deltas(data_t1),['ENTRIES_delta','EXITS_delta'])
station_rollup = station_rollup_t0.join(station_rollup_t1, lsuffix='_t0', rsuffix='_t1')
station_rollup['YoY'] = station_rollup.ENTRIES_delta_t1 + station_rollup.EXITS_delta_t1 - station_rollup.ENTRIES_delta_t0 - station_rollup.EXITS_delta_t0
station_rollup.sort_values('YoY').head()

Unnamed: 0_level_0,ENTRIES_delta_t0,EXITS_delta_t0,ENTRIES_delta_t1,EXITS_delta_t1,YoY
Station,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
42 ST-PA BUS TE,2464347,1891440,2204050,1719541,-432196
57 ST-7 AVE,645657,396508,496571,208273,-337321
ROCKAWAY AVE,251385,196693,129160,74923,-243995
34 ST-PENN STA,3977861,3389783,3884926,3269389,-213329
METS-WILLETS PT,144020,167220,49402,53041,-208797


In [22]:
station_rollup.sort_values('YoY').tail()

Unnamed: 0_level_0,ENTRIES_delta_t0,EXITS_delta_t0,ENTRIES_delta_t1,EXITS_delta_t1,YoY
Station,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BOYD-88 ST,10407,1645,76025,25816,89789
SARATOGA AVE,132489,53927,201410,84654,99648
FULTON ST,1134795,970892,1267925,1018455,180693
PATH WTC 2,9950,26927,15662,204521,183306
MAIN ST,1238318,1067246,1393136,1151015,238587


In [None]:
What stations have seen the most usage growth/decline in the last year

In [23]:
#What dates are the least busy? Could you identify days on which stations were not operating at full capacity or closed entirely?
data = get_mta_df_by_date_range(datetime.datetime(2015,1,3),2)
data_deltas = calc_deltas(data)
data_deltas.groupby('DATE')[['ENTRIES_delta','EXITS_delta']].sum()

Unnamed: 0_level_0,ENTRIES_delta,EXITS_delta
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1
01/01/2015,2451260,2039406
01/02/2015,4640488,3666934
01/03/2015,2984875,2422803
01/04/2015,2392262,1946628
01/05/2015,5709449,4358453
01/06/2015,5620817,4274236
01/07/2015,5667333,4313189
01/08/2015,5547177,4238099
01/09/2015,5346750,4027541
12/27/2014,3448706,2817056


In [24]:
station_rollup = agg_by_station_date(data_deltas,['ENTRIES_delta','EXITS_delta'])
station_capacity = station_rollup.reset_index(level=1,drop=True).groupby(level=0).agg({'ENTRIES_delta':max,
                                                                                       'EXITS_delta':max})
station_utilization = station_rollup.div(station_capacity)

In [25]:
station_utilization[(station_utilization.ENTRIES_delta<0.5)|
                    (station_utilization.EXITS_delta<0.5)]

Unnamed: 0_level_0,Unnamed: 1_level_0,ENTRIES_delta,EXITS_delta
Station,DATE,Unnamed: 2_level_1,Unnamed: 3_level_1
1 AVE,01/04/2015,0.526088,0.497924
1 AVE,12/28/2014,0.472405,0.465515
103 ST,01/04/2015,0.482111,0.481831
103 ST-CORONA,01/01/2015,0.420748,0.492318
104 ST,01/01/2015,0.358862,0.399613
104 ST,01/04/2015,0.435449,0.491285
104 ST,12/28/2014,0.455580,0.500323
110 ST,01/01/2015,0.442475,0.504288
110 ST,01/04/2015,0.450290,0.507920
110 ST,12/28/2014,0.488076,0.548885


In [None]:
# Day of week, Day of month, month, Line, Division, Station, unit, Yankees

In [None]:
yankee_schedule = get_yankee_schedule()

data = get_mta_df_by_date_range(datetime.datetime(2013,8,17),4)

In [None]:
data_deltas = calc_deltas(data)
data_deltas = data_deltas[(data_deltas.UNIT=='R195')&
                          (data_deltas.DATETIME.dt.time==datetime.time(16,22,0))]
data_exits = data_deltas.groupby('DATETIME')[['EXITS_delta']].sum()
data_exits['DATE'] = data_exits.index.date
data_exits['DAY'] = data_exits.index.dayofweek
data_exits = data_exits.merge(yankee_schedule,left_on='DATE',right_on='Date',how='left')
data_exits.loc[data_exits.Opp.isnull(),'OffPeak'] = data_exits.loc[data_exits.Opp.isnull(),'EXITS_delta']
data_exits.loc[~data_exits.Opp.isnull(),'Peak'] = data_exits.loc[~data_exits.Opp.isnull(),'EXITS_delta']
data_exits = data_exits[['DATE','DAY','Peak','OffPeak']]

In [None]:
data_exits['t1'] = data_exits.OffPeak.shift(1)
data_exits['t7'] = data_exits.OffPeak.shift(7)
data_exits['t8'] = data_exits.OffPeak.shift(8)
data_exits['t14'] = data_exits.OffPeak.shift(14)
data_exits['t15'] = data_exits.OffPeak.shift(15)

In [None]:
data_exits['pred'] = data_exits.t1+data_exits.t7-data_exits.t8

In [None]:
data_exits

In [None]:
data_exits.head()

In [None]:
data.EXITS.sum()

In [None]:
x = merge_station_strict(data_t1)
x[x.STATION=='SUTPHIN BLVD'].EXITS.sum()

In [None]:
station_rollup_t0 = calc_deltas(data_t0).groupby('STATION')[['ENTRIES_delta','EXITS_delta']].sum()
station_rollup_t0[station_rollup_t0.index=='42 ST-PA BUS TE']

In [None]:
station_rollup_t0 = calc_deltas(data_t1).groupby('STATION')[['ENTRIES_delta','EXITS_delta']].sum()
station_rollup_t0[station_rollup_t0.index=='42 ST-PORT AUTH']

In [None]:
station_rollup_t0 = agg_by_station(calc_deltas(data_t0),['ENTRIES_delta','EXITS_delta'])
station_rollup_t0[station_rollup_t0.index=='SUTPHIN BLVD']

In [None]:
x = merge_station(calc_deltas(data_t0))
x[x.STATION=='SUTPHIN BLVD'].Station.unique()

In [None]:
station_rollup_t1 = agg_by_station(calc_deltas(data_t0),['ENTRIES_delta','EXITS_delta'])
station_rollup_t1[station_rollup_t1.index=='42 ST-PA BUS TE']

In [None]:
x[x.Station=='42 ST-PA BUS TE'].STATION.unique()

In [None]:
x[x.STATION=='42 ST-PA BUS TE'].Station.unique()

In [None]:
station_rollup_t0 = calc_deltas(data_t0).groupby('STATION')[['ENTRIES_delta','EXITS_delta']].sum()
station_rollup_t0[station_rollup_t0.index=='42 ST-PORT AUTH']

In [None]:
yankee_schedule = get_yankee_schedule()

In [None]:
source = 'https://www.dropbox.com/s/2g5itrjc6mo4huu/yankee_home_2013.csv?dl=1'
pd.read_csv(source)

In [None]:
    source = 'https://www.dropbox.com/s/2g5itrjc6mo4huu/yankee_home_2013.csv?dl=1'
    sched = pd.read_csv(source)
    sched['Datetime'] = sched.apply(lambda x:
                                    datetime.datetime.strptime(x.Datetime,
                                                               '%m/%d/%Y %H:%M'))

In [None]:
datetime.datetime.strptime(sched.Datetime[0],
                                                               '%m/%d/%Y %H:%M')

In [None]:
sched.apply(lambda x:datetime.datetime.strptime(x.Datetime,'%m/%d/%Y %H:%M'))

In [None]:
x = 'http://web.mta.info/developers/data/nyct/turnstile/turnstile_{}.txt'

In [None]:
x.format('3434')