In [7]:
import csv
import datetime
import os
import pandas as pd
import sys
from collections import defaultdict
from collections import OrderedDict

In [2]:
# This adddress hosts the MTA turnstile data
MTA_url = 'http://web.mta.info/developers/data/nyct/turnstile/'

In [3]:
def get_filenames(start_date, n_weeks):
    """This function returns a list of MTA data file names for the
    period from start_date  spanning n_weeks.
    """
    
    ONE_WEEK = datetime.timedelta(days=7)

    # Identify first Saturday on or after specified date. Files are
    # named for the day when published, which is always a Saturday.

    if None == start_date:
        start_date = datetime.date.today() - n_weeks * ONE_WEEK
        
    first_saturday = start_date + datetime.timedelta(days=5 - start_date.weekday())
    
    # Compose list of requested dates
    
    dates = [first_saturday + ONE_WEEK * i for i in range(n_weeks)]
    
    # Return file names sorted chronologically
    
    return sorted(['turnstile_' + d.strftime('%y%m%d') + '.txt' for d in dates])

In [4]:
def read_data(n_weeks=4, start_date=None):
    # Get files to read

    frames = []
    for name in get_filenames(start_date, n_weeks):

        # Create data frame for each file.  If the file exists locally,
        # use it. Otherwise, download it and save a copy for the next
        # time

        if os.path.exists(name):
            print 'reading local', name
            frames.append(pd.read_csv(name))
        else:
            print 'reading url', MTA_url + name
            frames.append(pd.read_csv(MTA_url + name))
            frames[-1].to_csv(name, index=False)

    # Concatenate data frames

    return pd.concat(frames)

In [81]:
raw_df = read_data(12, datetime.date(2016, 6, 1))

reading local turnstile_160604.txt
reading local turnstile_160611.txt
reading local turnstile_160618.txt
reading local turnstile_160625.txt
reading local turnstile_160702.txt
reading local turnstile_160709.txt
reading local turnstile_160716.txt
reading local turnstile_160723.txt
reading local turnstile_160730.txt
reading local turnstile_160806.txt
reading local turnstile_160813.txt
reading local turnstile_160820.txt


In [122]:
def make_dict(df):
    """Convert turnstile data frame to dictionary that maps each STATION
    to a list of { DATE : [TIME, ENTRIES, EXITS], ... } values

    df: Pandas data frame with raw turnstile data
    """

    #   Build dictionary

    d = OrderedDict()
    for row in df.values:
        key = tuple(row[:4])
        date, time, entries, exits = (row[6], row[7], row[9], row[10])
        dt = datetime.datetime.strptime(date + ' ' + time, '%m/%d/%Y %H:%M:%S')
        if 0 == dt.minute + dt.second: # ignore invalid times
            d.setdefault(key, []).append([dt, entries, exits])

    #   { (C/A, UNIT, SCP, STATION) : [[DATE-TIME, ENTRIES, EXITS], ...]}

    #   Convert cumulative counts to increments
    
    for key, counts in d.items():
        for i in range(len(counts) - 1, 0, -1):
            dt1, entries1, exits1 = tuple(counts[i])
            dt2, entries2, exits2 = tuple(counts[i-1])
            if dt1 == dt2 + datetime.timedelta(hours=4):
                counts[i][1] = abs(entries1 - entries2) # handle negative deltas
                counts[i][2] = abs(exits1 - exits2)
            else:
                counts[i][1] = 0
                counts[i][2] = 0
        counts.pop(0)

    #   { (C/A, UNIT, SCP, STATION) : [[DATE-TIME, DELTA_ENTRIES, DELTA_EXITS], ...]}

    #   Remove large outliers

    cleaned = OrderedDict()
    for key, counts in d.items():
        for vals in counts:
            entries, exits = (vals[1], vals[2])
            if entries < 50000 and exits < 50000:
                cleaned.setdefault(key, []).append(vals)

    #   Aggregate counts for each station

    agg = OrderedDict()
    for key, counts in cleaned.items():
        station = key[3]
        for vals in counts:
            dt = vals[0]
            agg.setdefault((station, dt), []).append(vals[1:])

    # { (STATION, DATE-TIME) : [[DATE-TIME, DELTA_ENTRIES, DELTA_EXITS], ...]}

    d = OrderedDict()
    for key, counts in agg.items():
        station, dt = key
        entries, exits = zip(*counts)
        d.setdefault(station, []).append([dt.date(), dt.weekday(), dt.hour, sum(entries), sum(exits)])
        
    return d

In [123]:
d = make_dict(raw_df)

In [124]:
def dataframe_from_dict(d):
    flattened = []
    for station, tally in d.items():
        for data in tally:
            flattened.append([station] + data)
    return pd.DataFrame(flattened, columns=['STATION', 'DATE-TIME', 'DATE' 'DAY', 'TIME', 'ENTRIES', 'EXITS'])

In [162]:
df = dataframe_from_dict(d)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 195939 entries, 0 to 195938
Data columns (total 6 columns):
STATION      195939 non-null object
DATE-TIME    195939 non-null object
DATEDAY      195939 non-null int64
TIME         195939 non-null int64
ENTRIES      195939 non-null int64
EXITS        195939 non-null int64
dtypes: int64(4), object(2)
memory usage: 9.0+ MB


In [194]:
df.loc[(df.TIME%4 != 0) & ((df.TIME-1)%4 != 0), ['STATION', 'ENTRIES', 'EXITS']].groupby('STATION').sum()

Unnamed: 0_level_0,ENTRIES,EXITS
STATION,Unnamed: 1_level_1,Unnamed: 2_level_1
34 ST-PENN STA,3994359,2806657
42 ST-PORT AUTH,2782929,2180787
5 AV/53 ST,1616525,1760132
5 AV/59 ST,1443063,1001946
9TH STREET,0,0
FULTON ST,1964983,2184815
LACKAWANNA,0,0
TOMPKINSVILLE,56246,1
WALL ST,1373558,1042785
WILSON AV,0,0


In [198]:
df.loc[(df.STATION=='WALL ST') & (df.TIME%4 != 0) & ((df.TIME-1)%4 != 0), ['STATION', 'ENTRIES', 'EXITS']].sum()

STATION    WALL STWALL STWALL STWALL STWALL STWALL STWALL...
ENTRIES                                              1373558
EXITS                                                1042785
dtype: object

In [199]:
df.loc[(df.STATION=='WALL ST') & ((df.TIME%4 == 0) | ((df.TIME-1)%4 == 0)), ['STATION', 'ENTRIES', 'EXITS']].sum()

STATION    WALL STWALL STWALL STWALL STWALL STWALL STWALL...
ENTRIES                                              1824735
EXITS                                                1772330
dtype: object

In [204]:
raw_df.loc[(raw_df.TIME=='03:00:00')]

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS
11716,A077,R028,03-00-00,FULTON ST,ACJZ2345,BMT,05/28/2016,03:00:00,REGULAR,1639953,452005
11722,A077,R028,03-00-00,FULTON ST,ACJZ2345,BMT,05/29/2016,03:00:00,REGULAR,1640271,452073
11728,A077,R028,03-00-00,FULTON ST,ACJZ2345,BMT,05/30/2016,03:00:00,REGULAR,1640526,452143
11734,A077,R028,03-00-00,FULTON ST,ACJZ2345,BMT,05/31/2016,03:00:00,REGULAR,1640781,452204
11740,A077,R028,03-00-00,FULTON ST,ACJZ2345,BMT,06/01/2016,03:00:00,REGULAR,1641911,452429
11746,A077,R028,03-00-00,FULTON ST,ACJZ2345,BMT,06/02/2016,03:00:00,REGULAR,1643066,452675
11752,A077,R028,03-00-00,FULTON ST,ACJZ2345,BMT,06/03/2016,03:00:00,REGULAR,1644167,452902
11758,A077,R028,03-03-00,FULTON ST,ACJZ2345,BMT,05/28/2016,03:00:00,REGULAR,3942349,1927703
11764,A077,R028,03-03-00,FULTON ST,ACJZ2345,BMT,05/29/2016,03:00:00,REGULAR,3942546,1928103
11770,A077,R028,03-03-00,FULTON ST,ACJZ2345,BMT,05/30/2016,03:00:00,REGULAR,3942748,1928463


In [128]:
with open('mta-summer2016-rev03.csv', 'wb') as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    writer.writerow(['STATION', 'DATE', 'DAY', 'TIME', 'ENTRIES', 'EXITS'])
    for k, v in d.items():
        for e in v:
            writer.writerow([k] + e)


In [133]:
pd.options.display.float_format = '{:,.2f}'.format

In [134]:
import pandas as pd
import seaborn as sns
import pandasql as pdsql

# read csv into panda data frame to see values read from file into data frame
subway=pd.read_csv('mta-summer2016-rev03.csv') 

In [135]:
#set up and execute sql to get average number of entries for a station-time on a weekday
pysql = lambda q: pdsql.sqldf(q, globals())
str1 = "select station,time,avg(exits) from subway where (day<6) group by station,time order by avg(exits) desc;"
df1 = pysql(str1)

#print df1
# use panda groupby to return average entries and exits for each station-time
grouped_data_1 = subway.groupby(['STATION','TIME']).mean()
print grouped_data_1

                      DAY  ENTRIES    EXITS
STATION         TIME                       
1 AV            0    2.98 3,700.92 3,581.36
                4    3.00   940.86   695.68
                8    3.00 1,279.35 2,403.98
                12   3.00 4,266.93 4,204.04
                16   3.00 4,094.95 3,918.45
                20   2.99 5,756.08 5,920.59
103 ST          1    2.98 2,015.99 2,797.05
                5    3.00   266.25   481.05
                9    3.00 5,993.99 2,545.69
                13   3.00 6,743.75 3,231.26
                17   3.00 6,744.89 4,432.31
                21   3.00 5,979.39 5,634.56
103 ST-CORONA   0    2.98 1,193.92 2,966.07
                4    3.00   262.13 1,083.21
                8    3.00 5,632.31   605.25
                12   3.00 4,884.13 1,413.18
                16   3.00 3,182.87 2,319.04
                20   3.00 2,664.79 4,819.50
104 ST          0    2.98   198.13   348.53
                4    3.00    53.88   102.11
                8    3.00 1,398.