In [1]:
from __future__ import division, print_function

# MTA Data Challenges

In [2]:
import csv
from datetime import datetime
from collections import Counter

### Challenge 1

In [3]:
!curl -O http://web.mta.info/developers/data/nyct/turnstile/turnstile_150627.txt

curl: (6) Couldn't resolve host 'web.mta.info'


In [4]:
with open('turnstile_150627.txt') as f:
    reader = csv.reader(f)
    rows = [[cell.strip() for cell in row] for row in reader]

In [5]:
assert rows.pop(0) == ['C/A', 'UNIT', 'SCP', 'STATION', 'LINENAME',
                       'DIVISION', 'DATE', 'TIME', 'DESC', 'ENTRIES',
                       'EXITS']

In [6]:
raw_readings = {}
for row in rows:
    # why does our key have a tuple ? 
    raw_readings.setdefault(tuple(row[:4]), []).append(tuple(row[4:]))
    

In [7]:
# Another way to do the same thing .. 

from collections import defaultdict
t=defaultdict(list)

for row in rows:
    t[tuple(row[:4])].append(tuple(row[4:]))
    
#list(t.items())[0]

In [8]:
list(raw_readings.items())[0]

(('A002', 'R051', '02-00-00', 'LEXINGTON AVE'),
 [('NQR456',
   'BMT',
   '06/20/2015',
   '00:00:00',
   'REGULAR',
   '0005192500',
   '0001756572'),
  ('NQR456',
   'BMT',
   '06/20/2015',
   '04:00:00',
   'REGULAR',
   '0005192550',
   '0001756580'),
  ('NQR456',
   'BMT',
   '06/20/2015',
   '08:00:00',
   'REGULAR',
   '0005192568',
   '0001756609'),
  ('NQR456',
   'BMT',
   '06/20/2015',
   '12:00:00',
   'REGULAR',
   '0005192670',
   '0001756706'),
  ('NQR456',
   'BMT',
   '06/20/2015',
   '16:00:00',
   'REGULAR',
   '0005192886',
   '0001756776'),
  ('NQR456',
   'BMT',
   '06/20/2015',
   '20:00:00',
   'REGULAR',
   '0005193250',
   '0001756837'),
  ('NQR456',
   'BMT',
   '06/21/2015',
   '00:00:00',
   'REGULAR',
   '0005193409',
   '0001756875'),
  ('NQR456',
   'BMT',
   '06/21/2015',
   '04:00:00',
   'REGULAR',
   '0005193435',
   '0001756882'),
  ('NQR456',
   'BMT',
   '06/21/2015',
   '08:00:00',
   'REGULAR',
   '0005193461',
   '0001756900'),
  ('NQR456',
   

`raw_readings` is a solution to Challenge 1.

### Challenge 2

In [44]:
# typical dict comprehension 

# {n:n**2 for n in range(5)}

In [40]:
# create a dict comprehension using list comprehension
datetime_cumulative = {turnstile: [(datetime.strptime(date + time,
                                                      '%m/%d/%Y%X'),
                                    int(in_cumulative))
                                   for _, _, date, time,
                                       _, in_cumulative, _ in rows]
                       for turnstile, rows in raw_readings.items()}

In [11]:
list(datetime_cumulative.items())[0]

(('A002', 'R051', '02-00-00', 'LEXINGTON AVE'),
 [(datetime.datetime(2015, 6, 20, 0, 0), 5192500),
  (datetime.datetime(2015, 6, 20, 4, 0), 5192550),
  (datetime.datetime(2015, 6, 20, 8, 0), 5192568),
  (datetime.datetime(2015, 6, 20, 12, 0), 5192670),
  (datetime.datetime(2015, 6, 20, 16, 0), 5192886),
  (datetime.datetime(2015, 6, 20, 20, 0), 5193250),
  (datetime.datetime(2015, 6, 21, 0, 0), 5193409),
  (datetime.datetime(2015, 6, 21, 4, 0), 5193435),
  (datetime.datetime(2015, 6, 21, 8, 0), 5193461),
  (datetime.datetime(2015, 6, 21, 12, 0), 5193533),
  (datetime.datetime(2015, 6, 21, 16, 0), 5193752),
  (datetime.datetime(2015, 6, 21, 20, 0), 5193985),
  (datetime.datetime(2015, 6, 22, 0, 0), 5194109),
  (datetime.datetime(2015, 6, 22, 4, 0), 5194120),
  (datetime.datetime(2015, 6, 22, 8, 0), 5194164),
  (datetime.datetime(2015, 6, 22, 12, 0), 5194383),
  (datetime.datetime(2015, 6, 22, 16, 0), 5194686),
  (datetime.datetime(2015, 6, 22, 20, 0), 5195595),
  (datetime.datetime(2015

In [12]:
# sort by date
for rows in datetime_cumulative.values():
    assert rows == sorted(rows)

In [13]:
datetime_count_times = {turnstile: [[rows[i][0], # datetime first
                                      # delta on entries
                                     rows[i+1][1] - rows[i][1],
                                     # delta on datetime
                                     rows[i+1][0] - rows[i][0]]
                                    for i in range(len(rows) - 1)]
                        for turnstile, rows in datetime_cumulative.items()}

In [18]:
list(datetime_count_times.items())[0]

(('A002', 'R051', '02-00-00', 'LEXINGTON AVE'),
 [[datetime.datetime(2015, 6, 20, 0, 0), 50, datetime.timedelta(0, 14400)],
  [datetime.datetime(2015, 6, 20, 4, 0), 18, datetime.timedelta(0, 14400)],
  [datetime.datetime(2015, 6, 20, 8, 0), 102, datetime.timedelta(0, 14400)],
  [datetime.datetime(2015, 6, 20, 12, 0), 216, datetime.timedelta(0, 14400)],
  [datetime.datetime(2015, 6, 20, 16, 0), 364, datetime.timedelta(0, 14400)],
  [datetime.datetime(2015, 6, 20, 20, 0), 159, datetime.timedelta(0, 14400)],
  [datetime.datetime(2015, 6, 21, 0, 0), 26, datetime.timedelta(0, 14400)],
  [datetime.datetime(2015, 6, 21, 4, 0), 26, datetime.timedelta(0, 14400)],
  [datetime.datetime(2015, 6, 21, 8, 0), 72, datetime.timedelta(0, 14400)],
  [datetime.datetime(2015, 6, 21, 12, 0), 219, datetime.timedelta(0, 14400)],
  [datetime.datetime(2015, 6, 21, 16, 0), 233, datetime.timedelta(0, 14400)],
  [datetime.datetime(2015, 6, 21, 20, 0), 124, datetime.timedelta(0, 14400)],
  [datetime.datetime(2015, 

In [19]:
all_counts = [count for rows in datetime_count_times.values() for _, count, _ in rows]
all_counts.sort()
print(all_counts[-50:])

[2428, 2443, 2444, 2447, 2448, 2455, 2456, 2459, 2473, 2474, 2487, 2492, 2514, 2521, 2536, 2548, 2552, 2554, 2558, 2567, 2569, 2579, 2602, 2612, 2621, 2622, 2630, 2630, 2644, 2656, 2670, 2671, 2692, 2728, 2731, 2757, 2765, 2804, 2844, 2848, 2920, 2926, 2926, 3488, 87577, 130932, 131711, 131792, 531430, 117440499]


In [20]:
print(all_counts[:1200])

[-1208464910, -8664797, -7691983, -1958436, -1283643, -531174, -131370, -131215, -130432, -87562, -73467, -65355, -2545, -1929, -1852, -1782, -1720, -1703, -1687, -1677, -1662, -1659, -1645, -1608, -1572, -1571, -1553, -1543, -1533, -1524, -1492, -1484, -1482, -1455, -1410, -1404, -1403, -1402, -1401, -1397, -1378, -1376, -1362, -1357, -1345, -1339, -1332, -1326, -1323, -1311, -1303, -1302, -1299, -1292, -1290, -1290, -1289, -1268, -1263, -1260, -1257, -1246, -1236, -1233, -1226, -1217, -1205, -1196, -1194, -1184, -1182, -1136, -1133, -1130, -1130, -1126, -1120, -1119, -1116, -1113, -1111, -1105, -1094, -1092, -1089, -1086, -1084, -1079, -1077, -1071, -1065, -1059, -1058, -1044, -1036, -1035, -1028, -1021, -1021, -1012, -1000, -995, -994, -991, -991, -978, -968, -965, -965, -962, -957, -957, -954, -952, -943, -937, -933, -932, -930, -919, -911, -908, -886, -885, -884, -882, -882, -873, -871, -869, -868, -862, -858, -842, -840, -838, -832, -825, -822, -820, -818, -814, -813, -801, -800,

In [23]:
# Time intervals? 

all_times = [duration.total_seconds() / 60 / 60
             for rows in datetime_count_times.values()
             for _, _, duration in rows]
print(Counter(all_times).most_common(10))

[(4.0, 172791), (4.2, 10460), (8.0, 189), (4.433333333333334, 161), (0.02222222222222222, 80), (0.02277777777777778, 64), (0.022500000000000003, 55), (0.02777777777777778, 26), (3.963888888888889, 24), (0.03611111111111111, 24)]


In [26]:
datetime_counts = {turnstile: [(time, count)
                               for (time, count, _) in rows
                               if 0 <= count <= 5000]
                   for turnstile, rows in datetime_count_times.items()}

`datetime_counts` is a solution to Challenge 2.

In [23]:
all_good_counts = [count for rows in datetime_counts.values() for _, count in rows]
print(len(all_good_counts) / len(all_counts))

0.9941598540495402


In [26]:
all_good_counts.sort()
print(all_good_counts[-5:])

[2848, 2920, 2926, 2926, 3488]


In [27]:
print(all_good_counts[:5])

[0, 0, 0, 0, 0]


In [28]:
list(datetime_counts.items())[0]

(('A002', 'R051', '02-00-00', 'LEXINGTON AVE'),
 [(datetime.datetime(2015, 6, 20, 0, 0), 50),
  (datetime.datetime(2015, 6, 20, 4, 0), 18),
  (datetime.datetime(2015, 6, 20, 8, 0), 102),
  (datetime.datetime(2015, 6, 20, 12, 0), 216),
  (datetime.datetime(2015, 6, 20, 16, 0), 364),
  (datetime.datetime(2015, 6, 20, 20, 0), 159),
  (datetime.datetime(2015, 6, 21, 0, 0), 26),
  (datetime.datetime(2015, 6, 21, 4, 0), 26),
  (datetime.datetime(2015, 6, 21, 8, 0), 72),
  (datetime.datetime(2015, 6, 21, 12, 0), 219),
  (datetime.datetime(2015, 6, 21, 16, 0), 233),
  (datetime.datetime(2015, 6, 21, 20, 0), 124),
  (datetime.datetime(2015, 6, 22, 0, 0), 11),
  (datetime.datetime(2015, 6, 22, 4, 0), 44),
  (datetime.datetime(2015, 6, 22, 8, 0), 219),
  (datetime.datetime(2015, 6, 22, 12, 0), 303),
  (datetime.datetime(2015, 6, 22, 16, 0), 909),
  (datetime.datetime(2015, 6, 22, 20, 0), 252),
  (datetime.datetime(2015, 6, 23, 0, 0), 23),
  (datetime.datetime(2015, 6, 23, 4, 0), 62),
  (datetime.

### Challenge 3

In [34]:
# dictionary of dictionaries
day_counts = {}
for turnstile, rows in datetime_counts.items():
    by_day = {}
    for time, count in rows:
        day = time.date()
        by_day[day] = by_day.get(day, 0) + count
    day_counts[turnstile] = sorted(by_day.items())
    

list(day_counts.items())[0]

(('N135', 'R385', '01-03-00', 'ROCKAWAY BLVD'),
 [(datetime.date(2015, 6, 20), 271),
  (datetime.date(2015, 6, 21), 201),
  (datetime.date(2015, 6, 22), 561),
  (datetime.date(2015, 6, 23), 543),
  (datetime.date(2015, 6, 24), 555),
  (datetime.date(2015, 6, 25), 556),
  (datetime.date(2015, 6, 26), 543)])

`day_counts` is a solution to Challenge 3.

In [37]:
# dictionary of dictionaries
day_counts = {}
for turnstile, rows in datetime_counts.items():
    by_day = defaultdict(int)
    for time, count in rows:
        day = time.date()
        by_day[day] +=count
    day_counts[turnstile] = sorted(by_day.items())
    

list(day_counts['N135', 'R385', '01-03-00', 'ROCKAWAY BLVD'])

[(datetime.date(2015, 6, 20), 271),
 (datetime.date(2015, 6, 21), 201),
 (datetime.date(2015, 6, 22), 561),
 (datetime.date(2015, 6, 23), 543),
 (datetime.date(2015, 6, 24), 555),
 (datetime.date(2015, 6, 25), 556),
 (datetime.date(2015, 6, 26), 543)]