/
plot_wait_time.py
122 lines (106 loc) · 3.66 KB
/
plot_wait_time.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import bisect
import datetime
import json
import random
import seaborn
import numpy
import pandas
import scipy
from matplotlib import pyplot
def parse_time(t):
h, m, s = map(int, t.split(':'))
return h*60*60 + m*60 + s
sched_trips = {}
for i, line in enumerate(open('stop_times.txt')):
line = line.strip().split(',')
if i > 0:
trip_id, arr, dep, stop_id = line[:4]
if 'WKD' not in trip_id:
continue
line = trip_id.split('_')[2].split('.')[0]
key = (stop_id, line)
arr = parse_time(arr)
sched_trips.setdefault(key, []).append(arr)
for key, stops in sched_trips.iteritems():
stops.sort()
real_trips = {}
for n_lines, line in enumerate(open('log.jsons')):
for vehicle in json.loads(line.strip()):
if vehicle.get('current_status') != 1: # STOPPED_AT
continue
try:
line = vehicle['trip']['route_id'].rstrip('X') # fold express into normal
if line not in ['1', '2', '3', '4', '5', '6', 'GS', 'L', 'SI']:
print 'weird line', line
continue
if 'stop_id' in vehicle:
stop_id = vehicle['stop_id']
else:
# L and SI stop at every station, need to use
stop_id = '%d%s' % (vehicle['current_stop_sequence'], vehicle['trip']['trip_id'][-1])
key = (stop_id, line)
timestamp = vehicle['timestamp']
t = datetime.datetime.utcfromtimestamp(vehicle['timestamp'])
if t.weekday() < 5:
real_trips.setdefault(key, set()).add(timestamp)
except:
print 'weird vehicle', vehicle
continue
if n_lines % 1000 == 0:
print n_lines, '...'
xs = []
ys = []
MAX = 1800
ys_by_x = [[] for x in xrange(MAX/60)]
print ys_by_x
max_time = 4 * 3600
for key, stops in real_trips.iteritems():
stop_id, line = key
stops = sorted(stops)
if len(stops) < 5:
print key, 'not enough stops'
continue # stupid
if key not in sched_trips:
print key, 'has no schedule'
continue
# Sample random points in time and tie
lo = stops[0]
hi = stops[-1]
for i in xrange(len(stops)): # pretty arbitrary number of samples
t = lo + random.random() * (hi - lo)
j = bisect.bisect(stops, t)
t0, t1 = stops[j-1], stops[j]
if t1 - t0 > max_time:
continue
real_wait_time = t1 - t
# transform t to day offset
u = (t + (19 * 60 * 60)) % (24 * 60 * 60)
j = bisect.bisect(sched_trips[key], u)
if j < len(sched_trips[key]):
u1 = sched_trips[key][j]
else:
u1 = 24 * 60 * 60 + sched_trips[key][0]
sched_wait_time = u1 - u
if max(sched_wait_time, real_wait_time) < MAX:
xs.append(sched_wait_time / 60.)
ys.append(real_wait_time / 60.)
if sched_wait_time < MAX:
ys_by_x[int(sched_wait_time / 60.0)].append(real_wait_time / 60.)
seaborn.jointplot(numpy.array(xs), numpy.array(ys), kind='hex')
pyplot.savefig('wait_time_real_vs_sched_joint.png')
pyplot.clf()
percs = [50, 60, 70, 80, 90]
results = [[] for p in percs]
for x, ys in enumerate(ys_by_x):
print x, len(ys)
ps = numpy.percentile(ys, percs)
for i, y in enumerate(ps):
results[i].append(y)
for i, ys in enumerate(results):
pyplot.plot(range(len(ys)), ys, label='%d percentile' % percs[i])
pyplot.ylim([0, 60])
pyplot.title('How long do you have to wait given how much schedule predicts')
pyplot.xlabel('Scheduled waiting time (min)')
pyplot.ylabel('Real waiting time (min)')
pyplot.legend()
pyplot.savefig('wait_time_real_vs_sched_percentiles.png')