In [6]:
import csv
import requests
import json
import time as tm
import sqlite3
import traceback
import sys
from math import cos, asin, sqrt
from multiprocessing.dummy import Pool as ThreadPool
import os
import datetime
from IPython.display import clear_output
import pickle
import pandas as pd
import numpy as np

from tfl_journeyplanner_api_tools import *

In [12]:
# Read in origin longlats: pop-weighted MSOA centroids
orig_coords = []
with open('PopWeightedCentroids2011LondonMSOA_coords.csv', 'r') as inputfile:
    reader = csv.reader(inputfile, delimiter = ',')
    for row in reader:
        orig_coords.append((row[0], row[4], row[5])) # read in longlats coords from csv file into tuples
orig_coords = orig_coords[1:]

# Read in destination longlats: jobs-weighted MSOA centroids
dest_coords = []
with open('JobsWeightedCentroids2011LondonMSOA_coords.csv', 'r') as inputfile:
    reader = csv.reader(inputfile, delimiter = ',')
    for row in reader:
        dest_coords.append((row[0], row[1], row[2])) # read in longlats coords from csv file into tuples
dest_coords = dest_coords[1:]

odpairs = GenerateODPairs(orig_coords[:], dest_coords[:]) # specify how many OD pairs to generate here
print(len(orig_coords), len(dest_coords)) # checking data import was successful
print(len(odpairs), "pairs generated. First 10:", odpairs[:10])

983 983
966289 pairs generated. First 10: [(('E02006854', '-0.025046237', '51.50164399'), ('E02000001', '-0.091155997', '51.51475914')), (('E02006854', '-0.025046237', '51.50164399'), ('E02000002', '0.136596432', '51.58623085')), (('E02006854', '-0.025046237', '51.50164399'), ('E02000003', '0.138810779', '51.5705831')), (('E02006854', '-0.025046237', '51.50164399'), ('E02000004', '0.17624198', '51.56078086')), (('E02006854', '-0.025046237', '51.50164399'), ('E02000005', '0.144375564', '51.56175313')), (('E02006854', '-0.025046237', '51.50164399'), ('E02000007', '0.154249648', '51.55985824')), (('E02006854', '-0.025046237', '51.50164399'), ('E02000008', '0.138580433', '51.55208708')), (('E02006854', '-0.025046237', '51.50164399'), ('E02000009', '0.118662175', '51.55239182')), (('E02006854', '-0.025046237', '51.50164399'), ('E02000010', '0.150976445', '51.5482683')), (('E02006854', '-0.025046237', '51.50164399'), ('E02000011', '0.162473217', '51.54805375'))]


In [5]:
#set up db
'''
Already run once - do not rerun! It will reset the database!

db = batchsettings['date'] + '_' + batchsettings['time'] + '/traveloptions_to_top10.db'
if not os.path.exists(batchsettings['date'] + '_' + batchsettings['time']):
    os.makedirs(batchsettings['date'] + '_' + batchsettings['time'])
setup_db(db)
'''

In [10]:
# set up network data
with open('stoplocations.json', 'r') as infile:
    stoplocations = json.load(infile)
stoplist = {}
for stop in stoplocations['features']:
    stoplist[stop['properties']['NAPTAN']] = stop['geometry']['coordinates']
#with open('20190322_0830/networkdata_800000_900000', 'rb') as infile:
#    edges, nodes = pickle.load(infile)
edges = {}
nodes = {}
print(len(stoplist), len(edges), len(nodes))

12838 0 0


In [20]:
# set up currentpairs
# msoa_top10_jobs = ['E02000809','E02000970','E02000808','E02000575','E02000979',
#                    'E02000193','E02006854','E02000972','E02000977','E02000001'] # 10 MSOAs containing the most jobs
currentpairs = [odpair for odpair in odpairs[950000:]] # updated 20190218/0048
#with open('20190322_0830/currentpairs850000', 'rb') as infile:
#    currentpairs = pickle.load(infile)
print(len(currentpairs))

16289


In [16]:
# set up batchsettings
batchsettings = {'processingstarttime': datetime.datetime(2019,2,17,20,0),
                 'processingendtime': datetime.datetime(2019,2,18,6,0),
                 'date': '&date=20190322',
                 'time': '&time=0830',
                 'timeIs': '&timeIs=Arriving',
                 'journeyPreference': '&journeyPreference=leastTime',
                 'useMultiModalCall': '&useMultiModalCall=true'}
db = '20190322_0830/traveloptions.db'

In [21]:
# Execute data collection from API and write to DB
# Multithreaded version
# Auto-looping to reprocess pairs with errors

while len(currentpairs) > 0:
    # Batching of OD pairs into 290s
    startpos = 0
    nbatches = len(currentpairs) // 290 + 1
    initialbatchsize = len(currentpairs) % 290
    processed = set()
    errors = []

    # timed start
    while datetime.datetime.now() < batchsettings['processingstarttime']:
        print('Time is', datetime.datetime.now(), '. Starting processing at', batchsettings['processingstarttime'])
        tm.sleep(10)
        clear_output()

    print("Starting processing:", len(currentpairs), "pairs in", nbatches, "batches.")

    for batch in range(nbatches):
        outputs = []
        starttime = tm.clock()
        pool = ThreadPool(12)

        # multithreaded query the next batch of 290
        outputs.extend(pool.starmap(call_api, [(pair, stoplist, batchsettings) for pair in currentpairs[startpos : (initialbatchsize + batch * 290)]]))
        pool.close()
        pool.join()

        downloadtime = tm.clock() - starttime

        # single-thread write this batch's outputs to db - to avoid schema locks
        for output in outputs:
            for option in output:
                if option[4]:
                    write_to_db(db, option[0][0][0], option[0][1][0], option[1], option[2])
                    write_to_network(edges, nodes, option[3])
                    processed.add(option[0])
                else:
                    errors.append(option[0])

        processingtime = tm.clock() - starttime

        print('Processed batch', batch, 'in', processingtime, 's (', str(downloadtime/processingtime * 100)[:4], '% download time).', 
              len(processed) + len(errors), 'pairs processed altogether with', len(errors), 'errors so far.')

        # update next batch's startpos to be this batch's endpos
        startpos = initialbatchsize + batch * 290

        # check if processing time has expired
        if datetime.datetime.now() > batchsettings['processingendtime']:
            print('Abort - processing time ended at', datetime.datetime.now())
            break

        # if finished this batch in less than 60s, wait until 60s has passed for this batch
        if processingtime < 60:
            print('Waiting', 60 - processingtime, 's before next batch.')
            tm.sleep(60 - processingtime)
    print('Completed.')
    
    # check if processing time has expired
    if datetime.datetime.now() > batchsettings['processingendtime']:
        print('Abort - processing time ended at', datetime.datetime.now())
        break
        
    # if all pairs processed this round produced errors, then break the processing
    if (len(errors) == len(currentpairs)):
        print('Abort - all pairs processed this batch produced errors.')
        break
    else:
        currentpairs = [pair for pair in currentpairs if pair not in processed]


Starting processing: 16289 pairs in 57 batches.
Processed batch 0 in 46.93143362640694 s ( 79.0 % download time). 49 pairs processed altogether with 0 errors so far.
Waiting 13.068566373593058 s before next batch.
Processed batch 1 in 281.57824571983656 s ( 78.0 % download time). 339 pairs processed altogether with 0 errors so far.
Processed batch 2 in 312.96887385760783 s ( 79.8 % download time). 629 pairs processed altogether with 1 errors so far.
Processed batch 3 in 424.04416221274005 s ( 86.3 % download time). 919 pairs processed altogether with 5 errors so far.
Processed batch 4 in 315.66441633505747 s ( 82.9 % download time). 1209 pairs processed altogether with 5 errors so far.
Processed batch 5 in 360.38804481700936 s ( 84.2 % download time). 1499 pairs processed altogether with 6 errors so far.
Processed batch 6 in 259.56675474951044 s ( 76.7 % download time). 1789 pairs processed altogether with 8 errors so far.
Processed batch 7 in 217.56075184248039 s ( 73.9 % download tim

In [29]:
with open('20190322_0830/networkdata_900000', 'wb') as outfile: # updated 20190217/1035
    pickle.dump((edges, nodes), outfile)

MemoryError: 

In [28]:
# if pickle dump fails due to low memory, dump to csv instead
with open('20190322_0830/networkdata_900000_edges.csv', 'w') as outfile:
    dictwriter = csv.writer(outfile, delimiter = ';')
    for key, val in edges.items():
        dictwriter.writerow([key, val])

In [30]:
currentpairs = [pair for pair in currentpairs if pair not in processed] # run 20190218/0048
len(currentpairs)

1

In [26]:
with open('20190322_0830/currentpairs850000', 'wb') as outfile:
    pickle.dump(currentpairs, outfile)

In [31]:
currentpairs

[(('E02000048', '-0.212724446', '51.60219278'),
  ('E02000048', '-0.215295512', '51.60251404'))]

In [35]:
# adding missing pairs
db = sqlite3.connect('20190322_0830/traveloptions.db')

In [36]:
# calculate average walk speed
walktrips = pd.read_sql('SELECT orig_id, dest_id, option_id, traveltime, dist FROM traveloptions WHERE legs = 1 AND legs_walking = 1', db)
walkspeed = np.mean(walktrips['dist'] / walktrips['traveltime'])
print('Mean', walkspeed, 'm/min')
print('Mean', np.mean((walktrips['dist'] / 1000) / (walktrips['traveltime'] / 60)), 'km/h')

Mean 66.73182630050388 m/min
Mean 4.003909578030228 km/h


In [37]:
pairs_present = 0
pairs_missing = []
for coord in orig_coords:
    if len(pd.read_sql('SELECT orig_id, dest_id, option_id FROM traveloptions WHERE orig_id == "' + coord[0] + '" AND dest_id == "' + coord[0] + '"', db)) > 0:
        pairs_present += 1
    else:
        pairs_missing.extend([pair for pair in odpairs if pair[0][0] == coord[0] and pair[1][0] == coord[0]])

print(pairs_present, len(pairs_missing))
print(pairs_missing)

974 9
[(('E02000954', '-0.162348358', '51.43267961'), ('E02000954', '-0.16239404', '51.43267485')), (('E02000345', '-0.069908973', '51.57408545'), ('E02000345', '-0.070098658', '51.57381318')), (('E02000444', '-0.301298513', '51.59715682'), ('E02000444', '-0.301204663', '51.59712327')), (('E02000638', '-0.124655411', '51.44946268'), ('E02000638', '-0.124364297', '51.44954231')), (('E02000438', '-0.363354407', '51.60730078'), ('E02000438', '-0.363501163', '51.60634633')), (('E02000648', '-0.100383308', '51.42597135'), ('E02000648', '-0.099983279', '51.42511491')), (('E02000058', '-0.197840395', '51.58261777'), ('E02000058', '-0.198032396', '51.58270095')), (('E02000315', '0.113142073', '51.49371007'), ('E02000315', '0.113230423', '51.49366285')), (('E02000048', '-0.212724446', '51.60219278'), ('E02000048', '-0.215295512', '51.60251404'))]


In [38]:
writecursor = db.cursor()
for pair in pairs_missing:
    dist = calculate_distance([[pair[0][1], pair[0][2]], [pair[1][1], pair[1][2]]])
    walktime = dist / walkspeed
    params = (pair[0][0], pair[1][0], 0, '2019-03-22T08:27:00', walktime, 0, 0, 1, dist, None, None, dist, 1, walktime)
    writecursor.execute('''INSERT INTO traveloptions(orig_id, dest_id, option_id, starttime, traveltime,
    fare, transfers, legs, dist, first_zoned_station, last_zoned_station, dist_walking, legs_walking, time_walking)
    VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?)''',params)
    db.commit()

In [39]:
db.close()