In [1]:
import pandas as pd
import requests
import json
import os
from datetime import datetime
import uuid
from IPython.display import display_javascript, display_html, display
import networkx as nx

class RenderJSON(object):
    def __init__(self, json_data):
        if isinstance(json_data, dict):
            self.json_str = json.dumps(json_data)
        else:
            self.json_str = json
        self.uuid = str(uuid.uuid4())
        
    def _ipython_display_(self):
        display_html('<div id="{}" style="height: 600px; width:100%;"></div>'.format(self.uuid),
            raw=True
        )
        display_javascript("""
        require(["https://rawgit.com/caldwell/renderjson/master/renderjson.js"], function() {
          document.getElementById('%s').appendChild(renderjson(%s))
        });
        """ % (self.uuid, self.json_str), raw=True)

def make_query(query, path, update=False, app_id="65d0aab0", app_key="cdd81044920314738d75500411d1b504"):
    if update:
        url = f"https://api.tfl.gov.uk{query}app_id={app_id}&app_key={app_key}"
        print(query)
        response = requests.get(url)
        parsed = json.loads(response.text)
        latest_date = datetime.today().strftime('%Y-%m-%d')
        os.makedirs(path+latest_date+os.path.split(query)[0], exist_ok=True)
        with open(f"{path}{latest_date}{query[:-1]}.json", "w") as f:
            json.dump(parsed, f)
    else:
        latest_date = sorted(os.listdir(path))[-1]
        with open(f"{path}{latest_date}{query[:-1]}.json", "r") as f:
            parsed = json.load(f)
    return parsed

### Fetch data (either from old records, or from the API)

In [2]:
path = "../data_raw/tfl_api/"
update_data = False

df_routes = pd.DataFrame({"name":[], "line_id":[]})
df_times = pd.DataFrame({"route":[], "line_id":[], "from_naptan":[], "to_naptan":[], "transit_time":[]})
df_stations = pd.DataFrame({"name":[], "line_id":[], "naptan":[], "lat":[], "lon":[]})

# find the the lines for each mode
for mode in ["tube"]:
    #query = f"/Line/Mode/{mode}/Route?"
    #tmp = make_query(query, "../tmp/", update=True)
    query = f"/Line/Mode/{mode}?"
    json_lines = make_query(query, path, update=update_data)
    
    # find the stations and routes for each line
    for json_line in json_lines:
        line_id = json_line["id"]
        
        # find the stations
        query = f"/Line/{line_id}/StopPoints?"
        json_stations = make_query(query, path, update=update_data)
        for entity in json_stations:
            df_stations = df_stations.append({"name":entity["commonName"], \
                                              "line_id":line_id, \
                                              "naptan":entity["naptanId"], \
                                              "lat":entity["lat"], \
                                              "lon":entity["lon"]}, \
                                             ignore_index=True)
        
        # find the routes
        for direction in ["inbound","outbound"]:
            query = f"/Line/{line_id}/Route/Sequence/{direction}?"
            json_routes = make_query(query, path, update=update_data)
            
            # record data from each route
            for json_route in json_routes["orderedLineRoutes"]:
                name = json_route["name"]
                naptan_ids = json_route["naptanIds"]

                # record waypoints
                #for from_naptan,to_naptan in zip(naptan_ids[:-1],naptan_ids[1:]):
                #    df_routes = df_routes.append({"route":name, \
                #                                  "line_id":line_id, \
                #                                  "from_naptan":from_naptan, \
                #                                  "to_naptan":to_naptan}, \
                #                                 ignore_index=True)
                df_routes = df_routes.append({"name":name, \
                                              "line_id":line_id}, \
                                             ignore_index=True)
                    
                # record journey times
                query = f"/Line/{line_id}/Timetable/{naptan_ids[0]}/To/{naptan_ids[-1]}?"
                json_times = make_query(query, path, update=update_data)
                #succeeded_for_route = False
                for route in json_times["timetable"]["routes"]:              
                    for station_intervals in route["stationIntervals"]:
                        # do the intervals match the route waypoints?
                        test_naptan_ids = [naptan_ids[0]]
                        for interval in station_intervals["intervals"]:
                            test_naptan_ids.append(interval["stopId"])
                        # if yes, then record the timings
                        if naptan_ids==test_naptan_ids:
                            succeeded_for_route = True
                            last_time_to_arrival = 0
                            from_naptan = naptan_ids[0]
                            for interval in station_intervals["intervals"]:
                                to_naptan = interval["stopId"]
                                current_time_to_arrival = interval["timeToArrival"]
                                transit_time = current_time_to_arrival-last_time_to_arrival
                                df_times = df_times.append({"route":name, \
                                                            "line_id":line_id, \
                                                            "from_naptan":from_naptan, \
                                                            "to_naptan":to_naptan, \
                                                            "transit_time":transit_time}, \
                                                           ignore_index=True)
                                last_time_to_arrival = current_time_to_arrival
                                from_naptan = to_naptan
                                
                                

In [3]:
df_times["to_route"] = df_times["route"]
df_times.rename(inplace=True, columns={"route":"from_route"})
df_times.head()

Unnamed: 0,from_route,line_id,from_naptan,to_naptan,transit_time,to_route
0,Harrow & Wealdstone &harr; Elephant & Castle,bakerloo,940GZZLUHAW,940GZZLUKEN,2.0,Harrow & Wealdstone &harr; Elephant & Castle
1,Harrow & Wealdstone &harr; Elephant & Castle,bakerloo,940GZZLUKEN,940GZZLUSKT,2.0,Harrow & Wealdstone &harr; Elephant & Castle
2,Harrow & Wealdstone &harr; Elephant & Castle,bakerloo,940GZZLUSKT,940GZZLUNWY,2.0,Harrow & Wealdstone &harr; Elephant & Castle
3,Harrow & Wealdstone &harr; Elephant & Castle,bakerloo,940GZZLUNWY,940GZZLUWYC,2.0,Harrow & Wealdstone &harr; Elephant & Castle
4,Harrow & Wealdstone &harr; Elephant & Castle,bakerloo,940GZZLUWYC,940GZZLUSGP,3.0,Harrow & Wealdstone &harr; Elephant & Castle


In [4]:
# show routes with no timing values
set(df_routes["name"].unique()) - set(df_times["from_route"].unique()) - set(df_times["to_route"].unique())

{'Amersham  &harr;  Aldgate ',
 'Edgware Road (Circle Line)  &harr;  Hammersmith (H&C Line) ',
 'Edgware Road (Circle Line)  &harr;  Kensington (Olympia) ',
 'Edgware Road (Circle Line)  &harr;  Richmond ',
 'Hainault  &harr;  West Ruislip  via Woodford',
 'Hammersmith (H&C Line)  &harr;  Edgware Road (Circle Line) ',
 'Richmond  &harr;  Edgware Road (Circle Line) ',
 'West Ruislip  &harr;  Hainault  via Woodford'}

In [5]:
# show routes with too many timing values
df_times.groupby(by=["from_route","to_route","line_id","from_naptan","to_naptan"]) \
        .agg({"transit_time":"count"}) \
        .rename(columns={"transit_time":"count_of_transit_time"}) \
        .sort_values(by="count_of_transit_time", ascending=False) \
        .head(30)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,count_of_transit_time
from_route,to_route,line_id,from_naptan,to_naptan,Unnamed: 5_level_1
Hammersmith (H&C Line) &harr; Barking,Hammersmith (H&C Line) &harr; Barking,hammersmith-city,940GZZLUERC,940GZZLUBST,2
Hammersmith (H&C Line) &harr; Barking,Hammersmith (H&C Line) &harr; Barking,hammersmith-city,940GZZLULVT,940GZZLUADE,2
Hammersmith (H&C Line) &harr; Barking,Hammersmith (H&C Line) &harr; Barking,hammersmith-city,940GZZLUADE,940GZZLUWPL,2
Hammersmith (H&C Line) &harr; Barking,Hammersmith (H&C Line) &harr; Barking,hammersmith-city,940GZZLUBBB,940GZZLUWHM,2
Hammersmith (H&C Line) &harr; Barking,Hammersmith (H&C Line) &harr; Barking,hammersmith-city,940GZZLUBBN,940GZZLUMGT,2
Hammersmith (H&C Line) &harr; Barking,Hammersmith (H&C Line) &harr; Barking,hammersmith-city,940GZZLUBST,940GZZLUGPS,2
Hammersmith (H&C Line) &harr; Barking,Hammersmith (H&C Line) &harr; Barking,hammersmith-city,940GZZLUBWR,940GZZLUBBB,2
Hammersmith (H&C Line) &harr; Barking,Hammersmith (H&C Line) &harr; Barking,hammersmith-city,940GZZLUEHM,940GZZLUBKG,2
Hammersmith (H&C Line) &harr; Barking,Hammersmith (H&C Line) &harr; Barking,hammersmith-city,940GZZLUESQ,940GZZLUKSX,2
Hammersmith (H&C Line) &harr; Barking,Hammersmith (H&C Line) &harr; Barking,hammersmith-city,940GZZLUFCN,940GZZLUBBN,2


In [6]:
# show timing values for routes with non-unique timings
tmp = df_times.groupby(by=["from_route","to_route","line_id","from_naptan","to_naptan"]) \
              .agg({"transit_time":list})
tmp.loc[tmp["transit_time"].apply(lambda x: (len(x)>1) & (len(set(x))!=1))]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,transit_time
from_route,to_route,line_id,from_naptan,to_naptan,Unnamed: 5_level_1
Hammersmith (H&C Line) &harr; Barking,Hammersmith (H&C Line) &harr; Barking,hammersmith-city,940GZZLUBBB,940GZZLUWHM,"[2.0, 3.0]"
Hammersmith (H&C Line) &harr; Barking,Hammersmith (H&C Line) &harr; Barking,hammersmith-city,940GZZLUBBN,940GZZLUMGT,"[1.0, 2.0]"
Hammersmith (H&C Line) &harr; Barking,Hammersmith (H&C Line) &harr; Barking,hammersmith-city,940GZZLUEHM,940GZZLUBKG,"[4.0, 5.0]"
Hammersmith (H&C Line) &harr; Barking,Hammersmith (H&C Line) &harr; Barking,hammersmith-city,940GZZLUERC,940GZZLUBST,"[4.0, 3.0]"
Hammersmith (H&C Line) &harr; Barking,Hammersmith (H&C Line) &harr; Barking,hammersmith-city,940GZZLUESQ,940GZZLUKSX,"[3.0, 2.0]"
Hammersmith (H&C Line) &harr; Barking,Hammersmith (H&C Line) &harr; Barking,hammersmith-city,940GZZLUGPS,940GZZLUESQ,"[1.0, 2.0]"
Hammersmith (H&C Line) &harr; Barking,Hammersmith (H&C Line) &harr; Barking,hammersmith-city,940GZZLUMED,940GZZLUBWR,"[2.0, 1.0]"
Hammersmith (H&C Line) &harr; Barking,Hammersmith (H&C Line) &harr; Barking,hammersmith-city,940GZZLUWHM,940GZZLUPLW,"[2.0, 1.0]"


In [7]:
df_times.head()

Unnamed: 0,from_route,line_id,from_naptan,to_naptan,transit_time,to_route
0,Harrow & Wealdstone &harr; Elephant & Castle,bakerloo,940GZZLUHAW,940GZZLUKEN,2.0,Harrow & Wealdstone &harr; Elephant & Castle
1,Harrow & Wealdstone &harr; Elephant & Castle,bakerloo,940GZZLUKEN,940GZZLUSKT,2.0,Harrow & Wealdstone &harr; Elephant & Castle
2,Harrow & Wealdstone &harr; Elephant & Castle,bakerloo,940GZZLUSKT,940GZZLUNWY,2.0,Harrow & Wealdstone &harr; Elephant & Castle
3,Harrow & Wealdstone &harr; Elephant & Castle,bakerloo,940GZZLUNWY,940GZZLUWYC,2.0,Harrow & Wealdstone &harr; Elephant & Castle
4,Harrow & Wealdstone &harr; Elephant & Castle,bakerloo,940GZZLUWYC,940GZZLUSGP,3.0,Harrow & Wealdstone &harr; Elephant & Castle


In [19]:
update_df = True

if update_df:

    pedestrian_transit_time = 5

    # there's not many duplicate transit_times, and they're not dissimilar, so let's just take the average
    df_times2 = df_times.groupby(by=["from_route","to_route","line_id","from_naptan","to_naptan"]) \
                        .agg({"transit_time":"mean"}) \
                        .reset_index()

    # let's add duplicate the route column so that we can add some interchanges
    #df_times2.rename(columns={"route":"from_route"}, inplace=True)
    #df_times2["to_route"] = df_times2["from_route"]
    #df_times2["from_line_id"] = df_times2["line_id"]
    #df_times2["to_line_id"] = df_times2["line_id"]

    # let's add some pedestrian interchanges
    unique_naptans = set(df_times2["from_naptan"].to_list() + df_times2["to_naptan"].to_list())
    for naptan in unique_naptans:
        routes = set(df_times2.loc[(df_times2["from_naptan"]==naptan) | \
                                   (df_times2["to_naptan"]==naptan), \
                                   "from_route"].to_list() + \
                     df_times2.loc[(df_times2["from_naptan"]==naptan) | \
                                   (df_times2["to_naptan"]==naptan), \
                                   "to_route"].to_list())

        # firstly, interchanging between train lines
        for from_route in routes:
            for to_route in routes:
                if from_route!=to_route:
                    df_times2 = df_times2.append({"line_id":"pedestrian", \
                                                  "from_route":from_route, \
                                                  "to_route":to_route, \
                                                  "from_naptan":naptan, \
                                                  "to_naptan":naptan, \
                                                  "transit_time":pedestrian_transit_time}, \
                                                 ignore_index=True)
        # secondly, entering/exiting the station
        for route in routes:
            df_times2 = df_times2.append({"line_id":"pedestrian", \
                                          "from_route":route, \
                                          "to_route":"EntEx", \
                                          "from_naptan":naptan, \
                                          "to_naptan":naptan, \
                                          "transit_time":pedestrian_transit_time}, \
                                         ignore_index=True) \
                                 .append({"line_id":"pedestrian", \
                                          "from_route":"EntEx", \
                                          "to_route":route, \
                                          "from_naptan":naptan, \
                                          "to_naptan":naptan, \
                                          "transit_time":pedestrian_transit_time}, \
                                         ignore_index=True)  
        
        route_lines = df_routes.set_index("name").to_dict()["line_id"]
        #route_lines["Entrance"] = ""
        #route_lines["Exit"] = ""
        route_lines["EntEx"] = ""
        #df_times2["from_line_id"] = df_times2["from_route"].map(route_lines)
        #df_times2["to_line_id"] = df_times2["to_route"].map(route_lines)
        
        station_names = df_stations.set_index("naptan").to_dict()["name"]
        df_times2["from_name"] = df_times2["from_naptan"].map(station_names)
        df_times2["to_name"] = df_times2["to_naptan"].map(station_names)
        
        station_lats = df_stations.set_index("naptan").to_dict()["lat"]
        station_lons = df_stations.set_index("naptan").to_dict()["lon"]
        df_times2["from_lat"] = df_times2["from_naptan"].map(station_lats)
        df_times2["from_lon"] = df_times2["from_naptan"].map(station_lons)
        df_times2["to_lat"] = df_times2["to_naptan"].map(station_lats)
        df_times2["to_lon"] = df_times2["to_naptan"].map(station_lons)
        
        df_times2.to_csv("../data_processed/api_times.csv", index=False)
            
else:
    df_times2 = pd.read_csv("../data_processed/api_times.csv")

df_times2.head()

Unnamed: 0,from_route,to_route,line_id,from_naptan,to_naptan,transit_time,from_name,to_name,from_lat,from_lon,to_lat,to_lon
0,Aldgate &harr; Amersham,Aldgate &harr; Amersham,metropolitan,940GZZLUALD,940GZZLULVT,2.0,Aldgate Underground Station,Liverpool Street Underground Station,51.514246,-0.075689,51.517372,-0.083182
1,Aldgate &harr; Amersham,Aldgate &harr; Amersham,metropolitan,940GZZLUBBN,940GZZLUFCN,2.0,Barbican Underground Station,Farringdon Underground Station,51.520275,-0.097993,51.520252,-0.104913
2,Aldgate &harr; Amersham,Aldgate &harr; Amersham,metropolitan,940GZZLUBST,940GZZLUFYR,6.0,Baker Street Underground Station,Finchley Road Underground Station,51.522883,-0.15713,51.546825,-0.179845
3,Aldgate &harr; Amersham,Aldgate &harr; Amersham,metropolitan,940GZZLUCAL,940GZZLUAMS,4.0,Chalfont & Latimer Underground Station,Amersham Underground Station,51.667985,-0.560689,51.674126,-0.607714
4,Aldgate &harr; Amersham,Aldgate &harr; Amersham,metropolitan,940GZZLUCYD,940GZZLUCAL,4.0,Chorleywood Underground Station,Chalfont & Latimer Underground Station,51.654358,-0.518461,51.667985,-0.560689


In [20]:
df_times2[df_times2.isna().any(axis=1)]

Unnamed: 0,from_route,to_route,line_id,from_naptan,to_naptan,transit_time,from_name,to_name,from_lat,from_lon,to_lat,to_lon


### Let's build a graph and save the result

In [21]:
source_nodes = (df_times2["from_route"] + ": " + df_times2["from_naptan"]).to_list()
source_names = df_times2["from_name"].to_list()
source_lats = df_times2["from_lat"].to_list()
source_lons = df_times2["from_lon"].to_list()
#source_lines = df_times2["from_line_id"].to_list()
target_nodes = (df_times2["to_route"] + ": " + df_times2["to_naptan"]).to_list()
target_names = df_times2["to_name"].to_list()
target_lats = df_times2["to_lat"].to_list()
target_lons = df_times2["to_lon"].to_list()
#target_lines = df_times2["to_line_id"].to_list()

edge_weights = df_times2["transit_time"].to_list()
edge_lines = df_times2["line_id"].to_list()

ebunch = ((s, t, {"weight":w,"line_id":l}) for s,t,w,l in zip(source_nodes, target_nodes, edge_weights, edge_lines))
G = nx.DiGraph()
G.add_edges_from(ebunch)
nx.set_node_attributes(G, {k:v for k,v in zip(source_nodes,source_names)}, name="name")
nx.set_node_attributes(G, {k:v for k,v in zip(source_nodes,source_lats)}, name="lat")
nx.set_node_attributes(G, {k:v for k,v in zip(source_nodes,source_lons)}, name="lon")
nx.set_node_attributes(G, {k:v for k,v in zip(target_nodes,target_names)}, name="name")
nx.set_node_attributes(G, {k:v for k,v in zip(target_nodes,target_lats)}, name="lat")
nx.set_node_attributes(G, {k:v for k,v in zip(target_nodes,target_lons)}, name="lon")
#nx.set_node_attributes(G, {k:v for k,v in zip(source_nodes,source_lines)}, name="line_id")
#nx.set_node_attributes(G, {k:v for k,v in zip(target_nodes,target_lines)}, name="line_id")

In [22]:
print(G.edges['Aldgate  &harr;  Amersham : 940GZZLUALD', 'Aldgate  &harr;  Amersham : 940GZZLULVT'])
print(G.nodes['Aldgate  &harr;  Amersham : 940GZZLUALD'])
print(G.nodes[('Aldgate  &harr;  Amersham ', '940GZZLUALD')])

{'weight': 2.0, 'line_id': 'metropolitan'}
{'name': 'Aldgate Underground Station', 'lat': 51.514246, 'lon': -0.075689}


KeyError: ('Aldgate  &harr;  Amersham ', '940GZZLUALD')

In [23]:
data = nx.readwrite.json_graph.node_link_data(G)
with open("../data_processed/nx_graph.json", "w") as f:
    json.dump(data, f)

In [24]:
df_times2

Unnamed: 0,from_route,to_route,line_id,from_naptan,to_naptan,transit_time,from_name,to_name,from_lat,from_lon,to_lat,to_lon
0,Aldgate &harr; Amersham,Aldgate &harr; Amersham,metropolitan,940GZZLUALD,940GZZLULVT,2.0,Aldgate Underground Station,Liverpool Street Underground Station,51.514246,-0.075689,51.517372,-0.083182
1,Aldgate &harr; Amersham,Aldgate &harr; Amersham,metropolitan,940GZZLUBBN,940GZZLUFCN,2.0,Barbican Underground Station,Farringdon Underground Station,51.520275,-0.097993,51.520252,-0.104913
2,Aldgate &harr; Amersham,Aldgate &harr; Amersham,metropolitan,940GZZLUBST,940GZZLUFYR,6.0,Baker Street Underground Station,Finchley Road Underground Station,51.522883,-0.157130,51.546825,-0.179845
3,Aldgate &harr; Amersham,Aldgate &harr; Amersham,metropolitan,940GZZLUCAL,940GZZLUAMS,4.0,Chalfont & Latimer Underground Station,Amersham Underground Station,51.667985,-0.560689,51.674126,-0.607714
4,Aldgate &harr; Amersham,Aldgate &harr; Amersham,metropolitan,940GZZLUCYD,940GZZLUCAL,4.0,Chorleywood Underground Station,Chalfont & Latimer Underground Station,51.654358,-0.518461,51.667985,-0.560689
...,...,...,...,...,...,...,...,...,...,...,...,...
15688,EntEx,Epping &harr; Ealing Broadway,pedestrian,940GZZLUSWF,940GZZLUSWF,5.0,South Woodford Underground Station,South Woodford Underground Station,51.591907,0.027338,51.591907,0.027338
15689,Ealing Broadway &harr; Epping,EntEx,pedestrian,940GZZLUSWF,940GZZLUSWF,5.0,South Woodford Underground Station,South Woodford Underground Station,51.591907,0.027338,51.591907,0.027338
15690,EntEx,Ealing Broadway &harr; Epping,pedestrian,940GZZLUSWF,940GZZLUSWF,5.0,South Woodford Underground Station,South Woodford Underground Station,51.591907,0.027338,51.591907,0.027338
15691,Epping &harr; West Ruislip,EntEx,pedestrian,940GZZLUSWF,940GZZLUSWF,5.0,South Woodford Underground Station,South Woodford Underground Station,51.591907,0.027338,51.591907,0.027338
