Post process data generated by `generate_data.ipynb`.
Concatenate distance matrix data and put in Pandas dataframe.

In [24]:
from pprint import pprint
import os
import json
import numpy as np
import pandas as pd

Load data object.
A json file.

In [25]:
base_path = os.path.join('C:\\', 'Users', 'glenn', 'src', 'pycommute')
data_directory = os.path.join(base_path, 'data')
data_file = os.path.join(data_directory, 'test_data.json')
dataframe_file = os.path.join(data_directory, 'test_data_dataframe.hdf5')
with open(data_file, 'r') as f:
    data_object = json.load(f)

Get relevant info from data object

In [26]:
distance_matrices_transit = data_object['distance_matrices_transit']
distance_matrices_driving = data_object['distance_matrices_driving']
origins_batches = data_object['origins_batches']
destinations_geocodes = data_object['destinations_geocodes']
destination_labels = data_object['config']['destination_labels']

In [27]:
destination_coords = [
    (geocode['geometry']['location']['lat'], geocode['geometry']['location']['lng']) for geocode in destinations_geocodes
]

Create pandas table. Columns are:
origin lat,
origin lon,
origin address,
destination lat,
destination lon,
destination address,
destination_label,
travel mode,
distance text,
distance value,
duration text,
duration value,
status,

Function that takes a batch and creates a list of dictionaries that correspond to a row in the dataframe.

In [28]:
def entries_from_batch(batch_matrix, mode, batch_origins, destination_coords, destination_labels=None):
    if destination_labels is None:
        destination_labels = [None] * len(destination_coords)
    entries = []
    assert batch_matrix['status'] == 'OK', "Batch status isn't `OK`. It's {}.".format(batch_matrix['status'])
    destination_addresses = batch_matrix['destination_addresses']
    origin_addresses = batch_matrix['origin_addresses']
    rows = batch_matrix['rows']
    
    assert len(origin_addresses) == len(batch_origins), "Number of origin addresses ({}) doesn't correspond to number of origin coordinates in `batch_origins` ({}).".format(len(origin_addresses), len(batch_origins))
    assert len(destination_addresses) == 2, "Number of destination addresses ({}) isn't 2.".format(len(destination_addresses))

    for row, origin_address, origin_coordinate in zip(rows, origin_addresses, batch_origins):
        elements = row['elements']
        assert len(elements) == 2, "Number of elements ({}) isn't 2".format(len(elements))
        for element, destination_address, destination_coordinate, destination_label in zip(elements, destination_addresses, destination_coords, destination_labels):
            entry = {
                'origin lat': origin_coordinate[0],
                'origin lon': origin_coordinate[1],
                'origin address': origin_address,
                'destination lat': destination_coordinate[0],
                'destination lon': destination_coordinate[1],
                'destination address': destination_address,
                'destination label': destination_label,
                'travel mode': mode,
                'status': element['status'],
                'distance text': '',
                'distance value': np.nan,
                'duration text': '',
                'duration value': np.nan,
            }
            if element['status'] == 'OK':
                entry.update({
                    'distance text': element['distance']['text'],
                    'distance value': element['distance']['value'],
                    'duration text': element['duration']['text'],
                    'duration value': element['duration']['value'],
                })
            entries.append(entry)
    return entries

Build the dataframe

In [29]:
df = pd.DataFrame()

for batch_matrix, batch_origins in zip(distance_matrices_driving, origins_batches):
    entries = entries_from_batch(batch_matrix, 'driving', batch_origins, destination_coords, destination_labels)
    df = df.append(entries, ignore_index=True, verify_integrity=True)
for batch_matrix, batch_origins in zip(distance_matrices_transit, origins_batches):
    entries = entries_from_batch(batch_matrix, 'transit', batch_origins, destination_coords, destination_labels)
    df = df.append(entries, ignore_index=True, verify_integrity=True)

df

Unnamed: 0,origin lat,origin lon,origin address,destination lat,destination lon,destination address,destination label,travel mode,status,distance text,distance value,duration text,duration value
0,63.410974,10.343429,"Torshaugveien 21, 7020 Trondheim, Norway",63.412328,10.404471,"Klæbuveien 125, 7031 Trondheim, Norway",Lerkendal Stadion,driving,OK,7.1 km,7073.0,15 mins,908.0
1,63.410974,10.343429,"Torshaugveien 21, 7020 Trondheim, Norway",63.428781,10.473247,"Landbruksvegen 2, 7047 Trondheim, Norway",Ikea,driving,OK,11.2 km,11186.0,17 mins,1040.0
2,63.413828,10.343429,"Øvre Sverresborg 9, 7020 Trondheim, Norway",63.412328,10.404471,"Klæbuveien 125, 7031 Trondheim, Norway",Lerkendal Stadion,driving,OK,6.8 km,6775.0,14 mins,853.0
3,63.413828,10.343429,"Øvre Sverresborg 9, 7020 Trondheim, Norway",63.428781,10.473247,"Landbruksvegen 2, 7047 Trondheim, Norway",Ikea,driving,OK,10.9 km,10888.0,16 mins,984.0
4,63.425246,10.343429,"Møllebakken 60, 7020 Trondheim, Norway",63.412328,10.404471,"Klæbuveien 125, 7031 Trondheim, Norway",Lerkendal Stadion,driving,OK,5.9 km,5895.0,15 mins,917.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2647,63.405264,10.509335,"63.40526436,10.50933478",63.428781,10.473247,"Landbruksvegen 2, 7047 Trondheim, Norway",Ikea,transit,OK,4.5 km,4466.0,51 mins,3074.0
2648,63.408119,10.509335,"63.40811894,10.50933478",63.412328,10.404471,"63.4123278,10.404471",Lerkendal Stadion,transit,ZERO_RESULTS,,,,
2649,63.408119,10.509335,"63.40811894,10.50933478",63.428781,10.473247,"Landbruksvegen 2, 7047 Trondheim, Norway",Ikea,transit,OK,4.5 km,4522.0,52 mins,3135.0
2650,63.405264,10.512055,"63.40526436,10.51205454",63.412328,10.404471,"63.4123278,10.404471",Lerkendal Stadion,transit,ZERO_RESULTS,,,,


Store dataframe

In [30]:
df.to_hdf(dataframe_file, 'commute_data', 'w', complevel=9)