Post process data generated by `generate_data.ipynb`.
Concatenate distance matrix data and put in Pandas dataframe.

In [15]:
from pprint import pprint
import os
import json
import numpy as np
import pandas as pd

Load data object.
A json file.

In [16]:
base_path = os.path.join('C:\\', 'Users', 'glenn', 'src', 'pycommute')
data_directory = os.path.join(base_path, 'data')
data_file = os.path.join(data_directory, 'test_data.json')
dataframe_file = os.path.join(data_directory, 'test_data_dataframe.hdf5')
with open(data_file, 'r') as f:
    data_object = json.load(f)

Get relevant info from data object

In [17]:
distance_matrices_transit = data_object['distance_matrices_transit']
distance_matrices_driving = data_object['distance_matrices_driving']
origins_batches = data_object['origins_batches']
destinations_geocodes = data_object['destinations_geocodes']
destination_labels = data_object['config']['destination_labels']

In [18]:
destination_coords = [
    (geocode['geometry']['location']['lat'], geocode['geometry']['location']['lng']) for geocode in destinations_geocodes
]

Create pandas table. Columns are:
origin lat,
origin lon,
origin address,
destination lat,
destination lon,
destination address,
destination_label,
travel mode,
distance text,
distance value,
duration text,
duration value,
status,

Function that takes a batch and creates a list of dictionaries that correspond to a row in the dataframe.

In [19]:
def entries_from_batch(batch_matrix, mode, batch_origins, destination_coords, destination_labels=None):
    if destination_labels is None:
        destination_labels = [None] * len(destination_coords)
    entries = []
    assert batch_matrix['status'] == 'OK', "Batch status isn't `OK`. It's {}.".format(batch_matrix['status'])
    destination_addresses = batch_matrix['destination_addresses']
    origin_addresses = batch_matrix['origin_addresses']
    rows = batch_matrix['rows']
    
    assert len(origin_addresses) == len(batch_origins), "Number of origin addresses ({}) doesn't correspond to number of origin coordinates in `batch_origins` ({}).".format(len(origin_addresses), len(batch_origins))
    assert len(destination_addresses) == 2, "Number of destination addresses ({}) isn't 2.".format(len(destination_addresses))

    for row, origin_address, origin_coordinate in zip(rows, origin_addresses, batch_origins):
        elements = row['elements']
        assert len(elements) == 2, "Number of elements ({}) isn't 2".format(len(elements))
        for element, destination_address, destination_coordinate, destination_label in zip(elements, destination_addresses, destination_coords, destination_labels):
            entry = {
                'origin lat': origin_coordinate[0],
                'origin lon': origin_coordinate[1],
                'origin address': origin_address,
                'destination lat': destination_coordinate[0],
                'destination lon': destination_coordinate[1],
                'destination address': destination_address,
                'destination label': destination_label,
                'travel mode': mode,
                'status': element['status'],
                'distance text': '',
                'distance value': np.nan,
                'duration text': '',
                'duration value': np.nan,
            }
            if element['status'] == 'OK':
                entry.update({
                    'distance text': element['distance']['text'],
                    'distance value': element['distance']['value'],
                    'duration text': element['duration']['text'],
                    'duration value': element['duration']['value'],
                })
            entries.append(entry)
    return entries

Build the dataframe

In [20]:
df = pd.DataFrame()

for batch_matrix, batch_origins in zip(distance_matrices_driving, origins_batches):
    entries = entries_from_batch(batch_matrix, 'driving', batch_origins, destination_coords, destination_labels)
    df = df.append(entries, ignore_index=True, verify_integrity=True)
for batch_matrix, batch_origins in zip(distance_matrices_transit, origins_batches):
    entries = entries_from_batch(batch_matrix, 'transit', batch_origins, destination_coords, destination_labels)
    df = df.append(entries, ignore_index=True, verify_integrity=True)

df

Unnamed: 0,origin lat,origin lon,origin address,destination lat,destination lon,destination address,destination label,travel mode,status,distance text,distance value,duration text,duration value
0,59.928590,10.444782,"Ringeriksveien 254C, 1340 Skui, Norway",59.974551,11.048086,"Gunnar Randers Vei 48, 2007 Kjeller, Norway",FFI,driving,OK,45.0 km,45034.0,42 mins,2500.0
1,59.928590,10.444782,"Ringeriksveien 254C, 1340 Skui, Norway",59.666175,10.768215,"Universitetstunet 3, 1430 Ås, Norway",NMBU,driving,OK,53.4 km,53366.0,49 mins,2959.0
2,59.931288,10.444782,"Berghoffveien 65, 1340 Skui, Norway",59.974551,11.048086,"Gunnar Randers Vei 48, 2007 Kjeller, Norway",FFI,driving,OK,45.0 km,45010.0,40 mins,2388.0
3,59.931288,10.444782,"Berghoffveien 65, 1340 Skui, Norway",59.666175,10.768215,"Universitetstunet 3, 1430 Ås, Norway",NMBU,driving,OK,53.3 km,53342.0,48 mins,2861.0
4,59.925891,10.450150,"Jarenveien 2B, 1340 Skui, Norway",59.974551,11.048086,"Gunnar Randers Vei 48, 2007 Kjeller, Norway",FFI,driving,OK,44.7 km,44698.0,41 mins,2436.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
16903,59.925891,11.078121,"Nedre Rælingsveg 310, 2008 Fjerdingby, Norway",59.666175,10.768215,"Universitetstunet 3, 1430 Ås, Norway",NMBU,transit,OK,57.0 km,57007.0,1 hour 34 mins,5641.0
16904,59.928590,11.078121,"Møllervegen 15D, 2008 Fjerdingby, Norway",59.974551,11.048086,"Gunnar Randers Vei 48, 2007 Kjeller, Norway",FFI,transit,OK,7.9 km,7904.0,46 mins,2741.0
16905,59.928590,11.078121,"Møllervegen 15D, 2008 Fjerdingby, Norway",59.666175,10.768215,"Universitetstunet 3, 1430 Ås, Norway",NMBU,transit,OK,56.8 km,56794.0,1 hour 40 mins,5984.0
16906,59.931288,11.078121,"Harabakken 4, 2008 Fjerdingby, Norway",59.974551,11.048086,"Gunnar Randers Vei 48, 2007 Kjeller, Norway",FFI,transit,OK,7.7 km,7655.0,43 mins,2589.0


Store dataframe

In [21]:
df.to_hdf(dataframe_file, 'commute_data', 'w', complevel=9)