# Setup
This street cube is created using Hangzhou data from November and December of 2011.

In [1]:
# Imports to help query the cube, these are used for the following sections.
from IPython.display import display, HTML
from itertools import islice
from django.db import connection

from datetime import datetime
from collections import namedtuple
from IPython.display import Markdown

cursor = connection.cursor()

num_results = 20  # Maximum results to print from a query
def format_results(results):
    if len(results) > 1:
        sep = '-:'
        nl = '\n'
        col_sep = '|'
    else:
        sep = ' = '
        nl = ''
        col_sep = ', '
    return Markdown(nl.join((
        col_sep.join([f for f in results[0]._fields]),  # Header row
        col_sep.join(['-:'] * len(results[0]._fields)) if len(results) > 1 else ' = ',  # Alignment Row
        '\n'.join([col_sep.join([str(getattr(r, f)) if getattr(r, f) is not None else 'ANY' for f in r._fields]) for r in results]).rstrip())
    ))

def execute(q_str, max_results=num_results):
    cursor.execute(q_str)
    nt_result = namedtuple('Result', [col[0] for col in cursor.description])
    return [nt_result(*r) for r in islice(cursor, None, max_results)]

# Street Cube Statistics

### Total GPS Samples Processed to Create the Cube

In [3]:
q = """
SELECT
    COUNT(geometry) AS "num_trips",
    SUM(ST_NPoints(geometry)) AS "num_samples"
FROM entity_trip
"""
format_results(execute(q))

num_trips, num_samples = 17385125, 288284766

In [14]:
q = """
    SELECT sum((measures->>'cnt')::int) as "Total_Samples" 
    FROM streetcube_streetcell
    """.replace('\n', ' ')
%timeit execute(q)
format_results(execute(q))

The slowest run took 5.95 times longer than the fastest. This could mean that an intermediate result is being cached.
1 loop, best of 3: 6.29 s per loop


Total_Samples = 40630901

### Number of Non-Empty Cells in the Cube

In [5]:
# Count of data cells.  These are the direct cube cells for all dimensions.
q = """
    SELECT count(*) as data_cell_count
    FROM streetcube_streetcell
    WHERE NOT (time_inc IS NULL OR street_id IS NULL)
    """.replace('\n', ' ')
# %timeit execute(q)
results = execute(q)
data_cell_count = results[0].data_cell_count
format_results(results)

data_cell_count = 81310980

In [6]:
# Count of aggregate cells.  These are the cells that aggregate along a dimension.
q = """
    SELECT count(*) as agg_cell_count
    FROM streetcube_streetcell
    WHERE (time_inc IS NULL OR osm_id IS NULL)
    """.replace('\n', ' ')
# %timeit execute(q)
results = execute(q)
agg_cell_count = results[0].agg_cell_count
format_results(results)

agg_cell_count = 18244928

In [7]:
# Count of total cells.
q = """
    SELECT count(*) as total_cell_count
    FROM streetcube_streettaxicell
    """.replace('\n', ' ')
# %timeit execute(q)
results = execute(q)
total_cell_count = results[0].total_cell_count
format_results(results)

total_cell_count = 99555908

# Cube Size Information
Having built the cube it's useful to know how much data was processed to build the cube and what kind of size each dimension has in the cube.  The queries below provide some useful statistics related to the cube that was created above.

## Memory
### Intermediate cube data

In [15]:
# Get the size on disk of the cube table.
q = """
    SELECT pg_size_pretty(
        pg_total_relation_size('streetcube_streetcell')
    ) AS traj_cube_size__od
"""
results = execute(q)
'TrajCube::ST is size {} on disk.'.format(results[0].traj_cube_size__od)

'TrajCube::ST is size 3332 MB on disk.'

### Cube set table
This is the table that stores the generates cube sets

In [16]:
# Get the size on disk of the cube set table.
q = """
    SELECT pg_size_pretty(
        pg_total_relation_size('cache_table')
    ) AS cube_set_size
"""
results = execute(q)
'Cube sets size {} on disk.'.format(results[0].cube_set_size)

'Cube sets size 10 MB on disk.'

## Date ranges

In [9]:
q = """
    SELECT min(start_datetime), max(start_datetime)
    FROM entity_trip
""".replace('\n', ' ')
%timeit execute(q)
format_results(execute(q))

The slowest run took 125.89 times longer than the fastest. This could mean that an intermediate result is being cached.
1000 loops, best of 3: 571 µs per loop


min, max = 1999-11-30 00:42:48+00:00, 2080-01-12 11:07:23+00:00

## Most prevalent feature in each dimension

In [10]:
# List the most prevalent taxis
q = """
    SELECT
        entity_id,
        count(entity_id) as entity_count,
        count(distinct osm_id) as osm_count,
        count(distinct time_inc) as time_count
    FROM streetcube_streettaxicell GROUP BY entity_id ORDER BY entity_count DESC
    LIMIT 5""".replace('\n', ' ')
%timeit execute(q)
results = execute(q)
taxi_id_list = str(tuple(r.entity_id for r in results))
format_results(results)

The slowest run took 5.57 times longer than the fastest. This could mean that an intermediate result is being cached.
1 loop, best of 3: 54.6 s per loop


entity_id|entity_count|osm_count|time_count
-:|-:|-:|-:
2711|24760|1831|708
3023|24657|1958|639
2429|22367|1794|691
5833|20722|2051|639
6460|20495|1841|668

In [11]:
# List the most prevalent streets
q = """
    SELECT
        osm_id,
        count(osm_id) as osm_cells,
        count(distinct entity_id) as entity_cells,
        count(distinct time_inc) as time_cells
    FROM streetcube_streettaxicell GROUP BY osm_id ORDER BY osm_cells DESC
    LIMIT 5""".replace('\n', ' ')
%timeit execute(q)

results = execute(q)
street_id_list = str(tuple(r.osm_id for r in results))
format_results(results)

The slowest run took 9.90 times longer than the fastest. This could mean that an intermediate result is being cached.
1 loop, best of 3: 2min 43s per loop


osm_id|osm_cells|entity_cells|time_cells
-:|-:|-:|-:
636180|758404|8258|794
633894|664998|8240|734
1855111|568258|8173|728
1644171|534252|8191|725
625228|512220|8203|728

In [12]:
# List the most prevalent times
q = """
    SELECT
        time_inc,
        count(time_inc) as time_cells,
        count(distinct entity_id) as entity_cells,
        count(distinct osm_id) as osm_cells
    FROM streetcube_streettaxicell GROUP BY time_inc ORDER BY time_cells DESC
    LIMIT 5
    """.replace('\n', ' ')
%timeit execute(q)
results = execute(q)
time_id_list = str(tuple(r.time_inc for r in results))
format_results(results)

The slowest run took 11.88 times longer than the fastest. This could mean that an intermediate result is being cached.
1 loop, best of 3: 2min 57s per loop


time_inc|time_cells|entity_cells|osm_cells
-:|-:|-:|-:
2011-12-10 02:00:00+00:00|277820|7825|3336
2011-12-10 01:00:00+00:00|277160|7820|3263
2011-12-11 01:00:00+00:00|272339|7770|3337
2011-12-10 00:00:00+00:00|271051|7795|3255
2011-12-11 00:00:00+00:00|263014|7756|3391

### Dimension counts

In [13]:
q = "SELECT count(distinct entity_id) as count_taxis FROM streetcube_streettaxicell"
%timeit execute(q)
format_results(execute(q))

1 loop, best of 3: 1min 14s per loop


count_taxis = 10294

In [14]:
q = "SELECT count(distinct osm_id) as count_streets FROM streetcube_streettaxicell"
%timeit execute(q)
format_results(execute(q))

1 loop, best of 3: 1min 16s per loop


count_streets = 5628

In [15]:
q = "SELECT count(distinct time_inc) as count_times FROM streetcube_streettaxicell"
%timeit execute(q)
format_results(execute(q))

1 loop, best of 3: 1min 9s per loop


count_times = 1035

# Lookups and Dimension Aggregation
The creation of the cube provides some initial aggregation below the smallest level of fidelity that is needed for reporting or continued analysis.  The cube puts this data into a form that is easy for the database to query but also easy for the database to perform additional aggegations, which are called rollups.  A rollup summarizes the data along one or more dimensions in the cube.  Since rollups can be performed along any dimension, we demonstrate the queries using those dimensions.

1. Street
1. Taxi
1. Time

In [16]:
# Query portion to calculate average speed using PostGres NOSQL fields
count_measure = """
        sum((measures->'speed'->>'count')::int)
"""

avg_speed_measure = """
        (sum(CAST(measures->'speed'->>'sum' as float))
        / sum(CAST(measures->'speed'->>'count' as float))
        )
"""

combined_measure = """{count} as "count", {avg_speed}
""".format(count=count_measure, avg_speed=avg_speed_measure)

measure_column = combined_measure

## Street-centric data and Rollups
For these queries we rollup along time and taxi. Note that the common street id is chosen so that all the queries demonstrate the same street for connectivity as well as to limit the results.  In practice the streets chosen would be done through another pre-query such as brushing an area on a map.  The common taxi used in each query is to provide connectivity between each example.  In practice the query would return all taxis per any additional filter criteria. This same concept applies to the other examples but on different dimensions.

### Cells Related to Each of the Chosen Example Dimensions
Some specific values were chosen as rollup values, and consequently filters, in order to provide better continuity between each example.  By applying the value as a filter on the non-rollup version of the query, it's possible to see that the cube supports the higher fidelity results as well as the rollup.  That is, an initial example shows the individual results and a subsequent example shows the rollup of those results.

In [17]:
q = ("""
    SELECT count(*) AS street_count FROM streetcube_streettaxicell
    WHERE osm_id IN %s""" % street_id_list).replace('\n', ' ')
%timeit execute(q)
format_results(execute(q))

1 loop, best of 3: 4.23 s per loop


street_count = 3038132

In [18]:
q = ("""
    SELECT count(*) AS taxi_count FROM streetcube_streettaxicell
    WHERE entity_id IN %s"""  % taxi_id_list).replace('\n', ' ')
%timeit execute(q)
format_results(execute(q))

The slowest run took 4.34 times longer than the fastest. This could mean that an intermediate result is being cached.
10 loops, best of 3: 36.3 ms per loop


taxi_count = 113001

In [19]:
q = ("""
    SELECT count(*) AS combined_count
    FROM streetcube_streettaxicell
    WHERE osm_id IN %s and entity_id IN %s
    """ % (street_id_list, taxi_id_list)).replace('\n', '')
%timeit execute(q)
format_results(execute(q))

The slowest run took 93.43 times longer than the fastest. This could mean that an intermediate result is being cached.
1 loop, best of 3: 5.94 ms per loop


combined_count = 2764

### Full-Fidelity Query Filtered by Street and Taxi
This shows a snippet of the full fidelity of the cube, which can provide information related to an individual taxi on an individual street for an individual time increment. That is, a single cell indexed by all the cube dimensions.  The query below filters these results by taxi id and street id. 

In [20]:
q = ("""
    SELECT time_inc as time, osm_id as street_id, entity_id as taxi_id, %s as measure
    FROM streetcube_streettaxicell
    WHERE osm_id IN %s and entity_id IN %s
    GROUP BY osm_id, time_inc, entity_id
    ORDER BY measure DESC
    """ % (measure_column, street_id_list, taxi_id_list)).replace('\n', ' ')
%timeit execute(q)
format_results(execute(q))

The slowest run took 12.53 times longer than the fastest. This could mean that an intermediate result is being cached.
100 loops, best of 3: 10.9 ms per loop


time|street_id|taxi_id|count|measure
-:|-:|-:|-:|-:
2011-12-16 21:00:00+00:00|1855111|2429|4|93.0
2011-12-04 21:00:00+00:00|1855111|6460|2|90.0
2011-12-28 11:00:00+00:00|1855111|6460|4|90.0
2011-12-01 17:00:00+00:00|1855111|2711|5|89.2
2011-12-12 19:00:00+00:00|1855111|2711|1|88.0
2011-12-10 04:00:00+00:00|1855111|2711|3|86.3333333333333
2011-12-15 15:00:00+00:00|1855111|2711|5|85.4
2011-12-30 21:00:00+00:00|1855111|2429|3|85.3333333333333
2011-12-01 20:00:00+00:00|1644171|6460|1|85.0
2011-12-26 16:00:00+00:00|1855111|2429|3|85.0
2011-12-06 20:00:00+00:00|1855111|2711|5|84.8
2011-12-03 20:00:00+00:00|1855111|2711|2|84.5
2011-12-07 21:00:00+00:00|1855111|2711|8|84.5
2011-12-10 20:00:00+00:00|1855111|2711|12|84.4166666666667
2011-12-24 20:00:00+00:00|1855111|5833|1|84.08
2011-12-16 21:00:00+00:00|1644171|2711|1|84.0
2011-12-19 18:00:00+00:00|1855111|2711|18|83.8888888888889
2011-12-27 19:00:00+00:00|1855111|2711|5|83.8
2011-12-01 20:00:00+00:00|1855111|2711|6|83.5
2011-12-19 20:00:00+00:00|1644171|2711|9|83.3333333333333

### Rollup on Time
This query is largely the same as the previous except that it will rollup on the time dimension.  The result is the average speed of the defined street for the defined taxi including all time data in the cube.

In [21]:
q = ("""
    SELECT osm_id as street_id, entity_id as taxi_id, %s as measure
    FROM streetcube_streettaxicell
    WHERE osm_id IN %s and entity_id IN %s
    GROUP BY osm_id, entity_id
    ORDER BY entity_id
    """ % (measure_column, street_id_list, taxi_id_list)).replace('\n', ' ')
%timeit execute(q)
format_results(execute(q))

100 loops, best of 3: 5.77 ms per loop


street_id|taxi_id|count|measure
-:|-:|-:|-:
625228|2429|1436|20.1768802228412
636180|2429|2380|19.6529411764706
1644171|2429|1632|35.8774509803922
633894|2429|1160|23.1879310344828
1855111|2429|1512|36.989417989418
633894|2711|2966|21.490222521915
1855111|2711|1776|51.9786036036036
1644171|2711|2500|32.8912
625228|2711|2754|19.3558460421206
636180|2711|5072|17.9286277602524
636180|3023|14650|19.759014334471
625228|3023|9228|18.2051495448635
1644171|3023|3126|25.6517210492642
633894|3023|8526|19.1079849870983
1855111|3023|3622|31.6386195472115
633894|5833|11222|16.4220049901978
1644171|5833|2208|32.3039855072464
1855111|5833|1868|26.5649571734475
625228|5833|4896|18.5988807189542
636180|5833|8204|23.9688127742565

### Full-Fidelity Query Filtered by Street
This query is the same as the first except the filter by taxi id is not applied here.  This shows that the cube can return results for multiple values on the taxi dimension.

In [22]:
q = ("""
    SELECT osm_id as street_id, entity_id as taxi_id, %s as measure
    FROM streetcube_streettaxicell
    WHERE osm_id IN %s
    GROUP BY osm_id, entity_id
    ORDER BY measure DESC
    """ % (measure_column, street_id_list)).replace('\n', ' ')
%timeit execute(q)
format_results(execute(q))

1 loop, best of 3: 7.64 s per loop


street_id|taxi_id|count|measure
-:|-:|-:|-:
1644171|5|2|107.42
1855111|9487|2|80.01
1855111|8745|2|77.23
633894|9577|2|77.0
1855111|10185|4|76.49
1855111|6302|2|74.0
1855111|5197|14|70.9057142857143
1644171|9336|2|70.75
1644171|8079|2|70.38
1855111|9057|2|70.38
1855111|8756|2|69.82
1855111|9363|2|69.64
1855111|9404|2|69.08
1855111|8139|4|68.62
636180|9537|2|68.52
1855111|8504|4|68.245
1855111|10179|2|66.67
1644171|9536|2|66.67
636180|9409|2|66.67
1644171|9210|2|66.67

### Rollup on Taxi and Time
This query still filters by the defined street ID but the result is the combination of all values in the time dimension and all values in the taxi dimension.  Since the result is filtered by street, there is a single result.

In [23]:
q = ("""
    SELECT osm_id as street_id, %s as measure
    FROM streetcube_streettaxicell
    WHERE osm_id IN %s
    GROUP BY osm_id
    ORDER BY measure DESC
    """ % (measure_column, street_id_list)).replace('\n', ' ')
%timeit execute(q)
format_results(execute(q))

1 loop, best of 3: 7.3 s per loop


street_id|count|measure
-:|-:|-:
1644171|6545320|31.9793958798056
1855111|7374536|31.2373378501388
625228|7803620|19.3497997698483
636180|23563112|17.0677495519281
633894|16222012|16.3162846039086

## Taxi-centric data and Rollups
In the previous examples For these queries we roll up on time and and taxi to get information related to a specific street.  In order to show the cube supports the dimensions arbitrarily, these queries will show the queries to get information on a specific taxi.

### Rollup on Street
Here we query the database to roll up on the street and chose the first hour of rush hour

In [24]:
q = ("""
    SELECT time_inc as time, entity_id as taxi_id, %s as measure
    FROM streetcube_streettaxicell
    WHERE entity_id IN %s
    GROUP BY time_inc, entity_id
    ORDER BY measure DESC
    """ % (measure_column, taxi_id_list)).replace('\n', ' ')
%timeit execute(q)
format_results(execute(q))

10 loops, best of 3: 166 ms per loop


time|taxi_id|count|measure
-:|-:|-:|-:
2011-12-19 18:00:00+00:00|5833|1098|92.6157194899817
2011-12-11 16:00:00+00:00|6460|208|85.4615384615385
2011-12-12 16:00:00+00:00|6460|298|83.2818791946309
2011-12-13 14:00:00+00:00|6460|234|82.3760683760684
2011-12-19 15:00:00+00:00|5833|1430|77.1123076923077
2011-12-21 16:00:00+00:00|6460|338|75.5207100591716
2011-12-24 03:00:00+00:00|5833|1014|73.6644773175542
2011-12-02 06:00:00+00:00|6460|266|73.4135338345865
2011-12-20 15:00:00+00:00|2711|276|70.7391304347826
2011-12-30 00:00:00+00:00|6460|332|70.5722891566265
2011-12-18 08:00:00+00:00|2711|406|69.935960591133
2011-12-12 13:00:00+00:00|6460|398|67.3668341708543
2011-12-12 05:00:00+00:00|2429|456|66.4254385964912
2011-12-03 03:00:00+00:00|2429|274|66.3941605839416
2011-12-22 04:00:00+00:00|2429|210|66.3904761904762
2011-12-12 22:00:00+00:00|6460|350|66.1485714285714
2011-12-18 22:00:00+00:00|2429|444|65.8018018018018
2011-12-13 07:00:00+00:00|2429|408|65.5392156862745
2011-12-20 07:00:00+00:00|2711|514|65.2334630350195
2011-12-09 13:00:00+00:00|6460|324|64.9320987654321

### Rollup on Street and Time
Here we query the database to roll up on the street and chose the first hour of rush hour

In [25]:
q = ("""
    SELECT entity_id as taxi_id, %s as measure
    FROM streetcube_streettaxicell
    WHERE entity_id IN %s AND NOT (time_inc IS NULL OR osm_id IS NULL)
    GROUP BY entity_id
    """ % (measure_column, taxi_id_list)).replace('\n', ' ')
%timeit execute(q)
format_results(execute(q))

1 loop, best of 3: 138 ms per loop


taxi_id|count|measure
-:|-:|-:
2429|81146|25.6250831833978
2711|104083|25.864992361865
3023|382369|22.1307308123829
5833|526981|14.4939565563085
6460|92641|26.0047063395257

# Projected Cubes
Here we're trying to reduce the size of the working cube.

In [26]:
'There are ' + str(agg_cell_count+data_cell_count) + ' total cells in the 3 dimension cube'

'There are 99555908 total cells in the 3 dimension cube'

In [27]:
# Cells for the 2 dimension projected cube on time and streets
q = """
    SELECT count(*)
    FROM streetcube_streettaxicell
    WHERE entity_id IS NULL
    """.replace('\n', ' ')
# %timeit execute(q)
format_results(execute(q))

count = 1978961

In [28]:
# Cells for the 2 dimension projected cube on time and taxis
q = """
    SELECT count(*)
    FROM streetcube_streettaxicell
    WHERE osm_id IS NULL
    """.replace('\n', ' ')
# %timeit execute(q)
format_results(execute(q))

count = 4790199