# Map Scrambling

This is an experiment to see how time and location data can be scrambled while still allowing querying based on proximity.


## Sample Data

First, lets source some open data from the City of Cape Town in order to generate sample location data sets

In [24]:
from urllib.parse import urlencode, quote_plus
import requests
import os
import hashlib
import random
import zipfile
import pygeoif.geometry
from fastkml import kml

cache_dir = os.path.join(os.getcwd(), '_cache')
if not os.path.exists(cache_dir):
    os.mkdir(cache_dir)

# TODO: you need to get the City of Cape Town consent to download these files, as described on the home page URLs
sample_data_sets = {
    'taxi_routes': {'description': 'The file stores the spatial location of features (point, line, polygon) of taxi routes.',
                    'license_url': 'https://web1.capetown.gov.za/web1/OpenDataPortal/Document/GetDocument/1',
                    'home_page': 'https://web1.capetown.gov.za/web1/OpenDataPortal/DatasetDetail?DatasetName=Taxi%20routes',
                    'url': 'http://cityapps.capetown.gov.za/sites/opendatacatalog/Documents/Taxi%20routes/Taxi_Routes.kmz',
                    'skip': False,
                   },
    'fire_stations': {'description': 'Indicates the location of fire stations.',
                    'license_url': 'https://web1.capetown.gov.za/web1/OpenDataPortal/Document/GetDocument/1',
                    'home_page': 'https://web1.capetown.gov.za/web1/OpenDataPortal/DatasetDetail?DatasetName=Fire%20stations',
                    'url': 'http://cityapps.capetown.gov.za/sites/opendatacatalog/Documents/Fire%20stations/Fire%20stations%202016.kmz',
                     }
}

points = set()
routes = []

# read in all the data
for data_set_name, data_set_attrs in sample_data_sets.items():
    if data_set_attrs.get('skip', False):
        continue
    url = data_set_attrs['url']
    document_name = url[url.rfind('/')+1:]
    document_format = document_name[document_name.rfind('.')+1:]
    actual_url = 'https://www.capetown.gov.za/_layouts/OpenDataPortalHandler/DownloadHandler.ashx?DocumentName=%s&DatasetDocument=%s' % (document_name, quote_plus(url))
    url_hash = hashlib.sha256(actual_url.encode('UTF-8')).hexdigest()
    cache_filename = os.path.join(cache_dir, '%s-%s' % (url_hash, document_name))
    if os.path.exists(cache_filename):
        with open(cache_filename, 'rb') as f:
            data = f.read()
    else:
        data = requests.get(actual_url).content
        with open(cache_filename, 'wb') as f:
            f.write(data)
    if document_format == 'kmz':
        with zipfile.ZipFile(cache_filename) as kmz_zip:
            with kmz_zip.open('doc.kml') as kml_file:
                data = kml_file.read()
        document_format = 'kml'
    print("Name: %s; format: %s; length: %s" % (data_set_name, document_format, len(data)))
    k = kml.KML()
    k.from_string(data)
    count_points, count_paths, count_path_points = 0, 0, 0
    for document in k.features():
        for folder in document.features():
            for placemark in folder.features():
                geometry = placemark.geometry
                if isinstance(geometry, pygeoif.geometry.Point):
                    points.add((geometry.x, geometry.y))
                    count_points += 1
                elif isinstance(geometry, pygeoif.geometry.MultiLineString):
                    # the taxi routes data has a single linestring per MultiLineString
                    for i, linestring in enumerate(geometry.geoms):
                        routes.append([(g.x, g.y) for g in linestring.geoms])
                        count_paths += 1
                        count_path_points += len(linestring.geoms)
                else:
                    print("Unexpected type of geometry: %r" % geometry)
    print("  %d points, %d paths with %d total points" % (count_points, count_paths, count_path_points))

Name: taxi_routes; format: kml; length: 24602482
  0 points, 1466 paths with 577511 total points
Name: fire_stations; format: kml; length: 10200
  30 points, 0 paths with 0 total points


## Generating time and location data

Let's generate a sample set of simulated moving cellphone data

In [33]:
class parameters:
    PHONES = 1000
    MINUTES = 60*24*14
    SAMPLE_FREQUENCY = 7
    SAMPLE_STDEV = 2
    LOCATIONS_AVG = 2
    LOCATIONS_STDEV = 4
    TRIPS_PER_DAY_AVG = 3
    TRIPS_PER_DAY_STDEV = 1.5
    TRIP_TIME_STDEV = SAMPLE_FREQUENCY*10
    TRIP_STDEV = 0.003
    WALK_STDEV = 0.0005

time_phone_location = []

total_trips = 0
total_locations = 0

for phone_id in range(parameters.PHONES):
    trips_per_day = max(random.gauss(parameters.TRIPS_PER_DAY_AVG, parameters.TRIPS_PER_DAY_STDEV), 0.1)
    location_count = max(int(random.gauss(parameters.LOCATIONS_AVG, parameters.LOCATIONS_STDEV)), 1)
    locations = random.sample(points, location_count)
    minute = 0.0
    location_n = random.randint(0, location_count-1)
    total_locations += location_count
    x, y = locations[location_n]
    trips_taken = 0
    in_trip = False
    while minute < parameters.MINUTES:
        minute += random.gauss(parameters.SAMPLE_FREQUENCY, parameters.SAMPLE_STDEV)
        if in_trip:
            if trip_point >= len(trip):
                in_trip = False
                # get a different location to the last one
                if location_count > 1:
                    new_location_n = random.randint(0, location_count-2)
                    location_n = new_location_n + 1 if new_location_n >= location_n else new_location_n
                else:
                    location_n = 0
                x, y = locations[location_n]
            else:
                x, y = trip[trip_point]
                x += random.gauss(0, parameters.TRIP_STDEV)
                y += random.gauss(0, parameters.TRIP_STDEV)
                trip_point += 1
        if not in_trip:
            x += random.gauss(0, parameters.WALK_STDEV)
            y += random.gauss(0, parameters.WALK_STDEV)
            if (minute / 60*24) > (1/trips_per_day - trips_taken) + random.gauss(0, parameters.TRIP_TIME_STDEV):
                in_trip, trip, trip_point = True, random.choice(routes), 0
                trips_taken += 1
                total_trips += 1
        time_phone_location.append((minute, phone_id, (x, y)))

print("Tracked %d phones and generated %d time-phone-location tuples, with %d trips between %d phone-locations" % (parameters.PHONES, len(time_phone_location), total_trips, total_locations))

Tracked 1000 phones and generated 2880622 time-phone-location tuples, with 8359 trips between 2823 phone-locations
