# Map Scrambling

This is an experiment to see how time and location data can be scrambled while still allowing querying based on proximity

In [21]:
from urllib.parse import urlencode, quote_plus
import requests
import os
import hashlib
import zipfile
import pygeoif.geometry
from fastkml import kml

cache_dir = os.path.join(os.getcwd(), '_cache')
if not os.path.exists(cache_dir):
    os.mkdir(cache_dir)

sample_data_sets = {
    'taxi_routes': {'description': 'The file stores the spatial location of features (point, line, polygon) of taxi routes.',
                    'license_url': 'https://web1.capetown.gov.za/web1/OpenDataPortal/Document/GetDocument/1',
                    'home_page': 'https://web1.capetown.gov.za/web1/OpenDataPortal/DatasetDetail?DatasetName=Taxi%20routes',
                    'url': 'http://cityapps.capetown.gov.za/sites/opendatacatalog/Documents/Taxi%20routes/Taxi_Routes.kmz',
                    'skip': False,
                   },
    'fire_stations': {'description': 'Indicates the location of fire stations.',
                    'license_url': 'https://web1.capetown.gov.za/web1/OpenDataPortal/Document/GetDocument/1',
                    'home_page': 'https://web1.capetown.gov.za/web1/OpenDataPortal/DatasetDetail?DatasetName=Fire%20stations',
                    'url': 'http://cityapps.capetown.gov.za/sites/opendatacatalog/Documents/Fire%20stations/Fire%20stations%202016.kmz',
                     }
}

points = set()
routes = []

# read in all the data
for data_set_name, data_set_attrs in sample_data_sets.items():
    if data_set_attrs.get('skip', False):
        continue
    url = data_set_attrs['url']
    document_name = url[url.rfind('/')+1:]
    document_format = document_name[document_name.rfind('.')+1:]
    actual_url = 'https://www.capetown.gov.za/_layouts/OpenDataPortalHandler/DownloadHandler.ashx?DocumentName=%s&DatasetDocument=%s' % (document_name, quote_plus(url))
    url_hash = hashlib.sha256(actual_url.encode('UTF-8')).hexdigest()
    cache_filename = os.path.join(cache_dir, '%s-%s' % (url_hash, document_name))
    if os.path.exists(cache_filename):
        with open(cache_filename, 'rb') as f:
            data = f.read()
    else:
        data = requests.get(actual_url).content
        with open(cache_filename, 'wb') as f:
            f.write(data)
    if document_format == 'kmz':
        with zipfile.ZipFile(cache_filename) as kmz_zip:
            with kmz_zip.open('doc.kml') as kml_file:
                data = kml_file.read()
        document_format = 'kml'
    print("Name: %s; format: %s; length: %s" % (data_set_name, document_format, len(data)))
    k = kml.KML()
    k.from_string(data)
    for document in k.features():
        for folder in document.features():
            for placemark in folder.features():
                geometry = placemark.geometry
                if isinstance(geometry, pygeoif.geometry.Point):
                    points.add((geometry.x, geometry.y))
                elif isinstance(geometry, pygeoif.geometry.MultiLineString):
                    # the taxi routes data has a single linestring per MultiLineString
                    for i, linestring in enumerate(geometry.geoms):
                        routes.append([(g.x, g.y) for g in linestring.geoms])
                else:
                    print("Unexpected type of geometry: %r" % geometry)

Name: taxi_routes; format: kml; length: 24602482
Name: fire_stations; format: kml; length: 10200
