# Map Scrambling

This is an experiment to see how time and location data can be scrambled while still allowing querying based on proximity.


## Sample Data

First, lets source some open data from the City of Cape Town in order to generate sample location data sets

In [1]:
from urllib.parse import urlencode, quote_plus
import requests
import math
import os
import hashlib
import random
import zipfile
import pygeoif.geometry
from fastkml import kml

cache_dir = os.path.join(os.getcwd(), '_cache')
if not os.path.exists(cache_dir):
    os.mkdir(cache_dir)

# TODO: you need to get the City of Cape Town consent to download these files, as described on the home page URLs
sample_data_sets = {
    'taxi_routes': {'description': 'The file stores the spatial location of features (point, line, polygon) of taxi routes.',
                    'license_url': 'https://web1.capetown.gov.za/web1/OpenDataPortal/Document/GetDocument/1',
                    'home_page': 'https://web1.capetown.gov.za/web1/OpenDataPortal/DatasetDetail?DatasetName=Taxi%20routes',
                    'url': 'http://cityapps.capetown.gov.za/sites/opendatacatalog/Documents/Taxi%20routes/Taxi_Routes.kmz',
                    'skip': False,
                   },
    'fire_stations': {'description': 'Indicates the location of fire stations.',
                    'license_url': 'https://web1.capetown.gov.za/web1/OpenDataPortal/Document/GetDocument/1',
                    'home_page': 'https://web1.capetown.gov.za/web1/OpenDataPortal/DatasetDetail?DatasetName=Fire%20stations',
                    'url': 'http://cityapps.capetown.gov.za/sites/opendatacatalog/Documents/Fire%20stations/Fire%20stations%202016.kmz',
                     }
}

points = set()
routes = []
route_points = set()

# read in all the data
for data_set_name, data_set_attrs in sample_data_sets.items():
    if data_set_attrs.get('skip', False):
        continue
    url = data_set_attrs['url']
    document_name = url[url.rfind('/')+1:]
    document_format = document_name[document_name.rfind('.')+1:]
    actual_url = 'https://www.capetown.gov.za/_layouts/OpenDataPortalHandler/DownloadHandler.ashx?DocumentName=%s&DatasetDocument=%s' % (document_name, quote_plus(url))
    url_hash = hashlib.sha256(actual_url.encode('UTF-8')).hexdigest()
    cache_filename = os.path.join(cache_dir, '%s-%s' % (url_hash, document_name))
    if os.path.exists(cache_filename):
        with open(cache_filename, 'rb') as f:
            data = f.read()
    else:
        data = requests.get(actual_url).content
        with open(cache_filename, 'wb') as f:
            f.write(data)
    if document_format == 'kmz':
        with zipfile.ZipFile(cache_filename) as kmz_zip:
            with kmz_zip.open('doc.kml') as kml_file:
                data = kml_file.read()
        document_format = 'kml'
    print("Name: %s; format: %s; length: %s" % (data_set_name, document_format, len(data)))
    k = kml.KML()
    k.from_string(data)
    count_points, count_paths, count_path_points = 0, 0, 0
    for document in k.features():
        for folder in document.features():
            for placemark in folder.features():
                geometry = placemark.geometry
                if isinstance(geometry, pygeoif.geometry.Point):
                    points.add((geometry.x, geometry.y))
                    count_points += 1
                elif isinstance(geometry, pygeoif.geometry.MultiLineString):
                    # the taxi routes data has a single linestring per MultiLineString
                    for i, linestring in enumerate(geometry.geoms):
                        line_points = [(g.x, g.y) for g in linestring.geoms]
                        routes.append(line_points)
                        route_points.update(line_points)
                        count_paths += 1
                        count_path_points += len(linestring.geoms)
                else:
                    print("Unexpected type of geometry: %r" % geometry)
    print("  %d points, %d paths with %d total points" % (count_points, count_paths, count_path_points))

all_points = points.union(route_points)
print("%d unique points including routes" % len(all_points))

Name: taxi_routes; format: kml; length: 24602482
  0 points, 1466 paths with 577511 total points
Name: fire_stations; format: kml; length: 10200
  30 points, 0 paths with 0 total points
74486 unique points including routes


## Generating time and location data

Let's generate a sample set of simulated moving cellphone data

In [2]:
class gen_params:
    PHONES = 5000
    MINUTES = 60*24*14
    SAMPLE_FREQUENCY = 7
    SAMPLE_STDEV = 2
    LOCATIONS_AVG = 2
    LOCATIONS_STDEV = 4
    TRIPS_PER_DAY_AVG = 3
    TRIPS_PER_DAY_STDEV = 1.5
    TRIP_TIME_STDEV = SAMPLE_FREQUENCY*10
    TRIP_STDEV = 10/111111. # about 10 minutes
    WALK_STDEV = 2/111111. # about 2 minutes

time_phone_location = []

total_trips = 0
total_locations = 0

for phone_id in range(gen_params.PHONES):
    trips_per_day = max(random.gauss(gen_params.TRIPS_PER_DAY_AVG, gen_params.TRIPS_PER_DAY_STDEV), 0.1)
    location_count = max(int(random.gauss(gen_params.LOCATIONS_AVG, gen_params.LOCATIONS_STDEV)), 1)
    locations = random.sample(points, location_count)
    minute = 0.0
    location_n = random.randint(0, location_count-1)
    total_locations += location_count
    x, y = locations[location_n]
    trips_taken = 0
    in_trip = False
    while minute < gen_params.MINUTES:
        minute += random.gauss(gen_params.SAMPLE_FREQUENCY, gen_params.SAMPLE_STDEV)
        if in_trip:
            if trip_point >= len(trip):
                in_trip = False
                # get a different location to the last one
                if location_count > 1:
                    new_location_n = random.randint(0, location_count-2)
                    location_n = new_location_n + 1 if new_location_n >= location_n else new_location_n
                else:
                    location_n = 0
                x, y = locations[location_n]
            else:
                x, y = trip[trip_point]
                x += random.gauss(0, gen_params.TRIP_STDEV)
                y += random.gauss(0, gen_params.TRIP_STDEV)
                trip_point += 1
        if not in_trip:
            x += random.gauss(0, gen_params.WALK_STDEV)
            y += random.gauss(0, gen_params.WALK_STDEV)
            if (minute / 60*24) > (1/trips_per_day - trips_taken) + random.gauss(0, gen_params.TRIP_TIME_STDEV):
                in_trip, trip, trip_point = True, random.choice(routes), 0
                trips_taken += 1
                total_trips += 1
        time_phone_location.append((minute, phone_id, (x, y)))

time_phone_location.sort()

print("Tracked %d phones and generated %d time-phone-location tuples, with %d trips between %d phone-locations" % (gen_params.PHONES, len(time_phone_location), total_trips, total_locations))

Tracked 5000 phones and generated 14402119 time-phone-location tuples, with 41688 trips between 14391 phone-locations


## Sectoring and Anonymizing Time-Phone-Location Data

We divide the location set into roughly square sectors.

We then generate a sector ID for each location in the time-phone-location dataset

Finally we store a hash of the time and sector ID, rather than the location, whilst still remembering sector adjacency for that time

In [3]:
x_range = {p[0] for p in all_points}
y_range = {p[1] for p in all_points}

min_x, max_x = min(x_range), max(x_range)
min_y, max_y = min(y_range), max(y_range)
mid_x, mid_y = (min_x + max_x)/2, (min_y + max_y)/2

def distance(origin, destination):
    # thanks https://gist.github.com/rochacbruno/2883505
    lat1, lon1 = origin
    lat2, lon2 = destination
    radius = 6371 # km
    dlat = math.radians(lat2-lat1)
    dlon = math.radians(lon2-lon1)
    a = math.sin(dlat/2) * math.sin(dlat/2) + math.cos(math.radians(lat1)) \
        * math.cos(math.radians(lat2)) * math.sin(dlon/2) * math.sin(dlon/2)
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
    d = radius * c
    return d

x_distance = distance((min_x, mid_y), (max_x, mid_y))
y_distance = distance((mid_x, min_y), (mid_x, max_y))
print("The X values lie between %r and %r - a distance of %r km" % (min_x, max_x, x_distance))
print("The Y values lie between %r and %r - a distance of %r km" % (min_y, max_y, y_distance))

class grid_params:
    GRID_SIZE = 20/111111.  # a rough estimate
    TIME_POCKET = 60 # minutes

def coords_to_grid(x, y):
    return int(x / grid_params.GRID_SIZE), int(y / grid_params.GRID_SIZE)

time_phone_grid = [(time, phone, coords_to_grid(x, y)) for (time, phone, (x, y)) in time_phone_location]

unique_tracked_points = len(set(location for time, phone, location in time_phone_location))
unique_grid_points = len(set(grid_points for time, phone, grid_points in time_phone_grid))

print("%d unique tracked points reduce to %d unique grid points" % (unique_tracked_points, unique_grid_points))

def time_grid_to_bucket(minute, gx, gy):
    hour = int(minute / grid_params.TIME_POCKET)
    return hashlib.sha1('%d:%d,%d' % (hour, gx, gy))

phone_buckets_list = [(phone, time_grid_to_bucket(minute, gx, gy)) for (minute, phone, (gx, gy)) in time_phone_grid]
phone_bucket_map = {}
bucket_phone_map = {}

for phone, bucket in phone_buckets_list:
    phone_bucket_map.setdefault(phone, set()).add(bucket)
    bucket_phone_map.setdefault(bucket, set()).add(phone)

The X values lie between 18.13720600006003 and 18.91122499998429 - a distance of 86.06698591807307 km
The Y values lie between -34.27053699994632 and -33.05829100003106 - a distance of 127.81153196032332 km
14402119 unique tracked points reduce to 227770 unique grid points


NameError: name 'TIME_POCKET' is not defined

## Querying the Buckets for a particular phone

We now illustrate how having identified a particular phone, we can search for phones with proximity to that phone in the covered period

In [None]:
search_phone = random.randint(gen_params.PHONES)

buckets = phone_bucket_map[search_phone]

target_phones = set()

print("Found %d buckets" % buckets)

bucket_result_sizes = []

for bucket in buckets:
    bucket_phones = bucket_phone_map.get(bucket, [])
    bucket_result_sizes.append(len(bucket_phones))
    target_phones.update(bucket_phones)

print("Found %d phones" % target_phones)
bucket_len_dist = {size: bucket_result_sizes.count(size) for size in set(bucket_result_sizes)}
print("Bucket result sizes:")
import pprint
pprint.pprint(bucket_len_dist)