In [100]:
import numpy as np
import pandas as pd
import re

In [107]:
def coords_to_list(coords: str):
    return re.split(', ', coords.replace("(", "").replace(")", ""))

def time_to_int(time: str):
    (h, m, s) = time.split(':')
    result = int(h) * 3600 + int(m) * 60 + int(s)
    return result

def date_to_int(date: str):
    (y, m, d) = date.split('-')
    result = int(y + m + d)
    return result

def carrier_to_int(carrier: str):
    # since no carriers share the same first letter, just set category to ASCII of first letter
    return ord(carrier[0])

In [109]:
# a dataframe containing the data to be attacked
data: pd.DataFrame = pd.read_csv("small_data.csv")

# convert all columns to integers to make random queries possible 
time = list(map(time_to_int, data["time"]))
date = list(map(date_to_int, data["date"]))
carriers = list(map(carrier_to_int, data["carrier"]))
coords = list(map(coords_to_list, data["location"]))

data = pd.DataFrame({"date": date, "time": time, "carrier": carriers, "location": coords})


print(data)
# pd.DataFrame = 
n = data.shape[0]

# names of public identifier columns
pub = ["time", "date", "carrier"]

# variable to reconstruct
target = "location"

longitude = [float(i[0]) for i in data[target]]
latitude = [float(i[1]) for i in data[target]]
def execute_subsetsums_exact(predicates):
    """Count the number of citizens that satisfy each predicate.
    Resembles a public query interface on a sequestered dataset.
    Computed as in equation (1).

    :param predicates: a list of predicates on the public variables
    :returns a 1-d np.ndarray of exact answers the subset sum queries"""
#     print(sum([pred(data) for pred in predicates]))
    long = longitude @ np.stack([pred(data) for pred in predicates], axis=1)
    lat = latitude @ np.stack([pred(data) for pred in predicates], axis=1)
    return long, lat

          date   time  carrier                                 location
0     20140704  62100       97                 [40.8161391, -73.904608]
1     20140706  45180       97                [40.8192885, -73.8997237]
2     20140707  17100       97                [40.8152388, -73.9160874]
3     20140709  42840       97  [40.81610072727273, -73.90402954545455]
4     20140709  68460       97                 [40.815306, -73.8987435]
...        ...    ...      ...                                      ...
3391  20140930  30480      117                      [40.7843, -73.9585]
3392  20140930  46860      117                      [40.7287, -73.9548]
3393  20140930  54000      117            [40.7764, -73.94800000000001]
3394  20140930  58320      117                      [40.7758, -73.9507]
3395  20140930  70200      117                      [40.7587, -73.9775]

[3396 rows x 4 columns]


In [111]:
def make_random_predicate():
    """Returns a (pseudo)random predicate function by hashing public identifiers."""
    prime = 691
    desc = np.random.randint(prime, size=len(pub))
    # this predicate maps data into a 1-d ndarray of booleans
    #   (where `@` is the dot product and `%` modulus)
    return lambda data: ((data[pub].values @ desc) % prime % 2).astype(bool)

In [112]:
def reconstruction_attack(data_pub, predicates, answers):
    """Reconstructs a target column based on the `answers` to queries about `data`.

    :param data_pub: data of length n consisting of public identifiers
    :param predicates: a list of k predicate functions
    :param answers: a list of k answers to a query on data filtered by the k predicates
    :return 1-dimensional boolean ndarray"""
    # solve system of linear equations using ranodm predicates
    a = np.array([pred(data_pub) for pred in predicates])
    
    b = np.array([answer for answer in answers[0]])
    sol_long = np.linalg.lstsq(a, b, rcond=None)[0]
    
    b = np.array([answer for answer in answers[1]])
    sol_lat = np.linalg.lstsq(a, b, rcond=None)[0]

    
    return list(zip(sol_long, sol_lat))
    
#     sol_list = [[1] if x[0] > 0.5 else [0] for x in solution[0].tolist()]
#     return np.array(sol_list, dtype=int)

In [117]:
def test_reconstruction_attack(): 
    preds = [make_random_predicate() for i in range(n)]
    ans = execute_subsetsums_exact(preds)
    recon = reconstruction_attack(data, preds, ans)
    
    precision = 8
    long = [round(float(data[target][i][0]), precision) == round(recon[i][0], precision) for i in range(n)]
    lat = [round(float(data[target][i][1]), precision) == round(recon[i][1], precision) for i in range(n)]
    # return longitude percentage correct 
    print("longitude", 100 * (sum(long) / n))

    # return latitude percentage correct 
    print("latitude", 100 * (sum(lat) / n))

    print("exact location", 100 * (sum([1 if (i[0] == True and i[1] == True) else 0 for i in list(zip(long, lat))]) / n))
test_reconstruction_attack()

longitude 96.99646643109541
latitude 96.96702002355713
exact location 96.87868080094229


In [None]:
def execute_subsetsums_gaussian(sigma, predicates):
    """Count the number of trips that satisfy each predicate.
    Resembles a public query interface on a sequestered dataset.

    :param predicates: a list of predicates on the public variables
    :returns a 1-d np.ndarray of noisy answers the subset sum queries"""
    raw = data[target].values @ np.stack([pred(data) for pred in predicates], axis=1)
    noise = np.random.normal(0, sigma, len(raw))
    return list(map(sum, zip(raw, noise)))

def execute_subsetsums_gaussian(sigma, predicates):
    """Count the number of trips that satisfy each predicate.
    Resembles a public query interface on a sequestered dataset.

    :param predicates: a list of predicates on the public variables
    :returns a 1-d np.ndarray of noisy answers the subset sum queries"""
    raw = data[target].values @ np.stack([pred(data) for pred in predicates], axis=1)
    noise = np.random.normal(0, sigma, len(raw))
    return list(map(sum, zip(raw, noise)))