In [None]:
!pip install syft==0.2.9

# Differential Privacy example for a single feature database

Single boolean column of random 1 or 0. 5000 rows. Initialize a random list of 1s and 0s

In [None]:
import torch
num_entries = 5000

db = torch.rand(num_entries).gt(0.5).to(torch.uint8)
db

tensor([1, 0, 1,  ..., 1, 1, 1], dtype=torch.uint8)

Key to the definition of differenital privacy is the ability to ask the question "When querying a database, if I removed someone from the database, would the output of the query be any different?". Thus, in order to check this, we must construct what we term "parallel databases" which are simply databases with one entry removed.

Create 5000 parallel databases where in each of the database, one entry from the original database is dropped. Hence the size of each of the parallel databases will be 4999.

To do this, to delete element at index i, we slice the db from 0 to i, another slice at i+1 to 5000. Then concat these two slices

In [None]:
def get_parallel_db(db, remove_index):
    return torch.cat((db[0:remove_index], db[remove_index+1:]))

In [None]:
get_parallel_db(db, 3).shape

torch.Size([4999])

In [None]:
def get_parallel_dbs(db):
    parallel_dbs = list()
    for i in range(len(db)):
        pdb = get_parallel_db(db, i)
        parallel_dbs.append(pdb)
    return parallel_dbs

In [None]:
pdbs = get_parallel_dbs(db)

In [None]:
# Convenience function to do all these
def create_db_and_parallels(num_entries):
   
    def get_parallel_dbs(db):
        
        def get_parallel_db(db, remove_index):
            return torch.cat((db[0:remove_index], db[remove_index+1:]))

        parallel_dbs = list()
        for i in range(len(db)):
            pdb = get_parallel_db(db, i)
            parallel_dbs.append(pdb)
        return parallel_dbs

    db = torch.rand(num_entries).gt(0.5).to(torch.uint8)
    pdbs = get_parallel_dbs(db)
    return db, pdbs

In [None]:
db, pdbs = create_db_and_parallels(20)

In [None]:
db.shape

torch.Size([20])

In [None]:
len(pdbs)

20

In [None]:
pdbs[0].shape

torch.Size([19])

In [None]:
pdbs

[tensor([1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0],
        dtype=torch.uint8),
 tensor([1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0],
        dtype=torch.uint8),
 tensor([1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0],
        dtype=torch.uint8),
 tensor([1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0],
        dtype=torch.uint8),
 tensor([1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0],
        dtype=torch.uint8),
 tensor([1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0],
        dtype=torch.uint8),
 tensor([1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0],
        dtype=torch.uint8),
 tensor([1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0],
        dtype=torch.uint8),
 tensor([1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0],
        dtype=torch.uint8),
 tensor([1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0],
        dtype=torch.uint8),
 tensor([1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0,

## Evaluating the Privacy of a Function

In [None]:
db, pdb = create_db_and_parallels(20)
db

tensor([0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1],
       dtype=torch.uint8)

In [None]:
len(db)

20

In [None]:
len(pdb[0])

19

Create a function that queries this database. Let the query be a simple sum ie for a binary vector, it is the number of ones.

In [None]:
def query(db):
    return db.sum()

query(db).item() # item() to return number, Otherwise it will return tensor

11

So, the actual sum is 11. For the parallel databases, with one guy removed, the output of this query gets changed. Shown below.

In [None]:
print([query(temp_db).item() for temp_db in pdb])

[11, 10, 10, 10, 10, 11, 10, 10, 11, 11, 10, 10, 11, 10, 10, 11, 11, 11, 11, 10]


We need to find the max distance with which the parallel dbs change when you apply the query function on them compared to the application of the query function to the actual db.

In [None]:
max_distance = 0
centralized_db_result = query(db) # returns torch.Tensor
for temp_db in pdb:
    temp_db_result = query(temp_db) # returns torch.Tensor
    db_l1_distance = torch.abs(temp_db_result - centralized_db_result)
    if db_l1_distance > max_distance: max_distance = db_l1_distance

In [None]:
max_distance.item()

1

This is called sensitivity.

### Sensitivity:

The maximum amount by which the query changes when removing an individual from the data.

It is also called the L1-Sensitivity.

Now we found how to find the sensitivity of the function sum(). Now onwards to finding the sensitivity of any function generically.

In [None]:
def calculate_sensitivity(query, num_entries, verbose=False):
    def create_db_and_parallels(num_entries):
    
        def get_parallel_dbs(db):
            
            def get_parallel_db(db, remove_index):
                return torch.cat((db[0:remove_index], db[remove_index+1:]))

            parallel_dbs = list()
            for i in range(len(db)):
                pdb = get_parallel_db(db, i)
                parallel_dbs.append(pdb)
            return parallel_dbs

        db = torch.rand(num_entries).gt(0.5).to(torch.uint8)
        pdbs = get_parallel_dbs(db)
        return db, pdbs

    db, pdbs = create_db_and_parallels(num_entries)

    if verbose:
        print('Centralized data is: ', db)
        print('Parallel data are: ', pdbs)

    centralized_result = query(db)
    parallel_results = [query(temp_db) for temp_db in pdbs]

    if verbose:
        print('Centralized result: ', centralized_result)
        print('Parallel results: ', parallel_results)

    max_distance = 0
    for parallel_result in parallel_results:
        if max_distance < torch.abs(centralized_result - parallel_result): 
            max_distance = torch.abs(centralized_result - parallel_result)
    L1_sensitivity = max_distance
    return L1_sensitivity

print('Sensitivity for sum function is: ', calculate_sensitivity(lambda x: x.sum(), 20, verbose=True).item())

Centralized data is:  tensor([0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0],
       dtype=torch.uint8)
Parallel data are:  [tensor([0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0],
       dtype=torch.uint8), tensor([0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0],
       dtype=torch.uint8), tensor([0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0],
       dtype=torch.uint8), tensor([0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0],
       dtype=torch.uint8), tensor([0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0],
       dtype=torch.uint8), tensor([0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0],
       dtype=torch.uint8), tensor([0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0],
       dtype=torch.uint8), tensor([0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0],
       dtype=torch.uint8), tensor([0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0],
       dtype=torch.uint8), tensor([0, 0, 1, 1, 1, 0,

What happens if the query function is to find the mean? What will the removal of one element from the database result in?

In [None]:
print([temp_db.float().mean() for temp_db in pdbs])

[tensor(0.4211), tensor(0.4211), tensor(0.4737), tensor(0.4737), tensor(0.4211), tensor(0.4737), tensor(0.4211), tensor(0.4737), tensor(0.4737), tensor(0.4211), tensor(0.4211), tensor(0.4211), tensor(0.4737), tensor(0.4737), tensor(0.4737), tensor(0.4211), tensor(0.4211), tensor(0.4737), tensor(0.4737), tensor(0.4737)]


In [None]:
# mean query
# or use lambda x: x.float().mean() instead
def mean_query(db):
    return db.float().mean()

for _ in range(10): print("Sensitivity for the query mean is: ", calculate_sensitivity(mean_query, 200).item())

Sensitivity for the query mean is:  0.0026633143424987793
Sensitivity for the query mean is:  0.0025628209114074707
Sensitivity for the query mean is:  0.002914607524871826
Sensitivity for the query mean is:  0.0026884078979492188
Sensitivity for the query mean is:  0.002613067626953125
Sensitivity for the query mean is:  0.0026884078979492188
Sensitivity for the query mean is:  0.0027889609336853027
Sensitivity for the query mean is:  0.0025628209114074707
Sensitivity for the query mean is:  0.0027889609336853027
Sensitivity for the query mean is:  0.0026884078979492188


In [None]:
from statistics import mean
sensitivities = []
for _ in range(100):
    sensitivities.append(calculate_sensitivity(mean_query, 200).item())
mean(sensitivities)

0.0026711082458496095

This will result in an average sensitivity that is the same as the average value / num_entries

## Calculate the L1-Sensitivity of Threshold Function

Compute the sum over the database and return whether the sum is greater than a certain threshold.

Then create a database of size 10 and calculate the sensitivity for the threshold function

Reinitialize the DB 10 times and find the sensitivity.

In [None]:
def query(db, threshold=5):
    return (db.sum() > threshold).float()

db, pdbs = create_db_and_parallels(10)

print('Sum of db: ', db.sum())
for pdb in pdbs: print('Sum: ', pdb.sum())

Sum of db:  tensor(7)
Sum:  tensor(6)
Sum:  tensor(7)
Sum:  tensor(6)
Sum:  tensor(6)
Sum:  tensor(6)
Sum:  tensor(7)
Sum:  tensor(6)
Sum:  tensor(7)
Sum:  tensor(6)
Sum:  tensor(6)


In [None]:
for pdb in pdbs: print('Threshold for 5 is:', query(pdb))

Threshold for 5 is: tensor(1.)
Threshold for 5 is: tensor(1.)
Threshold for 5 is: tensor(1.)
Threshold for 5 is: tensor(1.)
Threshold for 5 is: tensor(1.)
Threshold for 5 is: tensor(1.)
Threshold for 5 is: tensor(1.)
Threshold for 5 is: tensor(1.)
Threshold for 5 is: tensor(1.)
Threshold for 5 is: tensor(1.)


In [None]:
for _ in range(5):
    calculate_sensitivity(query, num_entries=10, verbose=True)
    print('=========================================================================================================================')

Centralized data is:  tensor([0, 1, 1, 0, 1, 0, 0, 1, 0, 0], dtype=torch.uint8)
Parallel data are:  [tensor([1, 1, 0, 1, 0, 0, 1, 0, 0], dtype=torch.uint8), tensor([0, 1, 0, 1, 0, 0, 1, 0, 0], dtype=torch.uint8), tensor([0, 1, 0, 1, 0, 0, 1, 0, 0], dtype=torch.uint8), tensor([0, 1, 1, 1, 0, 0, 1, 0, 0], dtype=torch.uint8), tensor([0, 1, 1, 0, 0, 0, 1, 0, 0], dtype=torch.uint8), tensor([0, 1, 1, 0, 1, 0, 1, 0, 0], dtype=torch.uint8), tensor([0, 1, 1, 0, 1, 0, 1, 0, 0], dtype=torch.uint8), tensor([0, 1, 1, 0, 1, 0, 0, 0, 0], dtype=torch.uint8), tensor([0, 1, 1, 0, 1, 0, 0, 1, 0], dtype=torch.uint8), tensor([0, 1, 1, 0, 1, 0, 0, 1, 0], dtype=torch.uint8)]
Centralized result:  tensor(0.)
Parallel results:  [tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(0.)]
Centralized data is:  tensor([0, 0, 1, 1, 1, 0, 0, 0, 0, 0], dtype=torch.uint8)
Parallel data are:  [tensor([0, 1, 1, 1, 0, 0, 0, 0, 0], dtype=torch.uint8), tensor([0,

Sensitivity is variable here and depends on the sum.. In turn the elements in the database.