In [15]:
!pip install syft==0.2.9 >/dev/null

# Local and Global Differential Privacy

### Local DB

* Adding noise directly to the database or having individuals add noise to their own data before putting it into the database.

* Protection is happening at a local level.

###Global DP

* Adds noise to the output of the query to the database

* More accurate results to the query

* But requires the DB administrator or central server to be trustworthy.


## How much Noise should be added
### Randomized Response:
https://www.youtube.com/watch?v=0QuwEoesV9Q

Added privacy comes at the cost of accuracy.

### Goal of Differential Privacy:
* How to get the most accurate results with the greatest amount of privacy.
* Greatest fit with trust models in the actual world - flexible DP strategy

### Implementing Randomized Response using Plausible Deniability

* Flip coins using a binary rand generator
* For each entry in the DB, flip two coins
* If the first coin is Heads, leave the entry in the DB as it is. If it is Tails, reset that entry in the DB w.r.t. the second coin flip.
* Then, perform a query on the original DB and on the modified DB according to the coin flips
* Perform the mean query on both these DB
* Study how much the noise changes w.r.t the size of the DB.




In [16]:
import torch

class CalculateSensitivity:
    "This is a class that contains reusable methods for initializing parallel dbs \
with number of entries. Use classObject.create_db_and_parallels(num_entries) to \
get db and parallel db list. Use classObject.sensitivity(query, num_entries, verbose) \
to find the sensitivity of the query function"

    def get_parallel_db(self, db, remove_index):
        return torch.cat((db[0:remove_index], db[remove_index+1:]))

    def get_parallel_dbs(self, db):
        parallel_dbs = list()
        for i in range(len(db)):
            pdb = self.get_parallel_db(db, i)
            parallel_dbs.append(pdb)
        return parallel_dbs

    def create_db_and_parallels(self, num_entries):
        db = torch.rand(num_entries).gt(0.5).to(torch.uint8)
        pdbs = self.get_parallel_dbs(db)
        return db, pdbs

    def sensitivity(self, query, num_entries, verbose):

        db, pdbs = self.create_db_and_parallels(num_entries)

        if verbose:
            print('Centralized data is: ', db)
            print('Parallel data are: ', pdbs)

        centralized_result = query(db)
        parallel_results = [query(temp_db) for temp_db in pdbs]

        if verbose:
            print('Centralized result: ', centralized_result)
            print('Parallel results: ', parallel_results)

        max_distance = 0
        for parallel_result in parallel_results:
            if max_distance < torch.abs(centralized_result - parallel_result): 
                max_distance = torch.abs(centralized_result - parallel_result)
        L1_sensitivity = max_distance
        return L1_sensitivity

# sens = CalculateSensitivity()
# sens.sensitivity(lambda x:x.sum(), num_entries=10, verbose=True)

In [17]:
helper = CalculateSensitivity()
helper.__doc__

'This is a class that contains reusable methods for initializing parallel dbs with number of entries. Use classObject.create_db_and_parallels(num_entries) to get db and parallel db list. Use classObject.sensitivity(query, num_entries, verbose) to find the sensitivity of the query function'

In [18]:
db, pdbs = helper.create_db_and_parallels(num_entries=100)
db

tensor([0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0,
        1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1,
        0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0,
        1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1,
        0, 0, 1, 0], dtype=torch.uint8)

In [19]:
true_result = torch.mean(db.float()); true_result

tensor(0.4600)

In [20]:
# .float() used to convert boolean tensor to float tensor
first_coin_flip = (torch.rand(len(db)) > 0.5).float()
first_coin_flip

tensor([1., 1., 1., 0., 1., 0., 0., 0., 0., 0., 1., 1., 0., 1., 0., 0., 0., 1.,
        0., 1., 0., 0., 1., 0., 0., 1., 0., 1., 0., 0., 1., 1., 1., 1., 1., 0.,
        1., 0., 1., 0., 0., 1., 1., 1., 0., 1., 1., 0., 1., 1., 1., 0., 1., 1.,
        1., 1., 0., 1., 0., 0., 1., 1., 0., 1., 0., 0., 0., 0., 1., 1., 0., 1.,
        1., 0., 1., 0., 0., 0., 1., 1., 1., 0., 0., 1., 0., 1., 1., 0., 0., 1.,
        0., 1., 1., 0., 1., 0., 0., 0., 1., 0.])

In [21]:
second_coin_flip = (torch.rand(len(db)) > 0.5).float()
second_coin_flip

tensor([0., 0., 1., 1., 0., 1., 1., 0., 1., 0., 1., 0., 1., 0., 0., 0., 0., 0.,
        0., 1., 0., 1., 1., 1., 1., 1., 1., 0., 0., 0., 1., 1., 0., 0., 0., 1.,
        0., 0., 1., 1., 0., 0., 1., 1., 0., 0., 1., 0., 1., 1., 0., 1., 0., 0.,
        1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 1., 1., 0., 1., 0., 0., 1., 0.,
        0., 1., 0., 0., 0., 0., 0., 0., 1., 1., 1., 0., 1., 1., 1., 1., 0., 0.,
        0., 1., 0., 1., 1., 0., 0., 0., 0., 0.])

In [22]:
# If first coin flip is 1, keep the original value in db. If it is 0, keep the value in second coin flip
augmented_db = []
for value, first_coin, second_coin in zip(db.float(), first_coin_flip, second_coin_flip):

    if first_coin.item() == 1.0:
        augmented_db.append(value.item())
    else:
        augmented_db.append(second_coin.item())
torch.FloatTensor(augmented_db)

tensor([0., 0., 0., 1., 0., 1., 1., 0., 1., 0., 1., 1., 1., 1., 0., 0., 0., 0.,
        0., 1., 0., 1., 0., 1., 1., 1., 1., 0., 0., 0., 0., 0., 1., 1., 1., 1.,
        0., 0., 1., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 1., 1., 0., 1.,
        0., 0., 1., 0., 1., 1., 1., 0., 0., 0., 1., 1., 0., 1., 1., 0., 1., 0.,
        1., 1., 0., 0., 0., 0., 0., 1., 0., 1., 1., 1., 1., 0., 0., 1., 0., 0.,
        0., 0., 1., 1., 0., 0., 0., 0., 1., 0.])

In [23]:
# Smarter way to do it from Andrew Trask
augmented_database = db.float() * first_coin_flip + (1 - first_coin_flip) * second_coin_flip #  HOLY SHIT!
augmented_database

tensor([0., 0., 0., 1., 0., 1., 1., 0., 1., 0., 1., 1., 1., 1., 0., 0., 0., 0.,
        0., 1., 0., 1., 0., 1., 1., 1., 1., 0., 0., 0., 0., 0., 1., 1., 1., 1.,
        0., 0., 1., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 1., 1., 0., 1.,
        0., 0., 1., 0., 1., 1., 1., 0., 0., 0., 1., 1., 0., 1., 1., 0., 1., 0.,
        1., 1., 0., 0., 0., 0., 0., 1., 0., 1., 1., 1., 1., 0., 0., 1., 0., 0.,
        0., 0., 1., 1., 0., 0., 0., 0., 1., 0.])

### So creating a db, masking with local differential privacy and evaluating with mean

In [24]:
num_entries = 100
db, _ = helper.create_db_and_parallels(num_entries=num_entries)

def randomized_response(db):
    num_entries = len(db)
    first_coin_flip = (torch.rand(num_entries) > 0.5).float()
    second_coin_flip = (torch.rand(num_entries) > 0.5).float()
    return db.float() * first_coin_flip + (1-first_coin_flip) * second_coin_flip

augmented_database = randomized_response(db) # already FloatTensor

In [25]:
true_result = torch.mean(db.float())
augmented_result = torch.mean(augmented_database)
[true_result, augmented_result]

[tensor(0.5400), tensor(0.4700)]

In [26]:
# Creating experiment function to try with multiple num_entries

def local_differential_privacy_exp(num_entries):
    "returns tuple (true result, augmented result) with respect to mean"
    db, _ = helper.create_db_and_parallels(num_entries)
    augmented_database = randomized_response(db)
    return (torch.mean(db.float()), torch.mean(augmented_database))

local_differential_privacy_exp(100)

(tensor(0.4900), tensor(0.4500))

In [27]:
db_sizes = (10, 100, 1000, 10000, 100000)
for db_size in db_sizes:
    print(local_differential_privacy_exp(db_size))

(tensor(0.4000), tensor(0.3000))
(tensor(0.5500), tensor(0.5300))
(tensor(0.5000), tensor(0.5100))
(tensor(0.5064), tensor(0.5011))
(tensor(0.4988), tensor(0.4989))


### So as the number of items in the DB increases, the accuracy will also increase for a fixed level of privacy despite the added noise

* Local differential privacy is data hungry and will only work well in terms of accuracy if the database is large.
* If you want privacy but have less data, it will be better to add noise to the output of the query rather than to the data itself.

## How to understand such a result:
Suppose you get the mean as .6, then you can infer that the actual mean is .7 as the mean of .5(coin flip prob) and x is .6(result from query on DP data).

Solving

(.5+x)/2 = .6

.5 + x = 1.2

x = .7

Thus, if we collect a bunch of samples and it turns out that 60% of people answer yes, then we know that the TRUE distribution is actually centered around 70%, because 70% averaged wtih 50% (a coin flip) is 60% which is the result we obtained.

## Think of it this way
True distribution's mean is say .7, ie 70% of the people said yes

Our custom noise distribution's mean is 0.5, ie coin flip 

Then the resultant augmented distribution's mean will be the average of True and custom distribution ie avg(0.7, 0.5) which is 0.6. Hence if the result of the augmented query is 0.6, then we should reskew the result to get 0.7 by using 0.6(augmented result) * 2 - 0.5 (noise mean). Same is done on my query function below:

## Varying the Amount of Randomness/noise
In other words, change the probability of the coin flip. Then also balance the result of the query to the DP dataset such that we get a correct result, which is the same as what we would have gotten if we gave the query to a non DP dataset..

Augment the query to allow for varying amount of noise added to the dataset. Then bias the coin flip and make the dataset DP. 

In [28]:
# Unbiased coin flip mean query
# Returns result of DP data and true result
def query(db):
    true_result = torch.mean(db.float())
    first_coin_flip = (torch.rand(len(db)) > 0.5).float()
    second_coin_flip = (torch.rand(len(db)) > 0.5).float()

    augmented_database = db.float()*first_coin_flip + (1-first_coin_flip)*second_coin_flip
    db_result = torch.mean(augmented_database.float())*2 - 0.5 # Deskewing

    return db_result, true_result

In [29]:
# query function with noise hyperparameter
def query(db, noise=0.5):
    true_result = torch.mean(db.float())
    first_coin_flip = (torch.rand(len(db)) > noise).float()
    second_coin_flip = (torch.rand(len(db)) > 0.5).float()

    augmented_database = db.float()*first_coin_flip + (1-first_coin_flip)*second_coin_flip
    skewed_result = augmented_database.float().mean()
    private_result = ((skewed_result/noise)-0.5) * noise/(1-noise)

    return private_result, true_result

In [36]:
noise_set = [(i+1)/10 for i in range(9)] #0.1, 0.2, ...
database_size_set = [50, 100, 500, 1000]

for noise in noise_set:
    print('*******************************')
    print('Setting noise as ', noise)
    for database_size in database_size_set:
        print('For size of dataset: ', database_size)
        db, _ = helper.create_db_and_parallels(num_entries=database_size)
        private_result, true_result = query(db, noise=noise)
        print('Result with noise: ', private_result)
        print('True Result: ', true_result)
        print('................................')
    print(' ')

*******************************
Setting noise as  0.1
For size of dataset:  50
Result with noise:  tensor(0.5889)
True Result:  tensor(0.5600)
................................
For size of dataset:  100
Result with noise:  tensor(0.5667)
True Result:  tensor(0.5200)
................................
For size of dataset:  500
Result with noise:  tensor(0.4978)
True Result:  tensor(0.5000)
................................
For size of dataset:  1000
Result with noise:  tensor(0.4778)
True Result:  tensor(0.4820)
................................
 
*******************************
Setting noise as  0.2
For size of dataset:  50
Result with noise:  tensor(0.4250)
True Result:  tensor(0.4800)
................................
For size of dataset:  100
Result with noise:  tensor(0.4875)
True Result:  tensor(0.4600)
................................
For size of dataset:  500
Result with noise:  tensor(0.4900)
True Result:  tensor(0.4940)
................................
For size of dataset:  1000
Res

## The more people participate in this, easier it gets to protect and increment the privacy of the people. Also easier it gets to learn from the people without violating the privacy of the participants.