# Generate Parallel Database

In [1]:
import torch 

def remove_by_index(db, index):
    return torch.cat((db[0:index], db[index+1:]),0)

def get_parallel_dbs(db):
    
    dbs = []
    for n in range(len(db)):
        parallel_db = remove_by_index(db, n)
        dbs.append(parallel_db)
    return dbs

def get_db_and_parallels(num_entries):
    db = torch.empty(num_entries).random_(2)
    pdbs = get_parallel_dbs(db)
    return db, pdbs

In [2]:
dbs = get_db_and_parallels(2000)

In [3]:
dbs

(tensor([1., 0., 1.,  ..., 1., 1., 0.]),
 [tensor([0., 1., 1.,  ..., 1., 1., 0.]),
  tensor([1., 1., 1.,  ..., 1., 1., 0.]),
  tensor([1., 0., 1.,  ..., 1., 1., 0.]),
  tensor([1., 0., 1.,  ..., 1., 1., 0.]),
  tensor([1., 0., 1.,  ..., 1., 1., 0.]),
  tensor([1., 0., 1.,  ..., 1., 1., 0.]),
  tensor([1., 0., 1.,  ..., 1., 1., 0.]),
  tensor([1., 0., 1.,  ..., 1., 1., 0.]),
  tensor([1., 0., 1.,  ..., 1., 1., 0.]),
  tensor([1., 0., 1.,  ..., 1., 1., 0.]),
  tensor([1., 0., 1.,  ..., 1., 1., 0.]),
  tensor([1., 0., 1.,  ..., 1., 1., 0.]),
  tensor([1., 0., 1.,  ..., 1., 1., 0.]),
  tensor([1., 0., 1.,  ..., 1., 1., 0.]),
  tensor([1., 0., 1.,  ..., 1., 1., 0.]),
  tensor([1., 0., 1.,  ..., 1., 1., 0.]),
  tensor([1., 0., 1.,  ..., 1., 1., 0.]),
  tensor([1., 0., 1.,  ..., 1., 1., 0.]),
  tensor([1., 0., 1.,  ..., 1., 1., 0.]),
  tensor([1., 0., 1.,  ..., 1., 1., 0.]),
  tensor([1., 0., 1.,  ..., 1., 1., 0.]),
  tensor([1., 0., 1.,  ..., 1., 1., 0.]),
  tensor([1., 0., 1.,  ..., 1., 1.,

# Towards Evaluating The Differential Privacy of Function


In [4]:
db, pdbs = get_db_and_parallels(20)

In [5]:
def query(db):
    return db.sum()

In [6]:
full_db_result = query(db)

In [7]:
# This is called the "L1 Sensitivity" or "Sensitivity" for short
# L1 because it's using absolute

"""
    SENSITIVTY: The maximum amount that the query 
                changes when removing an individual
                from the database.
                
                It allows us to understand how much
                a certain query leak or do not leak
                information.
"""
sensitivity = 0

for pdb in pdbs:
    pdb_result = query(pdb)
    
    # absolute becasuse we don't care if negative
    db_dinstance = torch.abs(full_db_result - pdb_result)
    
    if db_dinstance > sensitivity:
        sensitivity = db_dinstance
print(sensitivity)

tensor(1.)


In [8]:
# Consolodation all the code above to a single function

def get_sensitivity(query, n_entries):
    db, pdbs = get_db_and_parallels(n_entries)
    
    full_db_result = query(db)
    
    sensitivity = 0

    for pdb in pdbs:
        pdb_result = query(pdb)

        # absolute becasuse we don't care if negative
        db_dinstance = torch.abs(full_db_result - pdb_result)

        if db_dinstance > sensitivity:
            sensitivity = db_dinstance
            
    return sensitivity, db, pdbs

In [9]:
get_sensitivity(query, 100)[0]

tensor(1.)

In [10]:
"""
    Note that the purpose of the sensitivity function 
    is not to find how the function changes when a 
    value is removed, but about how much it changes
    when all the value that corespond to a certain 
    person are removed. Person, not individual value.
"""

mean_query = lambda x: x.float().mean()

get_sensitivity(mean_query, 100)[0]



tensor(0.0051)

# Calculate L1 Sensitivity For Threshold

In [11]:
"""
    Steps:
        1. Create the query() function
        2. Create 10 databases of size 10
        3. Query each database with a 
           threshold of 5 (calculate
           sensitivity)
        4. Print out the sensitivity of 
           each database
"""

# Query function
def query(db, threshold=5):
    return (db.sum() > threshold).float()

# Step 2-3
for i in range(0,9):
    print(f"Sensitivity of DB {i+1}")
    print(get_sensitivity(query, 10))

Sensitivity of DB 1
(0, tensor([0., 1., 0., 1., 1., 1., 1., 0., 0., 0.]), [tensor([1., 0., 1., 1., 1., 1., 0., 0., 0.]), tensor([0., 0., 1., 1., 1., 1., 0., 0., 0.]), tensor([0., 1., 1., 1., 1., 1., 0., 0., 0.]), tensor([0., 1., 0., 1., 1., 1., 0., 0., 0.]), tensor([0., 1., 0., 1., 1., 1., 0., 0., 0.]), tensor([0., 1., 0., 1., 1., 1., 0., 0., 0.]), tensor([0., 1., 0., 1., 1., 1., 0., 0., 0.]), tensor([0., 1., 0., 1., 1., 1., 1., 0., 0.]), tensor([0., 1., 0., 1., 1., 1., 1., 0., 0.]), tensor([0., 1., 0., 1., 1., 1., 1., 0., 0.])])
Sensitivity of DB 2
(0, tensor([0., 0., 1., 1., 0., 0., 1., 1., 0., 1.]), [tensor([0., 1., 1., 0., 0., 1., 1., 0., 1.]), tensor([0., 1., 1., 0., 0., 1., 1., 0., 1.]), tensor([0., 0., 1., 0., 0., 1., 1., 0., 1.]), tensor([0., 0., 1., 0., 0., 1., 1., 0., 1.]), tensor([0., 0., 1., 1., 0., 1., 1., 0., 1.]), tensor([0., 0., 1., 1., 0., 1., 1., 0., 1.]), tensor([0., 0., 1., 1., 0., 0., 1., 0., 1.]), tensor([0., 0., 1., 1., 0., 0., 1., 0., 1.]), tensor([0., 0., 1., 1

So far, the sensitivity function that we've implmented is a **non-data coditioned sensitivity** function. That means that the sensitivity is based on the fucntion and what we know about the data. In this case, from a theoretical stand point, we know that maximum value or a maximum range of a sensitivity on this dataset is 1 (inclusing a threshold). Theoretically, this is what we want do during first time implementing differential privacy.

Note: I'm assuming that Andrew (the lecturer) means that we could now this by not actually looking at the database. 

However, if we take a peek at the data, we know that sometimes it's going to be one and sometimes it's not one. There are futher (more advance) sensitivity function, which is called **data conditioned sensitivity** that involves taking a peek at the dataset. So the sensitiviy can be calculated not just based on the range that we know that data could take, but also the actual values in the database.

# Perform a  Differencing on Row 10

Exlosing the value of a person represented by Row 10 in the database

Function to use:
- Sum
- Mean
- Threshodl

In [12]:
# Original DB
db, _= get_db_and_parallels(100)

# DB with Row 10 removed
pdb = remove_by_index(db, 10)

In [13]:
len(pdb), len(db)

(99, 100)

In [14]:
# Differencing attack via sum function

sum(db) - sum(pdb)

tensor(1.)

In [15]:
# Differencing attack via mean function

(sum(db).float() / len(db)) - (sum(pdb).float() / len(pdb) )

tensor(0.0059)

In [16]:
# Differencing attack via threshold function

(sum(db).float() > 50).float() - (sum(pdb).float() > 50 ).float()

tensor(0.)

Differencing attack is near to the heart behind intuition of differencial privacy. Developing differencial privacy techniques involces the need of immunity against these kinds of attack.

# Simple Local Differential Privacy

Steps:
1. Create a coin flipper (1: heads, 0: tails)
2. Create database with size 10, 100, 1000, 10000
3. Modified for all database:
        for all value in database:
            flip1 -> flip coin
            flip2 -> flip coin
            if flip1 == 1, let value as it is
            otherwise, set value as flip2

In [None]:
# Create a coin flipper (1: heads, 0: tails)
import random
def coin_flip():
    return random.choice([0,1])

# Create database with size 10, 100, 1000, 10000
ten, _ = get_db_and_parallels(10)
hundred, _ = get_db_and_parallels(100)
thousand, _ = get_db_and_parallels(1000)
ten_thousand, _ = get_db_and_parallels(10000)

# Function to add noise
def noise_db(db):
    new_db = []
    
    for i in range(len(db)):
        flip_1 = coin_flip()
        flip_2 = coin_flip()

        if flip_1 == 1:
            new_db.append(db[i])
        else:
            new_db.append(flip_2)
    return torch.FloatTensor(new_db)

# Cereate noised db
noised_ten = noise_db(ten)
noised_hundred = noise_db(hundred)
noised_thousand = noise_db(thousand)
noised_ten_thousand = noise_db(ten_thousand)

In [None]:
def mean_query(db):
    return (db.sum() / len(db))

def print_db_noised_comparison(original_db, noised_db):
    original = mean_query(original_db)
    noised = mean_query(noised_db)
    output = f"DB of Size {len(original_db)}\nQuery of Original {original}\nQuery of Noised: {noised}\n"
    print(output)

In [None]:
print_db_noised_comparison(ten, noised_ten)
print_db_noised_comparison(hundred, noised_hundred)
print_db_noised_comparison(thousand, noised_thousand)
print_db_noised_comparison(ten_thousand, noised_ten_thousand)

Above is my approach in solving the challenge. However, I'd suggest the following solution using mostly PyTorch by Andrew in the course.

In [None]:
db, _ = get_db_and_parallels(100)

flip_1 = (torch.rand(len(db)) > 0.5).float()
flip_2 = (torch.rand(len(db)) > 0.5).float()

augmented_db = db.float() * flip_1 + (1 - flip_1) * flip_2

Further explantion for the code above:

`db.float() * flip_1`: Here we want to get an array that contains which value will be the actual value and which one will be the value of the second flip of the coin. `flip_1` acts as a mask for the db values.

`(1 - flip_1)`: Contains the values where we want to put our random values of `flip_2`. All the 1s are the where we want the random values.

`(1 - flip_2) * flip_2`: All the values that are being sampled randomly.

Note: Based on the course, the man of the augmented database is 60%, which we can get by adding the "actual" statistic (in this case 70%) with 50% and diving the result by two. However, this might be different since our initialization of the database is random. In my case, the mean of the augmented database is 47%, and the "actual" statistic is 44%.

In [None]:
torch.mean(augmented_db.float()) 

In [None]:
db_result = torch.mean(augmented_db.float()) * 2 - 0.5
db_result

In [None]:
# Wrap it in to a single query function

def query(db):
    
    true_result = torch.mean(db.float())
    flip_1 = (torch.rand(len(db)) > 0.5).float()
    flip_2 = (torch.rand(len(db)) > 0.5).float()

    augmented_db = db.float() * flip_1 + (1 - flip_1) * flip_2
    db_result = torch.mean(augmented_db.float()) * 2 - 0.5
    
    return db_result, true_result

In [None]:
db, _ = get_db_and_parallels(10)
db_result, true_result = query(db)
print(f"With Noise: {db_result}")
print(f"Without Noise: {true_result}")

In [None]:
db, _ = get_db_and_parallels(100)
db_result, true_result = query(db)
print(f"With Noise: {db_result}")
print(f"Without Noise: {true_result}")

In [None]:
db, _ = get_db_and_parallels(1000)
db_result, true_result = query(db)
print(f"With Noise: {db_result}")
print(f"Without Noise: {true_result}")

In [None]:
db, _ = get_db_and_parallels(10000)
db_result, true_result = query(db)
print(f"With Noise: {db_result}")
print(f"Without Noise: {true_result}")

As we can see here, it seems tha the mean with and without the noise tend to be similar as the number of our dataste grows. This make sense since we are corupting the data points by adding noises. 

As local differential privacy is data hungry, the approach is suitable when we need to protect data at an individual level, with the condition that it is sufficiency large (not sure how large, but based on this implementaion it seems in the magnitude of tens of thousans suffice). 

On the other hand, we have global differential privacy which only add noise on the output, and this is suitable we have smaller dataset and still need to protect it.

# Varying Amount of Noise


In this challenge, we are supposed to add a parameter in the query function. The parameter will be the amount of noise that we want to apply to the first coin flip, meaning that there are going to be more 1s and 0s or vice versa.

The folowing is my solution before seeing the actual solution.

In [17]:
def query(db, flip_1_noise=0.5):
    
    true_result = torch.mean(db.float())
    flip_1 = (torch.rand(len(db)) > flip_1_noise).float()
    flip_2 = (torch.rand(len(db)) > 0.5).float()

    augmented_db = db.float() * flip_1 + (1 - flip_1) * flip_2
    db_result = torch.mean(augmented_db.float()) * 2 - 0.5
    
    return db_result, true_result

In [18]:
print("##### Noised with 0.7 probability for First Flip #####\n")

db, _ = get_db_and_parallels(10)
db_result, true_result = query(db, flip_1_noise=0.7)
print("DB of Size 10")
print(f"With Noise: {db_result}")
print(f"Without Noise: {true_result}")

db, _ = get_db_and_parallels(100)
db_result, true_result = query(db, flip_1_noise=0.7)
print("DB of Size 100")
print(f"With Noise: {db_result}")
print(f"Without Noise: {true_result}")

db, _ = get_db_and_parallels(1000)
db_result, true_result = query(db, flip_1_noise=0.7)
print("DB of Size 1000")
print(f"With Noise: {db_result}")
print(f"Without Noise: {true_result}")

db, _ = get_db_and_parallels(10000)
db_result, true_result = query(db, flip_1_noise=0.7)
print("DB of Size 10000")
print(f"With Noise: {db_result}")
print(f"Without Noise: {true_result}")

##### Noised with 0.7 probability for First Flip #####

DB of Size 10
With Noise: 0.8999999761581421
Without Noise: 0.5
DB of Size 100
With Noise: 0.48000001907348633
Without Noise: 0.6000000238418579
DB of Size 1000
With Noise: 0.5219999551773071
Without Noise: 0.492000013589859
DB of Size 10000
With Noise: 0.5046000480651855
Without Noise: 0.49059998989105225


In [19]:
print("##### Noised with 0.2 probability for First Flip #####\n")

db, _ = get_db_and_parallels(10)
db_result, true_result = query(db, flip_1_noise=0.2)
print("DB of Size 10")
print(f"With Noise: {db_result}")
print(f"Without Noise: {true_result}")

db, _ = get_db_and_parallels(100)
db_result, true_result = query(db, flip_1_noise=0.2)
print("DB of Size 100")
print(f"With Noise: {db_result}")
print(f"Without Noise: {true_result}")

db, _ = get_db_and_parallels(1000)
db_result, true_result = query(db, flip_1_noise=0.2)
print("DB of Size 1000")
print(f"With Noise: {db_result}")
print(f"Without Noise: {true_result}")

db, _ = get_db_and_parallels(10000)
db_result, true_result = query(db, flip_1_noise=0.2)
print("DB of Size 10000")
print(f"With Noise: {db_result}")
print(f"Without Noise: {true_result}")

##### Noised with 0.2 probability for First Flip #####

DB of Size 10
With Noise: 0.10000002384185791
Without Noise: 0.4000000059604645
DB of Size 100
With Noise: 0.5199999809265137
Without Noise: 0.5099999904632568
DB of Size 1000
With Noise: 0.531999945640564
Without Noise: 0.503000020980835
DB of Size 10000
With Noise: 0.5055999755859375
Without Noise: 0.5023999810218811


Looks like everything runs well. It seems noe matter how much we add noise, the database with the nose and tihout seems to have similar means as the size of the dataset gets larger. 

**However**, there is a part that I actually missed after checking out the solution by Andrew. Specifically, it was this part

```
db_result = torch.mean(augmented_db.float()) * 2 - 0.5
```

How so? The thing is that in the previous implementation when the noise of the first flip is 0.5 is applied to the dataset; in other words, we want to apply the result of the second coin flip, where all the first coin flip values are 0s. By doing that, we know that 1/2 of the dataset's value are answered honestly, while the other 1/2 are all values of the second coin flip. 

based on the course, The "true" statistics was 70% (mine was 44%). This means that 1/2 of the dataset are 1s 70% of the time (equivalent t answering yes), while the other 1/2 has values of 1s 50% of the time. This is the case we know the "actual" statistic.

However, what we are doing here is pretending that we don't actually know the real statistic (the 70%). And we get is only the noised database mean, which was 60% based on the lecture. From that 60%, we know that one 1/2 of the dataset contained the true statistics, while the other one are 50/50 coin flips. Therefore, in order to get the actual statistic, we just need to do the following
```
0.6 * 2 - 0.5 = 70 
```

since 
```
(70 + 50) / 2 = 60 
```

The denoising method we used by averaging is sound since we know that the nosie of the dataset is 50% since it's a 50/50 coin flip. However, when we have a noise that is not necessarily 50/50, this method is not the best approach since the distribution has been applied a noise so that the mean is no longer around 0.5. Therefore, we need to use a differenr denoising method.

In the course, Andrew mentioned that we can see the averaging method above as the following
```
(true_db_mean * noise) + (noised_db_mean * (1 - noise))
```
If we insert the noise of 0.5 and do a little bit of algebra, then we actually have the same as averaging the true_db_mean and noised_db_mean by 2

```
noise = 0.5
(true_db_mean * noise) + (noised_db_mean * (1 - noise))
= 0.5 * true_db_mean  + 0.5 * noised_db_mean
= 1/2 * true_db_mean  + 1/2 * noised_db_mean
= (true_db_mean + noised_db_mean) / 2
```

And if we set `true_db_mean` as 70% and `noised_db_mean` as 50%, then we have
```
(0.7 * 0.5) + (0.5 * 0.5) = 0.6 = augmented_db_mean
```

Now we know that in order to get the mean of the augmented database, we can use the following
```
augmented_db_mean = (true_db_mean * noise) + (noised_db_mean * (1 - noise))
```

That means, with a little bit of algebra, we can find the formula to find the true mean of the database 
```
true_db_mean = ((augmented_db_mean / noise) - 0.5) * noise / (1 - noise)
```

Knowing this, now we can modify the query function we created before.

In [20]:
def query(db, noise=0.5):
    
    true_result = torch.mean(db.float())
    flip_1 = (torch.rand(len(db)) > noise).float()
    flip_2 = (torch.rand(len(db)) > 0.5).float()

    augmented_db = db.float() * flip_1 + (1 - flip_1) * flip_2
    
    sk_result = augmented_db.float().mean()
    db_result = ((sk_result / noise) - 0.5) * noise / (1 - noise)
    
    return db_result, true_result

In [21]:
db, _ = get_db_and_parallels(100)
db_result, true_result = query(db, noise=0.1)
print(f"With Noise: {db_result}")
print(f"Without Noise: {true_result}")

With Noise: 0.5222222208976746
Without Noise: 0.5099999904632568


In [22]:
db, _ = get_db_and_parallels(100)
db_result, true_result = query(db, noise=0.2)
print(f"With Noise: {db_result}")
print(f"Without Noise: {true_result}")

With Noise: 0.4125000238418579
Without Noise: 0.46000000834465027


In [23]:
db, _ = get_db_and_parallels(100)
db_result, true_result = query(db, noise=0.4)
print(f"With Noise: {db_result}")
print(f"Without Noise: {true_result}")

With Noise: 0.5999999642372131
Without Noise: 0.6100000143051147


In [24]:
db, _ = get_db_and_parallels(100)
db_result, true_result = query(db, noise=0.8)
print(f"With Noise: {db_result}")
print(f"Without Noise: {true_result}")

With Noise: 0.34999993443489075
Without Noise: 0.47999998927116394


In [25]:
db, _ = get_db_and_parallels(10000)
db_result, true_result = query(db, noise=0.8)
print("DB of Size 10")
print(f"With Noise: {db_result}")
print(f"Without Noise: {true_result}")

DB of Size 10
With Noise: 0.4870000183582306
Without Noise: 0.49459999799728394


In [26]:
db, _ = get_db_and_parallels(10000)
db_result, true_result = query(db, noise=0.4)
print("DB of Size 10")
print(f"With Noise: {db_result}")
print(f"Without Noise: {true_result}")

DB of Size 10
With Noise: 0.5085000395774841
Without Noise: 0.49810001254081726


There's something interesting going on here. As we can see, we started of by using  a dataset of size 100 with a noise of 0.1. With such a small noise we can see that that the average are almost similar. However, as we increase the noise, we see that the mean starts to have a larger difference. To counter this, we can add the number of dataset size (of 10,000) and we have difference of the two means to be smaller again. And if we decrease the noise of the larger dataset, we get an even smaller difference between the means.

The main ponint here is that by having larger dataset, we can add more noise to it. This counter intutitive since it applies that the more data we have, the more protection we can add to the user. In the context of society, this is even more counter intuitive since making a data private means giving some form of restrictions to statisticians. However, differential privacy allows us to perform statistical analysis while also making sure that each individual data points are protected.

That means, theoretically, differential privacy allows us to analyze certain patterns of a dataset without having to access private information from a certain person. It acts as a filter that prevent certain information leakage while also making sure that we can analyze statistical properties of dataset. It needs to be noted that this requires large amount of dataset since the goals is to recgnize repeating statistical infomation.

# Create a Differentialy Private Query

Apply the Laplacian noise to the sum and mean query.

In [27]:
import numpy as np
def sum_query(db):
    return db.float().sum()

def mean_query(db):
    return db.float().mean()

In [28]:
# Apply noise by the sum query

epsilon = 0.5
sensitivity_sum_query, db_true, _ = get_sensitivity(sum_query, 100)
beta = sensitivity_sum_query/epsilon
noise = torch.tensor(np.random.laplace(0., beta, 1))
sum_noised_output = sum_query(db_true) * (noise)
print(sum_noised_output)

tensor([39.6289], dtype=torch.float64)


In [29]:
sum_query(db_true)

tensor(50.)

In [30]:
# Apply noise by the mean query

epsilon = 0.5
sensitivity_mean_query, db_true, _ = get_sensitivity(mean_query, 100)
beta = sensitivity_mean_query/epsilon
noise = torch.tensor(np.random.laplace(0., beta, 1))

mean_noised_output = mean_query(db_true) * noise
print(mean_noised_output)

tensor([-0.0054], dtype=torch.float64)


Above us my solution to the challenge. The following is Andrew's solution.

In [31]:
# A dedicated function for the laplacian mechanism

epsilon = 0.5
def laplacian_mechanism(db ,query, sensitivity):
    
    beta = sensitivity / epsilon
    noise = torch.tensor(np.random.laplace(0., beta, 1))

    return query(db) + noise

In [32]:
db, pdb = get_db_and_parallels(100)

In [33]:
laplacian_mechanism(db, sum_query, 1) # since sum query will always have a sensitivity of 1

tensor([68.3103], dtype=torch.float64)

In [34]:
laplacian_mechanism(db, mean_query, 1/100) # 1/100 since the sensitivity of mean is the sensitivity of sum devivid by 100

tensor([0.6165], dtype=torch.float64)

Something to note is that the output of the laplacian mechanism of the mean query is much smaller than that of the sume query. This make sense since we know that the sensitivity of the mean query is much smaller than the sume query, which always has a sensitivity of 1. It means that the mean query is a query function that is less sensitive; in other words, the nature of the function does not leak too much information.

Futhermore, as mentioned before, epsilon acts as a privacy budget to know how much noise we should add via the laplacian mechanism. If we make epsilon really small (say 0.0001), then we would have a noised output that is greatly larger then the one with larger epsilon. This follows the intuition that the smaller information that we wish to be leaked, the more noise we should add.

# Health Neural Network (Example Scenario)

In [35]:
num_labels = 10
num_samples = 10000
range_labels = 10

In [36]:
multiple_preds = torch.tensor(np.random.randint(range_labels, size=(num_samples,num_labels)))

In [37]:
def max_query(db):
    return db.mode().values

def sensitivity(query, db):
    pdbs = get_parallel_dbs(db)
    
    full_db_result = query(db)
    
    sensitivity = 0

    for pdb in pdbs:
        pdb_result = query(pdb)

        # absolute becasuse we don't care if negative
        db_dinstance = torch.abs(full_db_result - pdb_result)

        if db_dinstance > sensitivity:
            sensitivity = db_dinstance
            
    return sensitivity, db, pdbs

def laplacian_mechanism(db ,query, db_sensitivity, epsilon=0.5):
    
    beta = db_sensitivity / epsilon
    noise = torch.tensor(np.random.laplace(0., beta, 1))

    return query(db) * noise

In [38]:
max_query_results = torch.tensor(list(map(max_query, multiple_preds)))

In [39]:
laplacian_results = []

for i in range(num_samples):
    db = multiple_preds[i]
    db_sensitivity = sensitivity(max_query, db)[0]
    noise = laplacian_mechanism(db, max_query, db_sensitivity)
    laplacian_results.append(noise)
laplacian_results = torch.tensor(laplacian_results)

In [40]:
laplacian_results

tensor([  0.0000,   0.0000,   0.0000,  ...,   0.0000, -12.6953,   0.0000],
       dtype=torch.float64)

In [41]:
len(max_query_results) == len(laplacian_results)

True

In [42]:
labels_to_apply = max_query_results + laplacian_results

In [43]:
labels_to_apply

tensor([  2.0000,   1.0000,   0.0000,  ...,   2.0000, -10.6953,   8.0000],
       dtype=torch.float64)

Above is my approach to the example that was provide to predict tumors. However, upon seeig the approach that Andrew did, it seems that I seem to misnderstood something.

So the idea is that I had to apply the Laplacian noise to the count of each label range. So this means that we need to count how much each label class (0 - 9) occurs within a set of teachers (since we have 10 hospitals). For example,

[0, 9, 0, 9, 7, 3, 9, 7, 1, 1]

So we have to count how many of each label class occurs to get something like

[2, 2, 0, 1, 0, 0, 0, 2, 0, 3]

So (from the left) 0 occurs 2 times, 1 occurs 2 times, 2 occurs 0 times, and so on. We need to apply the noise to these values instead of the actual final label for the image.

Here is the approach from the Andrew.

In [44]:
new_labels = []
for an_image in multiple_preds:
    epsilon = 0.1
    beta = 1/epsilon

    for i in range(num_labels):
        an_image[i] += np.random.laplace(0, beta, 1)
        
    new_label = np.argmax(an_image)
    new_labels.append(new_label)

In [45]:
new_labels

[tensor(4),
 tensor(3),
 tensor(2),
 tensor(5),
 tensor(1),
 tensor(4),
 tensor(7),
 tensor(1),
 tensor(5),
 tensor(4),
 tensor(5),
 tensor(4),
 tensor(2),
 tensor(1),
 tensor(7),
 tensor(4),
 tensor(3),
 tensor(2),
 tensor(5),
 tensor(8),
 tensor(5),
 tensor(0),
 tensor(3),
 tensor(8),
 tensor(2),
 tensor(7),
 tensor(9),
 tensor(6),
 tensor(5),
 tensor(1),
 tensor(7),
 tensor(5),
 tensor(0),
 tensor(8),
 tensor(7),
 tensor(7),
 tensor(9),
 tensor(4),
 tensor(6),
 tensor(4),
 tensor(3),
 tensor(9),
 tensor(8),
 tensor(3),
 tensor(7),
 tensor(8),
 tensor(3),
 tensor(3),
 tensor(6),
 tensor(2),
 tensor(6),
 tensor(8),
 tensor(6),
 tensor(9),
 tensor(4),
 tensor(3),
 tensor(0),
 tensor(5),
 tensor(9),
 tensor(6),
 tensor(2),
 tensor(0),
 tensor(1),
 tensor(9),
 tensor(1),
 tensor(2),
 tensor(0),
 tensor(7),
 tensor(7),
 tensor(5),
 tensor(8),
 tensor(8),
 tensor(8),
 tensor(8),
 tensor(8),
 tensor(7),
 tensor(5),
 tensor(1),
 tensor(5),
 tensor(9),
 tensor(7),
 tensor(7),
 tensor(6),
 ten

Now the question still stands on how much information did we actually leaked? Note that have the epsilon to give some measurement on answering that question. Howver, since we did multiple queries, then we need to add up the epsilons for everytime we use to apply the laplacian noise.

Since we have around 10000 data points, these epsilons can be in the order of 10000 which is really large and not basically what we really wanst. That is where PATE comes in!

## PATE Analysis

PATE (Private Aggregation of Teacher Ensembles) is a way to take a peak and the label and try to figure how much does one entity that provides the data to create the model agree or disagree with each other. it provides a formal set of mechaism that is capable of computing an Epsilon that is conditioned on the level of aggrement. 

Why we want to do this? Well the reason is that our goal to achieve perfect privacy. And in the deep learning context, this means that no matter which data point we removed from the dataset, we should get the same model. Remember, that what we want to protect is the privacy of people and not the datapoint itself.

Now we don't actually have access to the dataset, but we can use the lables as a proxy to the datapoint. Note that his only allows to PATE if we know that each individual data point is not the result of the same person, meaning that the person who participated in the making of the model, participated in only on model and not contributing any kind of the data to the other model.

In [54]:
from syft.frameworks.torch.dp import pate

In [66]:
num_labels = 10
num_samples = 100
num_teachers = 100

fake_preds = np.random.randint(num_labels, size=(num_teachers, num_samples))
true_preds = np.random.randint(num_labels, size=(num_samples))

In [67]:
data_dep_eps, data_ind_eps = pate.perform_analysis(teacher_preds=fake_preds, indices=true_preds, noise_eps=0.1, delta=1e-5)

print(f"Data Independent Epsilon {data_ind_eps}")
print(f"Data Dependent Epsilon {data_dep_eps}")

Data Independent Epsilon 11.756462732485115
Data Dependent Epsilon 11.756462732485105


In [68]:
# Make model more agreeable by
# making certain predictions 
# the same
fake_preds[:, 0:5] *= 0

In [69]:
data_dep_eps, data_ind_eps = pate.perform_analysis(teacher_preds=fake_preds, indices=true_preds, noise_eps=0.1, delta=1e-5)

print(f"Data Independent Epsilon {data_ind_eps}")
print(f"Data Dependent Epsilon {data_dep_eps}")

Data Independent Epsilon 11.756462732485115
Data Dependent Epsilon 7.016363646145221


In [70]:
fake_preds[:, 0:50] *= 0

In [71]:
data_dep_eps, data_ind_eps = pate.perform_analysis(teacher_preds=fake_preds, indices=true_preds, noise_eps=0.1, delta=1e-5)

print(f"Data Independent Epsilon {data_ind_eps}")
print(f"Data Dependent Epsilon {data_dep_eps}")

Data Independent Epsilon 11.756462732485115
Data Dependent Epsilon 1.52655213289881


As we can see above, at first we see that the the independent epsilon is slightly hight then the dependent epsilon. This make sense since out data is randomly generated. However, as we make more of the prediction the similar between some of them, we have lower dependent epsilon.

Th intuition here is that this allows the model to learn the generic information of the data, thus result in less privacy leakage. This is because we want to force the model to not overfit and finding the true signal in oppose to the noise.