In [1]:
#Import necessary libraries

import numpy as np
import pandas as pd
import seal
from seal import *
import sourmash as smsh
import matplotlib.pyplot as plt
import pickle
import time

In [2]:
#Data Owner loads sketches of private data and loads anchor sketches sent by Model Owner
anchor_sketches = pickle.load(open('data/public/anchor_sketches.dump', 'rb'))

test_data_file = 'data/public/test_data'
def load_data():
    with open(test_data_file, "r") as f:
        data = f.readlines()

    labels = []
    sequences = []
    lengths = []
    for k in range(len(data)):
        if k % 2 == 0:
            labels.append(data[k])
        else:
            seq = data[k].strip()
            lengths.append(len(seq))
            sequences.append(seq)

    # uniformize lengths by filling in with N's
    #max_length = max(lengths)
    #for i in range(len(sequences)):
        #padding_size = max_length - len(sequences[i])
        #for j in range(padding_size):
            #sequences[i] += "N"


    types = [">B.1.526", ">B.1.1.7", ">B.1.427", ">P.1"]

    dataframe = []

    for i in range(len(labels)):
        entry = []
        # 2021/08/02: re-replaced use of match-case (Python 3.10) for backwards compatibility
        for j in range(len(types)):
            if labels[i].startswith(types[j]):
                entry.append(j)
                virus_number = labels[i].split("_")[1].strip()
                entry.append(virus_number)
                entry.append(sequences[i])
                break

            if j == 3:
                raise "Bad entry"

        dataframe.append(entry)

    return dataframe

data = load_data()

#Key preprocessing step:
#Replace all non-ACTG characters with an ACTG chosen uniformly at random.
start = time.time()
data_Nrand = []

base_dict = {0:'A',1:'C',2:'G',3:'T'}

for i in range(len(data)):
    string_mod = ''
    for j in range(len(data[i][2])):
        if data[i][2][j]=='A' or data[i][2][j]=='C' or data[i][2][j]=='G' or data[i][2][j]=='T':
            string_mod += data[i][2][j]
        else:
            string_mod+= base_dict[np.random.randint(0,4)]
    data_Nrand.append([data[i][0],data[i][1],string_mod])

end = time.time()
print(f'Time to Replace unknowns: {(end-start):.3f}s')

#These are the sketch parameters that I settled on. Form sketches of all samples.
start = time.time()
sketches = []
N = 5000
K = 33

for i in range(len(data_Nrand)):
    mh = smsh.MinHash(n=N,ksize=K)
    mh.add_sequence(data_Nrand[i][2])
    sketches.append(mh)

end = time.time()
print(f'Time to form sketches: {(end-start):.3f}s')

test_sketches = pd.DataFrame(sketches)

Time to Replace unknowns: 9.272s
Time to form sketches: 3.464s


In [3]:
#DATA OWNER preprocessing
#Model owner sends data owner sketches of the anchor samples 
#Data owner computes vector of distances to each of the 12 anchors
#for each test sample.
#These vectors will be hidden by the encryption.

jacc_sim = np.zeros((1000,12))

i=0
for sketch in test_sketches[0]:
    j=0
    for anchor in anchor_sketches[0]:
        jacc_sim[i,j] = round(sketch.jaccard(anchor),4)
        j+=1
    i+=1
        
dist_data = np.zeros((1000,12))

for i in range(1000):
    for j in range(12):
        dist_data[i,j] = -np.log(2*jacc_sim[i,j])+np.log(1+jacc_sim[i,j])

In [4]:
#DATA OWNER preprocessing
#Batches 341 samples into a single plaintext (8192 slots / (12 + 12 empty))
#1000 samples are placed into 3 large vectors
#batch_data[2] has extra 0's at the end
batch_data = np.zeros((3,8192))
for i in range(3):
    for j in range(341):
        if 341*i+j < 1000:
            batch_data[i][24*j:24*j+12] = dist_data[341*i+j]

In [5]:
#DATA OWNER
#Set the parameters of the encryption context.
#In real situation, the Model Owner would communicate the parameters poly_modulus_degree,
#list of prime sizes in coeff_modulus, and scale,
#based on the number of rescalings in their evaluation.

parms = EncryptionParameters(scheme_type.ckks)
poly_modulus_degree = 16384
parms.set_poly_modulus_degree(poly_modulus_degree)
parms.set_coeff_modulus(CoeffModulus.Create(poly_modulus_degree, [60, 40, 40, 40, 40, 40, 60]))
#320-bit coeff modulus Q. 
#From SEAL manual, security cutoffs for N=16384 are 300 bits for 192-bit security, 438 bits for 128-bit security.
scale = 2.0**40
context = SEALContext(parms)
#print_parameters(context)

ckks_encoder = CKKSEncoder(context)
slot_count = ckks_encoder.slot_count()

keygen = KeyGenerator(context)
public_key = keygen.create_public_key()
secret_key = keygen.secret_key()
galois_keys = keygen.create_galois_keys()
relin_keys = keygen.create_relin_keys()

decryptor = Decryptor(context, secret_key)

encryptor = Encryptor(context, public_key)
evaluator = Evaluator(context)

In [6]:
#DATA OWNER
#Encode and encrypt data owner's distance vector
pt_data = []
ct_data = []

for i in range(3):
    pt_data.append(ckks_encoder.encode(batch_data[i],scale))
    ct_data.append(encryptor.encrypt(pt_data[i]))

In [7]:
import tempfile
import base64
import json

payload = {}

for i in range(3):
    with tempfile.NamedTemporaryFile() as outfile:
        ct_data[i].save(outfile.name)
        with open(outfile.name, 'rb') as infile:
            payload[f"ct_{i}"] = base64.b64encode(infile.read()).decode('utf8')

with tempfile.NamedTemporaryFile() as outfile:
    parms.save(outfile.name)
    with open(outfile.name, 'rb') as infile:
        payload[f"IDASH_parms"] = base64.b64encode(infile.read()).decode('utf8')

with tempfile.NamedTemporaryFile() as outfile:
    public_key.save(outfile.name)
    with open(outfile.name, 'rb') as infile:
        payload[f"IDASH_pubkey"] = base64.b64encode(infile.read()).decode('utf8')

with tempfile.NamedTemporaryFile() as outfile:
    galois_keys.save(outfile.name)
    with open(outfile.name, 'rb') as infile:
        payload[f"IDASH_galkeys"] = base64.b64encode(infile.read()).decode('utf8')

with tempfile.NamedTemporaryFile() as outfile:
    relin_keys.save(outfile.name)
    with open(outfile.name, 'rb') as infile:
        payload[f"IDASH_relinkeys"] = base64.b64encode(infile.read()).decode('utf8')

payload[f"IDASH_scale"] = base64.b64encode(pickle.dumps(scale)).decode('utf8')

In [8]:
import requests

response = requests.post("http://localhost:5000", headers={'content-type': 'application/json'}, data=json.dumps(payload))
result = json.loads(response.text)

In [9]:
#Import necessary libraries

import numpy as np
import pandas as pd
import seal
from seal import *
import time
import pickle

In [10]:
#Initialize and load ciphertexts from Model Owner, Step 3


import json
import tempfile
import base64


results = []
for i in range(3):
    pt_init = ckks_encoder.encode(0.,scale)
    ct_init = encryptor.encrypt(pt_init)
    with tempfile.NamedTemporaryFile() as named_outfile:
        with open(named_outfile.name, 'wb') as outfile:
            outfile.write(base64.b64decode(result[f"IDASH_ct_results_{i}"].encode('utf8')))
        ct_init.load(context, named_outfile.name)
        results.append(ct_init)

In [11]:
#Decrypt and decode results

final_scores = []
for i in range(3):
    pt_final = decryptor.decrypt(results[i])
    final_scores.append(ckks_encoder.decode(pt_final))

In [12]:
#DATA OWNER postprocessing of results
#The first 4 entries of scores are the correct entries of the matrix-vector product. 
final_junk_removed = []
for j in range(1000):
    final_junk_removed.append(final_scores[j // 341][24*(j%341):24*(j%341)+4])

final_junk_removed

[array([ 0.00194938,  0.00689902,  1.04717489, -0.00498989]),
 array([-0.00231386, -0.00210003,  0.99214667, -0.0043789 ]),
 array([-3.38082120e-03, -2.29144119e-04,  1.03443809e+00, -5.00984479e-03]),
 array([ 0.00420689,  0.01762228,  0.9662041 , -0.00433947]),
 array([ 4.79388892e-04,  7.39155109e-03,  1.05376478e+00, -4.94500276e-03]),
 array([ 0.00602182,  0.0071881 ,  1.0290112 , -0.00500979]),
 array([-0.00499033,  0.01639629,  1.067951  , -0.00499471]),
 array([ 0.00171445,  0.00826121,  1.04367892, -0.00499914]),
 array([ 0.00666582,  0.01650302,  0.96376901, -0.00457514]),
 array([ 0.02899279,  0.0137223 ,  0.87289083, -0.00434266]),
 array([ 0.00968272,  0.0099318 ,  0.99789282, -0.00498743]),
 array([ 0.0081428 ,  0.01659205,  0.96253192, -0.00473647]),
 array([ 0.00510911,  0.01248804,  0.98284647, -0.0044509 ]),
 array([-0.00461784,  0.01412676,  1.03307321, -0.00275672]),
 array([ 0.00932164,  0.01648391,  0.92397229, -0.00364299]),
 array([ 0.01718922,  0.01520613,  0.9

In [13]:
#DATA OWNER postprocessing
#Very simple function to convert final_junk_removed to list of label predictions
#Correct conversion assumes all scores in final_junk_removed round to 0 or 1
#Could update function to change scores >1.5 to 0 and <-.5 to 1
#This version miscategorizes all 0's as belonging to the 0th category
#and if more than one category has a 1, the reported category is the sum.
def conv_to_pred(score_array):
    predictions = []
    for scores in score_array:
        predictions.append(int(np.dot(np.round(scores),np.array([0,1,2,3]))))

    return predictions

In [14]:
from sklearn.metrics import classification_report,confusion_matrix

In [15]:
def load_data():
    with open(test_data_file, "r") as f:
        data = f.readlines()

    labels = []
    sequences = []
    lengths = []
    for k in range(len(data)):
        if k % 2 == 0:
            labels.append(data[k])
        else:
            seq = data[k].strip()
            lengths.append(len(seq))
            sequences.append(seq)

    # uniformize lengths by filling in with N's
    #max_length = max(lengths)
    #for i in range(len(sequences)):
        #padding_size = max_length - len(sequences[i])
        #for j in range(padding_size):
            #sequences[i] += "N"


    types = [">B.1.526", ">B.1.1.7", ">B.1.427", ">P.1"]

    dataframe = []

    for i in range(len(labels)):
        entry = []
        # 2021/08/02: re-replaced use of match-case (Python 3.10) for backwards compatibility
        for j in range(len(types)):
            if labels[i].startswith(types[j]):
                entry.append(j)
                virus_number = labels[i].split("_")[1].strip()
                entry.append(virus_number)
                entry.append(sequences[i])
                break

            if j == 3:
                raise "Bad entry"

        dataframe.append(entry)

    return dataframe

data = load_data()
data_df = pd.DataFrame(data)
test_labels = data_df[:][0]

In [16]:
pred = conv_to_pred(final_junk_removed)

In [17]:
#21 samples classified incorrectly, for 2% error.
#Likely all error coming from application of threshold polynomial.

print(confusion_matrix(pred,test_labels))

[[238   0   0   0]
 [  0 232   0   0]
 [  0   0 265   0]
 [  0   0   0 265]]


In [18]:
#DATA OWNER postprocessing
#Very simple function to convert final_junk_removed to list of label predictions
#Just takes index of maximum score
def conv_to_pred2(score_array):
    predictions = []
    for scores in score_array:
        predictions.append(np.argmax(scores))

    return predictions

In [19]:
#This method fails for every sample in category 0
#but works for the other categories.
#Must figure out why.
#But for now use conv_to_pred

pred2 = conv_to_pred2(final_junk_removed)
print(confusion_matrix(pred2, test_labels))

[[238   0   0   0]
 [  0 232   0   0]
 [  0   0 265   0]
 [  0   0   0 265]]
