In [2]:
import torch

import numpy as np
import pandas as pd
import tenseal as ts
import base64

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from time import time
from tqdm import tqdm_notebook


1. sum  
2. mean  
3. var  
4. Linear Regression
5. Decision Tree

## Load, Preprocess Dataset - hmeq.csv (Before operation)

In [3]:
# Loading the dataset
df = pd.read_csv("data/hmeq.csv", sep=",")

# Replacement of NaN variables
df['MORTDUE'].fillna(value=df['MORTDUE'].mean(), inplace=True)
df['VALUE'].fillna(value=df['VALUE'].mean(), inplace=True)
df["REASON"].fillna(value="DebtCon", inplace=True)
df["JOB"].fillna(value="Other", inplace=True)
df["YOJ"].fillna(value=df['YOJ'].mean(), inplace=True)
df["DEROG"].fillna(value=0, inplace=True)
df["DELINQ"].fillna(value=0, inplace=True)
df['CLAGE'].fillna(value=df['CLAGE'].mean(), inplace=True)
df['NINQ'].fillna(value=df['NINQ'].mean(), inplace=True)
df['CLNO'].fillna(value=df['CLNO'].mean(), inplace=True)
df['CLNO'].fillna(value=df['CLNO'].mean(), inplace=True)
df['DEBTINC'].fillna(value=df['DEBTINC'].mean(), inplace=True)
df.drop(columns=["BAD", "JOB", "REASON"])

# Checking if there is anything left out
assert np.array_equal(df.isnull().sum(), [0] * len(df.isnull().sum()))

In [4]:
df.head()

Unnamed: 0,BAD,LOAN,MORTDUE,VALUE,REASON,JOB,YOJ,DEROG,DELINQ,CLAGE,NINQ,CLNO,DEBTINC
0,1,1100,25860.0,39025.0,HomeImp,Other,10.5,0.0,0.0,94.366667,1.0,9.0,33.779915
1,1,1300,70053.0,68400.0,HomeImp,Other,7.0,0.0,2.0,121.833333,0.0,14.0,33.779915
2,1,1500,13500.0,16700.0,HomeImp,Other,4.0,0.0,0.0,149.466667,1.0,10.0,33.779915
3,1,1500,73760.8172,101776.048741,DebtCon,Other,8.922268,0.0,0.0,179.766275,1.186055,21.296096,33.779915
4,0,1700,97800.0,112000.0,HomeImp,Office,3.0,0.0,0.0,93.333333,0.0,14.0,33.779915


In [5]:
df.shape

(5960, 13)

In [6]:
# Removing the features BAD, JOB, REASON from the input features set
# NUM = 1000
x_basic = df.drop(columns=["BAD", "JOB", "REASON"])
y = df["BAD"]

In [7]:
scaler = StandardScaler()
x_scaled = scaler.fit_transform(np.array(x_basic))
y_scaled = scaler.fit_transform(np.array(y).reshape(-1, 1))

In [8]:
outliers = []
temp = x_scaled.T
for i in range(10):
    outliers = outliers + np.where(abs(temp[i]) > 5)[0].tolist()
outliers = np.unique(np.array(outliers))

In [9]:
x_scaled = np.delete(x_scaled, outliers, axis=0)
y_scaled = np.delete(y_scaled, outliers, axis=0)

In [10]:
x_scaled = np.round(x_scaled, 3)
y_scaled = np.round(y_scaled, 3)

In [11]:
x_train, x_test, y_train, y_test = train_test_split(x_scaled, 
                                                    y_scaled, 
                                                    test_size=0.2, 
                                                    shuffle=True, 
                                                    stratify=y_scaled, 
                                                    random_state=42)

In [12]:
print(x_train.shape)
print(y_train.shape)

(4592, 10)
(4592, 1)


## Initialize Model

In [14]:
# To get initial weight and bias
# You can also randomly initialize it
# But it is comfortable to use torch to get (weight, bias) pair 

class LR(torch.nn.Module):
    def __init__(self, n_features):
        super(LR, self).__init__()
        self.lr = torch.nn.Linear(n_features, 1)
        
    def forward(self, x):
        out = torch.sigmoid(self.lr(x))
        return out

## Compile

In [15]:
def write_data(file_name, file_content):
    if type(file_content) == bytes:
        file_content = base64.b64encode(file_content)
        
    with open(file_name, 'wb') as f:
        f.write(file_content)

def read_data(file_name):
    with open(file_name, 'rb') as f:
        file_content = f.read()
    
    return base64.b64decode(file_content)

In [16]:
# this function have to be executed in local (before encrypting)

def compile(
    x, y, write_data # x, y are splitted before premodeling 
):  
    print("Making context")
    poly_mod_degree = pow(2, 13)
    coeff_mod_bit_sizes = [60, 40, 40, 60]
    ctx = ts.context(ts.SCHEME_TYPE.CKKS, poly_mod_degree, -1, coeff_mod_bit_sizes)
    ctx.global_scale = 2 ** 40
    ctx.generate_galois_keys()
    
    print("Saving context")
    secret_context = ctx.serialize(save_secret_key=True)
    write_data(file_name = "D:/data/customer/secret_context.txt", 
               file_content = secret_context)
    
    ctx.make_context_public()
    public_context = ctx.serialize()
    write_data(file_name = "D:/data/server/public_context.txt", 
               file_content = public_context)
    
    print("Encrypting datas")
    start_x = time()
    enc_x = [ts.CKKSVector(ctx, data.tolist()) for _, data in tqdm_notebook(enumerate(x))]
    encoding_x = time() - start_x
    
    start_y = time()
    enc_y = [ts.CKKSVector(ctx, data.tolist()) for _, data in tqdm_notebook(enumerate(y))]
    encoding_y = time() - start_y
    
    print(f"Data encrypting time x: {encoding_x}, y: {encoding_y} when the data size is {len(enc_x)}")
    
    print("Saving datas")
    start_saving = time()
    for i, data in tqdm_notebook(enumerate(enc_x)):
        write_data(file_name="D:/data/server/enc_x/enc_x_"+str(i)+".txt",
                   file_content=data.serialize())
        
    for i, data in tqdm_notebook(enumerate(enc_y)):
        write_data(file_name="D:/data/server/enc_y/enc_y_"+str(i)+".txt",
                   file_content=data.serialize())
    end_saving = time()
    print(f"Data saving time: {end_saving-start_saving} when the data size is {len(enc_x)}")
    
    del ctx, secret_context, public_context, enc_x, enc_y

In [17]:
compile(x=x_train, y=y_train, write_data=write_data)

Making context
Saving context
Encrypting datas


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


0it [00:00, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


0it [00:00, ?it/s]

Data encrypting time x: 22.790180921554565, y: 20.663437366485596 when the data size is 4592
Saving datas


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


0it [00:00, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


0it [00:00, ?it/s]

Data saving time: 109.86565828323364 when the data size is 4592


## Send to Server and Customer  
(The rest is ommitted)

### Train, Test code

In [25]:
def test(
    model: EncryptedLR, name: str, x_test: np.array, y_test: np.array
):
    print(f"Testing {name}")
    
    test_start = time()
    accuracy = model.plain_accuracy(x_test, y_test)
    test_end = time()
    
    print(f"Testing time is {test_end - test_start}")
    
    return accuracy

In [None]:
# pylint: disable=too-many-locals


def evaluate(
    model_class, name, x, y, test_size=0.33, show_circuit=False, predict_in_fhe=True, n_bits=None
):
    """Function to evaluate a model class on a given (x, y). This returns different metrics, notably
    in simulate and FHE for Concrete ML models, as well as execution times."""

    print(f"Evaluating {name}")

    # Splitting the data into test and train sets. Remark the use of stratify, to make sure that
    # the testset contains some representative class distribution in our targets
    x_local_tr, x_local_te, y_local_tr, y_local_te = train_test_split(
        x, y, stratify=y, test_size=test_size, random_state=1
    )
    len_x_local_te = len(x_local_te)

    # With a normalization
    model = Pipeline(
        [
            ("preprocessor", StandardScaler()),
            ("model", model_class()),
        ]
    )

    # Training
    model.fit(x_local_tr, y_local_tr)

    # Predicting
    before_time = time.time()
    y_local_pre = model.predict(x_local_te)
    local_t = (time.time() - before_time) / len_x_local_te

    local_a = accuracy_score(y_local_te, y_local_pre)
    local_f = f1_score(y_local_te, y_local_pre, average="macro")
    local_p = precision_score(y_local_te, y_local_pre, average="macro")
    local_r = recall_score(y_local_te, y_local_pre, average="macro")

    max_bit_width = None
    local_a_simulate = None
    local_a_fhe = None
    local_t_simulate = None
    local_t_fhe = None

    # For Concrete ML models
    if getattr(model_class(), "_is_a_public_cml_model", False):
        circuit = model["model"].compile(x)  # pylint: disable=no-member

        # To see the circuit
        if show_circuit:
            print(circuit)

        # Max bitwidth of the circuit
        max_bit_width = circuit.graph.maximum_integer_bit_width()

        # Prediction in simulation
        before_time = time.time()
        y_local_pre_simulate = model.predict(x_local_te, fhe="simulate")
        local_t_simulate = (time.time() - before_time) / len_x_local_te

        local_a_simulate = accuracy_score(y_local_te, y_local_pre_simulate)

        # Prediction in FHE
        if predict_in_fhe:
            before_time = time.time()
            y_local_pre_fhe = model.predict(x_local_te, fhe="execute")
            local_t_fhe = (time.time() - before_time) / len_x_local_te

            local_a_fhe = accuracy_score(y_local_te, y_local_pre_fhe)

    ans = (
        name,
        local_a,
        local_a_simulate,
        local_a_fhe,
        local_f,
        local_p,
        local_r,
        max_bit_width,
        local_t,
        local_t_simulate,
        local_t_fhe,
        len_x_local_te,
        n_bits,
    )

    return ans


In [None]:
poly_mod_degree = pow(2, 13)
coeff_mod_bit_sizes = [60, 40, 40, 60]
ctx = ts.context(ts.SCHEME_TYPE.CKKS, poly_mod_degree, -1, coeff_mod_bit_sizes)
ctx.global_scale = 2 ** 40
ctx.generate_galois_keys()

In [4]:
df

Unnamed: 0,BAD,LOAN,MORTDUE,VALUE,REASON,JOB,YOJ,DEROG,DELINQ,CLAGE,NINQ,CLNO,DEBTINC
0,1,1100,25860.0,39025.0,HomeImp,Other,10.5,0.0,0.0,94.366667,1.0,9.0,
1,1,1300,70053.0,68400.0,HomeImp,Other,7.0,0.0,2.0,121.833333,0.0,14.0,
2,1,1500,13500.0,16700.0,HomeImp,Other,4.0,0.0,0.0,149.466667,1.0,10.0,
3,1,1500,,,,,,,,,,,
4,0,1700,97800.0,112000.0,HomeImp,Office,3.0,0.0,0.0,93.333333,0.0,14.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5955,0,88900,57264.0,90185.0,DebtCon,Other,16.0,0.0,0.0,221.808718,0.0,16.0,36.112347
5956,0,89000,54576.0,92937.0,DebtCon,Other,16.0,0.0,0.0,208.692070,0.0,15.0,35.859971
5957,0,89200,54045.0,92924.0,DebtCon,Other,15.0,0.0,0.0,212.279697,0.0,15.0,35.556590
5958,0,89800,50370.0,91861.0,DebtCon,Other,14.0,0.0,0.0,213.892709,0.0,16.0,34.340882


In [14]:
sk = ctx.secret_key()
pk = ctx.public_key()

### Data Preprocessing

In [15]:
df = df.dropna()
label = df['Survived']
df = df.drop('Survived', axis=1)

     PassengerId  Survived  Pclass  \
1              2         1       1   
3              4         1       1   
6              7         0       1   
10            11         1       3   
11            12         1       1   
..           ...       ...     ...   
871          872         1       1   
872          873         0       1   
879          880         1       1   
887          888         1       1   
889          890         1       1   

                                                  Name     Sex   Age  SibSp  \
1    Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
6                              McCarthy, Mr. Timothy J    male  54.0      0   
10                     Sandstrom, Miss. Marguerite Rut  female   4.0      1   
11                            Bonnell, Miss. Elizabeth  female  58.0      0   
..                                                 ...     ...   ... 

Unnamed: 0,"Name_Allen, Miss. Elisabeth Walton","Name_Allison, Master. Hudson Trevor","Name_Allison, Miss. Helen Loraine","Name_Allison, Mrs. Hudson J C (Bessie Waldo Daniels)","Name_Anderson, Mr. Harry","Name_Andrews, Miss. Kornelia Theodosia","Name_Andrews, Mr. Thomas Jr","Name_Appleton, Mrs. Edward Dale (Charlotte Lamson)","Name_Astor, Mrs. John Jacob (Madeleine Talmadge Force)","Name_Aubart, Mme. Leontine Pauline",...,Cabin_F G63,Cabin_F G73,Cabin_F2,Cabin_F33,Cabin_F4,Cabin_G6,Cabin_T,Embarked_C,Embarked_Q,Embarked_S
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
11,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
871,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
872,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
879,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
887,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [16]:
obj_cols = [col for col in df.columns if df[col].dtypes=='object']
encoded_df = pd.get_dummies(df[obj_cols])
df = df.drop(obj_cols, axis=1)
df = pd.concat([df, encoded_df], axis=1)
df = torch.tensor(df.values)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 183 entries, 1 to 889
Columns: 454 entries, PassengerId to Embarked_S
dtypes: float64(2), int64(4), uint8(448)
memory usage: 90.1 KB
None


In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(label)
label = le.transform(label)
label = torch.tensor(label).unsqueeze(1).float()

In [6]:
class LR(torch.nn.Module):

    def __init__(self, n_features):
        super(LR, self).__init__()
        self.lr = torch.nn.Linear(n_features, 1)
        
    def forward(self, x):
        out = torch.sigmoid(self.lr(x))
        return out

In [7]:
class EncryptedLR:
    def __init__(self, torch_lr):
        self.weight = torch_lr.lr.weight.data.tolist()[0]  # shape이 1xN이 되서 N으로 바꿔줌
        self.bias = torch_lr.lr.bias.data.tolist()

        self._delta_w = 0
        self._delta_b = 0
        self._count = 0

    def forward(self, enc_x):
        # forward propagation
        enc_out = enc_x.dot(self.weight) + self.bias
        print(type(enc_out))
        enc_out = EncryptedLR.sigmoid(enc_out)
        return enc_out

    def backward(self, enc_x, enc_out, enc_y):
        # backward propagation
        out_minus_y = (enc_out - enc_y)
        self._delta_w += enc_x * out_minus_y
        self._delta_b += out_minus_y
        self._count += 1

    def update_parameters(self):
        # optimizer
        if self._count == 0:
            raise RuntimeError("You should at least run one forward iteration")

        self.weight -= self._delta_w * (1 / self._count) + self.weight * 0.05  # regularization
        # 1 / self._count => means learning_rate decay in optimization
        self.bias -= self._delta_b * (1 / self._count)
        self._delta_w = 0
        self._delta_b = 0
        # self._count = 0 # count?

    @staticmethod
    def sigmoid(enc_x):
        return enc_x.polyval([0.5, 0.197, 0, -0.004])

    def plain_accuracy(self, x_test, y_test):
        w = torch.tensor(self.weight)
        b = torch.tensor(self.bias)
        out = torch.sigmoid(x_test.matmul(w) + b).reshape(-1, 1)
        correct = torch.abs(y_test - out) < 0.5
        return correct.float().mean()

    def encrypt(self, context):
        self.weight = ts.ckks_vector(context, self.weight)
        self.bias = ts.ckks_vector(context, self.bias)

    def decrypt(self):
        self.weight = self.weight.decrypt()
        self.bias = self.bias.decrypt()

    def __call__(self, *args, **kwargs):
        return self.forward(*args, **kwargs)

In [8]:
x_train, x_test, y_train, y_test = train_test_split(df, label, test_size=0.2, random_state=42, stratify=label)

NameError: name 'label' is not defined

In [None]:
start_x = time()
enc_x_train = [ts.CKKSVector(ctx, x.tolist()) for i, x in tqdm_notebook(enumerate(x_train))]
end_x = time()

encryption_time_x = end_x - start_x

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


0it [00:00, ?it/s]

In [1]:
start_y = time()
enc_y_train = [ts.CKKSVector(ctx, [y]) for i, y in tqdm_notebook(enumerate(y_train))]
end_y = time()

encryption_time_y = end_y - start_y

NameError: name 'time' is not defined

In [73]:
x_test = x_test.to(torch.float32)
y_test = y_test.to(torch.float32)

In [76]:
eelr = EncryptedLR(LR(x_train.shape[1]))
accuracy = eelr.plain_accuracy(x_test, y_test)
print(f"Accuracy at epoch #0 is {accuracy}")

times = []
for epoch in range(5):
    # if you want to keep an eye on the distribution to make sure
    # the function approximation is still working fine
    # WARNING: this operation is time consuming
    # encrypted_out_distribution(eelr, enc_x_train)
    
    t_start = time()
    for enc_x, enc_y in zip(enc_x_train, enc_y_train):
        enc_out = eelr.forward(enc_x)
        eelr.backward(enc_x, enc_out, enc_y)
    eelr.update_parameters()
    t_end = time()
    times.append(t_end - t_start)
    
    eelr.decrypt()


accuracy = eelr.plain_accuracy(x_test, y_test)
print(f"\nAverage time per epoch: {int(sum(times) / len(times))} seconds")
print(f"Final accuracy is {accuracy}")

Accuracy at epoch #0 is 0.11281055212020874
<class 'tenseal.tensors.ckksvector.CKKSVector'>


ValueError: end of modulus switching chain reached

In [12]:
secret_context = ctx.serialize(save_secret_key=True, save_public_key=False)

In [14]:
with open("secret_context.bin", "wb") as file:
    file.write(secret_context)

In [16]:
with open("secret_context.bin", "rb") as file:
    loaded_context = file.read()

In [17]:
new_context = ctx.load(loaded_context)

In [18]:
new_context.secret_key()

<tenseal.enc_context.SecretKey at 0x1894e7d3898>