In [2]:
import torch

import numpy as np
import pandas as pd
import tenseal as ts
import base64

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from time import time
from tqdm import tqdm_notebook


1. sum  
2. mean  
3. var  
4. Linear Regression
5. Decision Tree

## Load, Preprocess Dataset - hmeq.csv (Before operation)

In [3]:
# Loading the dataset
df = pd.read_csv("data/hmeq.csv", sep=",")

# Replacement of NaN variables
df['MORTDUE'].fillna(value=df['MORTDUE'].mean(), inplace=True)
df['VALUE'].fillna(value=df['VALUE'].mean(), inplace=True)
df["REASON"].fillna(value="DebtCon", inplace=True)
df["JOB"].fillna(value="Other", inplace=True)
df["YOJ"].fillna(value=df['YOJ'].mean(), inplace=True)
df["DEROG"].fillna(value=0, inplace=True)
df["DELINQ"].fillna(value=0, inplace=True)
df['CLAGE'].fillna(value=df['CLAGE'].mean(), inplace=True)
df['NINQ'].fillna(value=df['NINQ'].mean(), inplace=True)
df['CLNO'].fillna(value=df['CLNO'].mean(), inplace=True)
df['CLNO'].fillna(value=df['CLNO'].mean(), inplace=True)
df['DEBTINC'].fillna(value=df['DEBTINC'].mean(), inplace=True)
df.drop(columns=["BAD", "JOB", "REASON"])

# Checking if there is anything left out
assert np.array_equal(df.isnull().sum(), [0] * len(df.isnull().sum()))

In [4]:
df.head()

Unnamed: 0,BAD,LOAN,MORTDUE,VALUE,REASON,JOB,YOJ,DEROG,DELINQ,CLAGE,NINQ,CLNO,DEBTINC
0,1,1100,25860.0,39025.0,HomeImp,Other,10.5,0.0,0.0,94.366667,1.0,9.0,33.779915
1,1,1300,70053.0,68400.0,HomeImp,Other,7.0,0.0,2.0,121.833333,0.0,14.0,33.779915
2,1,1500,13500.0,16700.0,HomeImp,Other,4.0,0.0,0.0,149.466667,1.0,10.0,33.779915
3,1,1500,73760.8172,101776.048741,DebtCon,Other,8.922268,0.0,0.0,179.766275,1.186055,21.296096,33.779915
4,0,1700,97800.0,112000.0,HomeImp,Office,3.0,0.0,0.0,93.333333,0.0,14.0,33.779915


In [5]:
df.shape

(5960, 13)

In [6]:
# Removing the features BAD, JOB, REASON from the input features set
# NUM = 1000
x_basic = df.drop(columns=["BAD", "JOB", "REASON"])
y = df["BAD"]

In [7]:
scaler = StandardScaler()
x_scaled = scaler.fit_transform(np.array(x_basic))
y_scaled = scaler.fit_transform(np.array(y).reshape(-1, 1))

In [8]:
outliers = []
temp = x_scaled.T
for i in range(10):
    outliers = outliers + np.where(abs(temp[i]) > 5)[0].tolist()
outliers = np.unique(np.array(outliers))

In [9]:
x_scaled = np.delete(x_scaled, outliers, axis=0)
y_scaled = np.delete(y_scaled, outliers, axis=0)

In [10]:
x_scaled = np.round(x_scaled, 3)
y_scaled = np.round(y_scaled, 3)

In [11]:
x_train, x_test, y_train, y_test = train_test_split(x_scaled, 
                                                    y_scaled, 
                                                    test_size=0.2, 
                                                    shuffle=True, 
                                                    stratify=y_scaled, 
                                                    random_state=42)

In [12]:
print(x_train.shape)
print(y_train.shape)

(4592, 10)
(4592, 1)


## Initialize Model

In [14]:
# To get initial weight and bias
# You can also randomly initialize it
# But it is comfortable to use torch to get (weight, bias) pair 

class LR(torch.nn.Module):
    def __init__(self, n_features):
        super(LR, self).__init__()
        self.lr = torch.nn.Linear(n_features, 1)
        
    def forward(self, x):
        out = torch.sigmoid(self.lr(x))
        return out

## Compile

In [15]:
def write_data(file_name, file_content):
    if type(file_content) == bytes:
        file_content = base64.b64encode(file_content)
        
    with open(file_name, 'wb') as f:
        f.write(file_content)

def read_data(file_name):
    with open(file_name, 'rb') as f:
        file_content = f.read()
    
    return base64.b64decode(file_content)

In [16]:
# this function have to be executed in local (before encrypting)

def compile(
    x, y, write_data # x, y are splitted before premodeling 
):  
    print("Making context")
    poly_mod_degree = pow(2, 13)
    coeff_mod_bit_sizes = [60, 40, 40, 60]
    ctx = ts.context(ts.SCHEME_TYPE.CKKS, poly_mod_degree, -1, coeff_mod_bit_sizes)
    ctx.global_scale = 2 ** 40
    ctx.generate_galois_keys()
    
    print("Saving context")
    secret_context = ctx.serialize(save_secret_key=True)
    write_data(file_name = "D:/data/customer/secret_context.txt", 
               file_content = secret_context)
    
    ctx.make_context_public()
    public_context = ctx.serialize()
    write_data(file_name = "D:/data/server/public_context.txt", 
               file_content = public_context)
    
    print("Encrypting datas")
    start_x = time()
    enc_x = [ts.CKKSVector(ctx, data.tolist()) for _, data in tqdm_notebook(enumerate(x))]
    encoding_x = time() - start_x
    
    start_y = time()
    enc_y = [ts.CKKSVector(ctx, data.tolist()) for _, data in tqdm_notebook(enumerate(y))]
    encoding_y = time() - start_y
    
    print(f"Data encrypting time x: {encoding_x}, y: {encoding_y} when the data size is {len(enc_x)}")
    
    print("Saving datas")
    start_saving = time()
    for i, data in tqdm_notebook(enumerate(enc_x)):
        write_data(file_name="D:/data/server/enc_x/enc_x_"+str(i)+".txt",
                   file_content=data.serialize())
        
    for i, data in tqdm_notebook(enumerate(enc_y)):
        write_data(file_name="D:/data/server/enc_y/enc_y_"+str(i)+".txt",
                   file_content=data.serialize())
    end_saving = time()
    print(f"Data saving time: {end_saving-start_saving} when the data size is {len(enc_x)}")
    
    del ctx, secret_context, public_context, enc_x, enc_y

In [17]:
compile(x=x_train, y=y_train, write_data=write_data)

Making context
Saving context
Encrypting datas


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


0it [00:00, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


0it [00:00, ?it/s]

Data encrypting time x: 22.790180921554565, y: 20.663437366485596 when the data size is 4592
Saving datas


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


0it [00:00, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


0it [00:00, ?it/s]

Data saving time: 109.86565828323364 when the data size is 4592


## Send to Server and Customer  
(The rest is ommitted)