In [1]:
# Let's start with a simple example of the scikit lib use

import sys
sys.path.append('./privatizers')
sys.path.append('./dataProviders')
sys.path.append('./adapters')

from sklearn import tree
from sklearn.datasets import fetch_covtype
from adapters import generalAdapter as adapter
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import numpy as np

dp = fetch_covtype()
print(dp.target)
X = dp.data
Y = dp.target

X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size = 0.3, random_state = 100)

print(X_train)


[5 5 2 ... 3 3 3]
[[2.703e+03 3.440e+02 1.600e+01 ... 0.000e+00 0.000e+00 0.000e+00]
 [2.916e+03 4.200e+01 6.000e+00 ... 0.000e+00 0.000e+00 0.000e+00]
 [2.606e+03 2.960e+02 2.400e+01 ... 0.000e+00 0.000e+00 0.000e+00]
 ...
 [2.982e+03 2.850e+02 1.100e+01 ... 0.000e+00 0.000e+00 0.000e+00]
 [3.040e+03 1.280e+02 1.300e+01 ... 0.000e+00 0.000e+00 0.000e+00]
 [2.972e+03 0.000e+00 3.000e+00 ... 0.000e+00 0.000e+00 0.000e+00]]


In [2]:
## Dataset
data_set = {
    'samples': X,
    'labels': Y
}

## Size adjustment
MAX_SIZE = len(data_set['samples'])
desired_size = 80000
size = min(desired_size, MAX_SIZE)

data_set['samples'] = data_set['samples'][:size]
data_set['labels'] = data_set['labels'][:size]

In [3]:
## Decision tree without privatization
clf = tree.DecisionTreeClassifier()
# clf = clf.fit(data_set['samples'], data_set['labels'])

# tree.plot_tree(clf.fit(data_set['samples'], data_set['labels']))
clf.fit(data_set['samples'], data_set['labels'])
# tree.export_graphviz(clf, out_file='tree.dot')

y_pred = clf.predict(X_test)

print("Accuracy is", accuracy_score(y_test,y_pred)*100)

Accuracy is 71.09360657242519


In [4]:
## Privatization: Laplace privatizer
from privatizers import laplacePrivatizer
ad = adapter.GeneralAdapter(10, 1)
scale = 0.3
privatizer = laplacePrivatizer.LaplacePrivatizer(scale)
privateData = privatizer.privatize(ad.fromRaw(data_set['samples']))

privateTargetsFloat = privatizer.privatize(ad.fromRaw(data_set['labels']))
privateTargets = ad.toDiscreteValue(privateTargetsFloat)

clf = tree.DecisionTreeClassifier()
clf.fit(privateData, privateTargets)
y_pred = clf.predict(X_test)

# tree.export_graphviz(clf, out_file='tree_laplace s=0,03 size=100.dot') 

print("Accuracy is", accuracy_score(y_test,y_pred)*100)

Accuracy is 60.96934092160823


In [None]:
## Privatization: Exponential privatizer
from privatizers import exponentialPrivatizer

scale = 0.03
privatizer = exponentialPrivatizer.ExponentialPrivatizer(scale)
privateData = privatizer.privatize(ad.fromRaw(data_set['samples']))

privateTargetsFloat = privatizer.privatize(ad.fromRaw(data_set['labels']))
privateTargets = ad.toDiscreteValue(privateTargetsFloat)

clf = tree.DecisionTreeClassifier()

tree.plot_tree(clf.fit(privateData, privateTargets))
clf.fit(privateData, privateTargets)
y_pred = clf.predict(X_test)

# tree.export_graphviz(clf, out_file='tree_exponential s=0,03 size=100.dot') 

print("Accuracy is", accuracy_score(y_test,y_pred)*100)


Accuracy is 40.279052689553886
