In [1]:
# Let's start with a simple example of the scikit lib use

import sys
sys.path.append('./privatizers')
sys.path.append('./dataProviders')
sys.path.append('./adapters')

from sklearn import tree
from sklearn.datasets import fetch_covtype
from adapters import generalAdapter as adapter
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import numpy as np

dp = fetch_covtype()
X = dp.data
Y = dp.target

X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size = 0.3, random_state = 100)


In [2]:
## Dataset
data_set = {
    'samples': X_train,
    'labels': y_train
}

## Size adjustment
MAX_SIZE = len(data_set['samples'])
desired_size = 10000
size = min(desired_size, MAX_SIZE)

data_set['samples'] = data_set['samples'][:size]
data_set['labels'] = data_set['labels'][:size]

In [3]:
## Decision tree without privatization
clf = tree.DecisionTreeClassifier()
# clf = clf.fit(data_set['samples'], data_set['labels'])

# tree.plot_tree(clf.fit(data_set['samples'], data_set['labels']))
clf.fit(data_set['samples'], data_set['labels'])
# tree.export_graphviz(clf, out_file='tree.dot')

y_pred = clf.predict(X_test)

print("Accuracy is", accuracy_score(y_test,y_pred)*100)

Accuracy is 72.42231962548192


In [12]:
## Privatization: Laplace privatizer
from privatizers import laplacePrivatizer
ad = adapter.GeneralAdapter(7, 1)

epsilon = [100.0, 90.0, 80.0, 70.0, 60.0, 50.0, 60.0, 50.0, 40.0, 30.0, 20.0, 10.0, 5.0, 4.0, 3.0, 2.0, 1.0, 0.5, 0.01, 0.001]
for i in epsilon:
    privatizer = laplacePrivatizer.LaplacePrivatizer(i)
    privateData = privatizer.privatize(ad.fromRaw(data_set['samples']))

    privateTargetsFloat = privatizer.privatize(ad.fromRaw(data_set['labels']))
    privateTargets = ad.toDiscreteValue(privateTargetsFloat)

    clf = tree.DecisionTreeClassifier()
    clf.fit(privateData, privateTargets)
    y_pred = clf.predict(X_test)

    # tree.export_graphviz(clf, out_file='tree_laplace s=0,03 size=100.dot') 

    print("Accuracy for epsilon = ", i," is", accuracy_score(y_test,y_pred)*100)

Accuracy for epsilon =  0.5  is 47.167018542316875
Accuracy for epsilon =  0.01  is 36.58148981090509
Accuracy for epsilon =  0.001  is 48.5794932990637
