In [458]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as ticker
import math

import sys,os
DPCOMP_PATH = '/nfs/avid/data1/miklau/dpcomp-parent/dpcomp_core_op'
sys.path.append(DPCOMP_PATH)
os.environ['DPCOMP_CORE']= DPCOMP_PATH

In [459]:
%matplotlib inline
plt.rcParams['figure.figsize'] = (20.0, 10.0)
sns.set_style("whitegrid")

In [460]:
from dpcomp_core.algorithm import *
from dpcomp_core import dataset
from dpcomp_core import util
from dpcomp_core import workload

In [461]:
import errors

In [462]:
# number of bins
# domain = (256,)
domain = 256

epsilon = 0.1

nickname = 'HEPTH'
# nickname = 'BIDS-ALL'

# number of data points from sample_to_scale data generation
sample = 1e4

# ESPairs = [(0.1, 1e4), (0.01, 1e5), (0.001, 1e6)]

seed = 1

seeds = range(5)

# Instantiate dataset
data = dataset.DatasetSampledFromFile(nickname=nickname, 
                                     sample_to_scale=sample, 
                                     reduce_to_dom_shape=domain, 
                                     seed=111)

# Instantiate workload
# w = workload.Identity(domain_shape=domain)
w = workload.Prefix1D(domain_shape_int=domain)


# Instantiate algorithms
Identity_ = identity.identity_engine()
HB_ = HB.HB_engine()
MWEM_ = mwemND.mwemND_engine()
DAWA_ = dawa.dawa_engine()

algorithms = [(Identity_, "Identity"), (HB_, "HB"), (MWEM_, "MWEM"), (DAWA_, "DAWA")]

In [466]:
# original data
dat = data.payload

df = pd.DataFrame(dat)
x = df.index.values
y = df.values.flatten()

clrs = ['red' if (k == max(y)) else 'black' for k in y ]
orig = sns.barplot(x=x, y=y, palette=clrs)

maxValOrig = max(y)
maxValIndexOrig = y.argmax()

print "Max: %d, Index: %d" % (maxVal, maxValIndex)

orig.set_xticks([k for k in range(domain+20) if k % 20 == 0])
orig.set_xticklabels([k for k in range(domain+20) if k % 20 == 0])
# orig.set_ylim([0,12000])
orig.set(xlabel='bins', ylabel='counts')
orig.set_title('Original %s \n Epsilon = %s, Domain = %s, Sample = %s' % (nickname, str(epsilon), str(domain), str(sample)))
# sns.plt.show()
# plt.gcf().clear()
plt.close()

Max: 123, Index: 223


In [464]:
datCDF= np.cumsum(dat)
fig, cdf = plt.subplots()
cdf.plot(datCDF)
cdf.set_xticks([k for k in range(domain+20) if k % 20 == 0])
cdf.set_xticklabels([k for k in range(domain+20) if k % 20 == 0])
# cdf.set_ylim([0, 12000])
cdf.set(xlabel='bins', ylabel='counts')
cdf.set_title('Original %s (CDF) \n Epsilon = %s, Domain = %s, Sample = %s' % (nickname, str(epsilon), str(domain), str(sample)))
# sns.plt.show()
# plt.gcf().clear()
plt.close()

In [465]:
# Calculate and plot noisy estimates for x
for seed in seeds:
    print seed
    for alg in algorithms:
        x_hat = alg[0].Run(w, dat, epsilon, seed)
        df_hat = pd.DataFrame(x_hat)
        x_hat_data = df_hat.index.values
        y_hat_data = df_hat.values.flatten()
    #     print sum (y_hat_data)

        # normalized non-negative rounding post-processing
        negSum = sum(y_hat_data)
        posSum = 0.00
        for i in y_hat_data:
            if i >= 0:
                posSum += i
        y_hat_data = [x*(negSum/posSum) if x >= 0 else 0 for x in y_hat_data]
    #     print sum (y_hat_data)

        maxVal = max(y_hat_data)
        maxValIndex = y_hat_data.index(max(y_hat_data))
        
        # clone y_hat_data by slicing
        y_hat_data_copy = y_hat_data[:]
        y_hat_data_copy.remove(maxVal)
        
        second_maxVal = max(y_hat_data_copy)
        second_maxValIndex = y_hat_data_copy.index(max(y_hat_data_copy))

        clrs = ['red' if (k == maxVal) or (k == second_maxVal) else 'black' for k in y_hat_data ]
        
#         graph = sns.barplot(x=x_hat_data, y=y_hat_data, palette = clrs)
#         graph.set_xticks([k for k in range(domain+20) if k % 20 == 0])
#         graph.set_xticklabels([k for k in range(domain+20) if k % 20 == 0])
#     #     graph.set_ylim([0,12000])
#         graph.set(xlabel='bins', ylabel='counts')
#         graph.set_title('%s %s \n Epsilon = %s, Domain = %s, Sample = %s' % (alg[1], nickname, str(epsilon), str(domain), str(sample)))
#         sns.plt.show()
#         plt.gcf().clear()
        plt.close()

        datCDF_noisy = np.cumsum(y_hat_data)

#         fig, cdf = plt.subplots()
#         cdf.plot(datCDF)
#         cdf.plot(datCDF_noisy)
#         cdf.set_xticks([k for k in range(domain+20) if k % 20 == 0])
#         cdf.set_xticklabels([k for k in range(domain+20) if k % 20 == 0])
#     #     cdf.set_ylim([0, 1200000])
#         cdf.set(xlabel='bins', ylabel='counts')
#         cdf.set_title('%s %s (CDF) \n Epsilon = %s, Domain = %s, Sample = %s' % (alg[1], nickname, str(epsilon), str(domain), str(sample)))
#         sns.plt.show()
#         plt.gcf().clear()
        plt.close()


0
1
2
3
4


In [290]:
errors = {}
for alg in algorithms:
    errors[alg[1]] = []
    
second_errors = {}
for alg in algorithms:
    second_errors[alg[1]] = []


In [379]:
# Calculate and plot noisy estimates for x
for seed in seeds:
    print seed
    for alg in algorithms:
        x_hat = alg[0].Run(w, dat, epsilon, seed)
        df_hat = pd.DataFrame(x_hat)
        x_hat_data = df_hat.index.values
        y_hat_data = df_hat.values.flatten()

        # normalized non-negative rounding post-processing
        negSum = sum(y_hat_data)
        posSum = 0.00
        for i in y_hat_data:
            if i >= 0:
                posSum += i
        y_hat_data = [x*(negSum/posSum) if x >= 0 else 0 for x in y_hat_data]

        maxVal = max(y_hat_data)
        maxValIndex = y_hat_data.index(max(y_hat_data))
        
        # clone y_hat_data by slicing
        y_hat_data_copy = y_hat_data[:]
        y_hat_data_copy.remove(maxVal)
        
        second_maxVal = max(y_hat_data_copy)
        second_maxValIndex = y_hat_data_copy.index(max(y_hat_data_copy))

#         print "Max: %d, Index: %d" % (maxVal, maxValIndex)

        errors[alg[1]].append(calculateError(maxVal, maxValIndex, maxValOrig, maxValIndexOrig))
        second_errors[alg[1]].append(calculateError(second_maxVal, second_maxValIndex, maxValOrig, maxValIndexOrig))

0


TypeError: 'module' object has no attribute '__getitem__'

In [457]:
reload(errors)
a = errors.DataCompare(y, y_hat_data)
print a.maxOrig
print a.maxOrigIndex
print a.calcError("M1")

AttributeError: 'numpy.ndarray' object has no attribute 'index'

In [421]:
for i in errors:
    print i, sum(errors[i])/len(errors[i])
print "\n"
for i in second_errors:
    print i, sum(second_errors[i])/len(second_errors[i])

TypeError: 'module' object is not iterable