### Evaluation Testing
- Anonymeter Method
- DOMIAS Method
- GDA Score Method
- SDV Data Analysis

In [1]:
# IMPORTS AND DATA
import pandas as pd
import math

#for Anonymeter
from scipy.stats import norm
from math import sqrt

#for DOMIAS
from sklearn.metrics import accuracy_score, roc_auc_score

#for GDA
from statistics import mean, stdev

#for SDV
import sdv
from sdv.metadata import SingleTableMetadata
import graphviz
from sdv.evaluation.single_table import evaluate_quality
from sdv.evaluation.single_table import run_diagnostic
from sdv.evaluation.single_table import get_column_plot
from sdv.evaluation.single_table import get_column_pair_plot

train_df = pd.read_parquet('datasets/adults_train.parquet')
synth_df = pd.read_parquet('test_data/adults_syn_ctgan.parquet')
metadata = SingleTableMetadata()
metadata = SingleTableMetadata.load_from_json('datasets/adults_train.json')

domias_guesses_df = pd.read_parquet('test_data/temp_domias_attack.parquet')
domias_y_true = []
domias_y_pred = []
domias_y_success = []
# convert true/false to 1/0
for i in range(0, len(domias_guesses_df['mia_scores'])):
    domias_y_true.append(1 if domias_guesses_df['y_true'][i] else 0)
    domias_y_pred.append(1 if domias_guesses_df['y_pred'][i] else 0)
    if domias_y_true[i] == 1 and domias_y_pred[i] == 1:
        domias_y_success.append(1)
    else:
        domias_y_success.append(0)

columns = train_df.columns
anon_guesses = {}
for secret in columns:
    df = pd.read_parquet('test_data/temp_anon_attack_'+secret+'.parquet')
    anon_guesses[secret] = df
    # print(df.head())
# also turn each y_true and y_pred into true/false
# 1. y_pred is always true
# 2. y_true is true if there is a match, otherwise false
anon_y_true = {}
anon_y_pred = {}
for secret in columns:
    y_true = []
    y_pred = []
    for i in range(0, len(anon_guesses[secret]['y_true'])):
        y_pred.append(1)
        if anon_guesses[secret]['y_true'][i] == anon_guesses[secret]['y_pred'][i]:
            y_true.append(1)
        else:
            y_true.append(0)
        
    anon_y_true[secret] = y_true
    anon_y_pred[secret] = y_pred

# using test data results
anon_n_attacks = 1000
anon_n_success = 134
anon_n_baseline = 83
domias_n_attacks = 39032
domias_n_success = 19516

In [13]:
# ANONYMETERS Method
confidence_level = 0.95
n_attacks = [anon_n_attacks, domias_n_attacks]
n_success = [anon_n_success, domias_n_success]

for i in range(2):
    z = norm.ppf(0.5 * (1.0 + confidence_level))
    z_squared = z * z
    n_success_var = n_success[i] * (n_attacks[i] - n_success[i]) / n_attacks[i]
    denominator = n_attacks[i] + z_squared
    rate = (n_success[i] + 0.5 * z_squared) / denominator
    error = (z / denominator) * sqrt(n_success_var + 0.25 * z_squared)
    if i==0:
        print("Anonymeter::")
    if i==1:
        print("\nDomias::")
    print("Rate: " + str(rate))
    print("Error: " + str(error))

Anonymeter::
Rate: 0.1354005936057132
Error: 0.021119517611586765

Domias::
Rate: 0.5
Error: 0.004960053062872045


In [None]:
# Anonymeter Utility Score
# TODO More Research Needed
# Score 1 - marginal distribution similarity:Jensen-Shannon divergence (categorical) or Kolmogorov-Smirnov statistic (continuous)
# Score 2 - pairwise scores: computed based on the absolute error between the original and synthetic statistics
# Score 3 - query counts similarity:  create a number of random queries and measure how many rows in the original and synthetic data match each query
#    then compute the score as the correlation coefficient between the original and synthetic query response sizes

In [3]:
# DOMIAS
# unable to do with Anonymeter results: requires y_true and y_pred
# y_true: whether original data exists in synthetic data
# y_pred: prediction of if original data exists in synthetic data
# for Anonymeter, we would have to rewrite the code to expose the guesses, and then figure out how to convert it to y_true and y_pred

# Accuracy as defined by DOMIAS
# use sklearn's accuracy_score function, which uses y_true and y_pred
for secret in columns:
    # acc = accuracy_score(anon_guesses[secret]['y_true'], anon_guesses[secret]['y_pred'])
    acc  = accuracy_score(anon_y_true[secret], anon_y_pred[secret])
    # print(secret, acc)
domias_acc = accuracy_score(domias_y_true, domias_y_pred)

# AUCROC as defined by DOMIAS
# use sklearn's roc_auc_score function, which uses y_true and y_scores
# TODO for Anonymeter, our y_scores would be the distance measured
# aucroc also is measured on continuous datasets, so this may not fit
domias_rocauc = roc_auc_score(domias_y_true, domias_guesses_df['mia_scores'])

# after using DOMIAS metrics, we can also use the Accuracy algorithm from GDA's utility score to further measure attack accuracy
# 1. Create Error Lists:
#     ** skip all Anon where 0 before computing, not needed here because anon is all 1
#     Absolute Error: Absolute( Anon - Raw)
#     Simple Relative Error: Raw / Anon
#     Relative Error: Absolute(Anon - Raw) / Max (Anon, Raw)
# 2. Convert Error Lists into 5 metrics each: Min, Max, Avg, Stddev, Compute
print("::Anonymeter Results::")
gda_acc = {}
for secret in columns:
    absErrorList = []
    simpleRelErrorList = []
    relErrorList = []

    y_true = anon_y_true[secret]
    y_pred = anon_y_pred[secret]
    for i in range(0, len(y_true)):
        absErrorList.append(abs(y_pred[i]-y_true[i]))
        simpleRelErrorList.append(y_true[i]/y_pred[i])
        relErrorList.append(abs(y_pred[i]-y_true[i]) / max(y_pred[i], y_true[i]))
    mins = [min(absErrorList), min(simpleRelErrorList), min(relErrorList)]
    maxs = [max(absErrorList), max(simpleRelErrorList), max(relErrorList)]
    avgs = [mean(absErrorList), mean(simpleRelErrorList), mean(relErrorList)]
    stdevs = [
        round(stdev(absErrorList), 5), 
        round(stdev(simpleRelErrorList), 5), 
        round(stdev(relErrorList), 5)
    ]
    
    mse_result = []
    for errorList in [absErrorList, simpleRelErrorList, relErrorList]:
        mse = 0
        for item in errorList:
            mse += item * item
        if len(errorList) > 0:
            mse = mse/len(errorList)
        mse_result.append(mse)
    
    gda_acc[secret] = {}
    
    #add our accuracy score from DOMIAS
    acc  = accuracy_score(y_true, y_pred)
    gda_acc[secret]['acc'] = acc
    
    #then add our GDA calculated accuracy score
    gda_acc[secret]['mins'] = mins
    gda_acc[secret]['maxs'] = maxs
    gda_acc[secret]['avgs'] = avgs
    gda_acc[secret]['stdevs'] = stdevs
    gda_acc[secret]['mse'] = mse_result
    print(str(secret).ljust(15, ' '), " --> ", gda_acc[secret])

print("\n\n::DOMIAS Results::")
absErrorList = []
simpleRelErrorList = []
relErrorList = []

y_true = domias_y_true
y_pred = domias_y_pred
for i in range(0, len(y_true)):
    #skip 0s
    if(y_pred[i] == 0):
        continue
    absErrorList.append(abs(y_pred[i]-y_true[i]))
    simpleRelErrorList.append(y_true[i]/y_pred[i])
    relErrorList.append(abs(y_pred[i]-y_true[i]) / max(y_pred[i], y_true[i]))
mins = [min(absErrorList), min(simpleRelErrorList), min(relErrorList)]
maxs = [max(absErrorList), max(simpleRelErrorList), max(relErrorList)]
avgs = [mean(absErrorList), mean(simpleRelErrorList), mean(relErrorList)]
stdevs = [
    round(stdev(absErrorList), 5), 
    round(stdev(simpleRelErrorList), 5), 
    round(stdev(relErrorList), 5)
]
mse_result = []
for errorList in [absErrorList, simpleRelErrorList, relErrorList]:
    mse = 0
    for item in errorList:
        mse += item * item
    if len(errorList) > 0:
        mse = mse/len(errorList)
    mse_result.append(mse)

gda_acc['domias'] = {}

# add our accuracy score from DOMIAS
gda_acc['domias']['acc'] = round(domias_acc, 5)
# add our rocauc score from DOMIAS
gda_acc['domias']['rocauc'] = round(domias_rocauc, 5)

#then add our GDA calculated accuracy score
gda_acc['domias']['mins'] = mins
gda_acc['domias']['maxs'] = maxs
gda_acc['domias']['avgs'] = avgs
gda_acc['domias']['stdevs'] = stdevs
gda_acc['domias']['mse'] = mse_result
print(gda_acc['domias'])



::Anonymeter Results::
age              -->  {'acc': 0.04, 'mins': [0, 0.0, 0.0], 'maxs': [1, 1.0, 1.0], 'avgs': [0.96, 0.04, 0.96], 'stdevs': [0.2, 0.2, 0.2], 'mse': [0.96, 0.04, 0.96]}
type_employer    -->  {'acc': 0.16, 'mins': [0, 0.0, 0.0], 'maxs': [1, 1.0, 1.0], 'avgs': [0.84, 0.16, 0.84], 'stdevs': [0.37417, 0.37417, 0.37417], 'mse': [0.84, 0.16, 0.84]}
fnlwgt           -->  {'acc': 0.0, 'mins': [1, 0.0, 1.0], 'maxs': [1, 0.0, 1.0], 'avgs': [1, 0.0, 1.0], 'stdevs': [0.0, 0.0, 0.0], 'mse': [1.0, 0.0, 1.0]}
education        -->  {'acc': 0.16, 'mins': [0, 0.0, 0.0], 'maxs': [1, 1.0, 1.0], 'avgs': [0.84, 0.16, 0.84], 'stdevs': [0.37417, 0.37417, 0.37417], 'mse': [0.84, 0.16, 0.84]}
education_num    -->  {'acc': 0.24, 'mins': [0, 0.0, 0.0], 'maxs': [1, 1.0, 1.0], 'avgs': [0.76, 0.24, 0.76], 'stdevs': [0.43589, 0.43589, 0.43589], 'mse': [0.76, 0.24, 0.76]}
marital          -->  {'acc': 0.2, 'mins': [0, 0.0, 0.0], 'maxs': [1, 1.0, 1.0], 'avgs': [0.8, 0.2, 0.8], 'stdevs': [0.40825, 0.40

In [11]:
# GDA Scores - Defense
# unable to do: requires guesses and confidence of each guess
# Anonymeter can provide guesses if we rewrite the attack code ourselves, but we won't get confidence
# DOMIAS can provide guesses and something similar to confidence (MIA Scores), but it is also different because the "guesses" is all the original data

# Confidence Improvement
# For Anonymeter attack: CI = (C-S)/(1-S) where C=n_success/n_attacks and S = n_baseline/n_attacks
# For DOMIAS attack: C=n_success/n_attacks and S=count(True)/total? Or maybe we use Anonymeter S as the baseline confidence

C_anon = anon_n_success/anon_n_attacks
C_domias = domias_n_success/domias_n_attacks

S_anon = anon_n_baseline/anon_n_attacks
# S_domias = sum(domias_y_success)/(len(domias_y_success)+1)

CI_anon = (C_anon-S_anon)/(1-S_anon)
CI_domias = (C_domias-S_anon)/(1-S_anon)

print("CI Anon: ", CI_anon)
print("CI Domias: ", CI_domias)

# Defense & Confidence are basically the same in our interpretation
# Using defense gride, getInterpolatedValue(CI, CP, defenseGrid)
# Claim made is most likely how many claims met the confidence threshold, which we do not have
# CP is defined as the ratio of attempts to claims
# CP is calculated as claim made / claim trials, but we can try success/total

CP_anon = anon_n_success/anon_n_attacks
CP_domias = sum(domias_y_pred)/(len(domias_y_pred)+1)

print("Claim Probability Anon: ", CP_anon)
print("Claim Probability Domias: ", CP_domias)

defenseGrid1 = [
    (1, 1, 0), (1, .01, .1), (1, .001, .3), (1, .0001, .7), (1, .00001, 1),
    (.95, 1, .1), (.95, .01, .3), (.95, .001, .7), (.95, .0001, .8), (.95, .00001, 1),
    (.90, 1, .3), (.90, .01, .6), (.90, .001, .8), (.90, .0001, .9), (.90, .00001, 1),
    (.75, 1, .7), (.75, .01, .9), (.75, .001, .95), (.75, .0001, 1), (.75, .00001, 1),
    (.50, 1, .95), (.50, .01, .95), (.50, .001, 1), (.50, .0001, 1), (.5, .00001, 1),
    (0, 1, 1), (0, .01, 1), (0, .001, 1), (0, .0001, 1), (0, .00001, 1)
]

def getInterpolatedValue(val0, val1, scoreGrid):
    """Compute interpolated value from grid of mapping tuples

       This routine takes as input a list of tuples ("grid") of the form
       `(val0,val1,score)`. It maps (val0,val1) values to a corresponding
       score. It returns a score that is interpolated between the
       scores in the grid. An example of such a grid can be found in
       gdaScore.py, called `_defenseGrid1`. Note that val0 and val1 must
       go in descending order as shown. Input values that are above the
       highest val0 and val1 values will take the score of the first
       entry. Input values that are below the lowest val0 and val1 will
       take the score of the last entry.
    """
    scoreAbove = -1
    scoreBelow = -1
    for tup in scoreGrid:
        tup0 = tup[0]
        tup1 = tup[1]
        score = tup[2]
        if val0 <= tup0 and val1 <= tup1:
            tup0Above = tup0
            tup1Above = tup1
            scoreAbove = score
    for tup in reversed(scoreGrid):
        tup0 = tup[0]
        tup1 = tup[1]
        score = tup[2]
        if val0 >= tup0 and val1 >= tup1:
            tup0Below = tup0
            tup1Below = tup1
            scoreBelow = score
    if scoreAbove == -1 and scoreBelow == -1:
        return None
    if scoreAbove == -1:
        return scoreBelow
    if scoreBelow == -1:
        return scoreAbove
    if scoreAbove == scoreBelow:
        return scoreAbove
    # Interpolate by treating as right triangle with tup0 as y and
    # tup1 as x
    yLegFull = tup0Above - tup0Below
    xLegFull = tup1Above - tup1Below
    hypoFull = math.sqrt((xLegFull ** 2) + (yLegFull ** 2))
    yLegPart = val0 - tup0Below
    xLegPart = val1 - tup1Below
    hypoPart = math.sqrt((xLegPart ** 2) + (yLegPart ** 2))
    frac = hypoPart / hypoFull
    interpScore = scoreBelow - (frac * (scoreBelow - scoreAbove))
    return interpScore

defense_anon = getInterpolatedValue(CI_anon, CP_anon, defenseGrid1)
defense_domias = getInterpolatedValue(CI_domias, CP_domias, defenseGrid1)

print("Defense Anon: ", defense_anon)
print("Defense Domias: ", defense_domias)

CI Anon:  0.05561613958560523
CI Domias:  0.45474372955288983
Claim Probability Anon:  0.134
Claim Probability Domias:  0.4999871903261343
Defense Anon:  0.9938733466563375
Defense Domias:  0.9698633539680862


In [45]:
# GDA Scores - Utility
# https://github.com/gda-score/utility/blob/master/gdaUtility.py
# We are going to write our own code for utility
# Utility has 2 scores: Accuracy and Coverage

# Accuracy
# 1. Create Error Lists:
#     ** skip all Anon where 0 before computing
#     Absolute Error: Absolute( Anon - Raw)
#     Simple Relative Error: Raw / Anon
#     Relative Error: Absolute(Anon - Raw) / Max (Anon, Raw)
# 2. Convert Error Lists into 5 metrics each: Min, Max, Avg, Stddev, Compute
# We can also use this to measure attack accuracy
# For Accuracy for Utility, this is NOT possible to measure because 
#   the algorithm is meant only for anonymized datasets, not synthetic

# Coverage
# We only need the original and synthetic datasets
# https://github.com/gda-score/utility/blob/master/gdaUtility.py
# For each column ... 
# 1. line 261: build dicts for original and synthetic data counting all distinct values
# 2. line 362: count values noColumnCountOnerawDb, noColumnCountMorerawDb, and valuesInBoth
# 3. Coverage is calculated as valuesInBoth/noColumnCountMorerawDb

coverage_scores = []
for col in train_df:
    # check if col is being covered
    if col not in synth_df:
        print("Not ", col)
        continue

# line 230: if the column has continuous data, coverage = numAnonRows/numRawRows
    # print(metadata)
    if(metadata.columns[col]['sdtype'] == "numerical"):
        # print(col, " :TEST: ", metadata.columns[col]['sdtype'])
        entry = {}
        entry['column'] = col
        entry['coverage'] = synth_df[col].count()/train_df[col].count()
        coverage_scores.append(entry)
        continue

# line 216: see how much of CATEGORICAL column is NULL
    # numRawRows = train_df[col].count() - train_df[col].value_counts()['?']
    # numAnonRows = synth_df[col].count() - synth_df[col].value_counts()['?']
    # if numRawRows == 0 or numAnonRows == 0:
    #     #empty column
    #     continue

# line 250: count all distinct values in the column
# line 261: build dicts for raw and anon
    rawRowsDict = {}
    anonRowsDict = {}
    for val in train_df[col].unique():
        if val == '?': continue
        rawRowsDict[val] = train_df[col].value_counts()[val]
    for val in synth_df[col].unique():    
        if val == '?': continue
        anonRowsDict[val] = synth_df[col].value_counts()[val]

# line 362: count values
    noColumnCountOnerawDb=0
    noColumnCountMorerawDb=0
    valuesInBoth=0

    for rawkey in rawRowsDict:
        if rawRowsDict[rawkey]==1:
            noColumnCountOnerawDb += 1
        else:
            noColumnCountMorerawDb += 1
    for anonkey in anonRowsDict:
        if anonkey in rawRowsDict:
            if rawRowsDict[anonkey] >1:
                valuesInBoth += 1
    valuesanonDb=len(anonRowsDict)

    entry = {}
    entry['column'] = col
    entry['colCountOneRawDb']=noColumnCountOnerawDb
    entry['colCountManyRawDb']=noColumnCountMorerawDb
    entry['valuesInBothRawAndAnonDb']=valuesInBoth
    entry['totalValCntAnonDb']=valuesanonDb
    if(noColumnCountMorerawDb==0):
        entry['coverage'] =None
# final calculation: coverage = valuesInBoth/noColumnCountMorerawDb
    else:
        entry['coverage']=valuesInBoth/noColumnCountMorerawDb

    coverage_scores.append(entry)

coverage_scores_dict = {}
for entry in coverage_scores:
    print(entry)

{'column': 'age', 'coverage': 0.0019215003074400491}
{'column': 'type_employer', 'colCountOneRawDb': 0, 'colCountManyRawDb': 8, 'valuesInBothRawAndAnonDb': 6, 'totalValCntAnonDb': 6, 'coverage': 0.75}
{'column': 'fnlwgt', 'coverage': 0.0019215003074400491}
{'column': 'education', 'colCountOneRawDb': 0, 'colCountManyRawDb': 16, 'valuesInBothRawAndAnonDb': 15, 'totalValCntAnonDb': 15, 'coverage': 0.9375}
{'column': 'education_num', 'coverage': 0.0019215003074400491}
{'column': 'marital', 'colCountOneRawDb': 0, 'colCountManyRawDb': 7, 'valuesInBothRawAndAnonDb': 6, 'totalValCntAnonDb': 6, 'coverage': 0.8571428571428571}
{'column': 'occupation', 'colCountOneRawDb': 0, 'colCountManyRawDb': 14, 'valuesInBothRawAndAnonDb': 14, 'totalValCntAnonDb': 14, 'coverage': 1.0}
{'column': 'relationship', 'colCountOneRawDb': 0, 'colCountManyRawDb': 6, 'valuesInBothRawAndAnonDb': 6, 'totalValCntAnonDb': 6, 'coverage': 1.0}
{'column': 'race', 'colCountOneRawDb': 0, 'colCountManyRawDb': 5, 'valuesInBothRaw

In [7]:
# SDV Data Analysis
# needs: original data, synthetic data, metadata

# quality check
print("=== Quality Report ===")
quality_report = evaluate_quality(
    real_data=train_df,
    synthetic_data=synth_df,
    metadata=metadata)

print("=== Diagnostic Report ===")
diagnostic_report = run_diagnostic(
    real_data=train_df,
    synthetic_data=synth_df,
    metadata=metadata)

# distribution analysis

print("=== Column Report ===")
fig = get_column_plot(
    real_data=train_df,
    synthetic_data=synth_df,
    column_name='age',
    metadata=metadata
)
fig.show()
fig = get_column_plot(
    real_data=train_df,
    synthetic_data=synth_df,
    column_name='fnlwgt',
    metadata=metadata
)
fig.show()

fig = get_column_pair_plot(
    real_data=train_df,
    synthetic_data=synth_df,
    metadata=metadata,
    column_names=['age', 'fnlwgt'],
)
fig.show()

# TODO: Wonder how this would look as column pair plot of a column and the y_pred

=== Quality Report ===
Generating report ...
(1/2) Evaluating Column Shapes: : 100%|██████████| 15/15 [00:00<00:00, 128.57it/s]
(2/2) Evaluating Column Pair Trends: : 100%|██████████| 105/105 [00:03<00:00, 30.97it/s]

Overall Quality Score: 56.11%

Properties:
- Column Shapes: 61.7%
- Column Pair Trends: 50.53%
=== Diagnostic Report ===
Generating report ...
(1/3) Evaluating Coverage: : 100%|██████████| 15/15 [00:00<00:00, 378.72it/s]
(2/3) Evaluating Boundary: : 100%|██████████| 15/15 [00:00<00:00, 1493.59it/s]
(3/3) Evaluating Synthesis: : 100%|██████████| 1/1 [00:01<00:00,  1.36s/it]

Diagnostic Results:

SUCCESS:
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data
✓ Over 90% of the synthetic rows are not copies of the real data

! The synthetic data is missing more than 10% of the categories present in the real data
! The synthetic data is missing more than 10% of the numerical ranges present in the real data
=== Column Report ===
