In [1]:
import numpy as np
import pandas as pd
import os
from sqlalchemy import types, create_engine
import cx_Oracle
# import diffprivlib as dp
import matplotlib.pyplot as plt
%matplotlib inline

from IPython.display import display
from ipywidgets import Layout, interact, IntSlider, widgets


In [76]:
#Random data set with a sensative attribue of food choise
data = {'Name (ID)' : ['Bill','Julia','Rose','Gwen','Peter','James','Anita','Rob',
                  'Jess','Edith','Catherin','Ellie','Andrew','Ruth','Barry','Hope',
                  'Ruby','Sian','Edward','James','Amanda'],
        'Age (QID)' : [23,19,22,25,30,18,18,16,28,29,21,
                  25,18,27,19,13,25,19,18,19,23], 
        'Gender (QID)' : ['M','F','F','F','M','M','F','M','F','M','M','F','M','F',
                'F','M','F','F','M','F','M'],
        'Political Party' : ['Green Party','Red Party','Red Party','Green Party','Green Party',
                'Red Party','Red Party','Green Party','Red Party','Green Party',
                'Green Party','Green Party','Green Party','Red Party','Green Party',
                'Red Party','Green Party','Red Party','Red Party','Red Party',
                'Green Party']}
                  
df = pd.DataFrame(data=data)

df['Name (ID)'] = '*'


## Metrics functions

In [3]:
def k_annonymity(df, qid):
    """returns the k annonymity value given the quasi-identifier field titles
    
    Parameters
    ----------
    df : DataFrame
        dataframe for which k value is to be calculated
    qid : list
        list of quasi-identifier columns
        
    Returns
    -------
    int 
        k value 
    """

    quid_groupsize = df.groupby(qid).size()
    return quid_groupsize[quid_groupsize > 0].min()


def l_diversity(df, qid, sa):
    """returns the l diversity value given the quasi-identifier field titles
    
    Parameters
    ----------
    df : DataFrame
        dataframe for which k value is to be calculated
    qid : list
        list of quasi-identifier columns
    sa : list
        list of sensitive attribute columns
        
    Returns
    -------
    int 
        l value 
    """
    sa_size_quids = df.groupby(qid)[sa].nunique()
    return sa_size_quids[sa_size_quids > 0].squeeze().min() # This assumes only one SA!
    # l-divesity generally only applies to a single SA 
    
def GIL(uq, lq, u, l):
    return (uq - lq) / (u - l)
    

# Record supression

In [91]:
k = widgets.IntSlider(value=1, min=1, max=5, description='k value:',layout=Layout(width='400px'))

def f_supression(k):
    QID = ['Age (QID)', 'Gender (QID)'] # define quasi-identifyer groups

    orig_length = len(df)
    new_length = len(df.groupby(QID).filter(lambda x: len(x)>=k))
    print("%d%% of data supressed" %(np.round(((orig_length - new_length) / orig_length)* 100)))

def f_supression_data(k):
    dfs = df.copy()
    QID = ['Age (QID)', 'Gender (QID)']
    index = dfs.groupby(QID).filter(lambda x: len(x)<k).index
    dfs.iloc[index] = '----------'
    display(dfs)

out1 = widgets.interactive_output(f_supression, {'k': k})
out2 = widgets.interactive_output(f_supression_data, {'k': k})

display(k)
display(out1)
display(out2)

IntSlider(value=1, description='k value:', layout=Layout(width='400px'), max=5, min=1)

Output()

Output()

# Combined

In [5]:
step_size = widgets.IntSlider(value=1, min=1, max=40, description='Step size:',layout=Layout(width='400px'))

def label_age(step_size):
    print('Ages changed to %d year steps' %(step_size))

def f_ranging(step_size):
    dfs = df.copy()
    dfs['ranged'] = np.floor((dfs['Age (QID)']/step_size))*step_size
    dfs['ranged'] = dfs['ranged'].astype(int)
    dfs['Age (QID)'] = dfs['ranged'].astype(str) + '-' + (dfs['ranged']+step_size).astype(str)
    dfs = dfs.drop(columns=['ranged'])
    display(dfs)


def f_ranging_k(step_size):
    dfs = df.copy()
    dfs['ranged'] = np.floor((dfs['Age (QID)']/step_size))*step_size
    dfs['ranged'] = dfs['ranged'].astype(int)
    gil = GIL(max(dfs['ranged'] + step_size) , min(dfs['ranged']), max(dfs['Age (QID)']), min(dfs['Age (QID)']))
    dfs['Age (QID)'] = dfs['ranged'].astype(str) + '-' + (dfs['ranged']+step_size).astype(str)
    qid = ['Age (QID)', 'Gender (QID)'] # define quasi-identifyer groups
    quid_groupsize = dfs.groupby(qid).size()
    k_val = quid_groupsize[quid_groupsize > 0].min()
    print('k-annonymity has a value of k = %d' %k_val)
    print("Generalised information loss = %.3f" %gil)
    

out1 = widgets.interactive_output(label_age, {'step_size': step_size})
out2 = widgets.interactive_output(f_ranging_k, {'step_size': step_size})
out3 = widgets.interactive_output(f_ranging, {'step_size': step_size})

display(step_size)
display(out1)
display(out2)
display(out3)



IntSlider(value=1, description='Step size:', layout=Layout(width='400px'), max=40, min=1)

Output()

Output()

Output()

# Combined

In [110]:
k = widgets.IntSlider(value=1, min=1, max=5, description='k value:',layout=Layout(width='400px'))
step_size = widgets.IntSlider(value=1, min=1, max=40, description='Step size:',layout=Layout(width='400px'))

dfs = df.copy()

def meassure_supression(k, step_size):
    QID = ['Age (QID)', 'Gender (QID)'] # define quasi-identifyer groups
    
    dfs = df.copy()
    dfs['ranged'] = np.floor((dfs['Age (QID)']/step_size))*step_size
    dfs['ranged'] = dfs['ranged'].astype(int)
    dfs['Age (QID)'] = dfs['ranged'].astype(str) + '-' + (dfs['ranged']+step_size).astype(str)
    dfs = dfs.drop(columns=['ranged'])

    orig_length = len(df)
    new_length = len(dfs.groupby(QID).filter(lambda x: len(x)>=k))
    print("%d%% of data supressed" %(np.round(((orig_length - new_length) / orig_length)* 100)))

def generalise_supress(k, step_size):
    dfs = df.copy()
    dfs['ranged'] = np.floor((dfs['Age (QID)']/step_size))*step_size
    dfs['ranged'] = dfs['ranged'].astype(int)
    dfs['Age (QID)'] = dfs['ranged'].astype(str) + '-' + (dfs['ranged']+step_size).astype(str)
    dfs = dfs.drop(columns=['ranged'])
    QID = ['Age (QID)', 'Gender (QID)']
    index = dfs.groupby(QID).filter(lambda x: len(x)<k).index
    dfs.iloc[index] = '----------'
    display(dfs)

    
def label_age(step_size):
    print('Ages changed to %d year steps' %(step_size))
    

def meassure_k(k, step_size):
    dfs = df.copy()
    dfs['ranged'] = np.floor((dfs['Age (QID)']/step_size))*step_size
    dfs['ranged'] = dfs['ranged'].astype(int)
    gil = GIL(max(dfs['ranged'] + step_size) , min(dfs['ranged']), max(dfs['Age (QID)']), min(dfs['Age (QID)']))
    dfs['Age (QID)'] = dfs['ranged'].astype(str) + '-' + (dfs['ranged']+step_size).astype(str)
    
    dfs = dfs.groupby(QID).filter(lambda x: len(x)>=k)

    qid = ['Age (QID)', 'Gender (QID)'] # define quasi-identifyer groups
    quid_groupsize = dfs.groupby(qid).size()
    k_val = quid_groupsize[quid_groupsize > 0].min()
    if np.isnan(k_val): k_val = 0
    print('k-annonymity has a value of k = %d' %k_val)
    print("Generalised information loss = %.3f" %gil)

    
out1 = widgets.interactive_output(meassure_supression, {'k': k, 'step_size': step_size})
out2 = widgets.interactive_output(generalise_supress, {'k': k, 'step_size': step_size})

out3 = widgets.interactive_output(label_age, {'step_size': step_size})
out4 = widgets.interactive_output(meassure_k, {'k': k, 'step_size': step_size})

display(k)
display(out1)

print('\n')

display(step_size)
display(out3)

print('\n')

display(out4)
display(out2)


IntSlider(value=1, description='k value:', layout=Layout(width='400px'), max=5, min=1)

Output()





IntSlider(value=1, description='Step size:', layout=Layout(width='400px'), max=40, min=1)

Output()





Output()

Output()