In [1]:
from DataSynthesizer.DataDescriber import DataDescriber
from DataSynthesizer.DataGenerator import DataGenerator
from DataSynthesizer.ModelInspector import ModelInspector
from DataSynthesizer.lib.utils import read_json_file, display_bayesian_network

import pandas as pd
import matplotlib.pyplot as plt
import os


In [2]:
# input dataset
input_data = 'Sandwich/J334772Truncated2.csv'
# location of two output files
mode = 'correlated_attribute_mode'
description_file = 'description.json'
synthetic_data = 'synthetic_data.csv'

In [3]:

# An attribute is categorical if its domain size is less than this threshold.
threshold_value = 20

# specify categorical attributes
categorical_attributes = {'RT13V7': True, 'RT13V8': True} #{'RT13V3': True, 'RT13V4',RT13V11, RT13V12 RT13V13, RT13V14, RT13V15, RT13V18, RT13V21, RT13V24, RT13V27, RT13V28, RT13V29, RT13V30, RT13V31, RT13V32 RT13V33}

# specify which attributes are candidate keys of input dataset.
candidate_keys = {'RT13V2': True}

# A parameter in Differential Privacy. It roughly means that removing a row in the input dataset will not 
# change the probability of getting the same output more than a multiplicative difference of exp(epsilon).
# Increase epsilon value to reduce the injected noises. Set epsilon=0 to turn off differential privacy.
epsilon = 1

# The maximum number of parents in Bayesian network, i.e., the maximum number of incoming edges.
degree_of_bayesian_network = 2

# Number of tuples generated in synthetic dataset.
num_tuples_to_generate = 9063

null_values = [99999, 999999999]

In [4]:
#Runs one iteration of creating the Bayesian network, generating the data and comparing it to the original data
def runDataSynth(degree, epsilon, gennum, threshold_value, null_values, description_file, synthetic_data):
    print("Data generated for epsilon " + str(epsilon) + " and degree " + str(degree))
    #Generates the Bayesian Network
    describer = DataDescriber(category_threshold=threshold_value,  null_values=null_values)
    describer.describe_dataset_in_correlated_attribute_mode(dataset_file=input_data, 
                                                        epsilon=epsilon, 
                                                        k=degree,
                                                        attribute_to_is_categorical=categorical_attributes,
                                                        attribute_to_is_candidate_key=candidate_keys)
    describer.save_dataset_description_to_file(description_file)

    #Displays the bayesian network
    print("Bayesian for epsilon " + str(epsilon) + " and degree " + str(degree))
    display_bayesian_network(describer.bayesian_network)
    #generates the synthetic data and saves it to synthetic data
    generator = DataGenerator()
    generator.generate_dataset_in_correlated_attribute_mode(num_tuples_to_generate, description_file)
    generator.save_synthetic_data(synthetic_data)

    # Read both datasets using Pandas.
    input_df = pd.read_csv(input_data, skipinitialspace=True)
    synthetic_df = pd.read_csv(synthetic_data)
    # Read attribute description from the dataset description file.
    attribute_description = read_json_file(description_file)['attribute_description']
    inspector = ModelInspector(input_df, synthetic_df, attribute_description)
    

    print("Comparison for epsilon " + str(epsilon) + " and degree " + str(degree))
    
    path = 'datasynthplt/' + "epsilon" + str(epsilon) + "degree" + str(degree)
    isExist = os.path.exists(path)
    if not isExist:
        os.makedirs(path)
    for attribute in synthetic_df.columns:
        photo = path + "/" + str(attribute) +'.png'
        inspector.compare_histograms(attribute)
        plt.savefig(photo, dpi=300, bbox_inches='tight')
        plt.cla()
        plt.close()

    inspector.mutual_information_heatmap()
    photo = path + '/heatmap.png'
    plt.savefig(photo, dpi=300, bbox_inches='tight')
    plt.cla()
    plt.close()
    
    return describer
    

In [5]:
runDataSynth(1, epsilon, num_tuples_to_generate
, threshold_value, null_values, description_file, synthetic_data)

Data generated for epsilon 1 and degree 1
Adding ROOT RT13V33
Adding attribute RT13V39
Adding attribute RT13V45
Adding attribute RT13V42
Adding attribute RT13V36
Adding attribute RT13V21
Adding attribute RT13V40
Adding attribute RT13V19
Adding attribute RT13V51
Adding attribute RT13V49
Adding attribute RT13V14
Adding attribute RT13V34
Adding attribute RT13V16
Adding attribute RT13V22
Adding attribute RT13V48
Adding attribute RT13V4
Adding attribute RT13V8
Adding attribute RT13V3
Adding attribute RT13V32
Adding attribute RT13V31
Adding attribute RT13V30
Adding attribute RT13V52
Adding attribute RT13V27
Adding attribute RT13V13
Adding attribute RT13V29
Adding attribute RT13V9
Adding attribute RT13V24
Adding attribute RT13V10
Adding attribute RT13V46
Adding attribute RT13V18
Adding attribute RT13V15
Adding attribute RT13V11
Adding attribute RT13V37
Adding attribute RT13V12
Adding attribute RT13V28
Adding attribute RT13V7
Adding attribute RT13V43
Adding attribute RT13V25
Bayesian for epsil

<DataSynthesizer.DataDescriber.DataDescriber at 0x3156107a0>

In [8]:
#Run Datasynth on multiple different values of epsilon and degree of Bayesian Network equal to 0(This automatically generates the Bayesian Network)
epsilon_vals = [0.05, 0.1, 0.2, 0.4, 0.8, 1.6]
describers0 = []
for epsilon in epsilon_vals:
    describer0 = runDataSynth(0, epsilon, num_tuples_to_generate, threshold_value, null_values, description_file, synthetic_data)
    describers0.append(describer)

Data generated for epsilon 0.05 and degree 0
Adding ROOT RT13V33
Adding attribute RT13V29
Adding attribute RT13V39
Adding attribute RT13V31
Adding attribute RT13V27
Adding attribute RT13V19
Adding attribute RT13V34
Adding attribute RT13V21
Adding attribute RT13V48
Adding attribute RT13V51
Adding attribute RT13V15
Adding attribute RT13V42
Adding attribute RT13V24
Adding attribute RT13V28
Adding attribute RT13V49
Adding attribute RT13V4
Adding attribute RT13V7
Adding attribute RT13V3
Adding attribute RT13V43
Adding attribute RT13V37
Adding attribute RT13V45
Adding attribute RT13V52
Adding attribute RT13V32
Adding attribute RT13V14
Adding attribute RT13V30
Adding attribute RT13V9
Adding attribute RT13V22
Adding attribute RT13V10
Adding attribute RT13V46
Adding attribute RT13V16
Adding attribute RT13V13
Adding attribute RT13V11
Adding attribute RT13V36
Adding attribute RT13V18
Adding attribute RT13V25
Adding attribute RT13V8
Adding attribute RT13V40
Adding attribute RT13V12
Bayesian for ep

KeyboardInterrupt: 

In [36]:
plt.close('all')

NameError: name 'fig' is not defined

In [7]:
#Run Datasynth on multiple different values of epsilon and degree of Bayesian Network equal to 2
epsilon_vals = [0.05, 0.1, 0.2, 0.4, 0.8, 1.6]
describers2 = []
for epsilon in epsilon_vals:
    describer2 = runDataSynth(2, epsilon, num_tuples_to_generate, threshold_value, null_values, description_file, synthetic_data)
    describers2.append(describer2)
    
    


Data generated for epsilon 0.05 and degree 2
Adding ROOT RT13V33
Adding attribute RT13V29
Adding attribute RT13V39
Adding attribute RT13V31
Adding attribute RT13V27
Adding attribute RT13V19
Adding attribute RT13V34
Adding attribute RT13V21
Adding attribute RT13V48
Adding attribute RT13V51
Adding attribute RT13V15
Adding attribute RT13V42
Adding attribute RT13V24
Adding attribute RT13V28
Adding attribute RT13V49
Adding attribute RT13V4
Adding attribute RT13V7
Adding attribute RT13V3
Adding attribute RT13V43
Adding attribute RT13V37
Adding attribute RT13V45
Adding attribute RT13V52
Adding attribute RT13V32
Adding attribute RT13V14
Adding attribute RT13V30
Adding attribute RT13V9
Adding attribute RT13V22
Adding attribute RT13V10
Adding attribute RT13V46
Adding attribute RT13V16
Adding attribute RT13V13
Adding attribute RT13V11
Adding attribute RT13V36
Adding attribute RT13V18
Adding attribute RT13V25
Adding attribute RT13V8
Adding attribute RT13V40
Adding attribute RT13V12
Bayesian for ep

In [6]:
#Run Datasynth on multiple different values of epsilon and degree of Bayesian Network equal to 3
epsilon_vals = [0.05, 0.1, 0.2, 0.4, 0.8, 1.6]
for epsilon in epsilon_vals:
    describers = []
    describer = runDataSynth(3, epsilon, num_tuples_to_generate, threshold_value, null_values, description_file, synthetic_data)
    describers.append(describer)

Data generated for epsilon 0.05 and degree 3
Adding ROOT RT13V33
Adding attribute RT13V29
Adding attribute RT13V39
Adding attribute RT13V31
Adding attribute RT13V27
Adding attribute RT13V19
Adding attribute RT13V34
Adding attribute RT13V21
Adding attribute RT13V48
Adding attribute RT13V51
Adding attribute RT13V15
Adding attribute RT13V42
Adding attribute RT13V24
Adding attribute RT13V28
Adding attribute RT13V49
Adding attribute RT13V4
Adding attribute RT13V7
Adding attribute RT13V3
Adding attribute RT13V43
Adding attribute RT13V37
Adding attribute RT13V45
Adding attribute RT13V52
Adding attribute RT13V32
Adding attribute RT13V14
Adding attribute RT13V30
Adding attribute RT13V9
Adding attribute RT13V22
Adding attribute RT13V10
Adding attribute RT13V46
Adding attribute RT13V16
Adding attribute RT13V13
Adding attribute RT13V11
Adding attribute RT13V36
Adding attribute RT13V18
Adding attribute RT13V25
Adding attribute RT13V8
Adding attribute RT13V40
Adding attribute RT13V12
Bayesian for ep

In [None]:
#Run Datasynth on multiple different values of epsilon and degree of Bayesian Network equal to 4(DID NOT RUN DUE TO RUNTIME)
epsilon_vals = [0.05, 0.1, 0.2, 0.4, 0.8, 1.6]
for epsilon in epsilon_vals:
    runDataSynth(4, epsilon, num_tuples_to_generate, threshold_value, null_values, description_file, synthetic_data)
