Author: Dr C Joshi

#### This notebook is based on the work DataSynthesizer: Privacy-Preserving Synthetic Datasets
https://faculty.washington.edu/billhowe/publications/pdfs/ping17datasynthesizer.pdf

The numerical code presented in this notebook is heavily based on open source code available here 
https://github.com/DataResponsibly/DataSynthesizer

In [None]:
# DataSynthesizer is the directory where the following parent modules exist:
# DataSynthesizer consists of three high-level modules — DataDescriber, 
# DataGenerator and ModelInspector. The first, DataDescriber, investigates
# the data types, correlations and distributions of the attributes in the 
# private dataset, and produces a data summary, adding noise to the distributions
# to preserve privacy. DataGenerator samples from the summary computed by 
# DataDescriber and outputs synthetic data. ModelInspector shows an intuitive 
# description of the data summary that was computed by DataDescriber, allowing the data 
# owner to evaluate the accuracy of the summarization process and adjust any parameters, if desired.
import os, sys
sys.path.append(os.getcwd() + '/DataSynthesizer/')

In [None]:
from DataDescriber import DataDescriber
from DataGenerator import DataGenerator
from ModelInspector import ModelInspector
from lib.utils import read_json_file, display_bayesian_network
from lib.utils import pairwise_attributes_mutual_information
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from pathlib import Path
import shutil
%matplotlib inline

In [None]:
# As noted in the original article above:
# DataSynthesizer can operate in one of three modes:
# In correlated attribute mode, we learn a differentially
# private Bayesian network capturing the correlation structure
# between attributes, then draw samples from this model to
# construct the result dataset. In cases where the correlated 
# attribute mode is too computationally expensive or when there
# is insufficient data to derive a reasonable model, one can use
# independent attribute mode. In this mode, a histogram is derived
# for each attribute, noise is added to the histogram to achieve
# differential privacy, and then samples are drawn for each attribute.
# Finally, for cases of extremely sensitive data, one can use random 
# mode that simply generates type-consistent random values for each attribute.

mode_dict = {"rand": 'random_mode',"inde": 'independent_attribute_mode',\
           "corr": 'correlated_attribute_mode'}
mode = input('Please choose a mode: ')
try:
    print("Chosen mode of synthetic data generation pipeline is ", mode_dict[mode])
except KeyError:
    print("The mode {} do not exist".format(mode))
    
mode=mode_dict[mode]

In [None]:

    
if os.path.exists(f'./out/{mode}/') and os.path.isdir(f'./out/{mode}/'):
    shutil.rmtree(f'./out/{mode}/')

In [None]:
# input dataset: specify the location of the training dataset
input_data = './data/dataset.csv'

# location of two output files
description_file = f'./out/{mode}/description.json'
synthetic_data = f'./out/{mode}/sythetic_data.csv'

Path(f'./out/{mode}/').mkdir(parents=True, exist_ok=True)

In [None]:
# Data can have lots of nuances:
# one has to investigate different 
# techniques to ensure the raw data
# is ready for training the model

# Read the training dataset
df=pd.read_csv(input_data)
print(df.dtypes)

print(df.shape)

In [None]:
# One can try different things to select
# chosen datatypes

#e.g.
#df=df.select_dtypes(include='int64')

# We use all the variables, but some columns 
# might be dirty

#remove columns with all zeros
df=df.loc[:, (df != 0).any(axis=0)]
print('after removing all zero columns, the number of entries are {}'.format(df.shape))

# remove nans: one could try something a little more clever too
df=df.dropna(axis=1, how='all')
print('after removing nans, the number of entries are {}'.format(df.shape))



In [None]:
#df=df.loc[:,(df.sum(axis=0)==0)]


# correlated attribute mode is expansive to compute
if mode=='correlated_attribute_mode':
    df=df.sample(8, axis=1)

# save the clean(er) dataset in the data directory with a chosen name

input_data = './data/dataset_short.csv'
df.to_csv(input_data)

In [None]:
# The domain of an attribute is the set of its legal values. 
# The data type is an important ingredient of the attribute
# domain. DataSynthesizer supports four data types. The 
# system allows users to explicitly specify attribute 
# data types. If an attribute data type is not specified by the user,
# it is inferred by the DataDescriber. For each attribute, 
# DataDescriber first detects whether it is numerical, and if so — whether
# it is an integer or a float. If the attribute is non-numerical,
# DataDescriber attempts to parse it as datetime. Any attribute 
# that is neither numerical nor datetime is considered a string.



In [None]:

# An attribute is categorical if its domain size is less than this threshold.

# DataSynthesizer allows users to specify a data type, and state whether an attribute
# is categorical, over-riding defaults on a per-attribute basis.
# Fore more details refer to the paper above.

# one can easily modify the threshold to adapt to the user need
threshold_value = 30



# A parameter in Differential Privacy. It roughly means that removing a row in the input dataset will not 
# change the probability of getting the same output more than a multiplicative difference of exp(epsilon).
# Increase epsilon value to reduce the injected noises. Set epsilon=0 to turn off differential privacy.
# DP has a massive role when generating synthetic data in the correlated mode.

epsilon = 0.1

# The maximum number of parents in Bayesian network, i.e., the maximum number of incoming edges.
# Larger this number-more computationally intensive would be the calculations

degree_of_bayesian_network = 1

# Number of tuples generated in synthetic dataset.


num_tuples_to_generate = np.int(df.shape[0]/4)
# It can be set to any other number as well.


In [None]:
## Initiate the Datadescribe module


In [None]:
describer = DataDescriber(category_threshold=threshold_value)


getattr(describer, 'describe_dataset_in_'+str(mode))(dataset_file=input_data)

In [None]:

getattr(describer, 'save_dataset_description_to_file')(description_file)



In [None]:
#display_bayesian_network: use when using the correlated attribute
#display_bayesian_network(getattr(describer, 'bayesian_network'))

In [None]:
## Initiate the Datagenerator module

In [None]:
#synthetic data
generator = DataGenerator()
getattr(generator, 'generate_dataset_in_'+str(mode))(num_tuples_to_generate, description_file)

getattr(generator,'save_synthetic_data')(synthetic_data)

In [None]:
# Read both datasets using Pandas.
input_df = pd.read_csv(input_data, skipinitialspace=True)
synthetic_df = pd.read_csv(synthetic_data)

# Read attribute description from the dataset description file.
attribute_description = read_json_file(description_file)['attribute_description']

inspector = ModelInspector(input_df, synthetic_df, attribute_description)

In [None]:

print('shape of the actual data {}'.format(input_df.shape))
print('shape of the synthetic data {}'.format(synthetic_df.shape))


In [None]:
#reduce the number of columns in the synthetic data (for visual aid only)

# DO TWEAK THIS BLOCK DEPENDING ON THE SIZE OF YOUR DATASET AND CHOSEN
# MODE OF SYNTHETIC DATA GENERATION
chosen_colmns=list(range(0,input_df.shape[1],15))

if mode=='correlated_attribute_mode':
    input_df_few_cols=input_df
    synthetic_df_few_cols=synthetic_df
    
    
else:
    input_df_few_cols=input_df.iloc[:,chosen_colmns]
    
    synthetic_df_few_cols=synthetic_df.iloc[:,chosen_colmns]
    
print('shape of the reduced input data {}'.format(input_df_few_cols.shape))
print('shape of the reduced synthetic data {}'.format(synthetic_df_few_cols.shape))

In [None]:
for attribute in synthetic_df_few_cols.columns:
    inspector.compare_histograms(attribute)

In [None]:



# compute mutual information
private_mi = pairwise_attributes_mutual_information(input_df_few_cols)
synthetic_mi = pairwise_attributes_mutual_information(synthetic_df_few_cols)

fig, axes = plt.subplots(ncols=2, figsize=(10, 10))
fig.suptitle('Pairwise Mutual Information Comparison (Real vs Synthetic) using {}'.format(mode), fontsize=20)
ax1, ax2 = axes

im1 = ax1.matshow(private_mi,cmap="YlGnBu")
im2 = ax2.matshow(synthetic_mi,cmap="YlGnBu")


columns1=list(private_mi.columns)
 # Formatting for heat map 1.
ax1.set_xticks(range(len(columns1)))
ax1.set_yticks(range(len(columns1)))
ax1.set_xticklabels(columns1)
ax1.set_yticklabels(columns1)

plt.setp(ax1.get_xticklabels(), rotation=45, ha='left', rotation_mode='anchor')

columns2=list(synthetic_mi.columns)
# Formatting for heat map 2.
ax2.set_xticks(range(len(columns2)))
ax2.set_yticks(range(len(columns2)))
ax2.set_xticklabels(columns2)
ax2.set_yticklabels(columns2)

plt.setp(ax2.get_xticklabels(), rotation=45, ha='left', rotation_mode='anchor')
    

fig.tight_layout()
fig.colorbar(im1, fraction=0.045, pad=0.05, ax=ax1)
fig.colorbar(im2, fraction=0.045, pad=0.05, ax=ax2)


