# Example: Generating Complex Tabular Data
In this notebook we will generate datasets of abritrary sizes and patters by chaining multiple synthesic data generators.

# Environment

## Library Imports

In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

## Jupyter-specific Imports and Settings

In [2]:
# set printing options
np.set_printoptions(threshold=sys.maxsize)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.expand_frame_repr', False)

# Display all cell outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from IPython import get_ipython
ipython = get_ipython()

# autoreload extension
if 'autoreload' not in ipython.extension_manager.loaded:
    get_ipython().run_line_magic('load_ext', 'autoreload')

get_ipython().run_line_magic('autoreload', '2')
from importlib import reload

## Library Imports

In [3]:
import synthesis.bayes_synthesis
from synthesis.hist_synthesis import HistSynthesizer
from synthesis.bayes_synthesis import PrivBayes, PrivBayesFix

# Load Data
Let's load an example dataset to show how we can generate a synthetic version of it.

Since we're hoping to use these algorithms to generate synthetic versions of the Netherlands Cancer Registry (NCR), we aim to use a dataset that has similar features that we would like to preserve, namely:
- Mix of categorical and continuous data
- Dataset contains >10k records and >10 columns
- Date Sequences: 2 or more dates that should occur in a certain order, for example date of birth - date of incidence - date of death
- Max one record per individual: one record per patient or tumor (in the latter we exclude multiple tumors) - this will make it easier to develop differentially private algorithms, which measure the sensitivity when one person in removed

There are very few publically available health data resources that contain all these features. However, we found dataset from slightly different domain, i.e. Crimes in Chicago, that actually does contain most of these requirements. The only cavaet is that we have to assume that each record belongs to a unique individual, since we do not have a way to identify which crimes are commited by the same person.

Source: https://www.kaggle.com/currie32/crimes-in-chicago

In [4]:
df_crimes = pd.read_csv('../data/input/chicago_crimes_2012_2017.csv')
df_crimes.head()

Unnamed: 0.1,Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,Beat,District,Ward,Community Area,FBI Code,X Coordinate,Y Coordinate,Year,Updated On,Latitude,Longitude,Location
0,3,10508693,HZ250496,05/03/2016 11:40:00 PM,013XX S SAWYER AVE,486,BATTERY,DOMESTIC BATTERY SIMPLE,APARTMENT,True,True,1022,10.0,24.0,29.0,08B,1154907.0,1893681.0,2016,05/10/2016 03:56:50 PM,41.864073,-87.706819,"(41.864073157, -87.706818608)"
1,89,10508695,HZ250409,05/03/2016 09:40:00 PM,061XX S DREXEL AVE,486,BATTERY,DOMESTIC BATTERY SIMPLE,RESIDENCE,False,True,313,3.0,20.0,42.0,08B,1183066.0,1864330.0,2016,05/10/2016 03:56:50 PM,41.782922,-87.604363,"(41.782921527, -87.60436317)"
2,197,10508697,HZ250503,05/03/2016 11:31:00 PM,053XX W CHICAGO AVE,470,PUBLIC PEACE VIOLATION,RECKLESS CONDUCT,STREET,False,False,1524,15.0,37.0,25.0,24,1140789.0,1904819.0,2016,05/10/2016 03:56:50 PM,41.894908,-87.758372,"(41.894908283, -87.758371958)"
3,673,10508698,HZ250424,05/03/2016 10:10:00 PM,049XX W FULTON ST,460,BATTERY,SIMPLE,SIDEWALK,False,False,1532,15.0,28.0,25.0,08B,1143223.0,1901475.0,2016,05/10/2016 03:56:50 PM,41.885687,-87.749516,"(41.885686845, -87.749515983)"
4,911,10508699,HZ250455,05/03/2016 10:00:00 PM,003XX N LOTUS AVE,820,THEFT,$500 AND UNDER,RESIDENCE,False,True,1523,15.0,28.0,25.0,06,1139890.0,1901675.0,2016,05/10/2016 03:56:50 PM,41.886297,-87.761751,"(41.886297242, -87.761750709)"


In [61]:
df_crimes.shape

(1456714, 23)

Following function will take a long time to run given the dataset size. Suggest not to re-run and just observe output.

In [60]:
df_crimes.describe(include='all')

Unnamed: 0.1,Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,Beat,District,Ward,Community Area,FBI Code,X Coordinate,Y Coordinate,Year,Updated On,Latitude,Longitude,Location
count,1456714.0,1456714.0,1456713,1456714,1456714,1456714.0,1456714,1456714,1455056,1456714,1456714,1456714.0,1456713.0,1456700.0,1456674.0,1456714.0,1419631.0,1419631.0,1456714.0,1456714,1419631.0,1419631.0,1419631
unique,,,1456598,582146,32774,365.0,33,342,142,2,2,,,,,26.0,,,,959,,,368286
top,,,HZ140230,01/01/2012 12:01:00 AM,001XX N STATE ST,820.0,THEFT,SIMPLE,STREET,False,False,,,,,6.0,,,,02/04/2016 06:33:39 AM,,,"(41.883500187, -87.627876698)"
freq,,,6,166,3634,136036.0,329460,150600,330471,1079242,1236660,,,,,329460.0,,,,908366,,,2096
mean,3308606.0,9597550.0,,,,,,,,,,1150.644,11.2592,22.87027,37.45632,,1164398.0,1885523.0,2013.897,,41.84147,-87.67224,
std,1235350.0,808350.5,,,,,,,,,,691.6466,6.904691,13.80589,21.44029,,18508.35,34247.75,1.449584,,0.09430126,0.06661726,
min,3.0,20224.0,,,,,,,,,,111.0,1.0,1.0,0.0,,0.0,0.0,2012.0,,36.61945,-91.68657,
25%,2698636.0,9002709.0,,,,,,,,,,613.0,6.0,10.0,23.0,,1152544.0,1858762.0,2013.0,,41.76787,-87.71528,
50%,3063654.0,9605776.0,,,,,,,,,,1024.0,10.0,23.0,32.0,,1166021.0,1891502.0,2014.0,,41.85797,-87.66613,
75%,3428849.0,10225770.0,,,,,,,,,,1711.0,17.0,34.0,56.0,,1176363.0,1908713.0,2015.0,,41.90529,-87.62813,


for experimentation we'll use a small subset to get fast results

In [10]:
df_crimes_sub = df_crimes.loc[:1000, :]

# Synthetic Data Generation
In this section we'll generate data based on the loaded input dataset. We'll define distinct variable clusters and show how we can generate them indepedently. Afterwards we'll create one multi-synthesis pipeline that combines all these steps in one go.

In [12]:
import synthesis.evaluation.visual
reload(synthesis.bayes_synthesis)
reload(synthesis.evaluation.visual)
from synthesis.evaluation.visual import compare_synthetic_data
from synthesis.bayes_synthesis import PrivBayes, PrivBayesFix, NodeParentPair


<module 'synthesis.bayes_synthesis' from 'C:\\Users\\dkn1904.51564\\DataScience\\Projects\\synthetic_data_generation\\synthetic_data_generation\\synthesis\\bayes_synthesis.py'>

<module 'synthesis.evaluation.visual' from 'C:\\Users\\dkn1904.51564\\DataScience\\Projects\\synthetic_data_generation\\synthetic_data_generation\\synthesis\\evaluation\\visual.py'>

## Variable Clustering
Since the full dataset will likely be too large to synthesize in one go, we'll group variables together for which we would like to retain strong statistical relations.

In [6]:
crime_location = ['IUCR', 'Block', 'Location', 'District', 'Beat', 'Ward', 'Community Area'] # likely some of these locations can be inferred and thus do not need to be generated to retain strong consistency
crime_details = ['IUCR', 'Location Description', 'Arrest', 'Domestic', 'FBI Code'] # IUCR will act as a bridging variable 
date_sequence = ['Date', 'Updated On']



# following variable groups won't be generated based:
identifiers = ['ID', 'Case Number'] # no statistical pattern to preserve
latlang = ['Latitude', 'Longitude', 'Location', 'X Coordinate', 'Y Coordinate'] # too high caridnaliy and can be inferred from Block - likely we don't want to publish exact locations
infer_columns = {
    'IUCR': ['Primary Type', 'Description'], # IUCR is a unique code that combines both columns
    'Date': ['Year']
}



In [16]:
import synthesis.tools.utils
import synthesis.tools.dp_utils
from synthesis.tools.dp_utils import dp_conditional_distribution

In [None]:
dp_conditional_distribution(df_crimes_sub[['IUCR', 'Community Area', 'Beat']], conditioned_variables='Beat')

In [24]:
columns = ['IUCR', 'Beat']
X = df_crimes_sub[columns]
counts = X.fillna('nan').groupby(columns).size().astype(float)


In [26]:
from itertools import product

full_space_index = pd.MultiIndex.from_tuples(tuple(product(*counts.index.levels)),
                                                 names=counts.index.names)
print(full_space_index)

MultiIndex([('0110',  111),
            ('0110',  112),
            ('0110',  121),
            ('0110',  122),
            ('0110',  123),
            ('0110',  124),
            ('0110',  131),
            ('0110',  132),
            ('0110',  133),
            ('0110',  211),
            ...
            ('5111', 2515),
            ('5111', 2521),
            ('5111', 2523),
            ('5111', 2524),
            ('5111', 2525),
            ('5111', 2531),
            ('5111', 2532),
            ('5111', 2533),
            ('5111', 2534),
            ('5111', 2535)],
           names=['IUCR', 'Beat'], length=30996)


In [28]:
contingency_table_ = pd.Series(data=0, index=full_space_index).combine(counts, max)
contingency_table_

IUCR  Beat
0110  111     0.0
      112     0.0
      121     0.0
      122     0.0
      123     0.0
             ... 
5111  2531    0.0
      2532    0.0
      2533    0.0
      2534    0.0
      2535    0.0
Length: 30996, dtype: float64

# Crime Location

In [7]:
epsilon = float(np.inf)
epsilon

inf

In [13]:
pb_location = PrivBayes(degree_network=2, epsilon=epsilon)
pb_location.fit(df_crimes_sub[crime_location])

1/7 - Degree of network (k): 2

Root of network: IUCR

2/7 - Evaluating next node to add to network
Number of NodeParentPair candidates: 6
Selected node: Community Area - with parents: ('IUCR',)

3/7 - Evaluating next node to add to network
Number of NodeParentPair candidates: 5
Selected node: Beat - with parents: ('IUCR', 'Community Area')

4/7 - Evaluating next node to add to network
Number of NodeParentPair candidates: 12
Selected node: District - with parents: ('Community Area', 'Beat')

5/7 - Evaluating next node to add to network
Number of NodeParentPair candidates: 18
Selected node: Block - with parents: ('IUCR', 'District')

6/7 - Evaluating next node to add to network
Number of NodeParentPair candidates: 20
Selected node: Ward - with parents: ('IUCR', 'Beat')

7/7 - Evaluating next node to add to network
Number of NodeParentPair candidates: 15
Selected node: Location - with parents: ('Block', 'IUCR')

Learned Network Structure

Learning conditional probabilities: Beat - with p

KeyboardInterrupt: 

# Crime Details

In [50]:
pb = PrivBayes(degree_network=2, epsilon =local_epsilon)
pb.fit(df_crimes_sub[crime_details])

1/5 - Degree of network (k): 2

Root of network: FBI Code

2/5 - Evaluating next node to add to network
Sampled node: IUCR - with parents: ('FBI Code',)

3/5 - Evaluating next node to add to network
Sampled node: Arrest - with parents: ('IUCR', 'FBI Code')

4/5 - Evaluating next node to add to network
Sampled node: Location Description - with parents: ('Arrest', 'IUCR')

5/5 - Evaluating next node to add to network
Sampled node: Domestic - with parents: ('FBI Code', 'Location Description')

Learned Network Structure

Learning conditional probabilities: Arrest - with parents ('IUCR', 'FBI Code')
Learning conditional probabilities: Location Description - with parents ('Arrest', 'IUCR')
Learning conditional probabilities: Domestic - with parents ('FBI Code', 'Location Description')
Learning conditional probabilities: IUCR - with parents ('FBI Code',)
Learning conditional probabilities: FBI Code - with parents None


PrivBayes(epsilon=inf, theta_usefulness=None)

In [40]:
df_crimes_sub.shape

(1001, 23)

In [51]:
df_synth_details = pb.transform(df_crimes_sub[crime_details])
df_synth_details.head()

Number of records generated: 1001 / 1001
 Synthetic Data Generated


Unnamed: 0,IUCR,Location Description,Arrest,Domestic,FBI Code
0,486,APARTMENT,True,True,08B
1,1152,ATM (AUTOMATIC TELLER MACHINE),False,False,11
2,486,RESIDENCE,False,True,08B
3,2014,SIDEWALK,True,False,18
4,820,STREET,False,False,06


In [69]:
df_synth.head()

Unnamed: 0,age,sex,education,workclass,income,marital-status,relationship,native-country,race
0,29,Male,Some-college,Private,<=50K,Divorced,Not-in-family,England,White
1,36,Male,Bachelors,Private,>50K,Married-civ-spouse,Husband,United-States,White
2,41,Male,Assoc-voc,Self-emp-not-inc,<=50K,Never-married,Not-in-family,United-States,White
3,47,Male,Assoc-voc,Private,<=50K,Divorced,Not-in-family,United-States,White
4,23,Female,Assoc-acdm,State-gov,<=50K,Never-married,Own-child,United-States,Black


## Date Sequences

In [79]:
reload(synthesis.preprocessing.dates)
from synthesis.preprocessing.dates import GeneralizeDateSequence

<module 'synthesis.preprocessing.dates' from 'C:\\Users\\dkn1904.51564\\DataScience\\Projects\\synthetic_data_generation\\synthetic_data_generation\\synthesis\\preprocessing\\dates.py'>

In [56]:
df_crimes_sub[date_sequence].head()

Unnamed: 0,Date,Updated On
0,05/03/2016 11:40:00 PM,05/10/2016 03:56:50 PM
1,05/03/2016 09:40:00 PM,05/10/2016 03:56:50 PM
2,05/03/2016 11:31:00 PM,05/10/2016 03:56:50 PM
3,05/03/2016 10:10:00 PM,05/10/2016 03:56:50 PM
4,05/03/2016 10:00:00 PM,05/10/2016 03:56:50 PM


In [77]:
gds = GeneralizeDateSequence(date_sequence)
gds.fit(df_crimes_sub[date_sequence])
df_generalized_dates = gds.transform(df_crimes_sub[date_sequence])
df_generalized_dates.head()

GeneralizeDateSequence(date_sequence=['Date', 'Updated On'])

Unnamed: 0,Date,Updated On
0,2016-05,6
1,2016-05,6
2,2016-05,6
3,2016-05,6
4,2016-05,6


In [70]:
pb_dates = PrivBayes(local_epsilon).set_network([NodeParentPair('Date', None), NodeParentPair('Updated On', ['Date'])])
pb_dates.fit(df_generalized_dates)
df_generalized_synth_dates = pb_dates.transform(df_generalized_dates)
df_generalized_synth_dates.head()

1/2 - Degree of network (k): 1

1/2 - init node Date - with parents: None
2/2 - init node Updated On - with parents: ['Date']
Learned Network Structure

Learning conditional probabilities: Updated On - with parents ['Date']
Learning conditional probabilities: Date - with parents None


PrivBayes(epsilon=inf, theta_usefulness=None)

Number of records generated: 1001 / 1001
 Synthetic Data Generated


Unnamed: 0,Date,Updated On
0,2016-05,6
1,2016-04,12
2,2016-05,7
3,2015-04,404
4,2016-03,70


In [86]:
df_crimes_sub[date_sequence]['Updated On'].value_counts()

05/11/2016 03:50:55 PM    543
05/12/2016 03:48:29 PM    248
05/10/2016 03:56:50 PM    163
05/11/2016 03:48:18 PM     35
05/12/2016 03:50:15 PM     10
05/23/2016 03:48:54 PM      2
Name: Updated On, dtype: int64

In [80]:
df_synth_dates = gds.inverse_transform(dfnuniqueeneralized_synth_dates)
df_synth_dates.head()

Unnamed: 0,Date,Updated On
0,2016-05-17,2016-05-23
1,2016-04-04,2016-04-16
2,2016-05-24,2016-05-31
3,2015-04-01,2016-05-09
4,2016-03-27,2016-06-05


In [81]:
compare_synthetic_data(df_crimes_sub[date_sequence], df_synth_dates)

Date
                        real  synthetic
01/01/2012 12:00:00 AM   2.0        NaN
01/01/2014 09:00:00 AM   1.0        NaN
01/01/2015 12:00:00 AM   2.0        NaN
01/01/2016 12:00:00 AM   1.0        NaN
01/01/2016 12:01:00 AM   2.0        NaN
...                      ...        ...
2016-05-27               NaN       16.0
2016-05-28               NaN       23.0
2016-05-29               NaN       18.0
2016-05-30               NaN       24.0
2016-05-31               NaN       16.0

[848 rows x 2 columns]
Updated On
                         real  synthetic
05/10/2016 03:56:50 PM  163.0        NaN
05/11/2016 03:48:18 PM   35.0        NaN
05/11/2016 03:50:55 PM  543.0        NaN
05/12/2016 03:48:29 PM  248.0        NaN
05/12/2016 03:50:15 PM   10.0        NaN
05/23/2016 03:48:54 PM    2.0        NaN
2016-04-13                NaN        2.0
2016-04-15                NaN        1.0
2016-04-16                NaN        1.0
2016-04-17                NaN        1.0
2016-04-18                NaN

## Chaining all these components together in one Pipeline
As we're chaining multiple synthesizers we need split our privacy budget 'epsilon'.

In [None]:
from sklearn.compose import ColumnTransformer

In [None]:
multi_synthesizer = ColumnTransformer(
    [
        ('crime_location', PrivBayes(epsilon=local_epsilon), crime_location)
#         ('crime_details', PrivBayesFix(epsilon=local_epsilon).set_network[NodeParentPair('IUCR', None)], crime_details)
    ]
)
df_synth = multi_synthesizer.fit_transform(df_crimes)

Unfortunately sklearn return numpy arrays. Naturally we'd like to retain the functionality of pandas thus we need to map it back to a DataFrame.

In [9]:
combined_cols = synth1_cols + synth2_cols
df_synth = pd.DataFrame(df_synth, columns=combined_cols)

# Evaluate results

In [10]:
from scipy.spatial.distance import cdist

In [11]:
df_adult_subset = df_adult[combined_cols]

In [16]:
distance = cdist(df_adult_subset['age'].values.reshape(-1,1), df_synth['age'].values.reshape(-1,1), metric='jensenshannon')

In [19]:
df_adult_subset['age'].values.reshape(-1,1).shape

(32561, 1)

In [22]:
import scipy

In [23]:
def jensen_shannon_distance(p, q):
    """
    method to compute the Jenson-Shannon Distance 
    between two probability distributions
    """

    # convert the vectors into numpy arrays in case that they aren't
    p = np.array(p)
    q = np.array(q)

    # calculate m
    m = (p + q) / 2

    # compute Jensen Shannon Divergence
    divergence = (scipy.stats.entropy(p, m) + scipy.stats.entropy(q, m)) / 2

    # compute the Jensen Shannon Distance
    distance = np.sqrt(divergence)

    return distance

jensen_shannon_distance(df_adult_subset['age'].astype(int).values.reshape(-1,1), df_synth['age'].astype(int).values.reshape(-1,1))

array([0.17361559])

In [31]:
scipy.spatial.distance.jensenshannon(df_adult_subset['age'].astype(int), df_synth['age'].astype(int))

0.17361558745338193

In [32]:
scipy.special.kl_div(df_adult_subset['age'].astype(int).values.reshape(-1,1), df_synth['age'].astype(int).values.reshape(-1,1)).sum()

155764.36809310576

# old

Call and fit synthesizer object on our data. 

In this example we use a simple Schema Synthesizer. This method models all the values in each column separately and samples on a columnar basis. Thus does not take into account the relationships that exist between columns.

Optional: set a value of epsilon to obtain a differentially private model of the data.

In [120]:
synthesizer = SchemaSynthesizer(epsilon=1)
synthesizer.fit(df_adult)

fit completed


After our synthesizer has fitted the structure of the original data source, we can now use to generate a new dataset.

In [121]:
df_adult_synth = synthesizer.generate()

Column sampled: age
Column sampled: workclass
Column sampled: fnlwgt
Column sampled: education
Column sampled: education-num
Column sampled: marital-status
Column sampled: occupation
Column sampled: relationship
Column sampled: race
Column sampled: sex
Column sampled: capital-gain
Column sampled: capital-loss
Column sampled: hours-per-week
Column sampled: native-country
Column sampled: income


We now obtained a new dataset which looks very similar to the original one.

In [122]:
df_adult_synth.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,35,Private,188467,Some-college,8,Never-married,Adm-clerical,Husband,White,Male,0,0,30,United-States,<=50K
1,64,Private,178142,Bachelors,11,Married-civ-spouse,Other-service,Own-child,White,Male,0,0,48,United-States,<=50K
2,47,Private,50122,HS-grad,13,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,40,United-States,<=50K
3,49,Self-emp-not-inc,175360,HS-grad,5,Married-civ-spouse,Other-service,Own-child,White,Male,0,0,40,United-States,<=50K
4,52,Private,108183,HS-grad,4,Married-civ-spouse,Exec-managerial,Unmarried,White,Male,0,0,40,United-States,<=50K


## Optional: Other ways we can interact with the Schema_Synthesizer

We could save the model we have created into a json file, so that we can share it with others or resample from it again in the future.

In [123]:
synthesizer.save_model(path='models/adult.json')

Instead of fitting the model on the data again, we can simply load a model we have saved earlier.

Note: we do have to specify the number of records we would like since we cannot infer this from the original dataset.

In [124]:
synthesizer_prefit = SchemaSynthesizer()
synthesizer_prefit.load_model(path='models/adult.json')

df_adult_synth_prefit = synthesizer_prefit.generate(num_records=10000)
df_adult_synth_prefit.head(3)

Column sampled: age
Column sampled: workclass
Column sampled: fnlwgt
Column sampled: education
Column sampled: education-num
Column sampled: marital-status
Column sampled: occupation
Column sampled: relationship
Column sampled: race
Column sampled: sex
Column sampled: capital-gain
Column sampled: capital-loss
Column sampled: hours-per-week
Column sampled: native-country
Column sampled: income


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,42,State-gov,133060,7th-8th,9,Married-civ-spouse,Prof-specialty,Husband,White,Male,3137,0,40,United-States,<=50K
1,37,Federal-gov,111795,HS-grad,10,Never-married,Prof-specialty,Wife,White,Female,0,0,38,United-States,<=50K
2,43,Private,142566,Assoc-acdm,9,Married-spouse-absent,Machine-op-inspct,Not-in-family,White,Male,0,0,10,United-States,<=50K


This also means that one can load an arbritary schema format directly into the Schema_Synthesizer that does not have to be based on real data. Hence, we can generate data with public information, as long as it follows the following structure:

{

    "col_name":{
    
        "column_values": [ list of values ],
        "counts": [ list of counts ],
        "probs": [ probabilities ]
        
        },
    "col_name_2":{
    
        "column_values": [ list of values ],
        "counts": [ list of counts ],
        "probs": [ probabilities ]
        
        },
    ...
    "col_name_n": {
   
        "column_values": [ list of values ],
        "counts": [ list of counts ],
        "probs": [ probabilities ]
      
        }
}


"probs" can be random if not known, just need to follow the simple probability rule, that is, their total sum should equal to 1.

In [135]:
example_schema =  {
    "tumor_icd10": {
        "column_values": ["C18" , "C19", "C20", "C21"],
        "probs": [0.25, 0.25, 0.25, 0.25]
    },
    "gender": {
        "column_values": ["male", "female"],
        "probs": [0.5, 0.5]
    },
    "5year_survival": {
        "column_values": ["0", "1"],
        "probs": [0.7, 0.3]
    }
}

In [138]:
synthesizer_schemabased = SchemaSynthesizer()
synthesizer_schemabased.model_ = example_schema

df_synth_schemabased = synthesizer_schemabased.generate(num_records=10000)
df_synth_schemabased.head()

Column sampled: tumor_icd10
Column sampled: gender
Column sampled: 5year_survival


Unnamed: 0,tumor_icd10,gender,5year_survival
0,C20,female,0
1,C21,female,1
2,C20,male,0
3,C21,female,0
4,C21,male,0


We can inspect whether the marginal distributions are according to our input specifications, note that 5year survival indeed follows a 70/30 distribution.

In [141]:
for c in df_synth_schemabased.columns:
    df_synth_schemabased[c].value_counts()

C21    2544
C18    2490
C19    2487
C20    2479
Name: tumor_icd10, dtype: int64

female    5083
male      4917
Name: gender, dtype: int64

0    7014
1    2986
Name: 5year_survival, dtype: int64