## GSD 


In [1]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import subprocess
import utils
from snsynth import Synthesizer
from snsynth.gsd import GSDSynthesizer
import time

from load_data import load_data
from sklearn.model_selection import train_test_split



For GSD support, please install jax: pip install --upgrade  "jax[cuda11_cudnn82]==0.4.6" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html


In [2]:
adult_path = 'adult.csv'
datasets = load_data(['adult'])
    
adult_df = datasets['adult']['data']

target =  datasets['adult']['target']
categorical_columns =  datasets['adult']['categorical_columns'].split(',')
print(adult_df.columns)
print(categorical_columns)


# Create config file. Note that we know the lower bound of each ordinal feature is 0.
# Let's assume the column upper bound is know.
ordinal_columns = ['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week', 'fnlwgt']
continuous_columns = []
config = {}
for c in adult_df.columns:
    if c in categorical_columns:
        config[c] = {'type': 'string'}
    else:
        config[c] = {'type': 'int', 'lower': 0, 'upper': adult_df[c].max()}


# Split into train/test sets for machine learning evaluation.
adult_df_train, adult_df_test = train_test_split(adult_df, test_size=0.2)

loading downloaded_datasets/adult.csv
Memory consumed by adult:4167808
Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'earning-class'],
      dtype='object')
['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country', 'earning-class']


# Adult

In [None]:
# Still need to implement 
epsilon= 10.0


synth = GSDSynthesizer(epsilon, 1e-5, tree_height=13, verbose=True)
synth.fit(adult_df_train,  meta_data=config,
          N_prime=5000)
    
max_error = np.abs(synth.stat_fn(synth.data.to_numpy()) - synth.stat_fn(synth.sync_data.to_numpy())).max()
print(f'Statistical error:', max_error)

adult_sync_df = synth.sample()

os.makedirs('downloaded_datasets', exist_ok=True)
adult_sync_df.to_csv(f'downloaded_datasets/adult_sync_{epsilon:.2f}.csv')

privacy budgets: First moments = 0.096362. Second moments = 1.686334
Cond.Marginal= ['age'] . Sigma=0.0011. Top.Level=7. Max.Size=None
Cond.Marginal= ['fnlwgt'] . Sigma=0.0015. Top.Level=13. Max.Size=None
Cond.Marginal= ['education-num'] . Sigma=0.0010. Top.Level=5. Max.Size=None
Cond.Marginal= ['capital-gain'] . Sigma=0.0015. Top.Level=13. Max.Size=None
Cond.Marginal= ['capital-loss'] . Sigma=0.0015. Top.Level=13. Max.Size=None
Cond.Marginal= ['hours-per-week'] . Sigma=0.0011. Top.Level=7. Max.Size=None
	Total size=49716
       age.tree_height = 5. Thresholds=58
    fnlwgt.tree_height = 5. Thresholds=54
education-num.tree_height = 4. Thresholds=20
capital-gain.tree_height = 2. Thresholds=6
capital-loss.tree_height = 2. Thresholds=4
hours-per-week.tree_height = 4. Thresholds=25
Cond.Marginal= ['age', 'fnlwgt'] . Sigma=0.0010. Top.Level=5. Max.Size=None
Cond.Marginal= ['age', 'education-num'] . Sigma=0.0010. Top.Level=5. Max.Size=None
Cond.Marginal= ['age', 'capital-gain'] . Sigma=0.001

In [None]:


# adult_sync_df['Type'] = 'Sync'
# adult_df_copy = adult_df.sample(n=5000).copy()
# adult_df_copy['Type'] = 'Real'

# df = pd.concat([adult_sync_df, adult_df_copy])

# """
# Plot subgroups distributions:
# """
# g = sns.FacetGrid(data=df, col='Type',  hue='earning-class', sharey=False)
# g.map(sns.histplot, 'capital-gain')
# g.add_legend()
# plt.show()

In [5]:
data_path = 'downloaded_datasets/adult_sync_1.00.csv'

adult_sync_df = pd.read_csv(data_path, index_col=0)


In [8]:
adult_sync_df.drop(columns=['Type'])

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,earning-class
0,46.0,4,150718.0,9,13.0,2,13,0,4,1,0.0,0.0,40.0,39,0
1,29.0,4,90389.0,2,9.0,4,7,1,4,1,0.0,0.0,40.0,39,0
2,26.0,4,101454.0,0,6.0,4,1,3,4,1,0.0,1.0,40.0,10,0
3,29.0,4,279856.0,9,13.0,4,12,1,4,1,1.0,0.0,48.0,39,0
4,30.0,4,62500.0,9,13.0,2,3,0,4,1,0.0,0.0,40.0,39,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,37.0,4,225175.0,15,9.0,3,13,4,4,1,0.0,1.0,40.0,39,1
4996,47.0,4,195474.0,11,9.0,5,8,1,4,1,0.0,0.0,40.0,39,0
4997,47.0,4,193871.0,11,9.0,5,12,5,2,0,0.0,230.0,47.0,39,1
4998,47.0,4,160602.0,11,9.0,2,4,0,4,1,0.0,1889.0,44.0,39,0


In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import ComplementNB
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import GradientBoostingClassifier

import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from sklearn.metrics import f1_score

categorical_features = categorical_columns.copy()
numeric_features = ordinal_columns.copy()
label = 'earning-class'
categorical_features.remove(label)


numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)

categorical_transformer = Pipeline(
    steps=[
        ("encoder", OneHotEncoder(handle_unknown="ignore")),
        ("selector", SelectPercentile(chi2, percentile=50)),
    ]
)
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", GradientBoostingClassifier())]
)

In [13]:
adult_df_train.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'earning-class'],
      dtype='object')

In [5]:
X_real = adult_df_train[categorical_features+numeric_features]
y_real = adult_df_train[label]


X_test = adult_df_test[categorical_features+numeric_features]
y_test = adult_df_test[label]


clf.fit(X_real, y_real)

print(f'Train on real:')
print(f"Accuracy={clf.score(X_test, y_test):.4f}")
print(f"F1-score = {f1_score(y_test, clf.predict(X_test), average='macro'):.4f}")

Train on real:
Accuracy=0.8715
F1-score = 0.8073


In [6]:
X_sync = adult_sync_df[categorical_features+numeric_features]
y_sync = adult_sync_df[label]


clf.fit(X_sync, y_sync)

print(f'Train on real:')
print(f"Accuracy={clf.score(X_test, y_test):.4f}")
print(f"F1-score = {f1_score(y_test, clf.predict(X_test), average='macro'):.4f}")

Train on real:
Accuracy=0.8446
F1-score = 0.7619


# End