In [2]:
# # ----------------------------------------------
# # DIFFERENTIAL PRIVACY
# # ----------------------------------------------

import string
import numpy as np
import pandas as pd 
# !pip install smartnoise-synth
from snsynth import QUAILSynthesizer
from snsynth.pytorch.nn import PATEGAN, PATECTGAN
from snsynth.pytorch import PytorchDPSynthesizer

# !pip install diffprivlib
# from diffprivlib.models import GaussianNB, LinearRegression


In [3]:
dataframe = pd.read_csv("final_data.csv", index_col=None) # in datasets/
df = dataframe.drop(['LABEL', 'INCOME_PRED', "PUMA"], axis=1)

nf = df.to_numpy()

target_column = 'LABEL'
# target_column = 'INCOME_PRED'


In [4]:
def discritize_data(df):

  df['POVPIP'] = pd.cut(df["POVPIP"],
       bins=[0, 62.5, 125.0, 187.5, 250, 375, 500 ], 
       labels=["Extreme-Poverty", "In-Poverty", "Borderline-Poverty", "Escaped-Poverty", "Not-In-Poverty", "Above-Median"])

  # size=20
  # df['PUMA'] = pd.cut(df["PUMA"],
  #       bins = np.linspace(np.min(df['PUMA'])- 1, np.max(df['PUMA']) + 1, 10, dtype=int), 
  #       labels=list(string.ascii_uppercase)[:9])
  
  df['AGEP'] = pd.cut(df["AGEP"],
       bins = [0, 20, 50, 70, 100] ,
       labels=['Young', 'Middle-Aged', 'Retirement', 'Old'])
  
  df['CIT'] =pd.cut(df["CIT"],
       bins = range(6) ,
       labels=["Born", "Outside", "BAAP", "Nat", "Not"])
  
  df["COW"] = pd.cut(df["COW"],
       bins = range(9) ,
       labels=list(string.ascii_uppercase)[:8])

  df['JWMNP'] = pd.cut(df["JWMNP"],
        bins = np.linspace(0, np.max(df['JWMNP']), 5, dtype=int),
        labels=['Low', 'Moderate', 'Manageable', 'High'])
  
  df['RELP'] = pd.cut(df["RELP"],
       bins = np.linspace(0, np.max(df['RELP']), 4, dtype=int),
       labels=['Low', 'Moderate',  'High'])

  df['SCHL'] = pd.cut(df["SCHL"],
        bins = [0, 16, 21, 24],
        labels=['School', 'College',  'Higher'])
  
  df["WKHP"] = pd.cut(df["WKHP"],
       bins = np.linspace(0, np.max(df['WKHP']), 6, dtype=int),
       labels=['Low', 'Moderate', 'Manageable' ,'High', 'Extreme'])
  
  df["OCCP"] = pd.cut(df["OCCP"],
       bins = [0, 490, 750, 960, 1240, 1555, 1980, 2060, 2555, 3550, 3655, 3960, 4160, 4655, 4950, 
               6130, 6765, 8990, 9760, 9830, 9920],
       labels=['MGR', 'BUS', 'FIN' ,'CMM', 'ENG', 'SCI', 'CMS', 'EDU', 'MED',
               'HLS', 'PRT', 'EAT', 'PRS', 'SAL', 'OFF', 'CON', 'PRD', 'TRN', 
               'MIL', 'UNEMP'])
  
  df["POBP"] = pd.cut(df["POBP"],
       bins = [0,56,np.max(df['POBP'])+1],
       labels=['US', "International"])
  
  single = [1,2,3,4,6,7,8]
  mult = [5,9]
  df['RAC1P'] = ["Single" if i in single else "Multiple" for i in df['RAC1P']]

  df['DREM'] = pd.cut(df["DREM"],
       bins = [0,1,2],
       labels=['Yes', "No"])
  
  df['MIG'] = pd.cut(df["MIG"],
       bins = [0,1,2,3],
       labels=['Same', "Outside", "US"])

  df['MIL'] = pd.cut(df["MIL"],
       bins = [0,1,3,4],
       labels=['Active', "Yes", "No"])

  df['ESP'] = pd.cut(df["ESP"],
       bins = [0,1,3,4, 5,6,7,8], ordered=False,
       labels=['Both', "Single", "None", "Single", "Other", "Single", "Other"])

  df['ESR'] = pd.cut(df["ESR"],
       bins = [0,2,3,5,6], ordered=False,
       labels=['Employed', "Umemployed", "AF", "Umemployed"])

  return df


In [5]:
df1 = discritize_data(df)
pd.set_option('display.max_columns', None)
df1.head()

Unnamed: 0,AGEP,CIT,COW,DEAR,DEYE,DREM,HINS2,JWMNP,MAR,MIG,MIL,RELP,SCHL,SEX,WKHP,ANC,DIS,ESP,ESR,NATIVITY,OCCP,POBP,POVPIP,RAC1P,RACAIAN,RACASN,RACBLK,RACNH,RACPI,RACSOR,RACWHT
0,Middle-Aged,Born,A,2,2,No,2,Moderate,5,US,No,High,College,1,Manageable,1,2,Both,Employed,1,EAT,US,Borderline-Poverty,Single,0,0,0,0,0,0,1
1,Middle-Aged,Born,A,2,2,No,2,Manageable,5,US,No,High,School,1,Manageable,1,2,Both,Employed,1,CON,US,Borderline-Poverty,Single,0,0,1,0,0,0,0
2,Middle-Aged,Born,A,2,2,No,2,Moderate,5,US,No,High,School,1,Manageable,2,2,Both,Employed,1,TRN,US,Extreme-Poverty,Multiple,1,0,0,1,0,0,0
3,Middle-Aged,Born,A,2,2,No,2,Moderate,5,Same,No,High,School,1,Manageable,1,2,Both,Employed,1,EAT,US,Borderline-Poverty,Single,0,0,1,0,0,0,0
4,Middle-Aged,Born,A,2,2,No,2,Moderate,5,Same,No,High,School,1,Manageable,1,2,Both,Employed,1,EAT,US,Borderline-Poverty,Single,0,0,1,0,0,0,0


In [6]:
for i in df1.columns.to_list():
    df1[i] = df1[i].astype(str) #df1.astype({i: 'str'})
# df2 = pd.DataFrame(df1,  dtype='str')


In [7]:
synth = PytorchDPSynthesizer(1.0, PATECTGAN(regularization='dragan'), None)
synth.fit(df1, categorical_columns=df1.columns)

sample = synth.sample(10) # synthesize 10 rows
print(sample)


ValueError: ignored

In [None]:
df1.columns

In [None]:
df1.dtypes

In [None]:
np.unique(df["ESP"])