# Smartnoise Synth

In [1]:
from lomas_client import Client
from snsynth import Synthesizer # Optional

In [2]:
APP_URL = "http://lomas_server"
USER_NAME = "Dr. Antartica"
DATASET_NAME = "PENGUIN"
client = Client(url=APP_URL, user_name = USER_NAME, dataset_name = DATASET_NAME)

In [3]:
penguin_metadata = client.get_dataset_metadata()
penguin_metadata

{'max_ids': 1,
 'row_privacy': True,
 'censor_dims': False,
 'columns': {'species': {'type': 'string',
   'cardinality': 3,
   'categories': ['Adelie', 'Chinstrap', 'Gentoo']},
  'island': {'type': 'string',
   'cardinality': 3,
   'categories': ['Torgersen', 'Biscoe', 'Dream']},
  'bill_length_mm': {'type': 'float', 'lower': 30.0, 'upper': 65.0},
  'bill_depth_mm': {'type': 'float', 'lower': 13.0, 'upper': 23.0},
  'flipper_length_mm': {'type': 'float', 'lower': 150.0, 'upper': 250.0},
  'body_mass_g': {'type': 'float', 'lower': 2000.0, 'upper': 7000.0},
  'sex': {'type': 'string',
   'cardinality': 2,
   'categories': ['MALE', 'FEMALE']}}}

In [4]:
Synthesizer.list_synthesizers()

['mwem', 'dpctgan', 'patectgan', 'mst', 'pacsynth', 'dpgan', 'pategan', 'aim']

In [81]:
cost = client.estimate_smartnoise_synth_cost(
    synth_name="dpctgan",
    epsilon= 2.0,
    delta = 0.0001,
    select_cols = ["bill_length_mm"],
    mul_matrix = [],
    synth_params = {},
    nullable = True,
    table_transformer_style = "gan",
)
cost

{'epsilon_cost': 2.0, 'delta_cost': 0.0001}

In [82]:
res = client.smartnoise_synth_query(
    synth_name="dpctgan",
    epsilon= 2.0,
    delta = 0.0001,
    select_cols = ["bill_length_mm"],
    mul_matrix = [],
    synth_params = {
        "embedding_dim": 128, 
        "generator_dim": (256, 256), 
        "discriminator_dim": (256, 256),
        "batch_size": 2
    },
    nullable = True,
    table_transformer_style = "gan",
    dummy = True
)
res

ReadTimeout: HTTPConnectionPool(host='lomas_server', port=80): Read timed out. (read timeout=50)

In [49]:
query_json_pums = {
    "dataset_name": "PENGUIN",
    "synth_name": "dpctgan",
    "epsilon": 0.1,
    "delta": 0.00001,
    "synth_params": {},
    "select_cols": [],
    "mul_matrix": [],
    "nullable": True,
    "condition": None,
    "nb_samples": None,
    "table_transformer_style": "gan",
}
query_json

NameError: name 'query_json' is not defined

In [9]:
import pandas as pd
from snsynth import Synthesizer
from snsynth.transform import (
    AnonymizationTransformer,
    BinTransformer,
    ChainTransformer,
    #DateTimeTransformer,
    LabelTransformer,
    MinMaxTransformer,
    OneHotEncoder,
)
from snsynth.transform.table import TableTransformer

from typing import Dict, List, Optional

In [10]:
Synthesizer.list_synthesizers()

['mwem', 'dpctgan', 'patectgan', 'mst', 'pacsynth', 'dpgan', 'pategan', 'aim']

In [18]:
from pydantic import BaseModel, Field
class SmartnoiseSynthModel(BaseModel):
    """Model input for a smarnoise-synth query"""

    dataset_name: str
    synth_name: str
    epsilon: float = 0.0
    delta: float = 0.0
    select_cols: List = []
    synth_params: dict = {}
    mul_matrix: List = []
    nullable: bool = True
    condition: Optional[str] = None
    nb_samples: Optional[int] = None
    table_transformer_style: str = "gan"

In [19]:
from enum import StrEnum
class SSynthSynthesizer(StrEnum):
    """Synthesizer models for smartnoise synth"""

    # Marginal Synthesizer
    AIM = "aim"
    MWEM = "mwem"
    MST = "mst"
    PAC_SYNTH = "pacsynth"

    # Neural Network Synthesizer
    DP_CTGAN = "dpctgan"
    PATE_CTGAN = "patectgan"
    PATE_GAN = "pategan"  # no documentation
    DP_GAN = "dpgan"  # no documentation

    # Hybrid Synthesizer
    QUAIL = "quail"


class SSynthTableTransStyle(StrEnum):
    """Transformer style for smartnoise synth"""

    GAN = "gan"
    CUBE = "cube"


class SSynthColumnType(StrEnum):
    """Type of columns for SmartnoiseSynth transformer pre-processing"""

    PRIVATE_ID = "private_id"
    CATEGORICAL = "categorical"
    CONTINUOUS = "continuous"
    ORDINAL = "ordinal"
    DATETIME = "datetime"


SSYNTH_PRIVATE_COLUMN = "uuid"
SSYNTH_DEFAULT_NB_SAMPLES = 100

In [20]:
def _preprocess_data(
        private_data: pd.DataFrame,
        query_json: dict,
    ) -> pd.DataFrame:
        """
        Preprocess the data based on the query parameters.

        Args:
            private_data (pd.DataFrame): Private data to be preprocessed
            query_json (dict): (SmartnoiseSynthModelCost): JSON request object
                select_cols (List[str]): List of columns to select
                mul_matrix (List): Multiplication matrix for columns aggregations

        Returns:
            pd.DataFrame: Preprocessed private data
        """
        if query_json.select_cols:
            try:
                private_data = private_data[query_json.select_cols]
            except KeyError as e:
                raise ValueError(
                    "Error while selecting provided select_cols: " + str(e)
                ) from e

        if query_json.mul_matrix:
            try:
                np_matrix = np.array(query_json.mul_matrix)
                mul_private_data = private_data.to_numpy().dot(np_matrix.T)
                private_data = pd.DataFrame(mul_private_data)
            except ValueError as e:
                raise ValueError(
                    f"Failed to multiply provided mul_matrix: {(str(e))}"
                ) from e
        return private_data

In [21]:
def _categorize_column(data: dict) -> str:
    """
    Categorize the column based on its metadata.

    Args:
        data (dict): Metadata of the column.

    Returns:
        str: Category of the column.
    """
    match data["type"]:
        case "string" | "boolean":
            return SSynthColumnType.CATEGORICAL
        case "int" | "float":
            if "lower" in data.keys():
                return SSynthColumnType.CONTINUOUS
            if "cardinality" in data.keys():
                return SSynthColumnType.CATEGORICAL
            return SSynthColumnType.ORDINAL
        case "datetime":
            return SSynthColumnType.DATETIME
        case _:
            raise ValueError(
                f"Unknown column type in metadata: {data['type']}"
            )

def _get_column_by_types(
    metadata,
    select_cols: List[str],
) -> Dict[str, List[str]]:
    """
    Sort the column in categories based on their types and metadata

    Args:
        metadata (Metadata): Metadata of the dataset
        select_cols (List[str]): List of columns to select

    Returns:
        Dict[str, List[str]]: Dictionnary of list of columns by categories
    """
    col_categories: Dict[str, List[str]] = {
        SSynthColumnType.CATEGORICAL: [],
        SSynthColumnType.CONTINUOUS: [],
        SSynthColumnType.DATETIME: [],
        SSynthColumnType.ORDINAL: [],
        SSynthColumnType.PRIVATE_ID: [],
    }
    for col_name, data in metadata["columns"].items():
        if select_cols and col_name not in select_cols:
            continue
        
        if "private_id" in data.keys():
            col_categories[SSynthColumnType.PRIVATE_ID].append(col_name)
            continue

        # Sort the column in categories based on their types and metadata
        category = _categorize_column(data)
        col_categories[category].append(col_name)

    return col_categories

def _prepare_data_transformer(
    metadata,
    private_data: pd.DataFrame,
    query_json: dict,
):
    """
    Creates the transformer based on the metadata
    The transformer is used to transform the data before synthesis and then
    reverse the transformation after synthesis.
    See https://docs.smartnoise.org/synth/transforms/index.html for documentation
    See https://github.com/opendp/smartnoise-sdk/blob/main/synth/snsynth/
        transform/type_map.py#L40 for get_transformer() method taken as basis.

    Args:
        metadata (Metadata): Metadata of the dataset
        private_data
        query_json (SmartnoiseSynthModelCost): JSON request object for the query
            select_cols (List[str]): List of columns to select
            nullable (bool): True is the data can have Null values, False otherwise
            table_transformer_style: 'gan' or 'cube'

    Returns:
        table_tranformer (TableTransformer) to pre and post-process the data
    """
    col_categories = _get_column_by_types(metadata, query_json.select_cols)
    style = query_json.table_transformer_style
    nullable = query_json.nullable

    constraints = {}
    for col in col_categories[SSynthColumnType.PRIVATE_ID]:
        constraints[col] = AnonymizationTransformer(SSYNTH_PRIVATE_COLUMN)

    if style == SSynthTableTransStyle.GAN:
        for col in col_categories[SSynthColumnType.CATEGORICAL]:
            constraints[col] = ChainTransformer(
                [LabelTransformer(nullable=nullable), OneHotEncoder()]
            )
        for col in col_categories[SSynthColumnType.CONTINUOUS]:
            constraints[col] = MinMaxTransformer(
                lower=metadata["columns"][col]["lower"],
                upper=metadata["columns"][col]["upper"],
                nullable=nullable,
            )
        for col in col_categories[SSynthColumnType.DATETIME]:
            constraints[col] = ChainTransformer(
                [
                    DateTimeTransformer(),
                    MinMaxTransformer(
                        lower=metadata["columns"][col]["lower"],
                        upper=metadata["columns"][col]["upper"],
                        nullable=nullable
                    ),
                ]
            )
        for col in col_categories[SSynthColumnType.ORDINAL]:
            constraints[col] = ChainTransformer(
                [LabelTransformer(nullable=nullable), OneHotEncoder()]
            )
    else:
        for col in col_categories[SSynthColumnType.CATEGORICAL]:
            constraints[col] = LabelTransformer(nullable=nullable)
        for col in col_categories[SSynthColumnType.CONTINUOUS]:
            constraints[col] = BinTransformer(nullable=nullable)
        for col in col_categories[SSynthColumnType.DATETIME]:
            constraints[col] = ChainTransformer(
                [
                    DateTimeTransformer(),
                    BinTransformer(bins=20, nullable=nullable),
                ]
            )
        for col in col_categories[SSynthColumnType.ORDINAL]:
            constraints[col] = LabelTransformer(nullable=nullable)
    print(constraints)
    return TableTransformer.create(
        data=private_data,
        style=style,
        nullable=nullable,
        constraints=constraints,
    )

## Synthesize data

In [63]:
query_json_pums = {
    "dataset_name": "PENGUIN",
    "synth_name": "dpctgan",
    "epsilon": 0.1,
    "delta": 0.00001,
    "synth_params": {},
    "select_cols": [],
    "mul_matrix": [],
    "nullable": True,
    "condition": None,
    "nb_samples": None,
    "table_transformer_style": "gan",
}
query_json_pums = SmartnoiseSynthModel.model_validate(query_json_pums)

### PUMS

In [23]:
pums_private_data = pd.read_csv("https://raw.githubusercontent.com/opendp/smartnoise-sdk/main/datasets/PUMS.csv")
print(pums_private_data.head())
pums_metadata = {
    "columns": {
        "age": {'type': 'int', 'lower': 18, 'upper': 70},
        "sex": {'type': 'boolean'},
        "educ": {'type': 'int', 'cardinality': 14},
        "race": {'type': 'int', 'cardinality': 6},
        "income": {'type': 'float', 'lower': 0.0, 'upper': 500_000},
        "married": {'type': 'boolean'},
    }
}

   age  sex  educ  race   income  married
0   59    1     9     1      0.0        1
1   31    0     1     3  17000.0        0
2   36    1    11     1      0.0        1
3   54    1    11     1   9100.0        1
4   39    0     5     3  37000.0        0


In [24]:
pums_private_data = _preprocess_data(pums_private_data, query_json_pums)

In [25]:
pums_transformer = _prepare_data_transformer(pums_metadata, pums_private_data, query_json_pums)
pums_transformer

{'sex': <snsynth.transform.chain.ChainTransformer object at 0x7fc01a48fa90>, 'educ': <snsynth.transform.chain.ChainTransformer object at 0x7fc01b142a90>, 'race': <snsynth.transform.chain.ChainTransformer object at 0x7fc01a4943d0>, 'married': <snsynth.transform.chain.ChainTransformer object at 0x7fc01a494a90>, 'age': <snsynth.transform.minmax.MinMaxTransformer object at 0x7fc01a494310>, 'income': <snsynth.transform.minmax.MinMaxTransformer object at 0x7fc01c400110>}


<snsynth.transform.table.TableTransformer at 0x7fc01ac2cc50>

In [26]:
pums_encoded = pums_transformer.fit_transform(pums_private_data, epsilon=0.0)
pums_transformer.odometer.spent

(0.0, 0.0)

In [27]:
model = Synthesizer.create(
    synth=query_json_pums.synth_name,
    epsilon=query_json_pums.epsilon,
    verbose=True,
    **query_json_pums.synth_params,
)

In [28]:
model.fit(
    data=pums_private_data,
    transformer=pums_transformer, #pums_transformer,
    preprocessor_eps=0.0, #0.0,  # will error if not 0.
    #nullable=query_json_pums.nullable,
)

  self._maybe_warn_non_full_backward_hook(args, result, grad_fn)


Epoch 1, Loss G: 0.6392, Loss D: 1.3874
epsilon is 0.08429801659035999, alpha is 63.0


In [29]:
model

<snsynth.pytorch.nn.dpctgan.DPCTGAN at 0x7fc01a3941d0>

In [30]:
samples = model.sample(5)
print(samples)

         age  sex  educ  race         income  married
0  40.995024    1    12     4  292920.205742        0
1  28.858799    0    13     6  190039.295703        0
2  43.161426    0    10     6  125412.203372        1
3  29.997472    1     6     2  155969.545245        0
4  40.092684    1    11     3  211686.365306        1


### Penguins

In [35]:
penguin_private_data = pd.read_csv("https://raw.githubusercontent.com/mwaskom/seaborn-data/master/penguins.csv", index_col=None)
print(penguin_private_data.head())
penguin_metadata = {
 'max_ids': 1,
 'row_privacy': True,
 'censor_dims': False,
 'columns': {
   'species': {'type': 'string', 'cardinality': 3,'categories': ['Adelie', 'Chinstrap', 'Gentoo']},
   'island': {'type': 'string', 'cardinality': 3, 'categories': ['Torgersen', 'Biscoe', 'Dream']},
   'bill_length_mm': {'type': 'float', 'lower': 30.0, 'upper': 65.0},
   'bill_depth_mm': {'type': 'float', 'lower': 13.0, 'upper': 23.0},
   'flipper_length_mm': {'type': 'float', 'lower': 150.0, 'upper': 250.0},
   'body_mass_g': {'type': 'float', 'lower': 2000.0, 'upper': 7000.0},
   'sex': {'type': 'string', 'cardinality': 2,'categories': ['MALE', 'FEMALE']}
  }
}

  species     island  bill_length_mm  bill_depth_mm  flipper_length_mm  \
0  Adelie  Torgersen            39.1           18.7              181.0   
1  Adelie  Torgersen            39.5           17.4              186.0   
2  Adelie  Torgersen            40.3           18.0              195.0   
3  Adelie  Torgersen             NaN            NaN                NaN   
4  Adelie  Torgersen            36.7           19.3              193.0   

   body_mass_g     sex  
0       3750.0    MALE  
1       3800.0  FEMALE  
2       3250.0  FEMALE  
3          NaN     NaN  
4       3450.0  FEMALE  


In [64]:
query_json_penguin = {
    "dataset_name": "PENGUIN",
    "synth_name": "dpctgan",
    "epsilon": 0.1,
    "delta": 0.00001,
    "select_cols": ["bill_length_mm"],
    "synth_params": {
        "embedding_dim": 128, 
        "generator_dim": (256, 256), 
        "discriminator_dim": (256, 256),
        "batch_size": 50
    },
    "mul_matrix": [],
    "nullable": True,
    "table_transformer_style": "gan",
}
query_json_penguin = SmartnoiseSynthModel.model_validate(query_json_penguin)

In [66]:
model = Synthesizer.create(
    synth=query_json_penguin.synth_name,
    epsilon=query_json_penguin.epsilon,
    verbose=True,
    **query_json_penguin.synth_params,
)
model

<snsynth.pytorch.nn.dpctgan.DPCTGAN at 0x7fbf6ac208d0>

In [44]:
penguin_private_data = _preprocess_data(penguin_private_data, query_json_penguin)

In [45]:
penguin_transformer = _prepare_data_transformer(penguin_metadata, penguin_private_data, query_json_penguin)
penguin_transformer.transformers

{'bill_length_mm': <snsynth.transform.minmax.MinMaxTransformer object at 0x7fbf610ddcd0>}


[<snsynth.transform.minmax.MinMaxTransformer at 0x7fbf610ddcd0>]

In [46]:
penguin_encoded = penguin_transformer.fit_transform(penguin_private_data, epsilon=0.0)
penguin_transformer.odometer.spent

(0.0, 0.0)

In [47]:
model = Synthesizer.create(
    synth=query_json_penguin.synth_name,
    epsilon=query_json_penguin.epsilon,
    **query_json_penguin.synth_params,
)

In [48]:
model.fit(
    data=penguin_private_data,
    transformer=penguin_transformer,
    preprocessor_eps=0.0, #0.0,  # will error if not 0.
    nullable=query_json_penguin.nullable,
)

  self._maybe_warn_non_full_backward_hook(args, result, grad_fn)


Epoch 1, Loss G: 0.6890, Loss D: 1.4445
epsilon is 0.0584807515514389, alpha is 63.0


In [42]:
samples = model.sample(5)
print(samples)

   bill_length_mm
0       46.495817
1       52.195935
2       36.965119
3       40.554339
4       48.738249


## Serialise

In [45]:
model

<snsynth.pytorch.nn.dpctgan.DPCTGAN at 0x7f9962742590>

In [57]:
import pickle
p_obj = pickle.dumps(model)

In [59]:
d_p_model = pickle.loads(p_obj)

In [60]:
samples = d_p_model.sample(5)
print(samples)

   bill_length_mm
0       46.956818
1       45.232080
2       55.124942
3       50.191850
4       57.324367


In [63]:
import json
j_obj = json.dumps(model)

TypeError: Object of type DPCTGAN is not JSON serializable

In [46]:
import base64
import json
from base64 import b64encode
pickled_model = b64encode(pickle.dumps(model))
depickled_model = base64.b64decode(pickled_model)
json_model = json.loads(depickled_model)

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x80 in position 0: invalid start byte

In [None]:
samples = json_model.sample(100)
print(samples)