# Differential Privacy on Income Data

We will use the Synthetic Swiss Income Dataset to demonstrate the how to release differentially private information on tabular data with OpenDP.
The main library used in the background is `Polars`.

In [None]:
# Import library
import polars as pl
import opendp.prelude as dp

dp.enable_features("contrib", "honest-but-curious")

: 

## 1. Data Domain Preparation
We first need to declare the metadata of the input domain of the Lazyframe.

### Define domain with bounds

In [2]:
# Income bounds
income_lower_bound, income_upper_bound = 1_000.0, 60_000.0

In [3]:
# Define dtype domain with bounds
df_domain = dp.lazyframe_domain([
    dp.series_domain("region", dp.atom_domain(T=int)),
    dp.series_domain("eco_branch", dp.atom_domain(T=int)),
    dp.series_domain("profession", dp.atom_domain(T=int)),
    dp.series_domain("education", dp.atom_domain(T=int)),
    dp.series_domain("age", dp.atom_domain(T=int)),
    dp.series_domain("sex", dp.atom_domain(T=int)),
    dp.series_domain("income", dp.atom_domain(bounds=(income_lower_bound, income_upper_bound), T=float))
])
df_domain

LazyFrameDomain(region: i32, eco_branch: i32, profession: i32, education: i32, age: i32, sex: i32, income: f64)

### Add margin metadata: 
Here, we add the counts the we will need.

In [4]:
# Total
total_counts = pl.LazyFrame({
    "counts": [2_032_543]
}, schema_overrides={"counts": pl.UInt32}
)

# For sex
sex_counts = pl.LazyFrame({
    "sex": [0, 1], 
    "counts": [634_720, 1_397_823]
}, schema_overrides={"sex": pl.Int32, "counts": pl.UInt32})

# For region
region_counts = pl.LazyFrame({
    "region": [1, 2, 3, 4, 5, 6, 7], 
    "counts": [352_001, 474_690, 267_304, 366_879, 284_638, 210_800, 76_231]
}, schema_overrides={"region": pl.Int32, "counts": pl.UInt32})

# For region and sex
sex_region_counts = pl.LazyFrame({
    "sex": [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1], 
    "region": [1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7], 
    "counts": [113_367, 148_265, 83_326, 113_715, 87_668, 64_357, 24_022, 238_634, 326_425, 183_978, 253_164, 196_970, 146_443, 52_209]
}, schema_overrides={"sex": pl.Int32, "region": pl.Int32, "counts": pl.UInt32})

#### Define input domain and input space

In [5]:
# Add counts to margin
df_domain = df_domain.with_counts(
    total_counts
).with_counts(sex_counts).with_counts(region_counts).with_counts(sex_region_counts)
df_domain

LazyFrameDomain(region: i32, eco_branch: i32, profession: i32, education: i32, age: i32, sex: i32, income: f64; margins=[{"region", "sex"}, {}, {"sex"}, {"region"}])

## 2. Data scanning

### Dataset input metric

In [6]:
metric = dp.symmetric_distance()
metric

SymmetricDistance()

### CsvDomain to scan data

In [7]:
# Initialise CsvDomain
csv_domain = dp.csv_domain(df_domain)
csv_domain

CsvDomain(region: i32, eco_branch: i32, profession: i32, education: i32, age: i32, sex: i32, income: f64)

In [8]:
# Declare Transformation to scan CSV
csv_scanner = (csv_domain, metric) >> dp.t.then_scan_csv()
csv_scanner

<opendp.mod.Transformation at 0x7f52bc0567c0>

In [9]:
# Get the csv data to a LazyFrame
DATA_PATH = "../data/income_synthetic_data.csv"
data = csv_scanner(DATA_PATH)
data

In [10]:
# LazyFrameDomain output domain from scanner: input domain of analysis
lf_domain = csv_scanner.output_domain
lf_domain

LazyFrameDomain(region: i32, eco_branch: i32, profession: i32, education: i32, age: i32, sex: i32, income: f64; margins=[{"region", "sex"}, {}, {"sex"}, {"region"}])

## 3. Private Data Analysis
We will analyse the income column: compute the private mean and quantiles.

### Compute the mean
With ´make_private_mean´ transformation

#### On all rows of the column (select operator)

In [17]:
expr_domain = dp.expr_domain(lf_domain, context="select")

mean_lazy_all = (lf_domain, metric) >> dp.m.then_private_select(
    dp.c.make_basic_composition([
        (expr_domain, metric) 
        >> dp.t.then_col(col_name = "income") 
        # >> dp.t.then_clamp_expr((income_lower_bound, income_upper_bound)) 
        >> dp.m.then_private_mean_expr(scale = 0.5)
    ])
) >> dp.t.make_collect(lf_domain, metric)

mean_lazy_all(data)

income
f64
7054.439756


#### On partitions (groupby and agg operators)

In [18]:
def partition_mean(lf_domain, metric, active_column, grouping_columns, bounds, scale):
    # Groupby Expr domain
    expr_domain = dp.expr_domain(lf_domain, grouping_columns = grouping_columns)
    
    # Groupby metric
    expr_metric = dp.l1(metric)
    
    # Create expression
    meas_lazy = (
        (lf_domain, metric)
        >> dp.t.then_groupby_stable(grouping_columns = grouping_columns)
        >> dp.m.then_private_agg(
            dp.c.make_basic_composition(
                [
                    (expr_domain, expr_metric)
                    >> dp.t.then_col(col_name = active_column)
                    >> dp.t.then_clamp_expr(bounds)
                    >> dp.m.then_private_mean_expr(scale = scale)
                ]
            )
        )
        >> dp.t.make_collect(lf_domain, metric)
    )
    return meas_lazy

In [19]:
# One level partition
mean_one_level_partition = partition_mean(
    lf_domain = lf_domain,
    metric = dp.symmetric_distance(),
    active_column = "income", 
    grouping_columns = ["sex"], 
    bounds = (income_lower_bound, income_upper_bound),
    scale = 0.5
)
mean_one_level_partition(data)

sex,income
i32,f64
0,6363.060526
1,7366.282274


In [20]:
# Multi level partitions
mean_multi_level_partitions = partition_mean(
    lf_domain = lf_domain,
    metric = dp.symmetric_distance(),
    active_column = "income",
    grouping_columns = ["sex", "region"],
    bounds = (income_lower_bound, income_upper_bound),
    scale = 0.5
)
mean_multi_level_partitions(data).sort(['region', 'sex'])

sex,region,income
i32,i32,f64
0,1,6470.466979
1,1,7480.28327
0,2,6240.920249
1,2,7230.395263
0,3,6565.734997
1,3,7578.334403
0,4,6545.059519
1,4,7588.845886
0,5,6208.029746
1,5,7187.171853


### Compute the quantiles

With ´make_private_quantile´

In [40]:
# Candidates
candidates = [x * 500.0 for x in range(4, 26)]
candidates

[2000.0,
 2500.0,
 3000.0,
 3500.0,
 4000.0,
 4500.0,
 5000.0,
 5500.0,
 6000.0,
 6500.0,
 7000.0,
 7500.0,
 8000.0,
 8500.0,
 9000.0,
 9500.0,
 10000.0,
 10500.0,
 11000.0,
 11500.0,
 12000.0,
 12500.0]

#### On all rows of the column (select operator)

In [32]:
expr_domain = dp.expr_domain(lf_domain, context="select")

# On the median (alpha = 0.5)
mean_lazy_all = (lf_domain, metric) >> dp.m.then_private_select(
    dp.c.make_basic_composition([
        (expr_domain, metric) 
        >> dp.t.then_col(col_name = "income") 
        >> dp.m.then_private_quantile(
            candidates = candidates,
            temperature = 0.1,
            alpha = 0.5
        )
    ])
) >> dp.t.make_collect(lf_domain, metric)

mean_lazy_all(data)

income
f64
6500.0


### On partitions (groupby and agg operators)

In [34]:
def partition_quantiles(domain, metric, active_column, grouping_columns, candidates, temperature, alpha):
    # Groupby Expr domain
    expr_domain = dp.expr_domain(domain, grouping_columns=grouping_columns)
    
    # Groupby metric
    expr_metric = dp.l1(metric)
    
    # Create expression
    meas_lazy = (
        (domain, metric)
        >> dp.t.then_groupby_stable(grouping_columns)
        >> dp.m.then_private_agg(
            dp.c.make_basic_composition(
                [
                    (expr_domain, expr_metric)
                    >> dp.t.then_col(active_column)
                    >> dp.m.then_private_quantile(candidates, temperature, alpha)
                ]
            )
        )
        >> dp.t.make_collect(domain, metric)
    )
    return meas_lazy

In [35]:
# One level partition
q50_one_level_partition = partition_quantiles(
    domain = lf_domain,
    metric = dp.symmetric_distance(),
    active_column = "income", 
    grouping_columns = ["sex"],
    candidates = candidates,
    temperature = 1.0,
    alpha = 0.5
)
q50_one_level_partition(data)

sex,income
i32,list[f64]
0,[6000.0]
1,[7000.0]


In [37]:
q90_one_level_partition = partition_quantiles(
    domain = lf_domain,
    metric = dp.symmetric_distance(),
    active_column = "income", 
    grouping_columns = ["sex"], 
    candidates = candidates,
    temperature = 1.0,
    alpha = 0.9
)
q90_one_level_partition(data)

sex,income
i32,list[f64]
0,[9000.0]
1,[10500.0]


In [39]:
# Multi level partitions
multi_level_partition = partition_quantiles(
    domain = lf_domain,
    metric = dp.symmetric_distance(),
    active_column = "income", 
    grouping_columns = ["sex", "region"], 
    candidates = candidates,
    temperature = 1.0,
    alpha = 0.9
)
multi_level_partition(data).sort(['region', 'sex'])

sex,region,income
i32,i32,list[f64]
0,1,[9500.0]
1,1,[11000.0]
0,2,[9000.0]
1,2,[10500.0]
0,3,[9500.0]
1,3,[11000.0]
0,4,[9500.0]
1,4,[11000.0]
0,5,[9000.0]
1,5,[10500.0]
