# Differential Privacy on Income Data

We will use the Synthetic Swiss Income Dataset to demonstrate the how to release differentially private information on tabular data with OpenDP.
The main library used in the background is `Polars`.

In [91]:
!pip install opendp==0.10.0

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [8]:
!pip install polars

Collecting polars
  Downloading polars-0.20.31-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (14 kB)
Downloading polars-0.20.31-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (28.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m28.8/28.8 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: polars
Successfully installed polars-0.20.31
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [92]:
!pip list

Package                   Version        Editable project location
------------------------- -------------- -------------------------
anyio                     4.4.0
argon2-cffi               23.1.0
argon2-cffi-bindings      21.2.0
arrow                     1.3.0
asttokens                 2.4.1
async-lru                 2.0.4
attrs                     23.2.0
Babel                     2.15.0
beautifulsoup4            4.12.3
bleach                    6.1.0
certifi                   2024.2.2
cffi                      1.16.0
charset-normalizer        3.3.2
comm                      0.2.2
contourpy                 1.2.1
cycler                    0.12.1
debugpy                   1.8.1
decorator                 5.1.1
defusedxml                0.7.1
executing                 2.0.1
fastjsonschema            2.19.1
fonttools                 4.52.4
fqdn                      1.5.1
h11                       0.14.0
httpcore                  1.0.5
httpx                     0.27.0
idna                

In [10]:
# Import library
import polars as pl
import opendp.prelude as dp

dp.enable_features("contrib")

## 1. Data Domain Preparation
We first need to declare the metadata of the input domain of the Lazyframe.

In [52]:
def seed(schema):
    return pl.DataFrame(None, schema, orient="row").lazy()  # type: ignore[attr-defined]

### Define domain with bounds

In [11]:
# Income bounds
income_lower_bound, income_upper_bound = 1_000.0, 60_000.0

In [23]:
lf_domain = dp.lazyframe_domain([
    dp.series_domain("region", dp.atom_domain(T=dp.i32)),
    dp.series_domain("eco_branch", dp.atom_domain(T=dp.i32)),
    dp.series_domain("profession", dp.option_domain(dp.atom_domain(T=dp.i32))),
    dp.series_domain("education", dp.atom_domain(T=dp.i32)),
    dp.series_domain("age", dp.atom_domain(T=dp.i32)),
    dp.series_domain("sex", dp.atom_domain(T=dp.i32)),
    dp.series_domain("income", dp.atom_domain(bounds=(income_lower_bound, income_upper_bound), T=dp.f64))
])

### Add margin metadata: 
Here, we add the counts the we will need.

In [24]:
lf_domain = dp.with_margin(lf_domain, by=["sex"], public_info="keys", max_partition_length=50)
lf_domain = dp.with_margin(lf_domain, by=["region"], public_info="keys", max_partition_length=50)
lf_domain = dp.with_margin(lf_domain, by=["sex", "region"], public_info="keys", max_partition_length=50)
#public_info="lengths",
#max_partition_length=50,
#max_num_partitions=3,
#max_partition_contributions=2,
#max_influenced_partitions=1,

### Input metrics and output measures

In [25]:
input_metric = dp.symmetric_distance()
output_measure = dp.max_divergence(T=dp.f64)

## 2. Get income df in polars lazyframe format

In [84]:
path = "income_synthetic.csv"

In [85]:
lf = pl.scan_csv(path)
lf.head()

In [86]:
#lf.collect()

## 3. Private Data Analysis
We will analyse the income column: compute the private mean and quantiles.

### Mean income

#### Overall

#### Per partition

In [89]:
expr = pl.col("Income").fill_null(0.0).dp.mean((1_000.0, 60_000.0), scale=0.0)
plan = seed(lf.schema).group_by("sex").agg(expr).sort("sex")

In [90]:
m_lf = dp.m.make_private_lazyframe(
    input_domain=lf_domain, 
    input_metric=dp.symmetric_distance(), 
    output_measure=dp.max_divergence(T=float), 
    lazyframe=plan
)

df_release = m_lf(lf).collect()
df_release

OpenDPException: 
  FFI("Error when deserializing LazyFrame. This may be due to mismatched polars versions. Semantic(None, "unknown variant `GroupBy`, expected one of `Selection`, `Cache`, `Scan`, `DataFrameScan`, `Projection`, `Aggregate`, `Join`, `HStack`, `Distinct`, `Sort`, `Slice`, `MapFunction`, `Union`, `HConcat`, `ExtContext`, `Sink`")")

In [None]:
plan = lf_dummy.group_by("sex").agg([
    pl.col("income").dp.mean(bounds=(0.0, 1.0), scale=2.),
])

In [17]:
expr_domain = dp.expr_domain(lf_domain, context="select")

mean_lazy_all = (lf_domain, metric) >> dp.m.then_private_select(
    dp.c.make_basic_composition([
        (expr_domain, metric) 
        >> dp.t.then_col(col_name = "income") 
        # >> dp.t.then_clamp_expr((income_lower_bound, income_upper_bound)) 
        >> dp.m.then_private_mean_expr(scale = 0.5)
    ])
) >> dp.t.make_collect(lf_domain, metric)

mean_lazy_all(data)

income
f64
7054.439756


#### On partitions (groupby and agg operators)

In [18]:
def partition_mean(lf_domain, metric, active_column, grouping_columns, bounds, scale):
    # Groupby Expr domain
    expr_domain = dp.expr_domain(lf_domain, grouping_columns = grouping_columns)
    
    # Groupby metric
    expr_metric = dp.l1(metric)
    
    # Create expression
    meas_lazy = (
        (lf_domain, metric)
        >> dp.t.then_groupby_stable(grouping_columns = grouping_columns)
        >> dp.m.then_private_agg(
            dp.c.make_basic_composition(
                [
                    (expr_domain, expr_metric)
                    >> dp.t.then_col(col_name = active_column)
                    >> dp.t.then_clamp_expr(bounds)
                    >> dp.m.then_private_mean_expr(scale = scale)
                ]
            )
        )
        >> dp.t.make_collect(lf_domain, metric)
    )
    return meas_lazy

In [19]:
# One level partition
mean_one_level_partition = partition_mean(
    lf_domain = lf_domain,
    metric = dp.symmetric_distance(),
    active_column = "income", 
    grouping_columns = ["sex"], 
    bounds = (income_lower_bound, income_upper_bound),
    scale = 0.5
)
mean_one_level_partition(data)

sex,income
i32,f64
0,6363.060526
1,7366.282274


In [20]:
# Multi level partitions
mean_multi_level_partitions = partition_mean(
    lf_domain = lf_domain,
    metric = dp.symmetric_distance(),
    active_column = "income",
    grouping_columns = ["sex", "region"],
    bounds = (income_lower_bound, income_upper_bound),
    scale = 0.5
)
mean_multi_level_partitions(data).sort(['region', 'sex'])

sex,region,income
i32,i32,f64
0,1,6470.466979
1,1,7480.28327
0,2,6240.920249
1,2,7230.395263
0,3,6565.734997
1,3,7578.334403
0,4,6545.059519
1,4,7588.845886
0,5,6208.029746
1,5,7187.171853


### Compute the quantiles

With ´make_private_quantile´

In [40]:
# Candidates
candidates = [x * 500.0 for x in range(4, 26)]
candidates

[2000.0,
 2500.0,
 3000.0,
 3500.0,
 4000.0,
 4500.0,
 5000.0,
 5500.0,
 6000.0,
 6500.0,
 7000.0,
 7500.0,
 8000.0,
 8500.0,
 9000.0,
 9500.0,
 10000.0,
 10500.0,
 11000.0,
 11500.0,
 12000.0,
 12500.0]

#### On all rows of the column (select operator)

In [32]:
expr_domain = dp.expr_domain(lf_domain, context="select")

# On the median (alpha = 0.5)
mean_lazy_all = (lf_domain, metric) >> dp.m.then_private_select(
    dp.c.make_basic_composition([
        (expr_domain, metric) 
        >> dp.t.then_col(col_name = "income") 
        >> dp.m.then_private_quantile(
            candidates = candidates,
            temperature = 0.1,
            alpha = 0.5
        )
    ])
) >> dp.t.make_collect(lf_domain, metric)

mean_lazy_all(data)

income
f64
6500.0


### On partitions (groupby and agg operators)

In [34]:
def partition_quantiles(domain, metric, active_column, grouping_columns, candidates, temperature, alpha):
    # Groupby Expr domain
    expr_domain = dp.expr_domain(domain, grouping_columns=grouping_columns)
    
    # Groupby metric
    expr_metric = dp.l1(metric)
    
    # Create expression
    meas_lazy = (
        (domain, metric)
        >> dp.t.then_groupby_stable(grouping_columns)
        >> dp.m.then_private_agg(
            dp.c.make_basic_composition(
                [
                    (expr_domain, expr_metric)
                    >> dp.t.then_col(active_column)
                    >> dp.m.then_private_quantile(candidates, temperature, alpha)
                ]
            )
        )
        >> dp.t.make_collect(domain, metric)
    )
    return meas_lazy

In [35]:
# One level partition
q50_one_level_partition = partition_quantiles(
    domain = lf_domain,
    metric = dp.symmetric_distance(),
    active_column = "income", 
    grouping_columns = ["sex"],
    candidates = candidates,
    temperature = 1.0,
    alpha = 0.5
)
q50_one_level_partition(data)

sex,income
i32,list[f64]
0,[6000.0]
1,[7000.0]


In [37]:
q90_one_level_partition = partition_quantiles(
    domain = lf_domain,
    metric = dp.symmetric_distance(),
    active_column = "income", 
    grouping_columns = ["sex"], 
    candidates = candidates,
    temperature = 1.0,
    alpha = 0.9
)
q90_one_level_partition(data)

sex,income
i32,list[f64]
0,[9000.0]
1,[10500.0]


In [39]:
# Multi level partitions
multi_level_partition = partition_quantiles(
    domain = lf_domain,
    metric = dp.symmetric_distance(),
    active_column = "income", 
    grouping_columns = ["sex", "region"], 
    candidates = candidates,
    temperature = 1.0,
    alpha = 0.9
)
multi_level_partition(data).sort(['region', 'sex'])

sex,region,income
i32,i32,list[f64]
0,1,[9500.0]
1,1,[11000.0]
0,2,[9000.0]
1,2,[10500.0]
0,3,[9500.0]
1,3,[11000.0]
0,4,[9500.0]
1,4,[11000.0]
0,5,[9000.0]
1,5,[10500.0]
