# Differential Privacy on Income Data

We will use the Synthetic Swiss Income Dataset to demonstrate the how to release differentially private information on tabular data with OpenDP.
The main library used in the background is `Polars`.

In [1]:
!pip install polars==0.20.16

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import polars as pl
import opendp.prelude as dp

dp.enable_features("contrib")

In [3]:
pl.__version__

'0.20.16'

## 1. Data Domain Preparation
We first need to declare the metadata of the input domain of the Lazyframe.

In [4]:
def seed(schema):
    return pl.DataFrame(None, schema, orient="row").lazy()  # type: ignore[attr-defined]

### Define domain with bounds

In [5]:
# Income bounds
income_lower_bound, income_upper_bound = 1_000.0, 60_000.0

In [23]:
lf_domain = dp.lazyframe_domain([
    dp.series_domain("region", dp.atom_domain(T=dp.i64)),
    dp.series_domain("eco_branch", dp.atom_domain(T=dp.i64)),
    dp.series_domain("profession", dp.option_domain(dp.atom_domain(T=dp.i64))),
    dp.series_domain("education", dp.atom_domain(T=dp.i64)),
    dp.series_domain("age", dp.atom_domain(T=dp.i64)),
    dp.series_domain("sex", dp.atom_domain(T=dp.i64)),
    dp.series_domain("income", dp.atom_domain(bounds=(income_lower_bound, income_upper_bound), T=dp.f64))
])

In [24]:
lf_domain = dp.with_margin(lf_domain, by=["sex"], public_info="keys", max_partition_length=50)
lf_domain = dp.with_margin(lf_domain, by=["region"], public_info="keys", max_partition_length=50)
lf_domain = dp.with_margin(lf_domain, by=["sex", "region"], public_info="keys", max_partition_length=50)

In [25]:
dp.expr_domain(lf_domain)

ExprDomain { lazy_frame_domain: FrameDomain(region: i64, eco_branch: i64, profession: i64, education: i64, age: i64, sex: i64, income: f64; margins=[{"sex"}, {"region", "sex"}, {"region"}]) }

## 2. Get income df in polars lazyframe format

In [26]:
path = "income_synthetic.csv"

In [27]:
lf = pl.scan_csv(path)
lf.head()

In [28]:
lf.collect()

region,eco_branch,profession,education,age,sex,income
i64,i64,i64,i64,i64,i64,f64
5,25,34,6,63,0,5878.776292
5,25,34,6,63,0,4816.668743
5,25,34,6,63,0,5597.575301
5,25,34,6,63,0,6058.278538
5,25,34,6,63,0,5520.935521
…,…,…,…,…,…,…
6,75,52,5,22,1,5296.288157
6,75,52,5,22,1,3720.089657
6,75,52,5,22,1,3695.666881
6,75,52,5,22,1,6712.725122


## 3. Private Data Analysis
We will analyse the income column: compute the private mean and quantiles.

### a. Mean income

In [29]:
# Overall
plan = seed(lf.schema).select(
    pl.col("income").dp.sum(bounds=(income_lower_bound, income_upper_bound)).dp.noise(0.0)
)
m_lf = dp.m.make_private_lazyframe(
    lf_domain, dp.symmetric_distance(), dp.max_divergence(T=float), plan
)
mean_income = m_lf(lf).collect()
mean_income

OpenDPException: 
  MakeTransformation("Expr is not recognized at this time: col("income").clip([1000.0, 60000.0]).sum()./usr/local/lib/python3.11/site-packages/opendp/lib/opendp.abi3.so:noise(). ""If you would like to see this supported, please file an issue.")

In [30]:
# Grouped by "sex"
expr = pl.col("Income").fill_null(0.0).dp.mean((1_000.0, 60_000.0), scale=0.0)
plan = seed(lf.schema).group_by("sex").agg(expr).sort("sex")

In [31]:
m_lf = dp.m.make_private_lazyframe(
    input_domain=lf_domain, 
    input_metric=dp.symmetric_distance(), 
    output_measure=dp.max_divergence(T=float), 
    lazyframe=plan
)

df_release = m_lf(lf).collect()
df_release

RuntimeError: BindingsError: "Value(\"the enum variant LogicalPlan::Error cannot be serialized\")"

### b. Quantile income

In [34]:
candidates = list(range(1_000, 100_000, 1000))

In [None]:
# Over

In [35]:
# Grouped by sex
plan = seed(lf.schema).group_by("sex").agg([
    pl.col("income").dp.quantile(candidates, alpha=.75, scale=1.),
])

m_lf = dp.m.make_private_lazyframe(
    input_domain=lf_domain, 
    input_metric=dp.symmetric_distance(), 
    output_measure=dp.max_divergence(T=float), 
    lazyframe=plan, 
    scale=1.
)

TypeError: DPExpr.quantile() got multiple values for argument 'alpha'