# Differential Privacy on Income Data

We will use the Synthetic Swiss Income Dataset to demonstrate the how to release differentially private information on tabular data with OpenDP.
The main library used in the background is `Polars`.

In [1]:
!pip install polars==0.20.16
!pip install pyarrow

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import polars as pl
import opendp.prelude as dp

dp.enable_features("contrib")

## 1. Get income df in polars lazyframe format

In [3]:
path = "income_synthetic.csv"
lf = pl.scan_csv(path)

## 2. Data Domain Preparation
We first need to declare the metadata of the input domain of the Lazyframe.

In [4]:
def seed(schema):
    return pl.DataFrame(None, schema, orient="row").lazy()  # type: ignore[attr-defined]

In [8]:
income_lower_bound, income_upper_bound = 1_000, 100_000

In [9]:
def get_lf_domain():
    lf_domain = dp.lazyframe_domain([
        dp.series_domain("region", dp.atom_domain(T=dp.i64)),
        dp.series_domain("eco_branch", dp.atom_domain(T=dp.i64)),
        dp.series_domain("profession", dp.option_domain(dp.atom_domain(T=dp.i64))),
        dp.series_domain("education", dp.atom_domain(T=dp.i64)),
        dp.series_domain("age", dp.atom_domain(T=dp.i64)),
        dp.series_domain("sex", dp.atom_domain(T=dp.i64)),
        dp.series_domain("income", dp.atom_domain(T=dp.f64))
    ])
    lf_domain = dp.with_margin(lf_domain, by=[], public_info="lengths", max_partition_length=50)
    return lf_domain

## 3. Private Data Analysis
We will analyse the income column: compute the private mean and quantiles.

### a. Mean 

In [10]:
# Overall
lf_domain = get_lf_domain()

plan = seed(lf.schema).select(
    pl.col("income").dp.mean(bounds=(income_lower_bound, income_upper_bound), scale=0.0)
)
m_lf = dp.m.make_private_lazyframe(
    lf_domain, dp.symmetric_distance(), dp.max_divergence(T=float), plan
)
mean_income = m_lf(lf).collect()
mean_income

income
f64
7052.657818


In [12]:
# Grouped by "sex"
lf_domain = get_lf_domain()
lf_domain = dp.with_margin(lf_domain, by=["sex"], public_info="lengths", max_partition_length=50) # works for grouped mean

plan = seed(lf.schema).group_by("sex").agg([
    pl.col("income").dp.mean(bounds=(1_000.0, 60_000.0), scale=1.0)
])
m_lf = dp.m.make_private_lazyframe(
    input_domain=lf_domain, 
    input_metric=dp.symmetric_distance(), 
    output_measure=dp.max_divergence(T=float), 
    lazyframe=plan
)
by_sex_mean = m_lf(lf).collect()
by_sex_mean

sex,income
i64,f64
1,7365.936164
0,6362.735208


### b. Quantile income

In [13]:
candidates = list(range(1_000, 100_000, 1000))

In [14]:
# Overall
lf_domain = get_lf_domain()

plan = seed(lf.schema).select(
    pl.col("income").dp.quantile(alpha=.75, candidates=candidates, scale=1.)
)
m_lf = dp.m.make_private_lazyframe(
    lf_domain, dp.symmetric_distance(), dp.max_divergence(T=float), plan
)
quantile_income = m_lf(lf).collect()
quantile_income

income
i64
29000


In [14]:
# Grouped by sex
lf_domain = get_lf_domain()
lf_domain = dp.with_margin(lf_domain, by=["sex"], public_info="keys", max_partition_length=50)

plan = seed(lf.schema).group_by("sex").agg([
    pl.col("income").dp.quantile(alpha=.75, candidates=candidates, scale=1.),
])
m_lf = dp.m.make_private_lazyframe(
    input_domain=lf_domain, 
    input_metric=dp.symmetric_distance(), 
    output_measure=dp.max_divergence(T=float), 
    lazyframe=plan
)
by_sex_quantile_income = m_lf(lf).collect()
by_sex_quantile_income

sex,income
i64,i64
0,25000
1,29000
