# Differential Privacy on Income Data

We will use the Synthetic Swiss Income Dataset to demonstrate the how to release differentially private information on tabular data with OpenDP.
The main library used in the background is `Polars`.

In [1]:
!pip install polars==0.20.16
!pip install pyarrow

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [13]:
import polars as pl
import opendp.prelude as dp

dp.enable_features("contrib")

## 1. Get income df in polars lazyframe format

In [14]:
path = "../data/income_synthetic_data.csv"
lf = pl.scan_csv(path)

## 2. Data Domain Preparation
We first need to declare the metadata of the input domain of the Lazyframe.

In [15]:
NB_SEX = 2
NB_REGIONS = 7

In [16]:
income_lower_bound, income_upper_bound = 1_000, 100_000

In [17]:
def get_lf_domain():
    lf_domain = dp.lazyframe_domain([
        dp.series_domain("region", dp.atom_domain(T=dp.i64)),
        dp.series_domain("eco_branch", dp.atom_domain(T=dp.i64)),
        dp.series_domain("profession", dp.option_domain(dp.atom_domain(T=dp.i64))),
        dp.series_domain("education", dp.atom_domain(T=dp.i64)),
        dp.series_domain("age", dp.atom_domain(T=dp.i64)),
        dp.series_domain("sex", dp.atom_domain(T=dp.i64)),
        dp.series_domain("income", dp.atom_domain(T=dp.f64))
    ])
    lf_domain = dp.with_margin(lf_domain, by=[], public_info="lengths", max_partition_length=2_032_543)
    return lf_domain

## 3. Private Data Analysis
We will analyse the income column: compute the private mean and quantiles.

In [18]:
def seed(schema):
    return pl.DataFrame(None, schema, orient="row").lazy()

In [19]:
type(seed)

function

In [20]:
def opendp_measurement(plan):
    return dp.m.make_private_lazyframe(lf_domain, dp.symmetric_distance(), dp.max_divergence(T=float), plan)

### a. Mean 

In [21]:
print(f"type of ls: {type(lf)}")
print(f"type of seed: {type(seed(lf.schema))}")


type of ls: <class 'polars.lazyframe.frame.LazyFrame'>
type of seed: <class 'polars.lazyframe.frame.LazyFrame'>


  print(f"type of seed: {type(seed(lf.schema))}")


In [22]:
# Overall
lf_domain = get_lf_domain()

plan = seed(lf.schema).select(
    pl.col("income").dp.mean(bounds=(income_lower_bound, income_upper_bound), scale=0.0)
)
m_lf = opendp_measurement(plan)

mean_income = m_lf(lf).collect()
mean_income

  plan = seed(lf.schema).select(


income
f64
7052.657818


In [23]:
serialized_plan = plan.serialize()

In [24]:
type(m_lf(lf))

opendp.polars.OnceFrame

In [25]:
# Grouped by "sex"
lf_domain = get_lf_domain()
lf_domain = dp.with_margin(lf_domain, by=["sex"], public_info="lengths", max_partition_length=NB_SEX)

plan = seed(lf.schema).group_by("sex").agg([
    pl.col("income").dp.mean(bounds=(income_lower_bound, income_upper_bound), scale=1.0)
]).sort("income")
m_lf = opendp_measurement(plan)

by_sex_mean = m_lf(lf).collect()
by_sex_mean

  plan = seed(lf.schema).group_by("sex").agg([


sex,income
i64,f64
0,6362.735212
1,7365.936164


In [37]:
# Grouped by "region"
lf_domain = get_lf_domain()
lf_domain = dp.with_margin(lf_domain, by=["region"], public_info="lengths", max_partition_length=NB_REGIONS)

plan = seed(lf.schema).group_by("region").agg([
    pl.col("income").dp.mean(bounds=(income_lower_bound, income_upper_bound), scale=1.0)
]).sort("income")
m_lf = opendp_measurement(plan)

by_sex_mean = m_lf(lf).collect()
by_sex_mean

m_lf.map(1)

  plan = seed(lf.schema).group_by("region").agg([


OpenDPException: 
  FailedFunction("max_num_partitions must be known when the metric is not sensitive to ordering (SymmetricDistance)")

In [36]:
# Grouped by "region"
lf_domain = get_lf_domain()
lf_domain = dp.with_margin(lf_domain, by=["region"], public_info="lengths", max_partition_length=2_032_543,)

plan = seed(lf.schema).group_by("region").agg([
    pl.col("income").dp.mean(bounds=(income_lower_bound, income_upper_bound), scale=1.0)
]).sort("income")
m_lf = opendp_measurement(plan)


  plan = seed(lf.schema).group_by("region").agg([


In [39]:
# Grouped by "region"
lf_domain = get_lf_domain()
lf_domain = dp.with_margin(lf_domain, by=["region"], public_info="lengths", max_partition_length=2_032_543, max_num_partitions=NB_REGIONS)


plan = seed(lf.schema).group_by("region").agg([
    pl.col("income").dp.mean(bounds=(income_lower_bound, income_upper_bound), scale=1.0)
]).sort("income")
m_lf = opendp_measurement(plan)

m_lf.map(1)

  plan = seed(lf.schema).group_by("region").agg([


1284.2445919122538

In [None]:
plan = seed(lf.schema).group_by("region").agg([
    pl.col("income").dp.mean(bounds=(income_lower_bound, income_upper_bound), scale=1.0)
]).sort("income")
m_lf = opendp_measurement(plan)

by_sex_mean = m_lf(lf).collect()
by_sex_mean

In [27]:
# Grouped by sex and region
lf_domain = get_lf_domain()
lf_domain = dp.with_margin(lf_domain, by=["sex", "region"], public_info="lengths", max_partition_length=NB_SEX*NB_REGIONS)

plan = seed(lf.schema).group_by(["sex", "region"]).agg([
    pl.col("income").dp.mean(bounds=(1_000.0, 60_000.0), scale=1.0)
]).sort("income")
m_lf = opendp_measurement(plan)

by_sex_mean = m_lf(lf).collect()
by_sex_mean

  plan = seed(lf.schema).group_by(["sex", "region"]).agg([


sex,region,income
i64,i64,f64
0,7,5243.486691
1,7,6057.757329
0,5,6208.044992
0,2,6240.956862
0,1,6470.330028
…,…,…
1,2,7230.468348
1,1,7480.217519
1,6,7536.12794
1,3,7578.416355


### b. Quantile income

In [12]:
candidates = list(range(1_000, 100_000, 1000))

In [23]:
# Overall
lf_domain = get_lf_domain()

plan = seed(lf.schema).select(
    pl.col("income").dp.quantile(alpha=.75, candidates=candidates, scale=1.)
)
m_lf = opendp_measurement(plan)

quantile_income = m_lf(lf).collect()
quantile_income

income
i64
8000


In [21]:
# Grouped by sex
lf_domain = get_lf_domain()
lf_domain = dp.with_margin(lf_domain, by=["sex"], public_info="keys", max_partition_length=NB_SEX)

plan = seed(lf.schema).group_by("sex").agg([
    pl.col("income").dp.quantile(alpha=.75, candidates=candidates, scale=1.),
]).sort("income")
m_lf = opendp_measurement(plan)

by_sex_quantile_income = m_lf(lf).collect()
by_sex_quantile_income

sex,income
i64,i64
0,34000
1,77000


In [20]:
# Grouped by region
lf_domain = get_lf_domain()
lf_domain = dp.with_margin(lf_domain, by=["region"], public_info="keys", max_partition_length=NB_REGIONS)

plan = seed(lf.schema).group_by("region").agg([
    pl.col("income").dp.quantile(alpha=.75, candidates=candidates, scale=1.),
]).sort("income")
m_lf = opendp_measurement(plan)

by_sex_quantile_income = m_lf(lf).collect()
by_sex_quantile_income

region,income
i64,i64
7,22000
2,26000
1,29000
3,30000
6,32000
4,80000
5,84000


In [19]:
# Grouped by sex and region
lf_domain = get_lf_domain()
lf_domain = dp.with_margin(lf_domain, by=["sex", "region"], public_info="keys", max_partition_length=NB_SEX*NB_REGIONS)

plan = seed(lf.schema).group_by(["sex", "region"]).agg([
    pl.col("income").dp.quantile(alpha=.75, candidates=candidates, scale=1.),
]).sort("income")
m_lf = opendp_measurement(plan)

by_sex_quantile_income = m_lf(lf).collect()
by_sex_quantile_income

sex,region,income
i64,i64,i64
0,7,16000
1,7,20000
0,2,22000
0,3,22000
0,1,23000
…,…,…
1,5,27000
1,1,28000
1,3,28000
1,4,29000
