# Differential Privacy on Income Data

We will use the Synthetic Swiss Income Dataset to demonstrate the how to release differentially private information on tabular data with OpenDP.
The main library used in the background is `Polars`.

In [21]:
!pip install polars==0.20.16

Collecting polars==0.20.16
  Downloading polars-0.20.16-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)
Downloading polars-0.20.16-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.3/26.3 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m0m
[?25hInstalling collected packages: polars
  Attempting uninstall: polars
    Found existing installation: polars 0.20.31
    Uninstalling polars-0.20.31:
      Successfully uninstalled polars-0.20.31
Successfully installed polars-0.20.16
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [None]:
from opendp.mod import _EXPECTED_POLARS_VERSION


In [22]:
import polars as pl
import opendp.prelude as dp

dp.enable_features("contrib")

## 1. Data Domain Preparation
We first need to declare the metadata of the input domain of the Lazyframe.

In [23]:
def seed(schema):
    return pl.DataFrame(None, schema, orient="row").lazy()  # type: ignore[attr-defined]

### Define domain with bounds

In [24]:
# Income bounds
income_lower_bound, income_upper_bound = 1_000.0, 60_000.0

In [25]:
lf_domain = dp.lazyframe_domain([
    dp.series_domain("region", dp.atom_domain(T=dp.i32)),
    dp.series_domain("eco_branch", dp.atom_domain(T=dp.i32)),
    dp.series_domain("profession", dp.option_domain(dp.atom_domain(T=dp.i32))),
    dp.series_domain("education", dp.atom_domain(T=dp.i32)),
    dp.series_domain("age", dp.atom_domain(T=dp.i32)),
    dp.series_domain("sex", dp.atom_domain(T=dp.i32)),
    dp.series_domain("income", dp.atom_domain(bounds=(income_lower_bound, income_upper_bound), T=dp.f64))
])

In [26]:
lf_domain = dp.with_margin(lf_domain, by=["sex"], public_info="keys", max_partition_length=50)
lf_domain = dp.with_margin(lf_domain, by=["region"], public_info="keys", max_partition_length=50)
lf_domain = dp.with_margin(lf_domain, by=["sex", "region"], public_info="keys", max_partition_length=50)

In [27]:
dp.expr_domain(lf_domain)

ExprDomain { lazy_frame_domain: FrameDomain(region: i32, eco_branch: i32, profession: i32, education: i32, age: i32, sex: i32, income: f64; margins=[{"sex"}, {"region", "sex"}, {"region"}]) }

## 2. Get income df in polars lazyframe format

In [28]:
path = "income_synthetic.csv"

In [29]:
lf = pl.scan_csv(path)
lf.head()

In [30]:
#lf.collect()

## 3. Private Data Analysis
We will analyse the income column: compute the private mean and quantiles.

### a. Mean income

In [31]:
# Overall
plan = seed(lf.schema).select(pl.mean("income").dp.noise(0.0))
m_lf = dp.m.make_private_lazyframe(
    lf_domain, dp.symmetric_distance(), dp.max_divergence(T=float), plan
)
mean_income = m_lf(lf).collect()
mean_income

OpenDPException: 
  FFI("Error when deserializing LazyFrame. This may be due to mismatched polars versions. Semantic(None, "unknown variant `Select`, expected one of `Selection`, `Cache`, `Scan`, `DataFrameScan`, `Projection`, `Aggregate`, `Join`, `HStack`, `Distinct`, `Sort`, `Slice`, `MapFunction`, `Union`, `HConcat`, `ExtContext`, `Sink`")")

In [17]:
# Grouped by "sex"
expr = pl.col("Income").fill_null(0.0).dp.mean((1_000.0, 60_000.0), scale=0.0)
plan = seed(lf.schema).group_by("sex").agg(expr).sort("sex")

In [18]:
m_lf = dp.m.make_private_lazyframe(
    input_domain=lf_domain, 
    input_metric=dp.symmetric_distance(), 
    output_measure=dp.max_divergence(T=float), 
    lazyframe=plan
)

df_release = m_lf(lf).collect()
df_release

OpenDPException: 
  FFI("Error when deserializing LazyFrame. This may be due to mismatched polars versions. Semantic(None, "unknown variant `GroupBy`, expected one of `Selection`, `Cache`, `Scan`, `DataFrameScan`, `Projection`, `Aggregate`, `Join`, `HStack`, `Distinct`, `Sort`, `Slice`, `MapFunction`, `Union`, `HConcat`, `ExtContext`, `Sink`")")

### b. Quantile income