### Setup

In [1]:
from datafusionsm.datasets import load_online_survey, load_tv_panel

survey = load_online_survey()
panel = load_tv_panel()

linking = survey.columns.intersection(panel.columns).tolist()

critical_cells = ["age", "gender"]
target = "income"

#### Predictive Mean Matching

In [2]:
from datafusionsm.implicit_model import PMM

pmm = PMM("income")

pmm.fit(survey, panel, linking=linking, critical=critical_cells)
fused_pmm = pmm.transform(survey, panel)

fused_pmm[["panelist-id"] + linking + [target]].head()

Unnamed: 0,panelist-id,age,gender,children,marital,income
0,61913,45-64,M,0,married,Top 25%
1,17173,45-64,M,0,married,Mid 50%
2,62118,45-64,M,0,married,Prefer not to say
3,72817,45-64,M,0,married,Mid 50%
4,92177,45-64,M,0,married,Mid 50%


#### Evaluate Results

In [3]:
from datafusionsm.evaluation import compare_dists

measured_inc = survey["income"].value_counts() / survey.shape[0]
fused_pmm_inc = fused_pmm["income"].value_counts() / fused_pmm.shape[0]
compare_dists(measured_inc, fused_pmm_inc)

KL-Divergence               0.004046
Hellinger Distance          0.031722
Total Variation Distance    0.032417
Overlap                     0.967583
dtype: float64

In [4]:
sum(pmm.results["income"]["scores"]) / len(pmm.results["income"]["scores"])

0.03879369957427439