In [None]:
using CorrectMatch
using StatsBase
using CSV
using CodecZlib
using DataFrames
using Distributions

In [None]:
# Read CSV file (use transcode for gzipped files)
df = CSV.read("adults.csv", DataFrame)

In [None]:
df_sub = df[:, [:age, :sex, :workclass, :relationship, Symbol("marital-status"), :race]]
data = Matrix{Int}(df_sub)
N, M = size(data)

## Estimating population uniqueness

In [67]:
# True population uniqueness
u = uniqueness(data)
println("True population uniqueness: $u")

True population uniqueness: 0.10853475016123583


In [68]:
# Fit model and estimate uniqueness
G = fit_mle(GaussianCopula, data; exact_marginal=true)
u = uniqueness(rand(G, N))
println("Estimated population uniqueness: $u")

Estimated population uniqueness: 0.12794447344983262


In [None]:
# Fit model on 325 records (1% of the original data) and estimate uniqueness
ix = sample(1:N, 325; replace=false)
G = fit_mle(GaussianCopula, data[ix, :]; exact_marginal=false)
u = uniqueness(rand(G, N))
println("Estimated population uniqueness (1% sample): $u")

## Estimating individual uniqueness

In [None]:
function extract_marginal_ordered(col::AbstractVector)
    cm = collect(values(countmap(col; alg=:dict)))
    Categorical(cm / sum(cm))
end

marginals = [extract_marginal_ordered(data[:, i]) for i in 1:M]

In [None]:
G = fit_mle(GaussianCopula, marginals, data)

### Likely unique individual

In [None]:
indiv = data[1, :]  # 39 years old male with non Asian/Black/White race

In [None]:
# Currently requires shifting indices to 1-based for marginals
# (see suggestions below for improving this)
shifted_indiv = indiv .- minimum(data; dims=1)[:] .+ 1
individual_uniqueness(G, shifted_indiv, N)

### Unlikely unique individual

In [None]:
indiv = data[12, :]  # 30 years old white male

In [None]:
shifted_indiv = indiv .- minimum(data; dims=1)[:] .+ 1
individual_uniqueness(G, shifted_indiv, N)