In [4]:
using CorrectMatch: Copula, Uniqueness, Individual
using StatsBase
using CSV, CodecZlib
using Distributions

In [9]:
df = CSV.read(open("adults.csv"))



32561

In [65]:
df_sub = df[[:age, :sex, :workclass, :relationship, Symbol("marital-status"), :race]];
data = Array{Int}(df_sub)
N, M = size(data)

(32561, 6)

In [66]:
N, M = size(data)

(32561, 6)

## Estimating population uniqueness

In [67]:
# True population uniqueness
u = uniqueness(data)
println("True population uniqueness: $u")

True population uniqueness: 0.10853475016123583


In [68]:
# Fit model and estimate uniqueness
G = fit_mle(GaussianCopula, data; exact_marginal=true)
u = uniqueness(rand(G, N))
println("Estimated population uniqueness: $u")

Estimated population uniqueness: 0.12794447344983262


In [69]:
# Fit model on 325 records (1% of the original data) and estimate uniqueness
ix = sample(1:N, 325; replace=false);
G = fit_mle(GaussianCopula, data[ix, :]; exact_marginal=false)
u = uniqueness(rand(G, N))
println("Estimated population uniqueness (1% sample): $u")

Estimated population uniqueness (1% sample): 0.12435121771444366


## Estimating individual uniqueness

In [70]:
function extract_marginal_ordered(row::AbstractVector)
  cm = collect(values(countmap(row; alg=:dict)))
  Categorical(cm / sum(cm))
end

marginals = [extract_marginal_ordered(data[:, i]) for i=1:M];

In [71]:
G = fit_mle(GaussianCopula, marginals, data);

### Likely unique individual

In [120]:
indiv = data[1, :] # 39 years old male with non Asian/Black/White race

6-element Array{Int64,1}:
 39
  1
  7
  1
  4
  4

In [118]:
shifted_indiv = indiv - minimum(data, 1)[:] + 1
Individual.individual_uniqueness(G, shifted_indiv, N)

0.9962405323678928

### Unlikely unique individual

In [121]:
indiv = data[12, :] # 30 years old white male

6-element Array{Int64,1}:
 30
  1
  7
  0
  2
  1

In [113]:
shifted_indiv = indiv - minimum(data, 1)[:] + 1
Individual.individual_uniqueness(G, shifted_indiv, N)

0.0002859441553556916