In [1]:
import correctmatch
import numpy as np
import pandas as pd

rng = np.random.default_rng(42)

Detected IPython. Loading juliacall extension. See https://juliapy.github.io/PythonCall.jl/stable/compat/#IPython


# Tests with a basic dataset

In [2]:
arr = rng.integers(1, 5, size=(1000, 5))
arr[:5, :]

array([[1, 4, 3, 2, 2],
       [4, 1, 3, 1, 1],
       [3, 4, 3, 4, 3],
       [4, 3, 1, 4, 2],
       [3, 2, 1, 4, 4]])

In [3]:
correctmatch.uniqueness(arr)

0.368

In [4]:
fitted_model = correctmatch.fit_model(arr)
fitted_model

GaussianCopula{Float64}(
Σ: [0.9999999999999994 -9.159943640501903e-17 … 0.08251117589321237 0.11970137945121229; -9.159943640501903e-17 1.0000000000000002 … -1.404227530302166e-16 -3.838590064615645e-17; … ; 0.08251117589321237 -1.404227530302166e-16 … 0.9999999999999989 -1.9815663327929788e-16; 0.11970137945121229 -3.838590064615645e-17 … -1.9815663327929788e-16 1.0]
marginals: Distributions.Distribution{Distributions.Univariate, Distributions.Discrete}[Distributions.Categorical{Float64, Vector{Float64}}(support=Base.OneTo(4), p=[0.258, 0.254, 0.251, 0.237]), Distributions.Categorical{Float64, Vector{Float64}}(support=Base.OneTo(4), p=[0.259, 0.253, 0.246, 0.242]), Distributions.Categorical{Float64, Vector{Float64}}(support=Base.OneTo(4), p=[0.277, 0.264, 0.234, 0.225]), Distributions.Categorical{Float64, Vector{Float64}}(support=Base.OneTo(4), p=[0.26, 0.258, 0.253, 0.229]), Distributions.Categorical{Float64, Vector{Float64}}(support=Base.OneTo(4), p=[0.261, 0.256, 0.247, 0.236])]
c

In [5]:
# Covariance matrix
fitted_model.Σ

5×5 PDMats.PDMat{Float64, Matrix{Float64}}:
  1.0          -9.15994e-17  -1.13927e-16   0.0825112     0.119701
 -9.15994e-17   1.0           0.0842077    -1.40423e-16  -3.83859e-17
 -1.13927e-16   0.0842077     1.0           0.0862887     0.0314026
  0.0825112    -1.40423e-16   0.0862887     1.0          -1.98157e-16
  0.119701     -3.83859e-17   0.0314026    -1.98157e-16   1.0

In [6]:
# Fitted marginals
fitted_model.marginals

5-element Vector{Distributions.Distribution{Distributions.Univariate, Distributions.Discrete}}:
 Distributions.Categorical{Float64, Vector{Float64}}(support=Base.OneTo(4), p=[0.258, 0.254, 0.251, 0.237])
 Distributions.Categorical{Float64, Vector{Float64}}(support=Base.OneTo(4), p=[0.259, 0.253, 0.246, 0.242])
 Distributions.Categorical{Float64, Vector{Float64}}(support=Base.OneTo(4), p=[0.277, 0.264, 0.234, 0.225])
 Distributions.Categorical{Float64, Vector{Float64}}(support=Base.OneTo(4), p=[0.26, 0.258, 0.253, 0.229])
 Distributions.Categorical{Float64, Vector{Float64}}(support=Base.OneTo(4), p=[0.261, 0.256, 0.247, 0.236])

In [7]:
synthetic_arr = correctmatch.sample_model(fitted_model, 1000)
synthetic_arr[1:5, :]

array([[2, 3, 4, 4, 4],
       [4, 2, 1, 3, 3],
       [1, 1, 1, 2, 1],
       [2, 4, 1, 3, 3]])

In [8]:
correctmatch.uniqueness(synthetic_arr)

0.351

# Individual-level metrics

CorrectMatch can estimate uniqueness and correctness for a specific individual, not just the population.

In [9]:
# Pick a specific individual from the dataset
individual = arr[0]
print(f"Individual record: {individual}")

# Estimate individual uniqueness and correctness
ind_uniqueness = correctmatch.individual_uniqueness(fitted_model, individual, 1000)
ind_correctness = correctmatch.individual_correctness(fitted_model, individual, 1000)

print(f"Individual uniqueness: {ind_uniqueness:.3f}")
print(f"Individual correctness: {ind_correctness:.3f}")

Individual record: [1 4 3 2 2]
Individual uniqueness: 0.379
Individual correctness: 0.644


# Working with pandas DataFrames

CorrectMatch supports pandas DataFrames with integer, categorical, and string columns.

In [10]:
# Create a DataFrame with categorical columns
df_cat = pd.DataFrame(
    {
        "color": pd.Categorical(rng.choice(["red", "green", "blue"], size=100)),
        "size": pd.Categorical(rng.choice(["S", "M", "L", "XL"], size=100)),
        "shape": pd.Categorical(rng.choice(["circle", "square", "triangle"], size=100)),
    },
)
df_cat.head()

Unnamed: 0,color,size,shape
0,red,L,square
1,red,L,square
2,green,S,triangle
3,blue,M,triangle
4,blue,S,square


In [11]:
# Compute metrics directly on the DataFrame
print(f"Uniqueness: {correctmatch.uniqueness(df_cat):.3f}")
print(f"Correctness: {correctmatch.correctness(df_cat):.3f}")

Uniqueness: 0.060
Correctness: 0.350


In [12]:
# Fit a model and compute individual metrics in a smaller population of 50 individuals
model_cat = correctmatch.fit_model(df_cat)
individual_cat = df_cat.iloc[0]

print(f"Individual: {individual_cat.to_dict()}")
print(f"Individual uniqueness: {correctmatch.individual_uniqueness(model_cat, individual_cat, 50):.3f}")
print(f"Individual correctness: {correctmatch.individual_correctness(model_cat, individual_cat, 50):.3f}")

Individual: {'color': 'red', 'size': 'L', 'shape': 'square'}
Individual uniqueness: 0.232
Individual correctness: 0.530


In [None]:
# Sample synthetic data from the fitted model
synthetic_df = correctmatch.sample_model(model_cat, 500)
print(f"Synthetic data shape: {synthetic_df.shape}")
synthetic_df[:5]

Synthetic data shape: (500, 3)


Unnamed: 0,1,2,3
0,green,XL,circle
1,red,S,circle
2,red,L,triangle
3,red,XL,circle
4,red,M,triangle


# Mixed column types

DataFrames with mixed column types (integers, categorical, strings) are also supported.

In [14]:
# Create a DataFrame with mixed column types
df_mixed = pd.DataFrame(
    {
        "age": rng.integers(18, 80, size=500),
        "gender": pd.Categorical(rng.choice(["M", "F"], size=500)),
        "city": rng.choice(["Paris", "London", "Berlin", "Madrid"], size=500),
    },
)
df_mixed.head()

Unnamed: 0,age,gender,city
0,22,M,London
1,79,M,Paris
2,28,M,Paris
3,44,F,Berlin
4,32,M,London


In [15]:
print(f"Column types: {df_mixed.dtypes.to_dict()}")
print(f"Uniqueness: {correctmatch.uniqueness(df_mixed):.3f}")
print(f"Correctness: {correctmatch.correctness(df_mixed):.3f}")

Column types: {'age': dtype('int64'), 'gender': CategoricalDtype(categories=['F', 'M'], ordered=False, categories_dtype=object), 'city': dtype('O')}
Uniqueness: 0.368
Correctness: 0.630


# Tests with a real dataset

In [16]:
adults = pd.read_csv("adults.csv")
adults.head()

Unnamed: 0,age,workclass,education-num,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,income
0,39,7,13,4,1,1,4,1,40,39,0
1,50,6,13,2,4,0,4,1,13,39,0
2,38,4,9,0,6,1,4,1,40,39,0
3,53,4,7,2,6,0,2,1,40,39,0
4,28,4,13,2,10,5,2,0,40,5,0


In [17]:
print(f"The dataset has {len(adults)} rows and {100*correctmatch.uniqueness(adults):.1f}% of them are unique records.")

The dataset has 32561 rows and 79.4% of them are unique records.


In [18]:
fitted_model = correctmatch.fit_model(adults)

In [19]:
for n in (100, 1000, 10000, 100000):
    fitted_adults = correctmatch.sample_model(fitted_model, n)
    print(f" - {n} rows: uniqueness={correctmatch.uniqueness(fitted_adults):.3f}, correctness={correctmatch.correctness(fitted_adults):.3f}")

 - 100 rows: uniqueness=1.000, correctness=1.000
 - 1000 rows: uniqueness=0.912, correctness=0.945
 - 10000 rows: uniqueness=0.807, correctness=0.854
 - 100000 rows: uniqueness=0.664, correctness=0.727


## Individual-level metrics

In [20]:
mini_adults = adults[["age", "education-num", "occupation"]]
print(f"Mini dataset uniqueness: {correctmatch.uniqueness(mini_adults):.3f}")

fitted_model = correctmatch.fit_model(mini_adults)

indiv = mini_adults.iloc[12]
print(f"Individual: {indiv.to_dict()}")

Mini dataset uniqueness: 0.067
Individual: {'age': 23, 'education-num': 13, 'occupation': 1}


In [21]:
for n in (1000, 10000, 100000):
    uniqueness = correctmatch.individual_uniqueness(fitted_model, indiv.to_numpy(), n)
    correctness = correctmatch.individual_correctness(fitted_model, indiv.to_numpy(), n)
    print(f" - Sample size {n}: uniqueness={uniqueness:.3f}, correctness={correctness:.3f}")

 - Sample size 1000: uniqueness=0.957, correctness=0.977
 - Sample size 10000: uniqueness=0.654, correctness=0.811
 - Sample size 100000: uniqueness=0.010, correctness=0.274


In [22]:
older_indiv = pd.Series({"age": 80, "education-num": 13, "occupation": 1})
for n in (1000, 10000, 100000):
    uniqueness = correctmatch.individual_uniqueness(fitted_model, older_indiv.to_numpy(), n)
    correctness = correctmatch.individual_correctness(fitted_model, older_indiv.to_numpy(), n)
    print(f" - Sample size {n}: uniqueness={uniqueness:.3f}, correctness={correctness:.3f}")

 - Sample size 1000: uniqueness=0.999, correctness=0.999
 - Sample size 10000: uniqueness=0.990, correctness=0.995
 - Sample size 100000: uniqueness=0.902, correctness=0.940
