In [1]:
import numpy as np
from simulator import generate_dataset_tiebreaking
from gibbs import ClaraGibbs

In [2]:
# Simulate a dataset with 1000 items and all labelers share a single confusion matrix
df = generate_dataset_tiebreaking(
    dataset_id=0,
    theta=np.array([0.8, 0.2]),
    psi=np.array([[0.9, 0.1], [0.05, 0.95]]),
    num_items=1000,
)

In [3]:
df.tail(10)

Unnamed: 0,dataset,id,labelers,ratings,true_rating
990,0,0_990,"[0, 0, 0]","[1, 0, 1]",0
991,0,0_991,"[0, 0]","[0, 0]",0
992,0,0_992,"[0, 0]","[0, 0]",0
993,0,0_993,"[0, 0, 0]","[1, 0, 0]",0
994,0,0_994,"[0, 0]","[0, 0]",0
995,0,0_995,"[0, 0]","[0, 0]",0
996,0,0_996,"[0, 0]","[0, 0]",0
997,0,0_997,"[0, 0]","[0, 0]",0
998,0,0_998,"[0, 0]","[0, 0]",0
999,0,0_999,"[0, 0]","[0, 0]",0


In [4]:
# Fit the model
model = ClaraGibbs(burn_in=100, num_samples=100, sample_lag=3)
model.fit(A=1, R=2, ratings=np.array(df.ratings))

INFO:gibbs:Fitting ...
INFO:gibbs: N = 1000
INFO:gibbs: R = 2
INFO:gibbs: A = 1
INFO:gibbs: C = 0
INFO:gibbs:Initializing ...
INFO:gibbs:Getting priors ...
INFO:gibbs:  theta_scale = 1.0
INFO:gibbs:  theta_mean = [0.73861022 0.26138978]
INFO:gibbs: theta_prior = dimension = 2. scale = 1.0. mean = [0.73861022 0.26138978].
INFO:gibbs:  psi_scale = [1.0, 1.0]
INFO:gibbs:  psi_mean = [[0.75, 0.25], [0.25, 0.75]]
INFO:gibbs: psi_prior = [dimension = 2. scale = 1.0. mean = [0.75 0.25]., dimension = 2. scale = 1.0. mean = [0.25 0.75].]
INFO:gibbs: phi_prior = None
INFO:gibbs:  llh = -1134.3906471317314
INFO:gibbs:  theta = prior: dimension = 2. scale = 1.0. mean = [0.73861022 0.26138978].. data: counts: {0: 781, 1: 219}. count_sum = 1000.. posterior: [0.7809576525637271, 0.21904234743627288]
INFO:gibbs:  psi[0][0] = prior: dimension = 2. scale = 1.0. mean = [0.75 0.25].. data: counts: {0: 1562, 1: 129}. count_sum = 1691.. posterior: [0.9236111111111112, 0.0763888888888889]
INFO:gibbs:  psi[0]

In [5]:
# Get the prevalence estimate (mean and 95% confidence interval)
model.get_prevalence()

{'mean': [0.7994591510652258, 0.20054084893477436],
 'ci': [[0.779908701514776, 0.1854509388448643],
  [0.8145490611551356, 0.2200912984852239]]}

In [6]:
# Get the (shared) confusion matrix (mean and 95% confidence interval)
model.get_confusion_matrix(labeler_id=0)

[{'mean': [0.9013498486972717, 0.09865015130272797],
  'ci': [[0.8870011594070394, 0.08286080015182554],
   [0.9171391998481744, 0.11299884059296064]]},
 {'mean': [0.08230324503978892, 0.917696754960211],
  'ci': [[0.05305545194579137, 0.8777926891615542],
   [0.12220731083844576, 0.9469445480542087]]}]