In [7]:
import numpy as np
from simulator import generate_dataset_tiebreaking
from simulator import generate_labeler_confusion_matrix
from simulator import generate_dataset_tiebreaking_different_labeler_cm
from gibbs import ClaraGibbs

In [10]:
# Simulate a dataset with 1000 items and all labelers share a single confusion matrix
df = generate_dataset_tiebreaking(
    dataset_id=0,
    theta=np.array([0.8, 0.2]),
    psi=np.array([[0.9, 0.1], [0.05, 0.95]]),
    num_items=1000,
)

In [17]:
df.tail(10)

Unnamed: 0,dataset,id,labelers,ratings,true_rating
990,0,0_990,"[0, 0]","[0, 0]",0
991,0,0_991,"[0, 0]","[0, 0]",0
992,0,0_992,"[0, 0, 0]","[0, 1, 0]",0
993,0,0_993,"[0, 0]","[0, 0]",0
994,0,0_994,"[0, 0]","[0, 0]",0
995,0,0_995,"[0, 0, 0]","[1, 0, 0]",0
996,0,0_996,"[0, 0]","[0, 0]",0
997,0,0_997,"[0, 0, 0]","[1, 0, 0]",0
998,0,0_998,"[0, 0]","[0, 0]",0
999,0,0_999,"[0, 0]","[0, 0]",0


In [12]:
# Fit the model
model = ClaraGibbs(burn_in=100, num_samples=100, sample_lag=3)
model.fit(A=1, R=2, ratings=np.array(df.ratings))

INFO:gibbs:Fitting ...
INFO:gibbs: N = 1000
INFO:gibbs: R = 2
INFO:gibbs: A = 1
INFO:gibbs: C = 0
INFO:gibbs:Initializing ...
INFO:gibbs:Getting priors ...
INFO:gibbs:  theta_scale = 1.0
INFO:gibbs:  theta_mean = [0.72735632 0.27264368]
INFO:gibbs: theta_prior = dimension = 2. scale = 1.0. mean = [0.72735632 0.27264368].
INFO:gibbs:  psi_scale = [1.0, 1.0]
INFO:gibbs:  psi_mean = [[0.75, 0.25], [0.25, 0.75]]
INFO:gibbs: psi_prior = [dimension = 2. scale = 1.0. mean = [0.75 0.25]., dimension = 2. scale = 1.0. mean = [0.25 0.75].]
INFO:gibbs: phi_prior = None
INFO:gibbs:  llh = -1153.3978848368724
INFO:gibbs:  theta = prior: dimension = 2. scale = 1.0. mean = [0.72735632 0.27264368].. data: counts: {0: 770, 1: 230}. count_sum = 1000.. posterior: [0.7699573989229163, 0.23004260107708385]
INFO:gibbs:  psi[0][0] = prior: dimension = 2. scale = 1.0. mean = [0.75 0.25].. data: counts: {0: 1540, 1: 132}. count_sum = 1672.. posterior: [0.9209503885236103, 0.07904961147638973]
INFO:gibbs:  psi[0

In [13]:
# Get the prevalence estimate (mean and 95% confidence interval)
model.get_prevalence()

{'mean': [0.7939633929289102, 0.2060366070710899],
 'ci': [[0.7789484079139252, 0.18708555812004088],
  [0.8129144418799592, 0.22105159208607486]]}

In [14]:
# Get the (shared) confusion matrix (mean and 95% confidence interval)
model.get_confusion_matrix(labeler_id=0)

[{'mean': [0.8939925482315203, 0.10600745176847975],
  'ci': [[0.877224217322873, 0.09350601422319929],
   [0.9064939857768007, 0.12277578267712702]]},
 {'mean': [0.06670896246401536, 0.9332910375359846],
  'ci': [[0.03135612743490523, 0.9020246660521936],
   [0.09797533394780647, 0.9686438725650948]]}]

In [22]:
# Simulate 10 different labelers, each with a separate confusion matrix
psi = generate_labeler_confusion_matrix(
    num_labelers=10, psi_mean=[0.9, 0.8], psi_std=[0.1, 0.1])

In [9]:
# Simulate a dataset with 1000 items and 10 labelers, each labeler has a separate
# confusion matrix as in psi
df2 = generate_dataset_tiebreaking_different_labeler_cm(
    dataset_id=1, theta=np.array([0.8, 0.2]), psi=psi, num_items=1000)

In [10]:
df2.tail(10)

Unnamed: 0,dataset,id,labelers,ratings,true_rating
990,1,1_990,"[6, 8]","[0, 0]",0
991,1,1_991,"[6, 8]","[1, 1]",1
992,1,1_992,"[1, 4]","[0, 0]",0
993,1,1_993,"[0, 6]","[0, 0]",0
994,1,1_994,"[1, 6]","[0, 0]",0
995,1,1_995,"[1, 5, 9]","[0, 1, 0]",0
996,1,1_996,"[0, 9, 3]","[0, 1, 0]",0
997,1,1_997,"[5, 1, 0]","[1, 0, 1]",1
998,1,1_998,"[9, 4]","[0, 0]",0
999,1,1_999,"[5, 4]","[0, 0]",1


In [13]:
# Fit the model, A=10 as there are 10 different labelers
model = ClaraGibbs(burn_in=100, num_samples=100, sample_lag=3)
model.fit(A=10, R=2, ratings=np.array(df2.ratings))

INFO:gibbs:Fitting ...
INFO:gibbs: N = 1000
INFO:gibbs: R = 2
INFO:gibbs: A = 10
INFO:gibbs: C = 0
INFO:gibbs:Initializing ...
INFO:gibbs:Getting priors ...
INFO:gibbs:  theta_scale = 1.0
INFO:gibbs:  theta_mean = [0.72747547 0.27252453]
INFO:gibbs: theta_prior = dimension = 2. scale = 1.0. mean = [0.72747547 0.27252453].
INFO:gibbs:  psi_scale = [1.0, 1.0]
INFO:gibbs:  psi_mean = [[0.75, 0.25], [0.25, 0.75]]
INFO:gibbs: psi_prior = [dimension = 2. scale = 1.0. mean = [0.75 0.25]., dimension = 2. scale = 1.0. mean = [0.25 0.75].]
INFO:gibbs: phi_prior = None
INFO:gibbs:  llh = -1294.9583313254766
INFO:gibbs:  theta = prior: dimension = 2. scale = 1.0. mean = [0.72747547 0.27252453].. data: counts: {1: 217, 0: 783}. count_sum = 1000.. posterior: [0.7829445309373945, 0.21705546906260556]
INFO:gibbs:  psi[0][0] = prior: dimension = 2. scale = 1.0. mean = [0.75 0.25].. data: counts: {0: 1566, 1: 176}. count_sum = 1742.. posterior: [0.898881239242685, 0.10111876075731498]
INFO:gibbs:  psi[0

INFO:gibbs:  psi[2][1] = prior: dimension = 2. scale = 1.0. mean = [0.25 0.75].. data: counts: {0: 0, 1: 0}. count_sum = 0.. posterior: [0.25, 0.75]
INFO:gibbs:  psi[3][0] = prior: dimension = 2. scale = 1.0. mean = [0.75 0.25].. data: counts: {0: 0, 1: 0}. count_sum = 0.. posterior: [0.75, 0.25]
INFO:gibbs:  psi[3][1] = prior: dimension = 2. scale = 1.0. mean = [0.25 0.75].. data: counts: {0: 0, 1: 0}. count_sum = 0.. posterior: [0.25, 0.75]
INFO:gibbs:  psi[4][0] = prior: dimension = 2. scale = 1.0. mean = [0.75 0.25].. data: counts: {0: 0, 1: 0}. count_sum = 0.. posterior: [0.75, 0.25]
INFO:gibbs:  psi[4][1] = prior: dimension = 2. scale = 1.0. mean = [0.25 0.75].. data: counts: {0: 0, 1: 0}. count_sum = 0.. posterior: [0.25, 0.75]
INFO:gibbs:  psi[5][0] = prior: dimension = 2. scale = 1.0. mean = [0.75 0.25].. data: counts: {0: 0, 1: 0}. count_sum = 0.. posterior: [0.75, 0.25]
INFO:gibbs:  psi[5][1] = prior: dimension = 2. scale = 1.0. mean = [0.25 0.75].. data: counts: {0: 0, 1: 0

INFO:gibbs:  psi[8][0] = prior: dimension = 2. scale = 1.0. mean = [0.75 0.25].. data: counts: {0: 0, 1: 0}. count_sum = 0.. posterior: [0.75, 0.25]
INFO:gibbs:  psi[8][1] = prior: dimension = 2. scale = 1.0. mean = [0.25 0.75].. data: counts: {0: 0, 1: 0}. count_sum = 0.. posterior: [0.25, 0.75]
INFO:gibbs:  psi[9][0] = prior: dimension = 2. scale = 1.0. mean = [0.75 0.25].. data: counts: {0: 0, 1: 0}. count_sum = 0.. posterior: [0.75, 0.25]
INFO:gibbs:  psi[9][1] = prior: dimension = 2. scale = 1.0. mean = [0.25 0.75].. data: counts: {0: 0, 1: 0}. count_sum = 0.. posterior: [0.25, 0.75]
INFO:gibbs: Iter 160 / 400
INFO:gibbs:  n_changes = 73 / 1000
INFO:gibbs:  llh = -1362.3351195461764
INFO:gibbs:  theta = prior: dimension = 2. scale = 1.0. mean = [0.72747547 0.27252453].. data: counts: {1: 154, 0: 846}. count_sum = 1000.. posterior: [0.8458815938744574, 0.15411840612554262]
INFO:gibbs:  psi[0][0] = prior: dimension = 2. scale = 1.0. mean = [0.75 0.25].. data: counts: {0: 1607, 1: 30

INFO:gibbs:  psi[2][0] = prior: dimension = 2. scale = 1.0. mean = [0.75 0.25].. data: counts: {0: 0, 1: 0}. count_sum = 0.. posterior: [0.75, 0.25]
INFO:gibbs:  psi[2][1] = prior: dimension = 2. scale = 1.0. mean = [0.25 0.75].. data: counts: {0: 0, 1: 0}. count_sum = 0.. posterior: [0.25, 0.75]
INFO:gibbs:  psi[3][0] = prior: dimension = 2. scale = 1.0. mean = [0.75 0.25].. data: counts: {0: 0, 1: 0}. count_sum = 0.. posterior: [0.75, 0.25]
INFO:gibbs:  psi[3][1] = prior: dimension = 2. scale = 1.0. mean = [0.25 0.75].. data: counts: {0: 0, 1: 0}. count_sum = 0.. posterior: [0.25, 0.75]
INFO:gibbs:  psi[4][0] = prior: dimension = 2. scale = 1.0. mean = [0.75 0.25].. data: counts: {0: 0, 1: 0}. count_sum = 0.. posterior: [0.75, 0.25]
INFO:gibbs:  psi[4][1] = prior: dimension = 2. scale = 1.0. mean = [0.25 0.75].. data: counts: {0: 0, 1: 0}. count_sum = 0.. posterior: [0.25, 0.75]
INFO:gibbs:  psi[5][0] = prior: dimension = 2. scale = 1.0. mean = [0.75 0.25].. data: counts: {0: 0, 1: 0

INFO:gibbs:  psi[7][1] = prior: dimension = 2. scale = 1.0. mean = [0.25 0.75].. data: counts: {0: 0, 1: 0}. count_sum = 0.. posterior: [0.25, 0.75]
INFO:gibbs:  psi[8][0] = prior: dimension = 2. scale = 1.0. mean = [0.75 0.25].. data: counts: {0: 0, 1: 0}. count_sum = 0.. posterior: [0.75, 0.25]
INFO:gibbs:  psi[8][1] = prior: dimension = 2. scale = 1.0. mean = [0.25 0.75].. data: counts: {0: 0, 1: 0}. count_sum = 0.. posterior: [0.25, 0.75]
INFO:gibbs:  psi[9][0] = prior: dimension = 2. scale = 1.0. mean = [0.75 0.25].. data: counts: {0: 0, 1: 0}. count_sum = 0.. posterior: [0.75, 0.25]
INFO:gibbs:  psi[9][1] = prior: dimension = 2. scale = 1.0. mean = [0.25 0.75].. data: counts: {0: 0, 1: 0}. count_sum = 0.. posterior: [0.25, 0.75]
INFO:gibbs: Iter 360 / 400
INFO:gibbs:  n_changes = 91 / 1000
INFO:gibbs:  llh = -1414.4434424628855
INFO:gibbs:  theta = prior: dimension = 2. scale = 1.0. mean = [0.72747547 0.27252453].. data: counts: {1: 194, 0: 806}. count_sum = 1000.. posterior: [0.

In [14]:
# Get the prevalence estimate (mean and 95% confidence interval)
model.get_prevalence()

{'mean': [0.8244130624059257, 0.17558693759407412],
 'ci': [[0.7924100654029289, 0.1429295949367314],
  [0.8570704050632685, 0.20758993459707106]]}

In [21]:
# Get the confusion matrix for each labeler(mean and 95% confidence interval)
model.get_confusion_matrix(labeler_id=0)

[{'mean': [0.8542516807971826, 0.14574831920281758],
  'ci': [[0.8321119809558635, 0.12599380774999952],
   [0.8740061922500004, 0.16788801904413636]]},
 {'mean': [0.12161179868106414, 0.8783882013189357],
  'ci': [[0.053361260633318405, 0.8120007915748899],
   [0.1879992084251101, 0.9466387393666816]]}]