# Analysis of convergence regarding BDS and HS-DS

In [42]:
import glob

In [43]:
import polars as pl
import numpy as np
import matplotlib.pyplot as plt

In [44]:
from scipy.stats import kendalltau
from scipy.stats import kendalltau, spearmanr

In [45]:
datasets = ["dog", "face", "tiny", "adult"]
classes = [4,4,5,5]
scenario = ["homo", "hetero", "hybrid"]
AI_accs = ["-sigma","mean","+sigma","max"]
r_values = [2,3,5,10]

In [46]:
variable_count = {}
for i,dataset in enumerate(datasets):
    variable_count[dataset] = {}
    for r in r_values:
        try:
            human_data = pl.read_csv(f"../human_responses/{dataset}_r={r}.csv")
        except Exception as e:
            print(f"Error reading {dataset} with r={r}: {dataset} (or the experimental conditions do not exist.)")
            continue
        variable_count[dataset][r] = {}
        variable_count[dataset][r]["n_classes"] = classes[i]
        variable_count[dataset][r]["n_workers"] = human_data["worker"].n_unique()

Error reading dog with r=2: dog (or the experimental conditions do not exist.)
Error reading face with r=2: face (or the experimental conditions do not exist.)
Error reading face with r=10: face (or the experimental conditions do not exist.)
Error reading tiny with r=3: tiny (or the experimental conditions do not exist.)
Error reading tiny with r=5: tiny (or the experimental conditions do not exist.)
Error reading tiny with r=10: tiny (or the experimental conditions do not exist.)
Error reading adult with r=2: adult (or the experimental conditions do not exist.)
Error reading adult with r=10: adult (or the experimental conditions do not exist.)


## Ranking Metrics
>In MCMC, the convergence of each latent variable can be diag-
nosed without access to ground-truth data using the Gelman-Rubin
statistic ($\hat{R}$). Therefore, we analyzed the aggregation results of
BDS and HS-DS_MCMC by calculating the number of latent variables
that met the convergence criterion ($\hat{R}$ < 1.1). For each experimental
condition, we investigated the relationship between convergence
and performance within these five runs. Specifically, we ranked the
five outcomes for each condition based on two metrics: the number
of converged variables and the final recall score. We then computed
the correlation between these two rankings. This rank correlation
analysis revealed a critical distinction between the two methods. For
BDS, we found no meaningful correlation between the rankings,
with average correlation coefficients being negligible (Kendall’s
$\tau$ = 0.11, Spearman’s $\rho$ = 0.12). In contrast, HS-DS exhibited a mod-
erate positive correlation (Kendall’s $\tau$ = 0.42, Spearman’s $rho$ = 0.45),
indicating that a higher degree of convergence is associated with
better recall. 

In [47]:
rows = []
for dataset in datasets:
    for r in r_values:
        for sc in scenario:
            for ai_acc in AI_accs:
                try:
                    data = pl.read_csv(f"../results/{dataset}_{sc}_{r}_{ai_acc}.csv")
                except Exception as e:
                    #print(f"Error reading {dataset} with r={r}, sc={sc}, ai_acc={ai_acc}: {e}")
                    continue
                uc_cols = [col for col in data.columns if col.startswith("uc_")]
                # Add a new column 'uc_sum' as the sum of uc_cols
                data = data.with_columns([
                    pl.sum_horizontal(uc_cols).alias("uc_sum")
                ])
                data = data.filter(pl.col("uc_sum") >= 0)

                for num_ai in data["num_ai"].unique():
                    for iter in [0,1,2,3,4]:
                        subset = data.filter(pl.col("num_ai") == num_ai)
                        subset = subset.filter(pl.col("iter") == iter)
                        for s in [1000,2000,3000]:
                            row = {
                                "dataset": dataset,
                                "r": r,
                                "sc": sc,
                                "ai_acc": ai_acc,
                                "num_ai": num_ai,
                                "s": s,
                                "iter": iter,
                            }
                            for method in ["BDS", "HSDS_MCMC"]:
                                tmp_row = row.copy()
                                tmp_row["method"] = method 
                                tmp_row["uc_sum"] = subset.filter(pl.col("method") == f"{method}(iter_sampling={s})")["uc_sum"].item()
                                tmp_row["ucc_perc"] = tmp_row["uc_sum"] / (
                                    (
                                    (variable_count[dataset][r]["n_workers"] + num_ai) \
                                        * variable_count[dataset][r]["n_classes"] \
                                        * variable_count[dataset][r]["n_classes"]
                                    ) + variable_count[dataset][r]["n_classes"]
                                )
                                tmp_row["recall"] = subset.filter(pl.col("method") == f"{method}(iter_sampling={s})")["recall"].item()
                                rows.append(tmp_row)
rdf = pl.DataFrame(rows)

In [48]:
rdf = rdf.with_columns((1-pl.col("ucc_perc")).alias("cc_perc"))

In [49]:
rdf = rdf.with_columns([
    (-pl.col("recall")).rank("average").over(["dataset", "r", "sc", "ai_acc","num_ai" ,"s", "method"]).alias("rank_recall"),
    (-pl.col("cc_perc")).rank("average").over(["dataset", "r", "sc", "ai_acc", "num_ai", "s", "method"]).alias("rank_cc_perc"),
])

In [50]:
group_cols = ["dataset", "r", "sc", "ai_acc", "num_ai", "s", "method"]
results = []

for group, df_group in rdf.group_by(group_cols, maintain_order=True):
    ranks_recall = df_group["rank_recall"].to_numpy()
    ranks_cc_perc = df_group["rank_cc_perc"].to_numpy()
    if len(ranks_recall) > 1 and len(ranks_cc_perc) > 1:
        tau, p_value = kendalltau(ranks_recall, ranks_cc_perc)
        spear, spear_p = spearmanr(ranks_recall, ranks_cc_perc)
        result = dict(zip(group_cols, group))
        result["kendall_tau"] = tau
        result["kendall_p_value"] = p_value
        result["spearman_r"] = spear
        result["spearman_p_value"] = spear_p
        results.append(result)

kendall_df = pl.DataFrame(results)

In [51]:
kendall_df.filter(
    (pl.col("kendall_tau").is_not_nan()) & 
    (pl.col("method") == "BDS")
)["kendall_tau"].mean()

0.10708938807487796

In [52]:
kendall_df.filter(
    (pl.col("spearman_r").is_not_nan()) & 
    (pl.col("method") == "BDS")
)["spearman_r"].mean()

0.12425438700566988

In [53]:
kendall_df.filter(
    (pl.col("kendall_tau").is_not_nan()) & 
    (pl.col("method") == "HSDS_MCMC")
)["kendall_tau"].mean()

0.4247345070761007

In [54]:
kendall_df.filter(
    (pl.col("spearman_r").is_not_nan()) & 
    (pl.col("method") == "HSDS_MCMC")
)["spearman_r"].mean()

0.44931502298196047

## Average Improvements
> Reinforcing this finding, under the same experimental settings, HS-DS had, on average, 34% fewer unconverged variables than BDS, while its recall was 0.43 higher.

In [55]:
rows = []
for dataset in datasets:
    for r in r_values:
        for sc in scenario:
            for ai_acc in AI_accs:
                try:
                    data = pl.read_csv(f"../results/{dataset}_{sc}_{r}_{ai_acc}.csv")
                except Exception as e:
                    #print(f"Error reading {dataset} with r={r}, sc={sc}, ai_acc={ai_acc}: {e}")
                    continue
                uc_cols = [col for col in data.columns if col.startswith("uc_")]
                # Add a new column 'uc_sum' as the sum of uc_cols
                data = data.with_columns([
                    pl.sum_horizontal(uc_cols).alias("uc_sum")
                ])
                data = data.group_by(["method", "num_ai"]).agg([
                    pl.mean("uc_sum").alias("avg_uc_sum"),
                    pl.mean("accuracy").alias("avg_accuracy"),
                    pl.mean("recall").alias("avg_recall"),
                ])
                data = data.filter(pl.col("avg_uc_sum") >= 0)
                for num_ai in data["num_ai"].unique():
                    subset = data.filter(pl.col("num_ai") == num_ai)
                    for s in [1000,2000,3000]:
                        row = {
                            "dataset": dataset,
                            "r": r,
                            "sc": sc,
                            "ai_acc": ai_acc,
                            "num_ai": num_ai,
                            "s": s,
                        }
                        row["ucc_bds"] = subset.filter(pl.col("method") == f"BDS(iter_sampling={s})")["avg_uc_sum"].item()
                        row["ucc_hsds"] = subset.filter(pl.col("method") == f"HSDS_MCMC(iter_sampling={s})")["avg_uc_sum"].item()
                        row["ucc_diff"] = row["ucc_bds"] - row["ucc_hsds"]
                        row["ucc_diff_perc"] = row["ucc_diff"] / (
                            (
                            (variable_count[dataset][r]["n_workers"] + num_ai) \
                                * variable_count[dataset][r]["n_classes"] \
                                * variable_count[dataset][r]["n_classes"]
                            ) + variable_count[dataset][r]["n_classes"]
                        )
                        row["bds_accuracy"] = subset.filter(pl.col("method") == f"BDS(iter_sampling={s})")["avg_accuracy"].item()
                        row["bds_recall"] = subset.filter(pl.col("method") == f"BDS(iter_sampling={s})")["avg_recall"].item()
                        row["hsds_accuracy"] = subset.filter(pl.col("method") == f"HSDS_MCMC(iter_sampling={s})")["avg_accuracy"].item()
                        row["hsds_recall"] = subset.filter(pl.col("method") == f"HSDS_MCMC(iter_sampling={s})")["avg_recall"].item()
                        row["accuracy_diff"] = row["hsds_accuracy"] - row["bds_accuracy"]
                        row["recall_diff"] = row["hsds_recall"] - row["bds_recall"]
                        rows.append(row)
results_df = pl.DataFrame(rows)

In [56]:
results_df.mean()["ucc_diff_perc"]

ucc_diff_perc
f64
0.344028


In [57]:
results_df.mean()["recall_diff"]

recall_diff
f64
0.433607
