In [None]:
import os
import json
import jsonlines
import pandas as pd
import numpy as np
from collections import defaultdict
from tqdm import tqdm
from copy import deepcopy
from typing import List
import matplotlib.pyplot as plt
from cycler import cycler
import math


line_cycler   = (cycler(color=["#E69F00", "#56B4E9", "#009E73", "#0072B2", "#D55E00", "#CC79A7", "#F0E442"]) +
                 cycler(linestyle=["-", "--", "-.", ":", "-", "--", "-."]))

marker_cycler = (cycler(color=["#E69F00", "#56B4E9", "#009E73", "#0072B2", "#D55E00", "#CC79A7", "#F0E442"]) +
                 cycler(linestyle=["-", "--", "-.", ":", "-", "--", "-."]) +
                 cycler(marker=["4", "2", "3", "1", "+", "x", "."]))

plt.rc("axes", prop_cycle=line_cycler)
plt.rc("savefig", dpi=200)
plt.rc("legend", loc="best", fontsize="medium", fancybox=True, framealpha=0.5)
plt.rc("lines", linewidth=2.5, markersize=10, markeredgewidth=2.5)

In [16]:
def load_json(path: str = "") -> dict:
    assert os.path.isfile(path)
    with open(path, "r") as infile:
        data = json.load(infile)
    return data


def load_jsonl(path):
    data = []
    with jsonlines.open(path, "r") as reader:
        for obj in reader:
            data.append(obj)
    return data


def get_best_row(df, metric):
    best_idx = df[metric].idxmax()
    best_row = df.loc[best_idx].to_dict()  # this should be loc and not iloc
    return best_row


def normalize_metric_name(metric):
    name = metric.split("Binary")[-1].split("()")[0]
    if ":" in metric:
        lang = metric.split(":")[1].upper()
        name = name+" "+lang

    name = name.replace("Precision", "$P$").replace("Recall", "$R$").replace("F1Score", "$F_1$")
    return name


def rename_model(name):
    if not isinstance(name, str):
        return name
    
    name = name.lower()

    if "cross-en-de-roberta-sentence-transformer" in name:
        name = "Cross$_{en-de}$"
    elif "xlm-r-distilroberta" in name:
        name = "Para"
        # name = "XLM-R$_{distil}$"
    elif "simcse-xlmrb-ssoar-noprefix" in name:
        name = "SoSSe-XLM-R$_{noprefix}$"
    elif "simcse-xlmrb-ssoar-prefix" in name:
        name = "SoSSe-XLM-R$_{prefix}$"
    elif "specter" in name:
        name = "SPECTER"
    elif "bert-base-multilingual" in name:
        if "uncased" in name:
            name = "mBERT-unca"
        elif "cased" in name:
            name = "mBERT-ca"
        else:
            name = "mBERT"
    elif "distilbert" in name:
        name = "DistilBERT"
    elif "multilingual-e5-small" in name:
        name = "mE5$_{small}$"
    elif "multilingual-e5-base" in name:
        name = "mE5$_{base}$"
    elif "multilingual-e5-large" in name:
        name = "mE5$_{large}$"
    elif "e5-large" in name:
        name = "E5$_{large}$"
    elif "e5-base" in name:
        name = "E5$_{base}$"
    elif "multilingual-minilm" in name:
        name = "mMiniLM" 
    elif "minilm" in name:
        name = "MiniLM"
    elif "multilingual-mpnet" in name:
        name = "mMPNet"
    elif "t5" in name:
        name = "Sentence-T5"
    elif "ssci" in name:
        name = "SsciBERT"
    elif "scibert" in name:
        name = "SciBERT"
    elif "distilbert" in name:
        name = "DistilBERT"
    elif "xlm-roberta-base" in name:
        name = "XLM-R$_{\text{base}}$"
    elif "xlm-roberta-large" in name:
        name = "XLM-R$_{\text{large}}$"
    elif "xlm-roberta-large" in name:
        name = "XLM-R$_{\text{large}}$"
    elif "xlm-v-base" in name:
        name = "XLM-V$_{\text{base}}$"
    elif "mdeberta-v3-base" in name:
        name = "mDeBERTa$_{\text{base}}$"
    elif "deberta-v3-base" in name:
        name = "DeBERTa$_{\text{base}}$"
    elif "deberta-v3-large" in name:
        name = "DeBERTa$_{\text{large}}$"
    elif "roberta-base" in name:
        name = "RoBERTa$_{\text{base}}$"
    elif "roberta-large" in name:
        name = "RoBERTa$_{\text{large}}$"
    elif "xlmr_large" in name:
        name = "SoSci-XLM-R$_{\text{large}}$"
    elif "xlmr_steps" in name:
        name = "SoSci-XLM-R$_{\text{base}}$"
    elif "mbert_steps" in name:
        name = "SoSci-mBERT"
    elif "bert-base" in name:
        name = "BERT$_{\text{base}}$"
    elif "bert-large" in name:
        name = "BERT$_{\text{large}}$"
    elif "logisticregression" in name:
        name = "LR"
    elif "linearsvm" in name or "{linear}" in name:
        name = "SVM$_{linear}$"
    elif "svm" in name or "{non-linear}" in name:
        name = "SVM$_{non-linear}$"
    elif "mistral-7b" in name:
        name = "Mistral-7B"
    elif "mixtral-8x7b" in name:
        name = "Mixtral-8x7B"
    else:
        print("No matching rules for model:", name)
        pass

    return name


def rename_metric(name):
    if "recall" in name:
        name = name.replace("recall", "r")
    elif "map" in name:
        name = name.replace("map", "MAP")
    elif "ndcg" in name:
        name = name.replace("ndcg", "nDCG")
    else:
        print("No matching rules for metric:", name)
        pass
    
    return name


def bold_str(s):
    return "\\textbf{"+s+"}"

In [17]:
runs_dir: str = "../results/runs_sv4_journal_paper_results"
test_path: str = "../data/sild/diff_test.tsv"
save_results: bool = True
group_output: bool = False
groupings: List[str] = ["cmodel", "rmodel"]
sortings: str = "weight"
plot_figures: bool = False
metrics: List[str] = ["test_lang:en:BinaryPrecision()_mean", "test_lang:en:BinaryRecall()_mean", "test_lang:en:BinaryF1Score()_mean"]
max_metric: str = "test_lang:en:BinaryF1Score()_mean"
n_columns: int = 2
fontsize: float = 18.0
figure_height: int = 30
figure_width: int = 30
font: str = "serif"
save_tables: bool = True
table_1: bool = False
table_2: bool = False
table_3: bool = False
table_35: bool = False
table_4: bool = False

output_dir: str = "../results/results_15-03-2024"

In [18]:
results_dirs = [os.path.join(runs_dir, d) for d in os.listdir(runs_dir)]

full_df = pd.DataFrame()
full_raw_results = []
    
for results_dir in tqdm(results_dirs):
    print(f"Working on results dir: {results_dir}")
    results_dir_type = os.path.basename(results_dir).split("-")[0]

    if os.path.isfile(results_dir):
        continue

    model_dirs = [os.path.join(results_dir, d) for d in os.listdir(results_dir)]

    if test_path:
        test_df = pd.read_csv(test_path, sep="\t").rename(columns={"sentence": "text", "is_variable": "label"})
        test_df.index = test_df["uuid"]

    full_results = []

    for model_dir in model_dirs:
        if os.path.isfile(model_dir) or "figure" in model_dir or "tables" in model_dir:
            continue

        if "knn" in model_dir:
            algorithm_name = "KNN"
        elif "rnn" in model_dir:
            algorithm_name = "RNN"
        elif "kmeans" in model_dir:
            algorithm_name = "KMeans"
        elif "logisticregression" in model_dir:
            algorithm_name = "LR"
        elif "linearsvm" in model_dir:
            algorithm_name = "SVM$_{linear}$"
        elif "svm" in model_dir:
            algorithm_name = "SVM$_{non-linear}$"
        elif "mistral" in model_dir:
            algorithm_name = "Mistral"
        else:
            algorithm_name = "Unk"
        
        avg_results = {}
        raw_results = {}

        results_path = os.path.join(model_dir, "results.json")
        if os.path.isfile(results_path):
            results = load_json(results_path)

            if "rac" in results_dir_type:
                _row = {"algorithm": "rac", "model_dir": model_dir, "result_type": results_dir_type, "rmodel": os.path.basename(model_dir)}
                for k,v in results.items():
                    _row[k] = v[0] if isinstance(v, list) else v
                full_results.append(_row)
                continue

            config_path = os.path.join(model_dir, "config.json")
            if os.path.isfile(config_path):
                config = load_json(config_path)

                config_key = list(config.keys())[0]  # each run fold has identical parameters
                run_config = {seed: cfg["run"] for seed, cfg in config[config_key].items()}
                run_key = list(run_config.keys())[0] # each seed has identical parameters

                cmodel = run_config[run_key]["classification_model_name_or_path"]
                ret_test = run_config[run_key]["do_retrieval_during_inference"]
            else:
                ret_test = ""
                cmodel = os.path.basename(model_dir)

            avg_ret_results = {}
            for metric, seed_results in results.items():
                if metric not in ["retriever_results"]:
                    _results = []
                    for vals in seed_results.values():
                        _results.extend(vals)
                    
                    avg_results[metric+"_mean"] = np.mean(_results)
                    avg_results[metric+"_std"] = np.std(_results)
                    avg_results[metric+"_raw"] = _results

                    raw_results[metric] = _results
                    
                elif ":" in metric:
                    pass
                else:
                    _results = defaultdict(lambda: defaultdict(list))
                    for seed, seed_values in seed_results.items():
                        for model_name, model_values in seed_values.items():
                            for metric, metric_values in model_values.items():
                                _results[model_name][metric].extend(metric_values)

                    for model_name, metric_values_list in _results.items():
                        weight_groups = list(set([k.split("weight=")[-1] for k in list(metric_values_list.keys())]))
                        for group_name in weight_groups:
                            model_avg_ret_results = deepcopy(avg_ret_results)
                            model_avg_ret_results["rmodel"] = model_name
                            model_avg_ret_results["weight"] = group_name
                            for metric, values in metric_values_list.items():
                                if group_name in metric:
                                    metric_name = metric.split(group_name)[0].split("_weight")[0]
                                    model_avg_ret_results[metric_name+"_mean"] = np.mean(values)
                                    model_avg_ret_results[metric_name+"_std"] = np.std(values)
                            full_results.append(model_avg_ret_results)
                    
            if avg_results:
                avg_results["retrieval_test"] = ret_test
                avg_results["cmodel"] = cmodel
                avg_results["model_dir"] = model_dir
                avg_results["algorithm"] = algorithm_name
                avg_results["result_type"] = results_dir_type

            if raw_results:
                raw_results["retrieval_test"] = ret_test
                raw_results["cmodel"] = cmodel
                raw_results["model_dir"] = model_dir
                raw_results["algorithm"] = algorithm_name
                raw_results["result_type"] = results_dir_type

        else:
            if "tables" not in results_path or "figures" not in results_path:
                print(f"Results file does not exist: {results_path}")
            continue

        if avg_results:
            full_results.append(avg_results)

        if raw_results:
            full_raw_results.append(raw_results)

        pred_files = [os.path.join(model_dir, f) for f in os.listdir(model_dir) if "preds" in f]

        if pred_files:
            preds = defaultdict(list)
            labels = {}
            for f in pred_files:
                df = pd.read_csv(f, sep="\t")
                for i in range(df.shape[0]):
                    row = df.iloc[i]
                    uuid = row["uuid"]
                    pred = row["pred"]
                    label = row["label"]
                    preds[uuid].append(str(pred))
                    labels[uuid] = label

            if test_path:
                pred_data = []
                for i in range(test_df.shape[0]):
                    row = test_df.iloc[i]
                    uuid = row["uuid"]
                    if uuid in preds:
                        assert row["label"] == labels[uuid]
                        new_row = {"uuid": uuid, "label": row["label"], "preds": ",".join(preds[uuid]), "text": row["text"]}
                        pred_data.append(new_row)

                pred_df = pd.DataFrame(pred_data)
                output_path = os.path.join(model_dir, "test_predictions.tsv")
                pred_df.to_csv(output_path, index=False, sep="\t")
        
    if save_results:
        output_file = os.path.join(results_dir, "results.tsv")
        
        if not full_results:
            continue

        df = pd.DataFrame(full_results)
        full_df = pd.concat([full_df, df])

full_df = full_df.reset_index()
raw_df = pd.DataFrame(full_raw_results)

  6%|▋         | 2/31 [00:00<00:01, 18.31it/s]

Working on results dir: ../results/runs_sv4_journal_paper_results_14-03-2024/cluster-2024-03-26_16-28
Working on results dir: ../results/runs_sv4_journal_paper_results_14-03-2024/llmeval-2024-03-13_13-41_10
Working on results dir: ../results/runs_sv4_journal_paper_results_14-03-2024/finetune_contextwindow-2024-04-20_20-53
Working on results dir: ../results/runs_sv4_journal_paper_results_14-03-2024/finetune_synthetic-2024-03-11_09-20_100k


 19%|█▉        | 6/31 [00:00<00:02, 12.24it/s]

Working on results dir: ../results/runs_sv4_journal_paper_results_14-03-2024/llmeval_mixtral-2024-03-13_13-39_0
Working on results dir: ../results/runs_sv4_journal_paper_results_14-03-2024/cluster_tfidf-2024-05-06_15-52


 26%|██▌       | 8/31 [00:00<00:01, 14.15it/s]

Working on results dir: ../results/runs_sv4_journal_paper_results_14-03-2024/llmeval-2024-03-13_13-41_0
Working on results dir: ../results/runs_sv4_journal_paper_results_14-03-2024/rac-2024-03-27
Working on results dir: ../results/runs_sv4_journal_paper_results_14-03-2024/llmeval_mixtral-2024-03-13_13-39_10
Working on results dir: ../results/runs_sv4_journal_paper_results_14-03-2024/finetune_synthetic-2024-03-11_09-21_400k-800k


 32%|███▏      | 10/31 [00:00<00:01, 14.27it/s]

Working on results dir: ../results/runs_sv4_journal_paper_results_14-03-2024/linear-2024-05-17_10-09
Working on results dir: ../results/runs_sv4_journal_paper_results_14-03-2024/finetune_contextwindow-2024-05-06_17-20
Results file does not exist: ../results/runs_sv4_journal_paper_results_14-03-2024/finetune_contextwindow-2024-05-06_17-20/FacebookAI--xlm-roberta-large-finetuned_do-retrieval=False_20240510-091909/results.json


 45%|████▌     | 14/31 [00:01<00:01, 12.07it/s]

Working on results dir: ../results/runs_sv4_journal_paper_results_14-03-2024/finetune_synthetic_tentrials-2024-05-13_21-22
Working on results dir: ../results/runs_sv4_journal_paper_results_14-03-2024/llmeval-2024-03-13_13-41_50
Working on results dir: ../results/runs_sv4_journal_paper_results_14-03-2024/finetune_synthetic-2024-03-29_best


 58%|█████▊    | 18/31 [00:01<00:00, 13.39it/s]

Working on results dir: ../results/runs_sv4_journal_paper_results_14-03-2024/finetune_diff_vs_rand-2024-05-06_23-04
Working on results dir: ../results/runs_sv4_journal_paper_results_14-03-2024/cluster_sim-2024-04-19_08-58
Working on results dir: ../results/runs_sv4_journal_paper_results_14-03-2024/llmeval_cp-2024-03-11_21-27
Working on results dir: ../results/runs_sv4_journal_paper_results_14-03-2024/finetune_synthetic-2024-03-11_09-26_1k


 65%|██████▍   | 20/31 [00:01<00:00, 11.35it/s]

Working on results dir: ../results/runs_sv4_journal_paper_results_14-03-2024/finetune_synthetic_fast_v2-2024-05-13_22-55
Working on results dir: ../results/runs_sv4_journal_paper_results_14-03-2024/llmeval-2024-03-13_13-41_100
Working on results dir: ../results/runs_sv4_journal_paper_results_14-03-2024/test_contextwindows-2024-05-13_16-57


 77%|███████▋  | 24/31 [00:01<00:00, 12.24it/s]

Working on results dir: ../results/runs_sv4_journal_paper_results_14-03-2024/finetune_synthetic_best_fast-2024-05-08_20-52
Working on results dir: ../results/runs_sv4_journal_paper_results_14-03-2024/finetune_synthetic_fast-2024-05-13_19-19
Working on results dir: ../results/runs_sv4_journal_paper_results_14-03-2024/llmeval_mixtral-2024-03-13_13-39_50


 84%|████████▍ | 26/31 [00:02<00:00, 10.38it/s]

Working on results dir: ../results/runs_sv4_journal_paper_results_14-03-2024/llmeval_mixtral-2024-03-13_13-39_20


 90%|█████████ | 28/31 [00:02<00:00, 11.34it/s]

Working on results dir: ../results/runs_sv4_journal_paper_results_14-03-2024/finetune_synthetic_fast_v3-2024-05-14_00-44
Working on results dir: ../results/runs_sv4_journal_paper_results_14-03-2024/llmeval-2024-03-13_13-41_20
Working on results dir: ../results/runs_sv4_journal_paper_results_14-03-2024/_linear-2024-03-02_10-09


 97%|█████████▋| 30/31 [00:02<00:00, 11.96it/s]

Working on results dir: ../results/runs_sv4_journal_paper_results_14-03-2024/finetune_synthetic-2024-04-22_08-18
Working on results dir: ../results/runs_sv4_journal_paper_results_14-03-2024/finetune-2024-03-11_11-26


100%|██████████| 31/31 [00:02<00:00, 11.90it/s]


In [4]:
from deepsig import multi_aso


def make_stat_sig(df, metric, model_col="cmodel", return_df=False, seed=42, num_jobs=24, confidence_level=0.95, output_path=None, rename_models=True):
    all_model_scores = []
    models = []

    for i in range(df.shape[0]):
        model_scores = df.iloc[i][metric]
        all_model_scores.append(model_scores)
        if rename_models:
            models.append(rename_model(df.iloc[i][model_col]))
        else:
            models.append(df.iloc[i][model_col])

    n_models = len(models)

    eps_min = multi_aso(all_model_scores, confidence_level=confidence_level, return_df=return_df, seed=seed, num_jobs=num_jobs)
    eps_min = np.concatenate((eps_min, np.array(models).reshape(n_models,1)), axis=1)

    df = pd.DataFrame(eps_min, columns=models+["Model"]).set_index("Model")

    if output_path:
        df.to_latex(output_path)

    return df


def find_significant_rows(eps_min, p_val=0.2):
    significant_rows = []

    for i in range(eps_min.shape[0]):
        row_k = eps_min.iloc[i].name
        max_val = float(max(eps_min.loc[:, eps_min.columns != row_k].iloc[i].values.astype(np.float64)))
        if max_val <= p_val:
            significant_rows.append((i,row_k))

    return significant_rows


def mark_significant_rows(df, significant_rows):
    for i,_,column in significant_rows:
        df.loc[i, column] = df.loc[i, column].replace("$^*_", "$^{**}_").replace("$_", "$^*_")

    return df


def add_significance_to_table(final_df, raw_df, raws, model_col="cmodel", p_val=0.2, output_dir=None, rename_models=True):
    significant_rows = []
    for metric in raws:
        output_path = os.path.join(output_dir, metric+".tex") if output_dir else None
        eps_min = make_stat_sig(raw_df, metric, model_col=model_col, return_df=True, output_path=output_path, rename_models=rename_models)
        srows = find_significant_rows(eps_min, p_val)
        srows = [(r[0], r[1], normalize_metric_name(metric)) for r in srows]
        significant_rows.extend(srows)

    if significant_rows:
        final_df = mark_significant_rows(final_df, significant_rows)

    return final_df

In [5]:
def pre_processing(df, metrics, model_cols, filter_criteria):
    for crit_name, crit_vals in filter_criteria:
        if crit_name in df:
            if crit_name in ["weight", "algorithm"] and crit_vals:
                df = df[df[crit_name].isin(crit_vals)]

            if crit_name in ["algorithm", "rmodel"]:
                model_cols.append(crit_name)

    stds = [m.replace("mean", "std") for m in metrics]
    raws = [m.replace("mean", "raw") for m in metrics]
    df = df[model_cols+metrics+stds+raws]

    return df, stds, raws


def format_scores(df, metrics, stds, cols):
    new_rows = []
    metric_cols = []
    for i in range(df.shape[0]):
        row = deepcopy(df.iloc[i])
        for m,s in zip(metrics, stds):
            name = normalize_metric_name(m)
            if name not in metric_cols:
                metric_cols.append(name)
            rounded_mean = str(round(row[m]*100, 1))
            rounded_std = '{:.2f}'.format(round(row[s]*100, 0)).replace('0.', '.').replace('.00', '')
            row[name] = rounded_mean+"$_{\pm"+rounded_std+"}$"
        new_rows.append(row)
    cols_to_keep = cols + list(metric_cols)

    return new_rows, cols_to_keep


def format_table(df, model_order=None, index_col="cmodel", drop_columns=["algorithm"], merge_example_columns=False, bold_table=True):
    df["cmodel"] = df["cmodel"].apply(lambda x: rename_model(x))
    df = df.rename(columns={index_col: "Model"})
    df.index = df["Model"]
    if model_order:
        df = df.loc[[m for m in model_order if m in df.index]]
    else:
        df = df.sort_values(by="Examples")
        if merge_example_columns:
            df["Model"] = df.apply(lambda x: str(x.Model)+"$_{"+str(x.Examples)+"}$", axis=1)
        df = df.drop(columns=["Examples"])

    if index_col != "algorithm":
        if df["algorithm"].unique().shape[0] > 1:
            df["Model"] = df.apply(lambda x: f"{x.algorithm}$_{x.Model}$", axis=1)

    df = df.drop(columns=drop_columns)
    if bold_table:
        df["Model"] = df["Model"].apply(lambda x: bold_str(x))
        df = df.rename(bold_str, axis="columns")
    
    return df

In [6]:
def make_table_knn(df, metrics, model_order, keep_weights=None, keep_algorithms=["KNN"], p_val=0.05, output_dir=None):
    if output_dir:
        os.makedirs(output_dir, exist_ok=True)
        
    df, stds, raws = pre_processing(df, metrics, model_cols=["cmodel"], filter_criteria=[("algorithm", keep_algorithms), ("weight", keep_weights), ("rmodel", None)])

    new_rows, cols_to_keep = format_scores(df, metrics, stds, ["cmodel", "algorithm"])
    final_df = pd.DataFrame(new_rows)[cols_to_keep].reset_index(drop=True)
    raw_df = pd.DataFrame(new_rows)[cols_to_keep+raws].reset_index(drop=True)

    final_df = add_significance_to_table(final_df, raw_df, raws, p_val=p_val, output_dir=output_dir)

    print(final_df["cmodel"].unique())

    final_df = format_table(final_df, model_order)

    if output_dir:
        final_df.to_latex(os.path.join(output_dir, "knn.tex"), index=False, caption="Nearest-neighbor classification performance across models for each language using KNN.")
    
    return final_df.to_latex(index=False, caption="Nearest-neighbor classification performance across models for each language using KNN.").replace("Precision", "$P$").replace("Recall", "$R$").replace("F1Score", "$F_1$")

In [None]:
model_order = ["BERT", "mBERT", "mBERT-ca", "mBERT-unca", "DistilBERT", "RoBERTa$_{\text{base}}$", "RoBERTa$_{\text{large}}$", "SciBERT", "MiniLM", "mMiniLM", "mMPNet", "SPECTER", "Sentence-T5", "SsciBERT", "E5$_{small}$", "mE5$_{small}$", "E5$_{base}$", "mE5$_{base}$", "E5$_{large}$", "mE5$_{large}$"]
metrics: List[str] = ["test_lang:en:BinaryPrecision()_mean", "test_lang:en:BinaryRecall()_mean", "test_lang:en:BinaryF1Score()_mean", "test_lang:de:BinaryPrecision()_mean", "test_lang:de:BinaryRecall()_mean", "test_lang:de:BinaryF1Score()_mean", "test_BinaryF1Score()_mean"]
print(make_table_knn(full_df[full_df["result_type"].isin(["cluster", "cluster_tfidf"])], metrics, model_order, output_dir=os.path.join(output_dir, "knn")))

In [None]:
model_order = ["BERT", "mBERT", "mBERT-ca", "mBERT-unca", "DistilBERT", "RoBERTa$_{\text{base}}$", "RoBERTa$_{\text{large}}$", "SciBERT", "MiniLM", "mMiniLM", "mMPNet", "SPECTER", "Sentence-T5", "SsciBERT", "E5$_{small}$", "mE5$_{small}$", "E5$_{base}$", "mE5$_{base}$", "E5$_{large}$", "mE5$_{large}$"]
metrics: List[str] = ["test_lang:en:BinaryPrecision()_mean", "test_lang:en:BinaryRecall()_mean", "test_lang:en:BinaryF1Score()_mean", "test_lang:de:BinaryPrecision()_mean", "test_lang:de:BinaryRecall()_mean", "test_lang:de:BinaryF1Score()_mean", "test_BinaryF1Score()_mean"]
print(make_table_knn(full_df[full_df["result_type"].isin(["cluster_sim"])], metrics, model_order, output_dir=os.path.join(output_dir, "knn-sim")))

In [None]:
model_order = ["BERT", "mBERT", "mBERT-ca", "mBERT-unca", "DistilBERT", "RoBERTa$_{\text{base}}$", "RoBERTa$_{\text{large}}$", "SciBERT", "MiniLM", "mMiniLM", "mMPNet", "SPECTER", "Sentence-T5", "SsciBERT", "E5$_{small}$", "mE5$_{small}$", "E5$_{base}$", "mE5$_{base}$", "E5$_{large}$", "mE5$_{large}$"]
metrics: List[str] = ["test_lang:en:BinaryPrecision()_mean", "test_lang:en:BinaryRecall()_mean", "test_lang:en:BinaryF1Score()_mean", "test_lang:de:BinaryPrecision()_mean", "test_lang:de:BinaryRecall()_mean", "test_lang:de:BinaryF1Score()_mean", "test_BinaryF1Score()_mean"]
print(make_table_knn(full_df[full_df["result_type"].isin(["finetune_synthetic_best_fast"])], metrics, model_order))

In [None]:
model_order = ["BERT", "mBERT", "mBERT-ca", "mBERT-unca", "DistilBERT", "RoBERTa$_{\text{base}}$", "RoBERTa$_{\text{large}}$", "SciBERT", "MiniLM", "mMiniLM", "mMPNet", "SPECTER", "Sentence-T5", "SsciBERT", "E5$_{small}$", "mE5$_{small}$", "E5$_{base}$", "mE5$_{base}$", "E5$_{large}$", "mE5$_{large}$"]
metrics: List[str] = ["test_lang:en:BinaryPrecision()_mean", "test_lang:en:BinaryRecall()_mean", "test_lang:en:BinaryF1Score()_mean", "test_lang:de:BinaryPrecision()_mean", "test_lang:de:BinaryRecall()_mean", "test_lang:de:BinaryF1Score()_mean", "test_BinaryF1Score()_mean"]
print(make_table_knn(full_df[full_df["result_type"].isin(["cluster_sim"])], metrics, model_order, output_dir=os.path.join(output_dir, "knn")))

In [13]:
def make_table_linear(df, metrics, model_order, keep_weights=None, keep_algorithms=["LR", "SVM$_{linear}$", "SVM$_{non-linear}$"], p_val=0.05, output_dir=None):
    if output_dir:
        os.makedirs(output_dir, exist_ok=True)
        
    df, stds, raws = pre_processing(df, metrics, model_cols=["cmodel"], filter_criteria=[("algorithm", keep_algorithms), ("weight", keep_weights), ("rmodel", None)])

    new_rows, cols_to_keep = format_scores(df, metrics, stds, ["cmodel", "algorithm"])
    final_df = pd.DataFrame(new_rows)[cols_to_keep].reset_index(drop=True)
    raw_df = pd.DataFrame(new_rows)[cols_to_keep+raws].reset_index(drop=True)

    final_df = add_significance_to_table(final_df, raw_df, raws, model_col="algorithm", p_val=p_val, output_dir=output_dir)

    final_df = format_table(final_df, model_order, index_col="algorithm", drop_columns=["cmodel"])

    if output_dir:
        final_df.to_latex(os.path.join(output_dir, "linear.tex"), index=False, caption="Results for lexical classifiers. LR stands for logistic regression and SVM stands for support vector machine, marked with the subscripts for the linear variant and the non-linear kernel variants.")
    
    return final_df.to_latex(index=False, caption="Results for lexical classifiers. LR stands for logistic regression and SVM stands for support vector machine, marked with the subscripts for the linear variant and the non-linear kernel variants.").replace("Precision", "$P$").replace("Recall", "$R$").replace("F1Score", "$F_1$")

In [14]:
model_order = ["LR", "SVM$_{linear}$", "SVM$_{non-linear}$"]
metrics: List[str] = ["test_lang:en:BinaryPrecision()_mean", "test_lang:en:BinaryRecall()_mean", "test_lang:en:BinaryF1Score()_mean"]
print(make_table_linear(full_df[full_df["result_type"].isin(["linear"])], metrics, model_order, output_dir=os.path.join(output_dir, "linear")))

No matching rules for model: lr


Model comparisons: 100%|█████████▉| 2997/3000 [00:03<00:00, 925.31it/s] 


No matching rules for model: lr


Model comparisons: 100%|█████████▉| 2997/3000 [00:03<00:00, 958.03it/s] 


No matching rules for model: lr


Model comparisons: 100%|█████████▉| 2997/3000 [00:03<00:00, 936.26it/s] 

\begin{table}
\caption{Results for lexical classifiers. LR stands for logistic regression and SVM stands for support vector machine, marked with the subscripts for the linear variant and the non-linear kernel variants.}
\begin{tabular}{llll}
\toprule
\textbf{Model} & \textbf{$P$ EN} & \textbf{$R$ EN} & \textbf{$F_1$ EN} \\
\midrule
\textbf{LR} & 86.4$^*_{\pm4}$ & 7.8$_{\pm4}$ & 14.0$_{\pm6}$ \\
\textbf{SVM$_{linear}$} & 76.7$_{\pm1}$ & 15.3$_{\pm1}$ & 25.5$_{\pm2}$ \\
\textbf{SVM$_{non-linear}$} & 66.7$_{\pm3}$ & 19.3$^*_{\pm1}$ & 29.9$^*_{\pm1}$ \\
\bottomrule
\end{tabular}
\end{table}






In [19]:
def make_table_finetune(df, metrics, model_order, keep_weights=None, keep_algorithms=["Unk"], p_val=0.05, output_dir=None):
    if output_dir:
        os.makedirs(output_dir, exist_ok=True)

    df["_model"] = df["cmodel"].apply(lambda x: rename_model(x))
    df = df[df["_model"].isin(model_order)]
    df = df.drop(columns=["_model"])
        
    df, stds, raws = pre_processing(df, metrics, model_cols=["cmodel"], filter_criteria=[("algorithm", keep_algorithms), ("weight", keep_weights), ("rmodel", None)])

    new_rows, cols_to_keep = format_scores(df, metrics, stds, ["cmodel", "algorithm"])
    final_df = pd.DataFrame(new_rows)[cols_to_keep].reset_index(drop=True)
    raw_df = pd.DataFrame(new_rows)[cols_to_keep+raws].reset_index(drop=True)

    final_df = add_significance_to_table(final_df, raw_df, raws, p_val=p_val, output_dir=output_dir)

    final_df = format_table(final_df, model_order)

    if output_dir:
        final_df.to_latex(os.path.join(output_dir, "finetune.tex"), index=False, caption="Fine-tuned transformers.")

    return final_df.to_latex(index=False, caption="Fine-tuned transformers.").replace("Precision", "$P$").replace("Recall", "$R$").replace("F1Score", "$F_1$")

In [None]:
model_order = ["BERT", "mBERT", "mBERT-ca", "mBERT-unca", "DistilBERT", "RoBERTa$_{\text{base}}$", "RoBERTa$_{\text{large}}$", "XLM-R$_{\text{base}}$", "XLM-R$_{\text{large}}$", "XLM-V$_{\text{base}}$", "mDeBERTa$_{\text{base}}$", "DeBERTa$_{\text{base}}$", "DeBERTa$_{\text{large}}$", "SoSci-mBERT", "SoSci-XLM-R", "SciBERT", "MiniLM", "mMiniLM", "mMPNet", "SPECTER", "Sentence-T5", "SsciBERT", "E5", "mE5"]
metrics: List[str] = ["test_lang:en:BinaryPrecision()_mean", "test_lang:en:BinaryRecall()_mean", "test_lang:en:BinaryF1Score()_mean", "test_lang:de:BinaryPrecision()_mean", "test_lang:de:BinaryRecall()_mean", "test_lang:de:BinaryF1Score()_mean", "test_BinaryF1Score()_mean"]
print(make_table_finetune(full_df[full_df["result_type"].isin(["finetune"])], metrics, model_order, output_dir=os.path.join(output_dir, "finetune_mono")))

model_order = ["BERT$_{\text{base}}$", "BERT$_{\text{large}}$", "RoBERTa$_{\text{base}}$", "RoBERTa$_{\text{large}}$", "DeBERTa$_{\text{base}}$", "DeBERTa$_{\text{large}}$", "SciBERT", "SPECTER", "SsciBERT"]
metrics: List[str] = ["test_lang:en:BinaryPrecision()_mean", "test_lang:en:BinaryRecall()_mean", "test_lang:en:BinaryF1Score()_mean", "test_lang:de:BinaryPrecision()_mean", "test_lang:de:BinaryRecall()_mean", "test_lang:de:BinaryF1Score()_mean", "test_BinaryF1Score()_mean"]
print(make_table_finetune(full_df[full_df["result_type"].isin(["finetune"])], metrics, model_order, output_dir=os.path.join(output_dir, "finetune_ssoar")))

model_order = ["mBERT", "mBERT-ca", "mBERT-unca", "XLM-R$_{\text{base}}$", "XLM-R$_{\text{large}}$", "XLM-V$_{\text{base}}$", "mDeBERTa$_{\text{base}}$"]
metrics: List[str] = ["test_lang:en:BinaryPrecision()_mean", "test_lang:en:BinaryRecall()_mean", "test_lang:en:BinaryF1Score()_mean", "test_lang:de:BinaryPrecision()_mean", "test_lang:de:BinaryRecall()_mean", "test_lang:de:BinaryF1Score()_mean", "test_BinaryF1Score()_mean"]
print(make_table_finetune(full_df[full_df["result_type"].isin(["finetune"])], metrics, model_order, output_dir=os.path.join(output_dir, "finetune_multi")))

In [None]:
model_order = ["BERT$_{\text{large}}$", "RoBERTa$_{\text{large}}$", "DeBERTa$_{\text{large}}$", "mBERT-unca", "XLM-R$_{\text{base}}$", "XLM-R$_{\text{large}}$", "SoSci-mBERT", "SoSci-XLM-R$_{\text{base}}$"]
metrics: List[str] = ["test_lang:en:BinaryPrecision()_mean", "test_lang:en:BinaryRecall()_mean", "test_lang:en:BinaryF1Score()_mean", "test_lang:de:BinaryPrecision()_mean", "test_lang:de:BinaryRecall()_mean", "test_lang:de:BinaryF1Score()_mean", "test_BinaryF1Score()_mean"]
print(make_table_finetune(full_df[full_df["result_type"].isin(["finetune", "finetune_diff_vs_rand"])], metrics, model_order, output_dir=os.path.join(output_dir, "finetune_diff_vs_rand")))

In [None]:
model_order = ["BERT", "mBERT", "mBERT-ca", "mBERT-unca", "DistilBERT", "RoBERTa$_{\text{base}}$", "RoBERTa$_{\text{large}}$", "XLM-R", "SciBERT", "MiniLM", "mMiniLM", "mMPNet", "SPECTER", "Sentence-T5", "SsciBERT", "E5", "mE5"]
metrics: List[str] = ["test_lang:en:BinaryPrecision()_mean", "test_lang:en:BinaryRecall()_mean", "test_lang:en:BinaryF1Score()_mean", "test_lang:de:BinaryPrecision()_mean", "test_lang:de:BinaryRecall()_mean", "test_lang:de:BinaryF1Score()_mean", "test_BinaryF1Score()_mean"]
print(make_table_finetune(full_df[full_df["algorithm"].isin(["Unk"])], metrics, model_order, output_dir=os.path.join(output_dir, "finetune_unk")))

In [None]:
model_order = ["BERT", "mBERT", "mBERT-ca", "mBERT-unca", "DistilBERT", "RoBERTa$_{\text{base}}$", "RoBERTa$_{\text{large}}$", "XLM-R", "SciBERT", "MiniLM", "mMiniLM", "mMPNet", "SPECTER", "Sentence-T5", "SsciBERT", "E5", "mE5"]
metrics: List[str] = ["test_lang:en:BinaryPrecision()_mean", "test_lang:en:BinaryRecall()_mean", "test_lang:en:BinaryF1Score()_mean", "test_lang:de:BinaryPrecision()_mean", "test_lang:de:BinaryRecall()_mean", "test_lang:de:BinaryF1Score()_mean", "test_BinaryF1Score()_mean"]
print(full_df[full_df["result_type"].isin(["test_contextwindows"])][metrics].apply(lambda x: round(x*100,1)).to_latex(index=False).replace("00000", ""))

In [23]:
model_order = ["BERT", "mBERT", "mBERT-ca", "mBERT-unca", "DistilBERT", "RoBERTa$_{\text{base}}$", "RoBERTa$_{\text{large}}$", "XLM-R", "SciBERT", "MiniLM", "mMiniLM", "mMPNet", "SPECTER", "Sentence-T5", "SsciBERT", "E5", "mE5"]
metrics: List[str] = ["test_lang:en:BinaryPrecision()_mean", "test_lang:en:BinaryRecall()_mean", "test_lang:en:BinaryF1Score()_mean", "test_lang:de:BinaryPrecision()_mean", "test_lang:de:BinaryRecall()_mean", "test_lang:de:BinaryF1Score()_mean", "test_BinaryF1Score()_mean"]
print(full_df[full_df["result_type"].isin(["finetune_synthetic_best_fast"])][metrics].apply(lambda x: round(x*100,1)).to_latex(index=False).replace("00000", ""))

\begin{tabular}{rrrrrrr}
\toprule
test_lang:en:BinaryPrecision()_mean & test_lang:en:BinaryRecall()_mean & test_lang:en:BinaryF1Score()_mean & test_lang:de:BinaryPrecision()_mean & test_lang:de:BinaryRecall()_mean & test_lang:de:BinaryF1Score()_mean & test_BinaryF1Score()_mean \\
\midrule
43.6 & 26.3 & 32.5 & 49.8 & 19.6 & 27.3 & 31.1 \\
46.2 & 22.2 & 29.5 & 57.2 & 23.0 & 32.2 & 30.3 \\
20.5 & 10.1 & 13.6 & 22.4 & 10.3 & 14.1 & 13.7 \\
\bottomrule
\end{tabular}



In [None]:
def make_table_icl(df, metrics, keep_weights=None, keep_algorithms=None, p_val=0.05, output_dir=None, bold_table=False):
    if output_dir:
        os.makedirs(output_dir, exist_ok=True)
        
    df, stds, raws = pre_processing(df, metrics, model_cols=["cmodel"], filter_criteria=[("algorithm", keep_algorithms), ("weight", keep_weights), ("rmodel", None)])

    new_rows, cols_to_keep = format_scores(df, metrics, stds, ["cmodel", "algorithm"])
    final_df = pd.DataFrame(new_rows)[cols_to_keep].reset_index(drop=True)
    raw_df = pd.DataFrame(new_rows)[cols_to_keep+raws].reset_index(drop=True)
    
    final_df = add_significance_to_table(final_df, raw_df, raws, p_val=p_val, output_dir=output_dir, rename_models=False)

    final_df["Examples"] = final_df["cmodel"].apply(lambda x: int(x.split("_")[-1]))
    final_df = format_table(final_df, merge_example_columns=True, bold_table=bold_table)

    if not bold_table:
        final_df["Model"] = final_df["Model"].apply(lambda x: bold_str(x))
        final_df = final_df.rename(bold_str, axis="columns")

    if output_dir:
        final_df.to_latex(os.path.join(output_dir, "in-context-learning.tex"), index=False, caption="In-context learning results.")

    return final_df.to_latex(index=False, caption="In-context learning results.").replace("Precision", "$P$").replace("Recall", "$R$").replace("F1Score", "$F_1$")

In [None]:
metrics: List[str] = ["test_lang:en:BinaryPrecision()_mean", "test_lang:en:BinaryRecall()_mean", "test_lang:en:BinaryF1Score()_mean", "test_lang:de:BinaryPrecision()_mean", "test_lang:de:BinaryRecall()_mean", "test_lang:de:BinaryF1Score()_mean", "test_BinaryF1Score()_mean"]
print(make_table_icl(full_df[full_df["result_type"].isin(["llmeval", "llmeval_mixtral"])], metrics))

In [None]:
def make_table_rac(df, metrics, model_order=None, keep_algorithms=None, output_dir=None, bold_table=False):
    if output_dir:
        os.makedirs(output_dir, exist_ok=True)
        
    for m in metrics:
        df[m] = df[m].apply(lambda x: str(round(x*100, 1)))

    final_df = df[["rmodel"] + metrics]
    final_df["rmodel"] = final_df["rmodel"].apply(lambda x: rename_model(x))
    final_df = final_df.rename(columns={"rmodel": "Model"})

    if model_order:
        final_df = final_df.loc[[m for m in model_order if m in final_df.index]]
    
    if not bold_table:
        final_df["Model"] = final_df["Model"].apply(lambda x: bold_str(x))
        final_df = final_df.rename(bold_str, axis="columns")

    if output_dir:
        final_df.to_latex(os.path.join(output_dir, "retrieval-augmented-classification.tex"), index=False, caption="In-context learning results.")

    return final_df.to_latex(index=False, caption="Retrieval-augmented classification results.").replace("Precision", "$P$").replace("Recall", "$R$").replace("F1Score", "$F_1$")

In [None]:
metrics: List[str] = ["test_lang:en:BinaryPrecision()", "test_lang:en:BinaryRecall()", "test_lang:en:BinaryF1Score()", "test_lang:de:BinaryPrecision()", "test_lang:de:BinaryRecall()", "test_lang:de:BinaryF1Score()", "test_BinaryF1Score()"]
print(make_table_rac(full_df[full_df["result_type"].isin(["rac"])], metrics))

In [None]:
def make_best_run(df, metrics, stds):
    best_rows = defaultdict(list)
    for metric in metrics:
        best_row = get_best_row(df, metric)
        best_cmodel = best_row["cmodel"]
        weight = best_row["weight"]

        for cname, cgroup in df.groupby(by="cmodel"):
            if cname == best_cmodel:
                for rname, rgroup in cgroup.groupby(by="rmodel"):
                    _rgroup = rgroup[rgroup["weight"] == weight].reset_index(drop=True)
                    best_rmodel_row = get_best_row(_rgroup, metric)
                    best_rows[metric].append(best_rmodel_row)

                    _rgroup = rgroup[rgroup["weight"] == '1.0'].reset_index(drop=True)
                    best_rmodel_row = get_best_row(_rgroup, metric)
                    best_rows[metric].append(best_rmodel_row)

        no_retrieval_row = df[(df["cmodel"] == best_cmodel) & (df["weight"] == '0.0')].drop_duplicates("cmodel").iloc[0]
        best_rows[metric].append(no_retrieval_row.to_dict())
        
    best_df = pd.DataFrame(best_rows[metrics[-1]])[["cmodel", "rmodel", "weight"]+metrics+stds]  # choose best metric to show
    return best_df

def make_table_rac(df, max_val_metric, metrics, stds, model_order):
    best_cmodel = get_best_row(df[df["weight"] == "0.0"], max_val_metric)
    cmodel = best_cmodel["cmodel"]

    best_rmodel = get_best_row(df[df["cmodel"] == cmodel], max_val_metric)
    weight = best_rmodel["weight"]
    df = df[(df["weight"] == weight) & (df["cmodel"] == cmodel)]

    df = pd.concat([pd.DataFrame([best_cmodel]), df])

    new_rows = []
    metric_cols = []
    for i in range(df.shape[0]):
        row = deepcopy(df.iloc[i])
        for m,s in zip(metrics, stds):
            metric = m.split("Binary")[-1].split("()")[0]
            if ":" in m:
                lang = m.split(":")[1].upper()
                name = metric+" "+lang
            else:
                name = metric
            if name not in metric_cols:
                metric_cols.append(name)
            rounded_mean = str(round(row[m]*100, 1))
            rounded_std = str(round(row[s]*100, 0)).replace('0.', '.').replace('.00', '')
            row[name] = rounded_mean+"$_{\pm"+rounded_std+"}$"
        new_rows.append(row)
    
    cols_to_keep = ["rmodel", "weight"] + list(metric_cols)

    final_df = pd.DataFrame(new_rows)[cols_to_keep]
    final_df["rmodel"] = final_df["rmodel"].apply(lambda x: rename_model(x))
    final_df["weight"] = final_df["weight"].apply(lambda x: str(x))
    final_df = final_df.rename(columns={"rmodel": "Model"})
    final_df.index = final_df["Model"]
    final_df = final_df.loc[[m for m in model_order if m in final_df.index]]
    final_df["Model"] = final_df["Model"].apply(lambda x: bold_str(x))
    final_df = final_df.rename(bold_str, axis="columns")

    if output_dir:
        final_df.to_latex(os.path.join(output_dir, "best_run.tex"), index=False, caption=f"Retrieval-augmented classification using {rename_model(cmodel)} as the base classifier.").replace("Precision", "$P$").replace("Recall", "$R$").replace("F1Score", "$F_1$")
    return final_df.to_latex(index=False, caption=f"Retrieval-augmented classification using {rename_model(cmodel)} as the base classifier.").replace("Precision", "$P$").replace("Recall", "$R$").replace("F1Score", "$F_1$")

In [None]:
max_val_metric = "val_eval_BinaryF1Score()_mean"
test_metrics = ["test_lang:en:BinaryPrecision()_mean", "test_lang:en:BinaryRecall()_mean", "test_lang:en:BinaryF1Score()_mean", "test_lang:de:BinaryPrecision()_mean", "test_lang:de:BinaryRecall()_mean", "test_lang:de:BinaryF1Score()_mean", "test_BinaryF1Score()_mean"]
stds = [m.replace("mean", "std") for m in metrics]
model_order = ["BERT", "mBERT", "mBERT-ca", "mBERT-unca", "DistilBERT", "RoBERTa", "SciBERT", "MiniLM", "mMiniLM", "mMPNet", "SPECTER", "Sentence-T5", "SsciBERT", "E5", "mE5"]
print(make_table_rac(full_df, max_val_metric, test_metrics, stds, model_order))

# Task 2

In [None]:
def get_results_df(results_dir):
    run_dirs = [os.path.join(results_dir, r) for r in os.listdir(results_dir) if "_" != r[0]]

    run_results = {}

    for run_dir in run_dirs:
        runs = [os.path.join(run_dir, r) for r in os.listdir(run_dir) if os.path.isdir(os.path.join(run_dir, r))]

        results = []
        for run in runs:
            results_file = os.path.join(run, "results.json")
            if os.path.isfile(results_file):
                data = load_json(results_file)

                res = {}
                metrics = data["config"]["metrics"].split(",")
                for m in metrics:
                    res[m] = round(data["results"][m]*100, 2)
                
                for attribute, attribute_results in data["fg_results"].items():
                    for category, category_results in attribute_results.items():
                        for m in metrics:
                            name = f"{attribute}_{category}_{m}"
                            res[name] = round(category_results[m]*100,2)

                res = res | {
                    "model": data["config"]["model_name"],
                    "reranker": data["config"]["reranker_name"],
                    "dense": data["config"]["do_dense"],
                    "sparse": data["config"]["do_sparse"],
                    "join": data["config"]["do_join"],
                    "rerank": data["config"]["do_rerank"],
                    "subset_datastore": data["config"]["do_subset_datastore"],
                    "filter": data["config"]["do_filter"],
                    "filters": ";".join(data["config"]["filters"]),
                    "metadata_values": data["config"]["metadata_values"],
                    "seed": data["config"]["seed"],
                    "docstore": data["config"]["doc_store_type"],
                }
                results.append(res)
        output_path = os.path.join(run_dir, "results.tsv")
        df = pd.DataFrame(results)
        df.to_csv(output_path, index=False, sep="\t")

        run_results[run_dir] = results

    return run_results


def get_nested_results_df(results_dir):
    run_dirs = [os.path.join(results_dir, r) for r in os.listdir(results_dir) if "_" != r[0]]

    run_results = {}

    for run_dir in run_dirs:
        days = [os.path.join(run_dir, r) for r in os.listdir(run_dir) if os.path.isdir(os.path.join(run_dir, r))]  # days

        results = []
        for day in days:
            samples = [os.path.join(day, d) for d in os.listdir(day) if os.path.isdir(os.path.join(day, d))]  # samples

            for sample in samples:
                seeds = [os.path.join(sample, s) for s in os.listdir(sample) if os.path.isdir(os.path.join(sample, s))]  # seeds

                for seed in seeds:
                    runs = [os.path.join(seed, s) for s in os.listdir(seed) if os.path.isdir(os.path.join(seed, s))]  # runs

                    for run in runs:
                        results_file = os.path.join(run, "results.json")
                        if os.path.isfile(results_file):
                            data = load_json(results_file)

                            res = {}
                            metrics = data["config"]["metrics"].split(",")
                            for m in metrics:
                                res[m] = round(data["results"][m]*100, 2)
                            
                            for attribute, attribute_results in data["fg_results"].items():
                                for category, category_results in attribute_results.items():
                                    for m in metrics:
                                        name = f"{attribute}_{category}_{m}"
                                        res[name] = round(category_results[m]*100,2)

                            res = res | {
                                "day": os.path.basename(day),
                                "sample": os.path.basename(sample),
                                "seed": os.path.basename(seed),
                                "model": data["config"]["model_name"].split("/")[-1].split(":")[-1].split("_epochs=")[0],
                                "reranker": data["config"]["reranker_name"],
                                "dense": data["config"]["do_dense"],
                                "sparse": data["config"]["do_sparse"],
                                "join": data["config"]["do_join"],
                                "rerank": data["config"]["do_rerank"],
                                "subset_datastore": data["config"]["do_subset_datastore"],
                                "filter": data["config"]["do_filter"],
                                "filters": ";".join(data["config"]["filters"]),
                                "metadata_values": data["config"]["metadata_values"],
                                "docstore": data["config"]["doc_store_type"],
                            }
                            results.append(res)

        output_path = os.path.join(run_dir, "results.tsv")
        df = pd.DataFrame(results)
        df.to_csv(output_path, index=False, sep="\t")

        run_results[run_dir] = results
    
    return run_results

def get_nested_results_df_v2(results_dir):
    days = [os.path.join(results_dir, r) for r in os.listdir(results_dir) if os.path.isdir(os.path.join(results_dir, r))]  # days

    results = []
    for day in days:
        samples = [os.path.join(day, d) for d in os.listdir(day) if os.path.isdir(os.path.join(day, d))]  # samples

        for sample in samples:
            seeds = [os.path.join(sample, s) for s in os.listdir(sample) if os.path.isdir(os.path.join(sample, s))]  # seeds

            for seed in seeds:
                runs = [os.path.join(seed, s) for s in os.listdir(seed) if os.path.isdir(os.path.join(seed, s))]  # runs

                for run in runs:
                    results_file = os.path.join(run, "results.json")
                    if os.path.isfile(results_file):
                        data = load_json(results_file)

                        res = {}
                        metrics = data["config"]["metrics"].split(",")
                        for m in metrics:
                            res[m] = round(data["results"][m]*100, 2)
                        
                        for attribute, attribute_results in data["fg_results"].items():
                            for category, category_results in attribute_results.items():
                                for m in metrics:
                                    name = f"{attribute}_{category}_{m}"
                                    res[name] = round(category_results[m]*100,2)

                        res = res | {
                            "day": os.path.basename(day),
                            "sample": os.path.basename(sample),
                            "seed": os.path.basename(seed),
                            "model": data["config"]["model_name"].split("/")[-1].split(":")[-1].split("_epochs=")[0],
                            "reranker": data["config"]["reranker_name"],
                            "dense": data["config"]["do_dense"],
                            "sparse": data["config"]["do_sparse"],
                            "join": data["config"]["do_join"],
                            "rerank": data["config"]["do_rerank"],
                            "subset_datastore": data["config"]["do_subset_datastore"],
                            "filter": data["config"]["do_filter"],
                            "filters": ";".join(data["config"]["filters"]),
                            "metadata_values": data["config"]["metadata_values"],
                            "docstore": data["config"]["doc_store_type"],
                        }
                        results.append(res)

    output_path = os.path.join(results_dir, "results.tsv")
    df = pd.DataFrame(results)
    df.to_csv(output_path, index=False, sep="\t")

    return results

In [None]:
results_dir = "../results/runs_task2_results"
pt_results = get_results_df(results_dir)

print("\n")

results_dir = "../results/task2_results_simlearn-17-01-23/filtered"
ft_results = get_nested_results_df(results_dir)

In [None]:
metrics = ["lang_en_recall@10", "lang_en_map@10", "lang_en_ndcg@10", "lang_de_recall@10", "lang_de_map@10", "lang_de_ndcg@10", "recall@10", "map@10", "ndcg@10"]

In [None]:
sparse_filter_df = pd.DataFrame(pt_results['../results/runs_task2_results/run_bm25_exp_filter_subset'])[metrics]
sparse_filter_df

In [None]:
sparse_nofilter_df = pd.DataFrame(pt_results['../results/runs_task2_results/run_bm25_exp_no-subset_no-filter'])[metrics]
sparse_nofilter_df

In [None]:
# pre-trained sentence embeddings baselines

model_order = ["XLM-R$_{\text{base}}$", "XLM-R$_{\text{large}}$", "SoSci-XLM-R$_{\text{base}}$", "Para", "Cross$_{en-de}$", "mE5$_{small}$", "mE5$_{base}$", "mE5$_{large}$"]

dense_filter_df = pd.DataFrame(pt_results['../results/runs_task2_results/run_multilingual_model_choice_exp_baseline'])[["model"]+metrics]
dense_filter_df["model"] = dense_filter_df["model"].apply(lambda x: rename_model(x))
dense_filter_df.index = dense_filter_df["model"]
print(dense_filter_df.loc[model_order].to_latex(index=False).replace("0000", ""))

In [None]:
def get_sosse_df(sosse_path, baseline_path):
    filtered = get_nested_results_df_v2(sosse_path)
    filtered_df = pd.DataFrame(filtered)

    _dense_df = pd.DataFrame(pt_results[baseline_path])

    _dense_df["sample"] = "sample=0"
    _dense_df["model"] = _dense_df["model"].apply(lambda x: x.split("/")[-1])
    _dense_df = _dense_df[_dense_df["model"].isin(filtered_df["model"].unique())]

    filtered_df = pd.concat([filtered_df, _dense_df])

    return filtered_df

In [None]:
def make_sosse_dataset_size_plot(df, metrics, metrics_clean=None, output_dir=None, n_columns=3, fontsize=18, figsize=(30,10), y_offset=1, add_title=True):
    n_subplots = len(metrics)
    n_rows = int(n_subplots / n_columns) if n_subplots % n_columns == 0 else int((n_subplots / n_columns) + 1)
    y_min = math.floor(min(df[metrics].min().tolist())-y_offset)
    y_max = math.ceil(max(df[metrics].max().tolist())+y_offset)

    fig, axes = plt.subplots(nrows=n_rows, ncols=n_columns, figsize=figsize)

    c = 0
    r = 0
    for i, metric in enumerate(metrics):
        # print("Metric:", metric)
        if n_rows > 1:
            ax = axes[r, c]
        else:
            ax = axes[c]

        samples = ["sample=200", "sample=2000", "sample=20000", "sample=200000", "sample=400000"]
        x_values = [int(x.split("sample=")[-1]) for x in samples]

        for model,model_group in df.groupby(by="model"):
            # print(model)
            y_means = []
            y_stds = []

            for sample in samples:
                y_mean = model_group[model_group["sample"] == sample][metric].mean()
                y_std = model_group[model_group["sample"] == sample][metric].std()

                if y_mean and y_std:
                    y_means.append(y_mean)
                    y_stds.append(y_std)
                else:
                    y_means.append(0)
                    y_stds.append(0)

            y_means = np.asarray(y_means)
            y_stds = np.asarray(y_stds)
            
            line, = ax.plot(x_values, y_means, label=rename_model(model))
            line_style = line.get_linestyle()
            ax.fill_between(x_values, y_means-y_stds, y_means+y_stds, alpha=0.2)

            y_baseline = [model_group[model_group["sample"] == "sample=0"][metric].mean()] * len(x_values)
            ax.plot(x_values, y_baseline, label=rename_model(model)+"$^{\clubsuit}$", color='gray', linestyle=line_style)
            
        ax.tick_params(axis='x', labelsize=fontsize)
        ax.tick_params(axis='y', labelsize=fontsize)

        if add_title:
            metric_title = metric
            if metrics_clean:
                metric_title = rename_metric(metrics_clean[i])
            ax.set_title(metric_title, fontsize=fontsize)
        ax.set_xscale('log')
        ax.set_ylim(y_min, y_max)

        handles, labels = ax.get_legend_handles_labels()
        axes[0].legend(handles, labels, borderaxespad=0.1, loc="best", fancybox=True, framealpha=0.5, fontsize=fontsize).set_title("Models",prop={'size':fontsize})

        if r == n_rows - 1:
            c += 1
            r = 0
        else:
            r += 1

    fig.supxlabel("Dataset size", fontsize=fontsize)
    fig.supylabel(rename_metric(metric).split("_")[-1], fontsize=fontsize, x=0.005)

    plt.tight_layout()

    if output_dir:
        os.makedirs(output_dir, exist_ok=True)
        path = os.path.join(output_dir, os.path.basename(output_dir)+".pdf")
        plt.savefig(path, bbox_inches="tight")
        print(f"Save to path: {path}")
        
    plt.show()
    plt.close()

In [None]:
sosse_path = "../results/task2_results_sosci-simlearn/filtered/run_multilingual_model_choice_exp"
baseline_path = "../results/runs_task2_results/run_multilingual_model_choice_exp_baseline"

results_dir = "../results/runs_task2_results"
pt_results = get_results_df(results_dir)

filtered_df = get_sosse_df(sosse_path, baseline_path)
metrics = ["lang_en_map@10", "lang_de_map@10"]
metrics_clean = ["English", "German"]
make_sosse_dataset_size_plot(filtered_df, metrics, metrics_clean, add_title=True, n_columns=2, figsize=(20,10), output_dir=os.path.join(output_dir, "SoSSE_filtered"))

In [None]:
sosse_path = "../results/task2_results_sosci-simlearn/filtered_gen/run_multilingual_model_choice_exp"
baseline_path = "../results/runs_task2_results/run_multilingual_model_choice_exp_baseline"

filtered_df = get_sosse_df(sosse_path, baseline_path)
metrics = ["lang_en_map@10", "lang_de_map@10"]
metrics_clean = ["English", "German"]
make_sosse_dataset_size_plot(filtered_df, metrics, metrics_clean, add_title=True, n_columns=2, figsize=(20,10), output_dir=os.path.join(output_dir, "SoSSE_filtered_gen"))

# Task 3

In [None]:
# pick best model on validation data
sosse_path = "../results/task2_results_sosci-simlearn/filtered/run_sim_data_size_choice_exp/04-04-2024"
baseline_path = "../results/runs_task2_results/run_multilingual_model_choice_exp_baseline"

results_dir = "../results/runs_task2_results"
pt_results = get_results_df(results_dir)

filtered_df = get_sosse_df(sosse_path, baseline_path)
filtered_df[["recall@10", "map@10", "ndcg@10", "model", "seed", "sample"]].sort_values(by="map@10", ascending=False).head()

In [None]:
# pick best model on test data
sosse_path = "../results/task2_results_sosci-simlearn/filtered_gen/run_multilingual_model_choice_exp"
baseline_path = "../results/runs_task2_results/run_multilingual_model_choice_exp_baseline"

results_dir = "../results/runs_task2_results"
pt_results = get_results_df(results_dir)

filtered_df = get_sosse_df(sosse_path, baseline_path)
filtered_df = filtered_df[filtered_df["sample"].isin(["sample=0", "sample=200000"])]
filtered_df[["recall@10", "map@10", "ndcg@10", "model", "seed", "sample"]].sort_values(by="map@10", ascending=False)

In [None]:
filtered_df[["lang_en_map@10", "lang_de_map@10", "map@10", "model", "seed", "sample"]].sort_values(by="map@10", ascending=False)

In [None]:
res = [
    {"model": "Para", "EN": 57.2, "DE": 58.6, "Total": 57.6, "sample": 0},
    {"model": "Para", "EN": 58.8, "DE": 61.9, "Total": 59.7, "sample": 200000},
    {"model": "Cross", "EN": 49.1, "DE": 51.7, "Total": 49.8, "sample": 0},
    {"model": "Cross", "EN": 50.7, "DE": 58.8, "Total": 53.1, "sample": 200000},
    {"model": "mE5$_{\text{base}}$", "EN": 57.9, "DE": 65.6, "Total": 60.2, "sample": 0},
    {"model": "mE5$_{\text{base}}$", "EN": 63.4, "DE": 68.2, "Total": 64.9, "sample": 200000},
]

In [None]:
print(pd.DataFrame(res).to_latex(index=False).replace("00000", ""))

In [None]:
def make_table_sosse(df, metrics, keep_weights=None, keep_algorithms=None, p_val=0.05, output_dir=None, bold_table=False):
    if output_dir:
        os.makedirs(output_dir, exist_ok=True)
        
    rows = []
    # group by values
    for model, model_group in df.groupby("model"):
        for sample, sample_group in model_group.groupby("sample"):
            _row = {}
            _row["model"] = rename_model(model)
            _row["sample"] = sample
            for metric in metrics:
                avg_score = round(sample_group[metric].mean(), 1)
                std_score = sample_group[metric].std()

                score_str = f"{avg_score}"
                if str(std_score) != "nan":
                    score_str += "$_{" + f"\pm{round(std_score,1)}" + "}$"
                _row[f"{metric}_str"] = score_str
            rows.append(_row)

    return rows

In [None]:
print(pd.DataFrame(make_table_sosse(filtered_df, ["lang_en_map@10", "lang_de_map@10"])).to_latex(index=False).replace("sample=0", "-").replace("sample=", ""))

In [None]:
test_path = "../results/data/vadis-prolific-3_project_2023-12-09_1251_12:53:08_htest.tsv"
test_df = pd.read_csv(test_path, sep="\t")
test_df = test_df.dropna(subset=["is_variable"])
test_df["is_variable"] = test_df["is_variable"].apply(lambda x: int(x))
test_df["label"] = test_df["is_variable"]

In [None]:
classifier_preds_path = "../results/runs_sv4_journal_paper/finetune_best-2024-03-26_15-24/FacebookAI--xlm-roberta-large-finetuned_do-retrieval=False_20240326-152437/seed=42/fold=0/test_preds.tsv"
disambiguator_bm25_preds_path = "../results/runs_task2_results/run_bm25_exp_filter_subset/20240410-183115/run.json"
disambiguator_st_preds_path = "../results/task2_results_sosci-simlearn/filtered_gen/run_best_ed_model/04-10-2024/20240410-225722/run.json"

qrels_path = "../results/runs_task2_results/run_bm25_exp_filter_subset/20240410-183115/qrels.json"

In [None]:
classifier_preds_df = pd.read_csv(classifier_preds_path, sep="\t")
classifier_preds_df.index = classifier_preds_df["uuid"]
classifier_preds_df["pred_score"] = classifier_preds_df["pred_scores"].apply(lambda x: float(x.split(";")[-1]))

pos_classified_uuids = classifier_preds_df[classifier_preds_df["pred"].isin([1, "1"])]["uuid"].tolist()

disambiguator_bm25_preds = load_json(disambiguator_bm25_preds_path)
disambiguator_pt_preds = load_json(disambiguator_st_preds_path)
qrels_dict = load_json(qrels_path)

full_qrels_dict = {uid: qrels_dict.get(uid, {}) for uid in test_df["uuid"].tolist()}

In [None]:
disambiguator_bm25_preds_filtered = {k:(v if k in pos_classified_uuids else {}) for k,v in disambiguator_bm25_preds.items()}
disambiguator_pt_preds_filtered = {k:(v if k in pos_classified_uuids else {}) for k,v in disambiguator_pt_preds.items()}

In [None]:
from ranx import Qrels, Run, evaluate

def get_scores(qrels_dict, run_dict, valid_uuids=None, metrics=["f1@10", "precision@10", "recall@10", "f1@10", "map@10", "ndcg@10"]):
    if valid_uuids:
        qrels_dict = {k:v for k,v in qrels_dict.items() if k in valid_uuids}
        run_dict = {k:v for k,v in run_dict.items() if k in valid_uuids}

    qrels = Qrels(qrels_dict)
    run = Run(run_dict)

    results = evaluate(qrels, run, metrics)
    print(results)
    return results


def compare_scores(filtered_preds, perfect_preds, qrels_dict, perfect_qrels_dict, test_df):
    en_uuids = test_df[test_df["lang"] == "en"]["uuid"].tolist()
    de_uuids = test_df[test_df["lang"] == "de"]["uuid"].tolist()

    en_scores = get_scores(qrels_dict, filtered_preds, en_uuids)
    de_scores = get_scores(qrels_dict, filtered_preds, de_uuids)
    full_scores = get_scores(qrels_dict, filtered_preds)

    print("\n")

    # perfect entity mention detection
    perf_en_scores = get_scores(perfect_qrels_dict, perfect_preds, en_uuids)
    perf_de_scores = get_scores(perfect_qrels_dict, perfect_preds, de_uuids)
    perf_full_scores = get_scores(perfect_qrels_dict, perfect_preds)

    return {"fil_en": en_scores, "fil_de": de_scores, "fil_full": full_scores, "perf_en": perf_en_scores, "perf_de": perf_de_scores, "perf_full": perf_full_scores}

In [None]:
# compare scores for BM25
scores = compare_scores(disambiguator_bm25_preds_filtered, disambiguator_bm25_preds, qrels_dict, qrels_dict, test_df)

In [None]:
# compare scores for best sentence transformer
scores = compare_scores(disambiguator_pt_preds_filtered, disambiguator_pt_preds, qrels_dict, qrels_dict, test_df)

In [None]:
full_disambiguator_bm25_preds_path = "../results/runs_task2_results/run_bm25_exp_filter_subset/20240411-080029/run.json"
full_qrels_path = "../results/runs_task2_results/run_bm25_exp_filter_subset/20240411-080029/run.json"

full_disambiguator_bm25_preds = load_json(full_disambiguator_bm25_preds_path)
full_qrels = load_json(full_qrels_path)

In [None]:
pred_pos_uuids = []

for k,v in full_disambiguator_bm25_preds.items():
    for k1,v1 in v.items():
        if v1 >= 0.99:
            pred_pos_uuids.append(k)
        break

In [None]:
retrieval_pt_preds = {k:v for k,v in full_disambiguator_bm25_preds.items() if k in pred_pos_uuids}
retrieval_qrels_dict = {k:v for k,v in full_qrels.items() if k in pred_pos_uuids}

scores = compare_scores(retrieval_pt_preds, full_disambiguator_bm25_preds, retrieval_qrels_dict, full_qrels, test_df)

# Analysis

In [None]:
results_dir = "../results/runs_task2_results"
pt_results = get_results_df(results_dir)

results_dir = "../results/task2_results_simlearn-17-01-23/filtered"
ft_results = get_nested_results_df(results_dir)

In [None]:
metrics = ["lang_en_recall@10", "lang_en_map@10", "lang_en_ndcg@10", "lang_de_recall@10", "lang_de_map@10", "lang_de_ndcg@10", "recall@10", "map@10", "ndcg@10"]
sparse_nofilter_df = pd.DataFrame(pt_results['../results/runs_task2_results/run_bm25_exp_no-subset_no-filter'])[metrics]
sparse_nofilter_df

In [None]:
def make_table_diagnostic(df, metrics, model_order, keep_weights=None, keep_algorithms=["Unk"], p_val=0.05, output_dir=None):
    if output_dir:
        os.makedirs(output_dir, exist_ok=True)

    df["_model"] = df["cmodel"].apply(lambda x: rename_model(x))
    df = df[df["_model"].isin(model_order)]
    df = df.drop(columns=["_model"])
        
    df, stds, raws = pre_processing(df, metrics, model_cols=["cmodel"], filter_criteria=[("algorithm", keep_algorithms), ("weight", keep_weights), ("rmodel", None)])

    new_rows, cols_to_keep = format_scores(df, metrics, stds, ["cmodel", "algorithm"])
    final_df = pd.DataFrame(new_rows)[cols_to_keep].reset_index(drop=True)
    raw_df = pd.DataFrame(new_rows)[cols_to_keep+raws].reset_index(drop=True)

    final_df = format_table(final_df, model_order)

    if output_dir:
        final_df.to_latex(os.path.join(output_dir, "finetune.tex"), index=False, caption="Fine-tuned transformers.")

    return final_df.to_latex(index=False, caption="Fine-tuned transformers.").replace("Precision", "$P$").replace("Recall", "$R$").replace("F1Score", "$F_1$")

In [None]:
# performance across types and subtypes
metrics = ["test_type_short:E:BinaryF1Score()_mean", "test_type_short:I:BinaryF1Score()_mean", "test_subtype_short:Q:BinaryF1Score()_mean", "test_subtype_short:P:BinaryF1Score()_mean"]

model_order = ["XLM-R$_{\text{base}}$", "XLM-R$_{\text{large}}$", "SoSci-XLM-R"]
print(make_table_diagnostic(full_df[full_df["result_type"].isin(["finetune"])], metrics, model_order, output_dir=os.path.join(output_dir, "md_diagnostics")))

In [None]:
metrics = ["type_short_E_recall@10", "type_short_I_recall@10", "subtype_short_Q_recall@10", "subtype_short_P_recall@10"]

model_order = ["mE5$_{small}$", "mE5$_{base}$", "mE5$_{large}$"]

dense_filter_df = pd.DataFrame(pt_results['../results/runs_task2_results/run_multilingual_model_choice_exp_baseline'])[["model"]+metrics]
dense_filter_df["model"] = dense_filter_df["model"].apply(lambda x: rename_model(x))
dense_filter_df.index = dense_filter_df["model"]
print(dense_filter_df.loc[model_order].to_latex(index=False).replace("0000", ""))

In [None]:
sparse_filter_df = pd.DataFrame(pt_results['../results/runs_task2_results/run_bm25_exp_filter_subset'])[["model"]+metrics]
sparse_filter_df

In [None]:
sosse_path = "../results/task2_results_sosci-simlearn/filtered/run_sim_data_size_choice_exp"
baseline_path = "../results/runs_task2_results/run_multilingual_model_choice_exp_baseline"

results_dir = "../results/runs_task2_results"
pt_results = get_results_df(results_dir)

filtered_df = get_sosse_df(sosse_path, baseline_path)

In [None]:
_df = filtered_df[(filtered_df["model"].isin(["multilingual-e5-base"])) & (filtered_df["sample"].isin(["sample=200000"]))][["model", "sample", "seed"]+metrics]
_df["model"] = _df["model"].apply(lambda x: rename_model(x))
_df[metrics].mean(), _df[metrics].std()

In [None]:
single_item_uuids = {}
multi_item_uuids = {}

test_df["label"] = test_df["label"].fillna(0)

for i in range(test_df.shape[0]):
    row = test_df.iloc[i]

    label = int(row.get("label", 0))
    uuid = row.get("uuid")

    if label == 1:
        variables = row.get("variable", "").split(";")
        variables = [v for vs in variables for v in vs.split(",") if v]

        if len(variables) > 1:
            multi_item_uuids[uuid] = variables
        else:
            single_item_uuids[uuid] = variables

single_plus_non_multi_uuids = list(single_item_uuids.keys()) + [uid for uid in test_df["uuid"].tolist() if uid not in multi_item_uuids]
multi_plus_non_multi_uuids = list(multi_item_uuids.keys()) + [uid for uid in test_df["uuid"].tolist() if uid not in single_item_uuids]

In [None]:
from metrics import compute_metrics

def compute_results_subset(files, gold_file, uuids):
    true_df = pd.read_csv(gold_file, sep="\t")
    true_df.index = true_df["uuid"]
    true_df["is_variable"] = true_df["is_variable"].fillna(0)
    true = true_df.loc[uuids]["is_variable"].astype(int).tolist()

    results = defaultdict(list)
    for f in files:
        preds_df = pd.read_csv(f, sep="\t")
        preds_df.index = preds_df["uuid"]
        preds_df["pred_scores"] = preds_df["pred_scores"].apply(lambda x: [float(x.split(";")[0]), float(x.split(";")[1])])

        pred_scores = preds_df.loc[uuids]["pred_scores"].tolist()

        scores = compute_metrics([pred_scores, true])
        for k,v in scores.items():
            results[k].append(v)
        
    return results

In [None]:
best_ed_model_path = "../results/runs_sv4_journal_paper_results_14-03-2024/finetune-2024-03-11_11-26/FacebookAI--xlm-roberta-large-finetuned_do-retrieval=False_20240311-112506"
seed_dirs = [os.path.join(best_ed_model_path, s, "fold=0", "test_preds.tsv") for s in os.listdir(best_ed_model_path) if os.path.isdir(os.path.join(best_ed_model_path, s))]

sresults = compute_results_subset(seed_dirs, "../results/data/vadis-prolific-3_project_2023-12-09_1251_12:53:08_htest.tsv", single_plus_non_multi_uuids)
mresults = compute_results_subset(seed_dirs, "../results/data/vadis-prolific-3_project_2023-12-09_1251_12:53:08_htest.tsv", multi_plus_non_multi_uuids)

In [None]:
def get_mean_std(scores):
    mean = round(np.mean(scores)*100, 1)
    std = round(np.std(scores)*100, 0)
    return mean, std

In [None]:
metrics = ["BinaryPrecision()", "BinaryRecall()", "BinaryF1Score()"]

sr = {}
mr = {}
for m in metrics:
    smean, sstd = get_mean_std(sresults[m])
    sscore = str(smean)+"$_{\pm"+str(sstd)+"}$"
    sr["Items"] = "Single"
    sr["Count"] = len(single_item_uuids)
    sr[m] = sscore

    mmean, mstd = get_mean_std(mresults[m])
    mscore = str(mmean)+"$_{\pm"+str(mstd)+"}$"
    mr["Items"] = "Multi"
    mr["Count"] = len(multi_item_uuids)
    mr[m] = mscore

print(pd.DataFrame([sr, mr]).to_latex(index=False))

In [None]:
disambiguator_bm25_preds = load_json(disambiguator_bm25_preds_path)
disambiguator_pt_preds = load_json(disambiguator_st_preds_path)
qrels_dict = load_json(qrels_path)

single_scores = get_scores(qrels_dict, disambiguator_bm25_preds, valid_uuids=single_item_uuids)
multi_scores = get_scores(qrels_dict, disambiguator_bm25_preds, valid_uuids=multi_item_uuids)
print("\n")

single_scores = get_scores(qrels_dict, disambiguator_pt_preds, valid_uuids=single_item_uuids)
multi_scores = get_scores(qrels_dict, disambiguator_pt_preds, valid_uuids=multi_item_uuids)

In [None]:
# topic-based similarity

def extract_variable_basename(variable):
    research_id = variable.split("_")[0]
    assert "ZA" in research_id

    basename = variable.split("_")[1:]
    return research_id, basename

def expand_variable(variable, variable_meta):
    research_id, variable_basename = ""

    return expaned_variables

In [None]:
meta = load_jsonl("../data/gsim/survey_items.jsonl")
meta_json = {m["url"].split(":")[-1]: m["variables"] for m in meta}

In [None]:
def get_clean_rdids(variables_list):
    return sorted(list(set([_v.split("_")[0] for vs in variables_list for v in vs.split(";") for _v in v.split(",") if _v and "ZA" in _v])))

rd_ids = get_clean_rdids(test_df[test_df["is_variable"].isin(["1", 1])]["variable"])

In [None]:
expandable_vids = []
for r in rd_ids:
    variables_meta = meta_json.get(r)
    for vid in variables_meta:
        prefix = f"exploredata-{r}_"
        vid = vid.split(prefix)[-1]
        if "_" in vid:
            expandable_vids.append(r+"_"+vid)

In [None]:
def get_surrounding_variables(variable, variables_list, n=5):
    idx = variables_list.index(variable)
    before_start_idx = max(0, idx-n)
    after_start_idx = min(idx+1, len(variables_list))
    after_end_idx = min(idx+n+1, len(variables_list))
    before_n = variables_list[before_start_idx:idx]
    after_n = variables_list[after_start_idx:after_end_idx]
    return before_n, after_n

In [None]:
get_surrounding_variables("exploredata-ZA6670_VarFI_REG", list(meta_json['ZA6670'].keys()), 5)

In [None]:
def expand_predicted_variables(preds, n_neighbors=2, expand=False, expandable_variables=None):
    expanded_preds = {}

    for uid, vpreds in preds.items():
        expanded_uid_preds = {}
        for vid,score in vpreds.items():
            expanded_uid_preds[vid] = score
            if expandable_variables and vid in expandable_variables or expand:
                rd_id = vid.split("_")[0]
                assert "ZA" in rd_id
                pre_vars, post_vars = get_surrounding_variables("exploredata-"+vid, list(meta_json.get(rd_id, []).keys()), n_neighbors)
                for ev in pre_vars + post_vars:  # give each expanded variable the same score as the original
                    ev = ev.split("exploredata-")[-1]
                    if ev not in expanded_uid_preds:
                        expanded_uid_preds[ev] = score
        expanded_preds[uid] = expanded_uid_preds
    return expanded_preds

In [None]:
expanded_disambiguator_pt_preds = expand_predicted_variables({"abc": {"ZA6670_VarNL_INC": 0.8, "ZA6670_VarPH_INC": 0.5}}, expandable_vids)

In [None]:
expanded_disambiguator_pt_preds = expand_predicted_variables(disambiguator_pt_preds, expandable_vids, n_neighbors=1)

In [None]:
scores = compare_scores(expanded_disambiguator_pt_preds, disambiguator_pt_preds, qrels_dict, qrels_dict, test_df)

In [None]:
get_scores(qrels_dict, expanded_disambiguator_pt_preds, metrics=["hits@50"])

In [None]:
get_scores(qrels_dict, disambiguator_pt_preds, metrics=["hits@50"])

In [None]:
def get_scores(qrels_dict, run_dict, valid_uuids=None, metrics=["f1@10", "precision@10", "recall@10", "f1@10", "map@10", "ndcg@10"]):
    if valid_uuids:
        qrels_dict = {k:v for k,v in qrels_dict.items() if k in valid_uuids}
        run_dict = {k:v for k,v in run_dict.items() if k in valid_uuids}

    qrels = Qrels(qrels_dict)
    run = Run(run_dict)

    results = evaluate(qrels, run, metrics)
    print(results)
    return results

In [None]:
def truncate_dict(nested_dict, top_k=10):
    new_dict = {}

    for k,v in nested_dict.items():
        top_vs = list(v.keys())[:top_k]

        new_dict[k] = {_k:_v for _k,_v in v.items() if _k in top_vs}

    return new_dict

def get_recall(qrels, run, top_k=None):
    scores = []

    for uid, qs in qrels.items():
        preds = run.get(uid, {})
        preds = list(preds.keys())[:top_k] if top_k else preds
        
        count = 0
        for q in qs:
            if q in preds:
                count += 1
        
        score = count / len(qs)
        scores.append(score)
    
    return np.mean(scores)

In [None]:
truncated_disambiguator_pt_preds = truncate_dict(disambiguator_pt_preds)

In [None]:
get_recall(qrels_dict, truncated_disambiguator_pt_preds)

In [None]:
for i in range(10):
    expanded_truncated_disambiguator_pt_preds = expand_predicted_variables(truncated_disambiguator_pt_preds, expandable_variables=expandable_vids, n_neighbors=i)
    score = get_recall(qrels_dict, expanded_truncated_disambiguator_pt_preds)
    print(i+1, score)

In [None]:
for i in range(10):
    expanded_truncated_disambiguator_pt_preds = expand_predicted_variables(truncated_disambiguator_pt_preds, n_neighbors=i, expand=True)
    score = get_recall(qrels_dict, expanded_truncated_disambiguator_pt_preds)
    print(i+1, score)

In [None]:
train_df = pd.read_csv("../results/data/vadis-prolific-3_project_2023-12-09_1251_12:53:08_train_en.tsv", sep="\t")
train_df_74709 = train_df[train_df["doc_id"] == 74709]

In [None]:
variables = train_df_74709[train_df_74709["is_variable"].isin(["1", 1])]["variable"].tolist()
variables = [v for vs in variables for v in vs.split(";") if v]
variables = [v for vs in variables for v in vs.split(",") if v]
set(variables)

In [None]:
# document-level results
doc_var_mapping = {}

for doc_id, doc_group in test_df.groupby(by="doc_id"):
    variables = doc_group["variable"].fillna("").tolist()
    variables = [v for vs in variables for v in vs.split(";") if v]
    variables = [v for vs in variables for v in vs.split(",") if v]
    variables = [v for v in variables if "ZA" in v]
    variables = sorted(set(variables))

    doc_var_mapping[str(doc_id)] = variables

In [None]:
def get_pred_doc_var_mapping(df, preds, top_k):
    pred_doc_var_mapping = {}

    for doc_id, doc_group in df.groupby("doc_id"):
        doc_uuids = doc_group['uuid'].tolist()

        pred_vars = []
        for uid in doc_uuids:
            pred = preds.get(uid, {})
            
            if pred != {}:
                pred_vars.extend(list(pred.keys())[:top_k])

        pred_vars = sorted(list(set(pred_vars)))
        pred_doc_var_mapping[str(doc_id)] = pred_vars

    return pred_doc_var_mapping

In [None]:
def get_accuracy(true, pred):
    count = 0
    for p in pred:
        if p in true:
            count += 1

    return count / len(true)

In [None]:
def get_document_accuracies(test_df, preds):
    k_accs = []
    k_stds = []
    for k in range(1,51):
        pred_doc_var_mapping = get_pred_doc_var_mapping(test_df, preds, k)
        scores = []

        for doc_id,true_vars in doc_var_mapping.items():
            if len(true_vars) > 0:
                acc = get_accuracy(true_vars, pred_doc_var_mapping[doc_id])
                scores.append(acc)

        k_accs.append(np.mean(scores)*100)
        k_stds.append(np.std(scores)*100)
    return k_accs, k_stds

In [None]:
gold_accs, gold_stds = get_document_accuracies(test_df, disambiguator_pt_preds)
best_accs, best_stds = get_document_accuracies(test_df, disambiguator_pt_preds_filtered)

In [None]:
def make_sosse_dataset_size_plot(models, scores, stds, plot_stds=True, output_dir=None, n_columns=3, fontsize=18, figsize=(30,10), y_offset=1, add_title=True):
    flat_scores = [s for _s in scores for s in _s]
    y_min = max(math.floor(min(flat_scores)-y_offset), 0)
    y_max = min(math.ceil(max(flat_scores)+y_offset), 100)

    fig, ax = plt.subplots()
    
    for i,(model,score,std) in enumerate(zip(models, scores, stds)):
        score = np.asarray(score)
        std = np.asarray(std)

        ax.plot(list(range(1,len(score)+1)), score, label=model)
        if plot_stds:
            ax.fill_between(list(range(1,len(score)+1)), score-std, score+std, alpha=0.2)

    ax.tick_params(axis='x', labelsize=fontsize)
    ax.tick_params(axis='y', labelsize=fontsize)

    ax.set_ylim(y_min, y_max)
    ax.set_xlabel("Top k", fontsize=fontsize)
    ax.set_ylabel("Recall", fontsize=fontsize)

    handles, labels = ax.get_legend_handles_labels()
    ax.legend(handles, labels, borderaxespad=0.1, loc="best", fancybox=True, framealpha=0.5, fontsize=fontsize).set_title("Models",prop={'size':fontsize})

    plt.tight_layout()

    if output_dir:
        os.makedirs(output_dir, exist_ok=True)
        path = os.path.join(output_dir, os.path.basename(output_dir)+".pdf")
        plt.savefig(path, bbox_inches="tight")
        print(f"Save to path: {path}")
        
    plt.show()
    plt.close()

In [None]:
gold_accs, gold_stds = get_document_accuracies(test_df, disambiguator_pt_preds)
best_accs, best_stds = get_document_accuracies(test_df, disambiguator_pt_preds_filtered)
make_sosse_dataset_size_plot(["Gold", "Best"], [gold_accs, best_accs], [gold_stds, best_stds], plot_stds=False, output_dir=os.path.join(output_dir, "document-level-recall"))

In [None]:
# context window results
metrics: List[str] = ["test_lang:en:BinaryPrecision()_mean", "test_lang:en:BinaryRecall()_mean", "test_lang:en:BinaryF1Score()_mean", "test_lang:de:BinaryPrecision()_mean", "test_lang:de:BinaryRecall()_mean", "test_lang:de:BinaryF1Score()_mean", "test_BinaryF1Score()_mean"]
metrics: List[str] = ["test_type_short:E:BinaryF1Score()_mean", "test_type_short:I:BinaryF1Score()_mean", "test_subtype_short:Q:BinaryF1Score()_mean", "test_subtype_short:P:BinaryF1Score()_mean"]
model_order = ["SoSci-XLM-R$_{\text{base}}$", "XLM-R$_{\text{large}}$"]
print(make_table_diagnostic(full_df[full_df["result_type"].isin(["finetune_contextwindow"])], metrics, model_order))

In [None]:
# Qualitative analysis

# Load best models with different architectures and multiple seeds
# Predict on validation data
# Count the instanaces that most models fail on

from transformers import pipeline


def load_pipeline(path, batch_size=None):
    return pipeline(model=path, batch_size=batch_size)

val_path = "../results/data/vadis-prolific-3_project_2023-12-09_1251_12:53:08_val_en.tsv"
val_df = pd.read_csv(val_path, sep="\t")
val_df.index = val_df["uuid"]

instance_level_md_results = defaultdict(lambda: defaultdict(list))

model_results_dir = ""
model_dirs = [os.path.join(model_results_dir, d) for d in os.listdir(model_results_dir)]
for model_dir in model_dirs:
    seed_dirs = [os.path.join(model_dir, d) for d in os.listdir(model_dir)]
    model_name = os.path.basename(model_dir)

    for seed_dir in seed_dirs:
        pipe = load_pipeline(seed_dir, batch_size=8)
        uuids = val_df["uuid"].tolist()
        texts = val_df["sentence"].tolist()
        labels = val_df["is_variable"].tolist()
        preds = pipe(texts)

        for uid,true,pred in zip(uuids,labels,preds):
            instance_level_md_results[uid][model_name].append(1 if true == pred else 0)
        break
    break

In [None]:
# show instances with mentions that all models get wrong

# show instances with mentions that models sometimes get wrong

# show instances without mentions that models get wrong

# show instances without mentions that models sometimes get wrong

In [None]:
valid_models = ["multilingual-e5"]
valid_samples = ["sample=100000", "sample=200000", "sample=400000"]
top_k = 10
instance_level_ed_results = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict())))

model_results_dir = "../results/task2_results_sosci-simlearn/filtered_gen/run_sim_data_size_choice_exp/04-06-2024"
date_dirs = [os.path.join(model_results_dir, d) for d in os.listdir(model_results_dir) if os.path.isdir(os.path.join(model_results_dir, d))]
for date_dir in date_dirs:
    sample_dirs = [os.path.join(date_dir, d) for d in os.listdir(date_dir) if os.path.isdir(os.path.join(date_dir, d))]

    for sample_dir in sample_dirs:
        sample = os.path.basename(sample_dir)
        seed_dirs = [os.path.join(sample_dir, d) for d in os.listdir(sample_dir) if os.path.isdir(os.path.join(sample_dir, d))]

        for seed_dir in seed_dirs:
            seed = os.path.basename(seed_dir)
            seed_dir = os.path.join(seed_dir, os.listdir(seed_dir)[0])
            qrels_path = os.path.join(seed_dir, "qrels.json")
            run_path = os.path.join(seed_dir, "run.json")
            results_path = os.path.join(seed_dir, "results.json")

            qrels = load_json(qrels_path)
            run = load_json(run_path)
            results = load_json(results_path)

            model_path = results["config"]["model_name"]
            assert sample in model_path
            assert seed in model_path
            model_name = os.path.basename(model_path).split("_epochs=50")[0]

            for vmodel in valid_models:
                if vmodel in model_name and sample in valid_samples:
                    for uid,entities in qrels.items():
                        preds = run.get(uid, {})
                        preds = list(preds.keys())[:top_k]

                        pred_entities = {}

                        for ent in entities:
                            label = 0
                            if ent in preds:
                                label = 1

                            pred_entities[ent] = label

                        instance_level_ed_results[uid][model_name][sample][seed] = pred_entities

In [None]:
from collections import Counter

train_path = "../results/data/vadis-prolific-3_project_2023-12-09_1251_12:53:08_train_en.tsv"
train_df = pd.read_csv(train_path, sep="\t")
val_path = "../results/data/vadis-prolific-3_project_2023-12-09_1251_12:53:08_val_en.tsv"
val_df = pd.read_csv(val_path, sep="\t")
eval_df = pd.concat([train_df, val_df])
eval_df.index = eval_df["uuid"]

aggregated_instance_level_ed_results = {}
aggregated_counts = {-1: 0, 0: 0, 1: 0, 10: 0, 51: 0}

"""
Aggregates the results of instance-level entity disambiguation (ED) for each variable in the input data.

For each instance (identified by a unique ID `uid`), the function processes the ED results for each model, sample, and seed. It counts the occurrences of each value for each variable, and then determines the final value for each variable based on the following rules:

- If there are two unique values, the function selects the value with the higher count as the final value. If the counts are equal, it stores the full count dictionary as the final value.
- If there is only one unique value, it is stored as the final value.
- If there are more than two unique values, the final value is set to -1.

The function also keeps track of the counts for the final values of -1, 0, 1, and 51, as well as any other unique values (stored as 10).

The final aggregated results for each instance are stored in the `aggregated_instance_level_ed_results` dictionary, keyed by the instance ID.
"""
for uid, res in instance_level_ed_results.items():
    variable_results = defaultdict(list)

    for model, mres in res.items():
        for sample, sares in mres.items():
            for seed, seres in sares.items():
                for vid, vval in seres.items():
                    variable_results[vid].append(vval)

    aggregated_variable_results = {}
    for vid, vvals in variable_results.items():
        counts = Counter(vvals)
        if len(counts) == 2:
            if counts.get(1) > counts.get(0):
                aggregated_variable_results[vid] = 51
                print(51, vid, eval_df.loc[uid]["sentence"])
            else:
                aggregated_variable_results[vid] = str(dict(counts))
        elif len(counts) == 1:
            aggregated_variable_results[vid] = vvals[0]
            print(vvals[0], vid, eval_df.loc[uid]["sentence"])
        else:
            aggregated_variable_results[vid] = -1

        if aggregated_variable_results[vid] in [-1, 0, 1, 51]:
            aggregated_counts[aggregated_variable_results[vid]] += 1
        else:
            aggregated_counts[10] += 1
    print(uid, aggregated_variable_results)
    print("\n")

    aggregated_instance_level_ed_results[uid] = aggregated_variable_results
    # break

In [None]:
aggregated_counts