In [1]:
import os
import itertools
from IPython.display import display

import numpy as np
import pandas as pd
import scipy.stats as sts

from tqdm import tqdm

from typing import Callable, Tuple, Any
from functools import partial

import matplotlib.pyplot as plt

pd.options.display.float_format = '{:.4f}'.format

In [2]:
# STATISTIC = "./matrix/new_cat_jacc_sorted.csv"
# BERT = "./matrix/scores_bert.csv"
# MEDBERT = "./matrix/scores_medbert.csv"
# DEEPSEEK = "./matrix/scores_DeepSeek-V3_mean.csv"
# MLM = "./matrix/scores_mlm.csv"
# YANDEX = "./matrix/scores_yandex_gpt5.csv"
# DOC = "./matrix/scores_yandex_doc_search.csv"
# QWEN = "./matrix/scores_qwen3-235b-a22b_0.csv"

STATISTIC = "./matrices/new_cat_jacc_sorted.csv"
BERT = "./matrices/similarity_scores.csv"
MEDBERT = "./matrix/scores_medbert.csv"
DEEPSEEK = "./matrices/scores_DeepSeek-V3_mean.csv"
MLM = "./matrices/mlm_results_final.csv"
YANDEX = "./matrices/yandex_prompt_matrix.csv"
DOC = "./matrices/scores_llm_doc.csv"
QWEN = "./matrices/scores_qwen3-235b-a22b_0.csv"

In [3]:
statistic = pd.read_csv(STATISTIC).set_index("Unnamed: 0", drop=True)
bert = pd.read_csv(BERT).set_index("Unnamed: 0", drop=True)
medbert = pd.read_csv(MEDBERT).set_index("Unnamed: 0", drop=True)
deepseek = pd.read_csv(DEEPSEEK).set_index("Unnamed: 0", drop=True)
mlm = pd.read_csv(MLM).set_index("Unnamed: 0", drop=True)
yandex = pd.read_csv(YANDEX).set_index("Unnamed: 0", drop=True)
doc = pd.read_csv(DOC).set_index("Unnamed: 0", drop=True)
qwen = pd.read_csv(QWEN).set_index("Unnamed: 0", drop=True)

In [4]:
matrices = {
    "statistic": statistic,
    "bert": bert,
    "medbert": medbert,
    "deepseek": deepseek,
    "mlm": mlm,
    "yandex": yandex,
    "doc": doc,
    "qwen": qwen
}

LLMS = ["deepseek", "qwen", "yandex", "mlm"]
BASELINES = ["statistic", "medbert", "bert", "doc"]

In [5]:
for k, v in matrices.items():
    print(k, v.shape)
    display(v.head())

statistic (1757, 1757)


Unnamed: 0_level_0,A01,A02,A03,A04,A05,A06,A07,A08,A09,A15,...,Z90,Z91,Z92,Z93,Z94,Z95,Z96,Z97,Z98,Z99
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A01,1.0,0.04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A02,0.04,1.0,0.0,0.0035,0.0,0.0,0.0,0.0043,0.002,0.0042,...,0.0014,0.0008,0.0015,0.001,0.0027,0.0008,0.001,0.0016,0.0009,0.0004
A03,0.0,0.0,1.0,0.0008,0.0,0.0,0.0133,0.0003,0.0021,0.0,...,0.0001,0.0001,0.0002,0.0,0.0,0.0,0.0001,0.0002,0.0,0.0001
A04,0.0,0.0035,0.0008,1.0,0.0019,0.0002,0.0014,0.0462,0.0132,0.0025,...,0.0504,0.0433,0.048,0.0659,0.0577,0.0392,0.0376,0.0336,0.0419,0.0634
A05,0.0,0.0,0.0,0.0019,1.0,0.0,0.0,0.007,0.0041,0.0,...,0.0007,0.001,0.0012,0.001,0.0024,0.0005,0.0009,0.0009,0.0006,0.0007


bert (1696, 1696)


Unnamed: 0_level_0,A01,A02,A03,A04,A05,A06,A07,A08,A09,A15,...,Z90,Z91,Z92,Z93,Z94,Z95,Z96,Z97,Z98,Z99
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A01,1.0,0.8591,0.6663,0.859,0.8232,0.7387,0.8297,0.8829,0.8726,0.8202,...,0.8054,0.7709,0.7201,0.7007,0.7522,0.7798,0.7637,0.739,0.7087,0.7486
A02,0.8591,1.0,0.7781,0.9732,0.8877,0.8647,0.9421,0.9341,0.9137,0.8221,...,0.834,0.8226,0.8044,0.8205,0.8043,0.7879,0.8504,0.8559,0.8342,0.8135
A03,0.6663,0.7781,1.0,0.7914,0.7905,0.9191,0.8056,0.7389,0.7291,0.6688,...,0.7761,0.7331,0.7604,0.8092,0.7333,0.6571,0.7582,0.7807,0.8348,0.7784
A04,0.859,0.9732,0.7914,1.0,0.898,0.8693,0.9584,0.9494,0.9096,0.8111,...,0.844,0.8284,0.8417,0.8409,0.8171,0.7896,0.8612,0.8814,0.8637,0.8327
A05,0.8232,0.8877,0.7905,0.898,1.0,0.8485,0.882,0.8853,0.8774,0.8238,...,0.8873,0.8659,0.8055,0.8212,0.8168,0.7795,0.8332,0.8581,0.8606,0.8815


medbert (1757, 1757)


Unnamed: 0_level_0,A01,A02,A03,A04,A05,A06,A07,A08,A09,A15,...,Z90,Z91,Z92,Z93,Z94,Z95,Z96,Z97,Z98,Z99
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A01,1.0,0.6166,0.6332,0.6998,0.7445,0.5432,0.7018,0.4359,0.6696,0.7296,...,0.6017,0.5904,0.6364,0.619,0.6008,0.6264,0.5834,0.6221,0.6086,0.6181
A02,0.6166,1.0,0.9491,0.894,0.888,0.9076,0.9227,0.8937,0.8813,0.5538,...,0.742,0.7225,0.7114,0.6653,0.7015,0.6537,0.6856,0.7082,0.7338,0.6684
A03,0.6332,0.9491,1.0,0.9109,0.8954,0.9104,0.9271,0.8952,0.9134,0.5566,...,0.7835,0.7579,0.7356,0.7313,0.7477,0.694,0.7162,0.7484,0.7726,0.7015
A04,0.6998,0.894,0.9109,1.0,0.9448,0.8352,0.9124,0.7704,0.9225,0.6792,...,0.7549,0.7353,0.7501,0.7203,0.7821,0.7133,0.6995,0.735,0.7713,0.7205
A05,0.7445,0.888,0.8954,0.9448,1.0,0.8326,0.9455,0.7555,0.9078,0.7106,...,0.7344,0.7036,0.729,0.7024,0.724,0.7377,0.6986,0.7378,0.7455,0.7107


deepseek (2046, 2046)


Unnamed: 0_level_0,A00,A01,A02,A03,A04,A05,A06,A07,A08,A09,...,Z90,Z91,Z92,Z93,Z94,Z95,Z96,Z97,Z98,Z99
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.3333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A03,0.0,0.0,0.0,0.0,0.1667,0.0,0.0,0.0,0.0,0.6667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A04,0.0,0.0,0.0,0.1667,0.0,0.0,0.0,0.0,0.0,0.1667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


mlm (1757, 1757)


Unnamed: 0_level_0,K76,R18,K74,B19,J44,F31,F43,Z87,D69,E87,...,V45,V16,W99,P11,B56,W85,P07,A27,Y64,Y21
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
K76,1.0,-0.0101,0.0322,0.0225,-0.0438,0.0373,-0.0354,0.0613,-0.0103,0.0673,...,0.0494,-0.0321,-0.0638,-0.0201,-0.0219,0.097,-0.0385,0.0032,0.0437,0.0902
R18,-0.0101,1.0,0.0141,0.1117,-0.0041,-0.01,-0.0222,-0.1292,-0.0061,0.0417,...,0.0154,-0.0035,-0.1151,-0.0405,-0.0019,0.02,-0.0939,0.0127,-0.061,-0.0541
K74,0.0322,0.0141,1.0,-0.0253,0.0296,0.0193,0.1311,0.0386,-0.0387,-0.0299,...,0.1384,-0.0697,-0.0374,0.0364,0.0299,-0.0097,-0.0418,-0.0631,0.0735,0.0187
B19,0.0225,0.1117,-0.0253,1.0,-0.0018,0.0264,0.039,0.0423,-0.0289,0.0212,...,0.0655,-0.0218,0.0067,0.0087,-0.1204,0.0664,0.0325,0.0867,-0.0737,-0.0318
J44,-0.0438,-0.0041,0.0296,-0.0018,1.0,0.0409,-0.0069,0.0965,-0.0023,0.1058,...,0.056,-0.0064,-0.0088,0.0241,0.0249,-0.0585,0.0481,0.0353,0.0225,0.1176


yandex (1696, 1696)


Unnamed: 0_level_0,A01,A02,A03,A04,A05,A06,A07,A08,A09,A15,...,Z90,Z91,Z92,Z93,Z94,Z95,Z96,Z97,Z98,Z99
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A01,1.0,0.7,0.0,0.0,0.0,0.0,0.0,0.6,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A02,0.7,1.0,0.7,0.6,0.0,0.0,0.0,0.8,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A03,0.0,0.7,1.0,0.7,0.6,0.7,0.0,0.8,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A04,0.0,0.6,0.7,1.0,0.9,0.0,0.7,0.8,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A05,0.0,0.0,0.6,0.9,1.0,0.0,0.0,0.8,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


doc (1696, 1696)


Unnamed: 0_level_0,A01,A02,A03,A04,A05,A06,A07,A08,A09,A15,...,Z90,Z91,Z92,Z93,Z94,Z95,Z96,Z97,Z98,Z99
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A01,1.0,0.7428,0.7808,0.7677,0.7582,0.7838,0.7795,0.7592,0.7595,0.7714,...,0.6238,0.6441,0.6317,0.5498,0.6119,0.5632,0.5865,0.6073,0.651,0.5643
A02,0.7428,1.0,0.7807,0.8987,0.8605,0.7332,0.8561,0.8724,0.8412,0.7605,...,0.5522,0.5543,0.5303,0.4858,0.5738,0.5114,0.5196,0.4976,0.4986,0.4752
A03,0.7808,0.7807,1.0,0.7488,0.7348,0.8481,0.8101,0.7181,0.7162,0.7141,...,0.6193,0.5914,0.6499,0.6216,0.6201,0.5252,0.6123,0.6131,0.6108,0.5523
A04,0.7677,0.8987,0.7488,1.0,0.8766,0.7642,0.8775,0.9298,0.8976,0.791,...,0.5635,0.5647,0.5862,0.4959,0.6634,0.5506,0.5771,0.5436,0.5822,0.5037
A05,0.7582,0.8605,0.7348,0.8766,1.0,0.7277,0.8159,0.805,0.8306,0.7891,...,0.6396,0.6042,0.5617,0.486,0.5999,0.4987,0.5077,0.5138,0.5346,0.574


qwen (2032, 2032)


Unnamed: 0_level_0,A00,A01,A02,A03,A04,A05,A06,A07,A08,A09,...,Z90,Z91,Z92,Z93,Z94,Z95,Z96,Z97,Z98,Z99
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
intersection = sorted(list(intersection))

In [8]:
len(intersection)

1696

In [9]:
for k, v in matrices.items():
    matrices[k] = v.loc[intersection, intersection]

In [10]:
for k, v in matrices.items():
    print(k)
    display(v.head(3))

statistic


Unnamed: 0_level_0,A01,A02,A03,A04,A05,A06,A07,A08,A09,A15,...,Z90,Z91,Z92,Z93,Z94,Z95,Z96,Z97,Z98,Z99
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A01,1.0,0.04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A02,0.04,1.0,0.0,0.0035,0.0,0.0,0.0,0.0043,0.002,0.0042,...,0.0014,0.0008,0.0015,0.001,0.0027,0.0008,0.001,0.0016,0.0009,0.0004
A03,0.0,0.0,1.0,0.0008,0.0,0.0,0.0133,0.0003,0.0021,0.0,...,0.0001,0.0001,0.0002,0.0,0.0,0.0,0.0001,0.0002,0.0,0.0001


bert


Unnamed: 0_level_0,A01,A02,A03,A04,A05,A06,A07,A08,A09,A15,...,Z90,Z91,Z92,Z93,Z94,Z95,Z96,Z97,Z98,Z99
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A01,1.0,0.8591,0.6663,0.859,0.8232,0.7387,0.8297,0.8829,0.8726,0.8202,...,0.8054,0.7709,0.7201,0.7007,0.7522,0.7798,0.7637,0.739,0.7087,0.7486
A02,0.8591,1.0,0.7781,0.9732,0.8877,0.8647,0.9421,0.9341,0.9137,0.8221,...,0.834,0.8226,0.8044,0.8205,0.8043,0.7879,0.8504,0.8559,0.8342,0.8135
A03,0.6663,0.7781,1.0,0.7914,0.7905,0.9191,0.8056,0.7389,0.7291,0.6688,...,0.7761,0.7331,0.7604,0.8092,0.7333,0.6571,0.7582,0.7807,0.8348,0.7784


medbert


Unnamed: 0_level_0,A01,A02,A03,A04,A05,A06,A07,A08,A09,A15,...,Z90,Z91,Z92,Z93,Z94,Z95,Z96,Z97,Z98,Z99
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A01,1.0,0.6166,0.6332,0.6998,0.7445,0.5432,0.7018,0.4359,0.6696,0.7296,...,0.6017,0.5904,0.6364,0.619,0.6008,0.6264,0.5834,0.6221,0.6086,0.6181
A02,0.6166,1.0,0.9491,0.894,0.888,0.9076,0.9227,0.8937,0.8813,0.5538,...,0.742,0.7225,0.7114,0.6653,0.7015,0.6537,0.6856,0.7082,0.7338,0.6684
A03,0.6332,0.9491,1.0,0.9109,0.8954,0.9104,0.9271,0.8952,0.9134,0.5566,...,0.7835,0.7579,0.7356,0.7313,0.7477,0.694,0.7162,0.7484,0.7726,0.7015


deepseek


Unnamed: 0_level_0,A01,A02,A03,A04,A05,A06,A07,A08,A09,A15,...,Z90,Z91,Z92,Z93,Z94,Z95,Z96,Z97,Z98,Z99
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6667,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.3333,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A03,0.0,0.0,0.0,0.1667,0.0,0.0,0.0,0.0,0.6667,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


mlm


Unnamed: 0_level_0,A01,A02,A03,A04,A05,A06,A07,A08,A09,A15,...,Z90,Z91,Z92,Z93,Z94,Z95,Z96,Z97,Z98,Z99
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A01,1.0,0.0208,0.1081,0.1088,0.063,0.0282,0.0256,0.0799,0.0677,-0.0173,...,0.0978,-0.0761,0.0894,0.0925,0.0273,0.0901,-0.0043,0.0693,0.059,0.0346
A02,0.0208,1.0,0.0616,0.0026,0.0355,0.0307,-0.0556,-0.0596,-0.1279,0.1108,...,0.0298,0.0078,0.0123,-0.0127,-0.0096,-0.0007,-0.0207,-0.0892,0.1506,-0.1049
A03,0.1081,0.0616,1.0,0.0432,-0.0119,-0.0439,0.0509,0.0006,-0.0259,-0.051,...,0.0743,0.0518,0.0443,0.0313,0.0771,-0.0249,-0.0674,0.0465,0.0371,0.0098


yandex


Unnamed: 0_level_0,A01,A02,A03,A04,A05,A06,A07,A08,A09,A15,...,Z90,Z91,Z92,Z93,Z94,Z95,Z96,Z97,Z98,Z99
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A01,1.0,0.7,0.0,0.0,0.0,0.0,0.0,0.6,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A02,0.7,1.0,0.7,0.6,0.0,0.0,0.0,0.8,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A03,0.0,0.7,1.0,0.7,0.6,0.7,0.0,0.8,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


doc


Unnamed: 0_level_0,A01,A02,A03,A04,A05,A06,A07,A08,A09,A15,...,Z90,Z91,Z92,Z93,Z94,Z95,Z96,Z97,Z98,Z99
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A01,1.0,0.7428,0.7808,0.7677,0.7582,0.7838,0.7795,0.7592,0.7595,0.7714,...,0.6238,0.6441,0.6317,0.5498,0.6119,0.5632,0.5865,0.6073,0.651,0.5643
A02,0.7428,1.0,0.7807,0.8987,0.8605,0.7332,0.8561,0.8724,0.8412,0.7605,...,0.5522,0.5543,0.5303,0.4858,0.5738,0.5114,0.5196,0.4976,0.4986,0.4752
A03,0.7808,0.7807,1.0,0.7488,0.7348,0.8481,0.8101,0.7181,0.7162,0.7141,...,0.6193,0.5914,0.6499,0.6216,0.6201,0.5252,0.6123,0.6131,0.6108,0.5523


qwen


Unnamed: 0_level_0,A01,A02,A03,A04,A05,A06,A07,A08,A09,A15,...,Z90,Z91,Z92,Z93,Z94,Z95,Z96,Z97,Z98,Z99
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
pearsonr = pd.DataFrame(data=-np.ones((len(LLMS), len(BASELINES))), columns=BASELINES, index=LLMS)

In [34]:
# for method1, method2 in tqdm(itertools.combinations(matrices.keys(), 2)):
for llm, baseline in tqdm(itertools.product(LLMS, BASELINES)):
    print("Computing estimate for {} and {}".format(llm, baseline))
    pearsonr.loc[llm, baseline] = sts.pearsonr(matrices[llm].values.flatten(), matrices[baseline].values.flatten()).statistic

2it [00:00, 17.60it/s]

Computing estimate for deepseek and statistic
Computing estimate for deepseek and medbert
Computing estimate for deepseek and bert
Computing estimate for deepseek and doc


6it [00:00, 17.82it/s]

Computing estimate for qwen and statistic
Computing estimate for qwen and medbert
Computing estimate for qwen and bert
Computing estimate for qwen and doc


10it [00:00, 15.96it/s]

Computing estimate for yandex and statistic
Computing estimate for yandex and medbert
Computing estimate for yandex and bert


14it [00:00, 16.26it/s]

Computing estimate for yandex and doc
Computing estimate for mlm and statistic
Computing estimate for mlm and medbert
Computing estimate for mlm and bert


16it [00:00, 16.76it/s]

Computing estimate for mlm and doc





In [35]:
pearsonr

Unnamed: 0,statistic,medbert,bert,doc
deepseek,0.0422,0.0412,-0.0231,0.1729
qwen,0.0465,0.0512,0.0327,0.127
yandex,0.5158,0.1183,0.0701,0.1819
mlm,0.3449,0.0382,0.0239,0.0532


In [11]:
spearmanr = pd.DataFrame(data=-np.ones((len(LLMS), len(BASELINES))), columns=BASELINES, index=LLMS)
spearmanp = pd.DataFrame(data=-np.ones((len(LLMS), len(BASELINES))), columns=BASELINES, index=LLMS)

In [12]:
# for method1, method2 in tqdm(itertools.combinations(matrices.keys(), 2)):
for llm, baseline in tqdm(itertools.product(LLMS, BASELINES)):
    print("Computing estimate for {} and {}".format(llm, baseline))
    statistic, p_value = sts.spearmanr(matrices[llm].values.flatten(), matrices[baseline].values.flatten())
    spearmanr.loc[llm, baseline] = statistic
    spearmanp.loc[llm, baseline] = p_value
    spearmanr.to_csv("spearman.csv")
    spearmanp.to_csv("spearman_p_value.csv")

0it [00:00, ?it/s]

Computing estimate for deepseek and statistic


1it [00:00,  2.28it/s]

Computing estimate for deepseek and medbert


2it [00:00,  2.41it/s]

Computing estimate for deepseek and bert


3it [00:01,  2.39it/s]

Computing estimate for deepseek and doc


4it [00:01,  2.36it/s]

Computing estimate for qwen and statistic


5it [00:02,  2.45it/s]

Computing estimate for qwen and medbert


6it [00:02,  2.42it/s]

Computing estimate for qwen and bert


7it [00:02,  2.41it/s]

Computing estimate for qwen and doc


8it [00:03,  2.45it/s]

Computing estimate for yandex and statistic


9it [00:03,  2.44it/s]

Computing estimate for yandex and medbert


10it [00:04,  2.43it/s]

Computing estimate for yandex and bert


11it [00:04,  2.47it/s]

Computing estimate for yandex and doc


12it [00:04,  2.44it/s]

Computing estimate for mlm and statistic


13it [00:05,  2.41it/s]

Computing estimate for mlm and medbert


14it [00:05,  2.36it/s]

Computing estimate for mlm and bert


15it [00:06,  2.26it/s]

Computing estimate for mlm and doc


16it [00:06,  2.38it/s]


In [13]:
spearmanr

Unnamed: 0,statistic,medbert,bert,doc
deepseek,0.0064,0.0244,-0.0217,0.139
qwen,0.0714,0.0291,0.0292,0.0929
yandex,0.0698,0.0701,0.0565,0.0925
mlm,-0.0017,0.0051,0.0027,0.0076


In [14]:
spearmanp

Unnamed: 0,statistic,medbert,bert,doc
deepseek,0.0,0.0,0.0,0.0
qwen,0.0,0.0,0.0,0.0
yandex,0.0,0.0,0.0,0.0
mlm,0.0043,0.0,0.0,0.0


In [22]:
def compute_bootstrapped_score(
        y_test: np.array, 
        y_prob: np.array, 
        scorer: Callable[[np.array, np.array], Any], 
        m_sample: int = None, 
        stratum_vals: np.array =None
    ):
    assert not isinstance(y_test, pd.Series), "y_test should be np.array"
    assert not isinstance(y_prob, pd.Series), "y_prob should be np.array"

    idx = np.array(range(len(y_test)))
    if m_sample is None: m_sample = len(y_test) #bootstrap sample size
        
    if stratum_vals is not None: #select equal number of samples from each category
        idx_bs = [] 
        for val in set(stratum_vals):
            stratum_idx = idx[stratum_vals == val] 
            idx_bs += np.random.choice(stratum_idx, size=len(stratum_idx), replace=True).tolist()
    else:
        idx_bs = np.random.choice(idx, size=m_sample, replace=True)

    try:
        return scorer(y_test[idx_bs], y_prob[idx_bs]).statistic
    except Exception as e:
        print("WARNING: Bootstrapping failed for", scorer.func.__name__ if isinstance(scorer, partial) else scorer.__name__, "with error", e)
        return np.nan
    
def compute_ci(
    y_test: np.array, 
    y_prob: np.array, 
    stratum_vals: np.array = None, 
    n_bootstraps: int = 1000, 
    m_sample: int = None, 
    scorer: Callable[[np.array, np.array], Any] = sts.spearmanr,
    alpha: float = 0.05, #95% CI
    verbose: int = 1, 
    return_se: bool = False
):
    assert len(y_test) == len(y_prob), "y_test and y_prob should have the same lengths"
        
    scores = []
    if verbose > 0:
        print(f"Bootstrap scores computing for {scorer.func.__name__ if isinstance(scorer, partial) else scorer.__name__}...")
        for _ in tqdm(range(n_bootstraps)):
            scores.append(compute_bootstrapped_score(y_test, y_prob, scorer, stratum_vals=stratum_vals, m_sample=m_sample))
    else:
        for _ in range(n_bootstraps):
            scores.append(compute_bootstrapped_score(y_test, y_prob, scorer, stratum_vals=stratum_vals, m_sample=m_sample))
    scores = np.array(scores)
    
    nans_share = np.sum(np.isnan(scores).astype(int))/len(scores)
    if nans_share > 0.5: #empirical threshold, you can change it if you have better solution
        print(f"WARNING: There is {nans_share*100:.0f}% NaNs in bootstrapped scores for {scorer.func.__name__ if isinstance(scorer, partial) else scorer.__name__}")
        random_idxs = np.random.choice(list(range(len(y_test))), size=20, replace=False)
        print("       random 20 entries from y_test:", y_test[random_idxs])
        print("corresponging 20 entries from y_prob:", y_prob[random_idxs])
        if return_se:
            return np.nan, np.nan, np.nan
        else:
            return np.nan, np.nan
    
    estimation = np.nanmean(scores)
    se = np.nanstd(scores)
    perc = sts.norm.ppf(1 - alpha/2)
    e_perc = se * perc
    
    if verbose > 1:
        plt.figure(figsize=(4, 2.5))
        plt.hist(scores, bins=50)
        plt.axvline(x = estimation, color = 'tab:orange', label = 'mean')
        plt.axvline(x = estimation - e_perc, color = 'tab:red', label = f'mean - e_{1-alpha:.2f}')
        plt.axvline(x = estimation + e_perc, color = 'tab:red', label = f'mean + e_{1-alpha:.2f}')
        plt.show()
    
    if return_se:
        return estimation, e_perc, se
    else:
        return estimation, e_perc

In [23]:
spearman_mean = pd.DataFrame(data=-np.ones((len(LLMS), len(BASELINES))), columns=BASELINES, index=LLMS)
spearman_se = pd.DataFrame(data=-np.ones((len(LLMS), len(BASELINES))), columns=BASELINES, index=LLMS)
spearman_e_perc = pd.DataFrame(data=-np.ones((len(LLMS), len(BASELINES))), columns=BASELINES, index=LLMS)

In [24]:
from multiprocessing import Pool

def _worker_pair(args):
    """Compute CI for a single (llm, baseline) pair. Top-level for pickling."""
    (llm, baseline, x_flat, y_flat, kwargs) = args
    # call your existing function (single-process bootstrap inside)
    r, e_perc, se = compute_ci(
        x_flat, y_flat,
        return_se=True,
        **kwargs
    )
    return llm, baseline, r, e_perc, se

if __name__ == "__main__":  # important for Windows / notebooks
    # prepare outputs
    spearman_mean   = pd.DataFrame(-np.ones((len(LLMS), len(BASELINES))), columns=BASELINES, index=LLMS)
    spearman_se     = pd.DataFrame(-np.ones((len(LLMS), len(BASELINES))), columns=BASELINES, index=LLMS)
    spearman_e_perc = pd.DataFrame(-np.ones((len(LLMS), len(BASELINES))), columns=BASELINES, index=LLMS)

    # build tasks (pre-flatten to avoid doing it in workers repeatedly)
    common_kwargs = dict(
        scorer=sts.spearmanr,
        n_bootstraps=1000,   # adjust as you like
        m_sample=None,
        stratum_vals=None,
        alpha=0.05,
        verbose=1            # silence per-worker prints
    )
    tasks = []
    for llm, baseline in itertools.product(LLMS, BASELINES):
        x_flat = matrices[llm].values.ravel()
        y_flat = matrices[baseline].values.ravel()
        tasks.append((llm, baseline, x_flat, y_flat, common_kwargs))

    # run in parallel across pairs
    n_jobs = None  # or an int like 8
    with Pool(processes=n_jobs) as pool:
        for llm, baseline, r, e_perc, se in tqdm(pool.imap_unordered(_worker_pair, tasks, chunksize=1)):
            spearman_mean.loc[llm, baseline]   = r
            spearman_se.loc[llm, baseline]     = se
            spearman_e_perc.loc[llm, baseline] = e_perc
    
    spearman_mean.to_csv("spearman_mean.csv")
    spearman_se.to_csv("spearman_se.csv")
    spearman_e_perc.to_csv("spearman_e_perc.csv")

16it [16:26, 61.63s/it]


In [25]:
spearman_mean

Unnamed: 0,statistic,medbert,bert,doc
deepseek,0.0064,0.0244,-0.0217,0.139
qwen,0.0714,0.0291,0.0293,0.0929
yandex,0.0698,0.0701,0.0565,0.0926
mlm,-0.0017,0.0052,0.0027,0.0076


In [32]:
spearman_e_perc

Unnamed: 0,statistic,medbert,bert,doc
deepseek,0.0012,0.0012,0.0011,0.0012
qwen,0.0013,0.0013,0.0012,0.0012
yandex,0.0014,0.0012,0.0012,0.0011
mlm,0.0012,0.0012,0.0012,0.0012


In [26]:
spearman_se

Unnamed: 0,statistic,medbert,bert,doc
deepseek,0.0006,0.0006,0.0006,0.0006
qwen,0.0007,0.0007,0.0006,0.0006
yandex,0.0007,0.0006,0.0006,0.0006
mlm,0.0006,0.0006,0.0006,0.0006


In [3]:
spearman_mean = pd.read_csv('spearman_mean.csv')
spearman_se = pd.read_csv('spearman_se.csv')