In [3]:
import os
import itertools
from IPython.display import display

import numpy as np
import pandas as pd
import scipy.stats as sts

from tqdm import tqdm

from typing import Callable, Tuple, Any
from functools import partial

import matplotlib.pyplot as plt

pd.options.display.float_format = '{:.3f}'.format

In [4]:
STATISTIC = "./matrix/new_cat_jacc_sorted.csv"
BERT = "./matrix/scores_bert.csv"
MEDBERT = "./matrix/scores_medbert.csv"
DEEPSEEK = "./matrix/scores_DeepSeek-V3_mean.csv"
MLM = "./matrix/scores_mlm.csv"
YANDEX = "./matrix/scores_yandex_gpt5.csv"
DOC = "./matrix/scores_yandex_doc_search.csv"
QWEN = "./matrix/scores_qwen3-235b-a22b_0.csv"

In [5]:
statistic = pd.read_csv(STATISTIC).set_index("Unnamed: 0", drop=True)
bert = pd.read_csv(BERT).set_index("Unnamed: 0", drop=True)
medbert = pd.read_csv(MEDBERT).set_index("Unnamed: 0", drop=True)
deepseek = pd.read_csv(DEEPSEEK).set_index("Unnamed: 0", drop=True)
mlm = pd.read_csv(MLM).set_index("Unnamed: 0", drop=True)
yandex = pd.read_csv(YANDEX).set_index("Unnamed: 0", drop=True)
doc = pd.read_csv(DOC).set_index("Unnamed: 0", drop=True)
qwen = pd.read_csv(QWEN).set_index("Unnamed: 0", drop=True)

In [6]:
matrices = {
    "statistic": statistic,
    "bert": bert,
    "medbert": medbert,
    "deepseek": deepseek,
    "mlm": mlm,
    "yandex": yandex,
    "doc": doc,
    "qwen": qwen
}

LLMS = ["deepseek", "qwen", "yandex", "mlm"]
BASELINES = ["statistic", "medbert", "bert", "doc"]

In [7]:
for k, v in matrices.items():
    print(k)
    display(v.head())

statistic


Unnamed: 0_level_0,A01,A02,A03,A04,A05,A06,A07,A08,A09,A15,...,Z90,Z91,Z92,Z93,Z94,Z95,Z96,Z97,Z98,Z99
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A01,1.0,0.04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A02,0.04,1.0,0.0,0.003,0.0,0.0,0.0,0.004,0.002,0.004,...,0.001,0.001,0.001,0.001,0.003,0.001,0.001,0.002,0.001,0.0
A03,0.0,0.0,1.0,0.001,0.0,0.0,0.013,0.0,0.002,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A04,0.0,0.003,0.001,1.0,0.002,0.0,0.001,0.046,0.013,0.003,...,0.05,0.043,0.048,0.066,0.058,0.039,0.038,0.034,0.042,0.063
A05,0.0,0.0,0.0,0.002,1.0,0.0,0.0,0.007,0.004,0.0,...,0.001,0.001,0.001,0.001,0.002,0.001,0.001,0.001,0.001,0.001


bert


Unnamed: 0_level_0,A01,A02,A03,A04,A05,A06,A07,A08,A09,A15,...,Z90,Z91,Z92,Z93,Z94,Z95,Z96,Z97,Z98,Z99
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A01,1.0,0.859,0.666,0.859,0.823,0.739,0.83,0.883,0.873,0.82,...,0.805,0.771,0.72,0.701,0.752,0.78,0.764,0.739,0.709,0.749
A02,0.859,1.0,0.778,0.973,0.888,0.865,0.942,0.934,0.914,0.822,...,0.834,0.823,0.804,0.821,0.804,0.788,0.85,0.856,0.834,0.814
A03,0.666,0.778,1.0,0.791,0.791,0.919,0.806,0.739,0.729,0.669,...,0.776,0.733,0.76,0.809,0.733,0.657,0.758,0.781,0.835,0.778
A04,0.859,0.973,0.791,1.0,0.898,0.869,0.958,0.949,0.91,0.811,...,0.844,0.828,0.842,0.841,0.817,0.79,0.861,0.881,0.864,0.833
A05,0.823,0.888,0.791,0.898,1.0,0.848,0.882,0.885,0.877,0.824,...,0.887,0.866,0.806,0.821,0.817,0.779,0.833,0.858,0.861,0.881


medbert


Unnamed: 0_level_0,A01,A02,A03,A04,A05,A06,A07,A08,A09,A15,...,Z90,Z91,Z92,Z93,Z94,Z95,Z96,Z97,Z98,Z99
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A01,1.0,0.617,0.633,0.7,0.744,0.543,0.702,0.436,0.67,0.73,...,0.602,0.59,0.636,0.619,0.601,0.626,0.583,0.622,0.609,0.618
A02,0.617,1.0,0.949,0.894,0.888,0.908,0.923,0.894,0.881,0.554,...,0.742,0.723,0.711,0.665,0.702,0.654,0.686,0.708,0.734,0.668
A03,0.633,0.949,1.0,0.911,0.895,0.91,0.927,0.895,0.913,0.557,...,0.783,0.758,0.736,0.731,0.748,0.694,0.716,0.748,0.773,0.702
A04,0.7,0.894,0.911,1.0,0.945,0.835,0.912,0.77,0.923,0.679,...,0.755,0.735,0.75,0.72,0.782,0.713,0.7,0.735,0.771,0.721
A05,0.744,0.888,0.895,0.945,1.0,0.833,0.946,0.756,0.908,0.711,...,0.734,0.704,0.729,0.702,0.724,0.738,0.699,0.738,0.746,0.711


deepseek


Unnamed: 0_level_0,A00,A01,A02,A03,A04,A05,A06,A07,A08,A09,...,Z90,Z91,Z92,Z93,Z94,Z95,Z96,Z97,Z98,Z99
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.167,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A03,0.0,0.0,0.0,0.0,0.167,0.0,0.0,0.0,0.0,0.667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A04,0.0,0.0,0.0,0.167,0.0,0.0,0.0,0.0,0.0,0.167,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


mlm


Unnamed: 0_level_0,K76,R18,K74,B19,J44,F31,F43,Z87,D69,E87,...,V45,V16,W99,P11,B56,W85,P07,A27,Y64,Y21
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
K76,1.0,0.037,-0.139,0.031,0.061,-0.057,-0.047,-0.009,0.062,0.032,...,-0.017,-0.016,0.039,-0.023,0.02,0.041,-0.013,-0.09,-0.067,-0.041
R18,0.037,1.0,-0.078,0.086,-0.134,-0.019,0.045,-0.019,0.094,0.027,...,0.052,0.011,0.051,-0.037,-0.171,-0.011,0.05,0.031,-0.075,0.094
K74,-0.139,-0.078,1.0,0.001,0.132,0.007,0.053,0.039,0.093,0.037,...,0.033,0.065,-0.025,0.071,0.006,-0.049,-0.048,-0.056,0.046,0.023
B19,0.031,0.086,0.001,1.0,-0.079,0.082,0.09,-0.03,0.087,0.053,...,0.04,0.007,-0.066,-0.074,0.009,0.002,0.019,0.008,0.043,-0.007
J44,0.061,-0.134,0.132,-0.079,1.0,-0.035,-0.097,-0.009,-0.059,0.021,...,0.041,-0.022,0.04,0.031,-0.001,-0.03,0.027,-0.038,0.057,-0.031


yandex


Unnamed: 0_level_0,A01,A02,A03,A04,A05,A06,A07,A08,A09,A15,...,Z90,Z91,Z92,Z93,Z94,Z95,Z96,Z97,Z98,Z99
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A01,1.0,0.7,0.0,0.0,0.0,0.0,0.0,0.6,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A02,0.7,1.0,0.7,0.6,0.0,0.0,0.0,0.8,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A03,0.0,0.7,1.0,0.7,0.6,0.7,0.0,0.8,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A04,0.0,0.6,0.7,1.0,0.9,0.0,0.7,0.8,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A05,0.0,0.0,0.6,0.9,1.0,0.0,0.0,0.8,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


doc


Unnamed: 0_level_0,A01,A02,A03,A04,A05,A06,A07,A08,A09,A15,...,Z90,Z91,Z92,Z93,Z94,Z95,Z96,Z97,Z98,Z99
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A01,1.0,0.743,0.781,0.768,0.758,0.784,0.779,0.759,0.76,0.771,...,0.624,0.644,0.632,0.55,0.612,0.563,0.587,0.607,0.651,0.564
A02,0.743,1.0,0.781,0.899,0.861,0.733,0.856,0.872,0.841,0.76,...,0.552,0.554,0.53,0.486,0.574,0.511,0.52,0.498,0.499,0.475
A03,0.781,0.781,1.0,0.749,0.735,0.848,0.81,0.718,0.716,0.714,...,0.619,0.591,0.65,0.622,0.62,0.525,0.612,0.613,0.611,0.552
A04,0.768,0.899,0.749,1.0,0.877,0.764,0.877,0.93,0.898,0.791,...,0.563,0.565,0.586,0.496,0.663,0.551,0.577,0.544,0.582,0.504
A05,0.758,0.861,0.735,0.877,1.0,0.728,0.816,0.805,0.831,0.789,...,0.64,0.604,0.562,0.486,0.6,0.499,0.508,0.514,0.535,0.574


qwen


Unnamed: 0_level_0,A00,A01,A02,A03,A04,A05,A06,A07,A08,A09,...,Z90,Z91,Z92,Z93,Z94,Z95,Z96,Z97,Z98,Z99
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
intersection = None
for i, k in enumerate(matrices):
    if i == 0:
        intersection = set(matrices[k].index)
    else:
        intersection = intersection.intersection(set(matrices[k].index))

In [9]:
intersection = sorted(list(intersection))

In [10]:
len(intersection)

1696

In [11]:
intersection[:10]

['A01', 'A02', 'A03', 'A04', 'A05', 'A06', 'A07', 'A08', 'A09', 'A15']

In [12]:
for k, v in matrices.items():
    matrices[k] = v.loc[intersection, intersection]

In [13]:
for k, v in matrices.items():
    print(k)
    display(v.head(3))

statistic


Unnamed: 0_level_0,A01,A02,A03,A04,A05,A06,A07,A08,A09,A15,...,Z90,Z91,Z92,Z93,Z94,Z95,Z96,Z97,Z98,Z99
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A01,1.0,0.04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A02,0.04,1.0,0.0,0.003,0.0,0.0,0.0,0.004,0.002,0.004,...,0.001,0.001,0.001,0.001,0.003,0.001,0.001,0.002,0.001,0.0
A03,0.0,0.0,1.0,0.001,0.0,0.0,0.013,0.0,0.002,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


bert


Unnamed: 0_level_0,A01,A02,A03,A04,A05,A06,A07,A08,A09,A15,...,Z90,Z91,Z92,Z93,Z94,Z95,Z96,Z97,Z98,Z99
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A01,1.0,0.859,0.666,0.859,0.823,0.739,0.83,0.883,0.873,0.82,...,0.805,0.771,0.72,0.701,0.752,0.78,0.764,0.739,0.709,0.749
A02,0.859,1.0,0.778,0.973,0.888,0.865,0.942,0.934,0.914,0.822,...,0.834,0.823,0.804,0.821,0.804,0.788,0.85,0.856,0.834,0.814
A03,0.666,0.778,1.0,0.791,0.791,0.919,0.806,0.739,0.729,0.669,...,0.776,0.733,0.76,0.809,0.733,0.657,0.758,0.781,0.835,0.778


medbert


Unnamed: 0_level_0,A01,A02,A03,A04,A05,A06,A07,A08,A09,A15,...,Z90,Z91,Z92,Z93,Z94,Z95,Z96,Z97,Z98,Z99
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A01,1.0,0.617,0.633,0.7,0.744,0.543,0.702,0.436,0.67,0.73,...,0.602,0.59,0.636,0.619,0.601,0.626,0.583,0.622,0.609,0.618
A02,0.617,1.0,0.949,0.894,0.888,0.908,0.923,0.894,0.881,0.554,...,0.742,0.723,0.711,0.665,0.702,0.654,0.686,0.708,0.734,0.668
A03,0.633,0.949,1.0,0.911,0.895,0.91,0.927,0.895,0.913,0.557,...,0.783,0.758,0.736,0.731,0.748,0.694,0.716,0.748,0.773,0.702


deepseek


Unnamed: 0_level_0,A01,A02,A03,A04,A05,A06,A07,A08,A09,A15,...,Z90,Z91,Z92,Z93,Z94,Z95,Z96,Z97,Z98,Z99
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.667,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A03,0.0,0.0,0.0,0.167,0.0,0.0,0.0,0.0,0.667,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


mlm


Unnamed: 0_level_0,A01,A02,A03,A04,A05,A06,A07,A08,A09,A15,...,Z90,Z91,Z92,Z93,Z94,Z95,Z96,Z97,Z98,Z99
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A01,1.0,-0.019,-0.043,-0.025,-0.021,0.005,-0.049,-0.065,0.024,-0.031,...,-0.1,-0.028,-0.013,0.059,0.012,-0.008,-0.025,-0.0,-0.048,0.0
A02,-0.019,1.0,0.04,0.098,0.076,0.065,-0.003,0.02,-0.158,-0.068,...,0.103,-0.07,0.108,-0.058,-0.052,-0.019,0.024,-0.024,0.034,0.032
A03,-0.043,0.04,1.0,0.101,0.052,-0.002,-0.042,-0.067,-0.053,-0.005,...,-0.102,-0.055,-0.003,0.078,0.1,0.003,0.039,-0.045,-0.021,0.088


yandex


Unnamed: 0_level_0,A01,A02,A03,A04,A05,A06,A07,A08,A09,A15,...,Z90,Z91,Z92,Z93,Z94,Z95,Z96,Z97,Z98,Z99
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A01,1.0,0.7,0.0,0.0,0.0,0.0,0.0,0.6,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A02,0.7,1.0,0.7,0.6,0.0,0.0,0.0,0.8,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A03,0.0,0.7,1.0,0.7,0.6,0.7,0.0,0.8,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


doc


Unnamed: 0_level_0,A01,A02,A03,A04,A05,A06,A07,A08,A09,A15,...,Z90,Z91,Z92,Z93,Z94,Z95,Z96,Z97,Z98,Z99
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A01,1.0,0.743,0.781,0.768,0.758,0.784,0.779,0.759,0.76,0.771,...,0.624,0.644,0.632,0.55,0.612,0.563,0.587,0.607,0.651,0.564
A02,0.743,1.0,0.781,0.899,0.861,0.733,0.856,0.872,0.841,0.76,...,0.552,0.554,0.53,0.486,0.574,0.511,0.52,0.498,0.499,0.475
A03,0.781,0.781,1.0,0.749,0.735,0.848,0.81,0.718,0.716,0.714,...,0.619,0.591,0.65,0.622,0.62,0.525,0.612,0.613,0.611,0.552


qwen


Unnamed: 0_level_0,A01,A02,A03,A04,A05,A06,A07,A08,A09,A15,...,Z90,Z91,Z92,Z93,Z94,Z95,Z96,Z97,Z98,Z99
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
def compute_bootstrapped_score(
        y_test: np.array, 
        y_prob: np.array, 
        scorer: Callable[[np.array, np.array], Any], 
        m_sample: int = None, 
        stratum_vals: np.array =None
    ):
    assert not isinstance(y_test, pd.Series), "y_test should be np.array"
    assert not isinstance(y_prob, pd.Series), "y_prob should be np.array"

    idx = np.array(range(len(y_test)))
    if m_sample is None: m_sample = len(y_test) #bootstrap sample size
        
    if stratum_vals is not None: #select equal number of samples from each category
        idx_bs = [] 
        for val in set(stratum_vals):
            stratum_idx = idx[stratum_vals == val] 
            idx_bs += np.random.choice(stratum_idx, size=len(stratum_idx), replace=True).tolist()
    else:
        idx_bs = np.random.choice(idx, size=m_sample, replace=True)

    try:
        return scorer(y_test[idx_bs], y_prob[idx_bs]).statistic
    except Exception as e:
        print("WARNING: Bootstrapping failed for", scorer.func.__name__ if isinstance(scorer, partial) else scorer.__name__, "with error", e)
        return np.nan
    
def compute_ci(
    y_test: np.array, 
    y_prob: np.array, 
    stratum_vals: np.array = None, 
    n_bootstraps: int = 1000, 
    m_sample: int = None, 
    scorer: Callable[[np.array, np.array], Any] = sts.spearmanr,
    alpha: float = 0.05, #95% CI
    verbose: int = 1, 
    return_se: bool = False
):
    assert len(y_test) == len(y_prob), "y_test and y_prob should have the same lengths"
        
    scores = []
    if verbose > 0:
        print(f"Bootstrap scores computing for {scorer.func.__name__ if isinstance(scorer, partial) else scorer.__name__}...")
        for _ in tqdm(range(n_bootstraps)):
            scores.append(compute_bootstrapped_score(y_test, y_prob, scorer, stratum_vals=stratum_vals, m_sample=m_sample))
    else:
        for _ in range(n_bootstraps):
            scores.append(compute_bootstrapped_score(y_test, y_prob, scorer, stratum_vals=stratum_vals, m_sample=m_sample))
    scores = np.array(scores)
    
    nans_share = np.sum(np.isnan(scores).astype(int))/len(scores)
    if nans_share > 0.5: #empirical threshold, you can change it if you have better solution
        print(f"WARNING: There is {nans_share*100:.0f}% NaNs in bootstrapped scores for {scorer.func.__name__ if isinstance(scorer, partial) else scorer.__name__}")
        random_idxs = np.random.choice(list(range(len(y_test))), size=20, replace=False)
        print("       random 20 entries from y_test:", y_test[random_idxs])
        print("corresponging 20 entries from y_prob:", y_prob[random_idxs])
        if return_se:
            return np.nan, np.nan, np.nan
        else:
            return np.nan, np.nan
    
    estimation = np.nanmean(scores)
    se = np.nanstd(scores)
    perc = sts.norm.ppf(1 - alpha/2)
    e_perc = se * perc
    
    if verbose > 1:
        plt.figure(figsize=(4, 2.5))
        plt.hist(scores, bins=50)
        plt.axvline(x = estimation, color = 'tab:orange', label = 'mean')
        plt.axvline(x = estimation - e_perc, color = 'tab:red', label = f'mean - e_{1-alpha:.2f}')
        plt.axvline(x = estimation + e_perc, color = 'tab:red', label = f'mean + e_{1-alpha:.2f}')
        plt.show()
    
    if return_se:
        return estimation, e_perc, se
    else:
        return estimation, e_perc

In [21]:
spearmanr = pd.DataFrame(data=-np.ones((len(LLMS), len(BASELINES))), columns=BASELINES, index=LLMS)

In [23]:
# for method1, method2 in tqdm(itertools.combinations(matrices.keys(), 2)):
for llm, baseline in tqdm(itertools.product(LLMS, BASELINES)):
    print("Computing estimate for {} and {}".format(llm, baseline))
    spearmanr.loc[llm, baseline] = sts.spearmanr(matrices[llm].values.flatten(), matrices[baseline].values.flatten()).statistic
    spearmanr.to_csv("spearman.csv")

0it [00:00, ?it/s]

Computing estimate for deepseek and statistic


1it [00:00,  2.28it/s]

Computing estimate for deepseek and medbert


2it [00:00,  2.37it/s]

Computing estimate for deepseek and bert


3it [00:01,  2.36it/s]

Computing estimate for deepseek and doc


4it [00:01,  2.35it/s]

Computing estimate for qwen and statistic


5it [00:02,  2.43it/s]

Computing estimate for qwen and medbert


6it [00:02,  2.39it/s]

Computing estimate for qwen and bert


7it [00:02,  2.43it/s]

Computing estimate for qwen and doc


8it [00:03,  2.44it/s]

Computing estimate for yandex and statistic


9it [00:03,  2.50it/s]

Computing estimate for yandex and medbert


10it [00:04,  2.50it/s]

Computing estimate for yandex and bert


11it [00:04,  2.51it/s]

Computing estimate for yandex and doc


12it [00:04,  2.52it/s]

Computing estimate for mlm and statistic


13it [00:05,  2.51it/s]

Computing estimate for mlm and medbert


14it [00:05,  2.48it/s]

Computing estimate for mlm and bert


15it [00:06,  2.45it/s]

Computing estimate for mlm and doc


16it [00:06,  2.44it/s]


In [15]:
spearman_mean = pd.DataFrame(data=-np.ones((len(LLMS), len(BASELINES))), columns=BASELINES, index=LLMS)
spearman_se = pd.DataFrame(data=-np.ones((len(LLMS), len(BASELINES))), columns=BASELINES, index=LLMS)
spearman_e_perc = pd.DataFrame(data=-np.ones((len(LLMS), len(BASELINES))), columns=BASELINES, index=LLMS)

In [18]:
from multiprocessing import Pool

def _worker_pair(args):
    """Compute CI for a single (llm, baseline) pair. Top-level for pickling."""
    (llm, baseline, x_flat, y_flat, kwargs) = args
    # call your existing function (single-process bootstrap inside)
    r, e_perc, se = compute_ci(
        x_flat, y_flat,
        return_se=True,
        **kwargs
    )
    return llm, baseline, r, e_perc, se

if __name__ == "__main__":  # important for Windows / notebooks
    # prepare outputs
    spearman_mean   = pd.DataFrame(-np.ones((len(LLMS), len(BASELINES))), columns=BASELINES, index=LLMS)
    spearman_se     = pd.DataFrame(-np.ones((len(LLMS), len(BASELINES))), columns=BASELINES, index=LLMS)
    spearman_e_perc = pd.DataFrame(-np.ones((len(LLMS), len(BASELINES))), columns=BASELINES, index=LLMS)

    # build tasks (pre-flatten to avoid doing it in workers repeatedly)
    common_kwargs = dict(
        scorer=sts.spearmanr,
        n_bootstraps=1000,   # adjust as you like
        m_sample=None,
        stratum_vals=None,
        alpha=0.05,
        verbose=0            # silence per-worker prints
    )
    tasks = []
    for llm, baseline in itertools.product(LLMS, BASELINES):
        x_flat = matrices[llm].values.ravel()
        y_flat = matrices[baseline].values.ravel()
        tasks.append((llm, baseline, x_flat, y_flat, common_kwargs))

    # run in parallel across pairs
    n_jobs = None  # or an int like 8
    with Pool(processes=n_jobs) as pool:
        for llm, baseline, r, e_perc, se in tqdm(pool.imap_unordered(_worker_pair, tasks, chunksize=1)):
            spearman_mean.loc[llm, baseline]   = r
            spearman_se.loc[llm, baseline]     = se
            spearman_e_perc.loc[llm, baseline] = e_perc
    
    spearman_mean.to_csv("spearman_mean.csv")
    spearman_se.to_csv("spearman_se.csv")
    spearman_e_perc.to_csv("spearman_e_perc.csv")

16it [13:17, 49.85s/it]


In [24]:
spearmanr

Unnamed: 0,statistic,medbert,bert,doc
deepseek,0.006,0.024,-0.022,0.139
qwen,0.071,0.029,0.029,0.093
yandex,0.07,0.07,0.057,0.093
mlm,-0.003,0.003,0.002,0.007


In [19]:
spearman_mean

Unnamed: 0,statistic,medbert,bert,doc
deepseek,0.006,0.024,-0.022,0.139
qwen,0.071,0.029,0.029,0.093
yandex,0.07,0.07,0.056,0.093
mlm,-0.003,0.003,0.002,0.007


In [20]:
spearman_se

Unnamed: 0,statistic,medbert,bert,doc
deepseek,0.001,0.001,0.001,0.001
qwen,0.001,0.001,0.001,0.001
yandex,0.001,0.001,0.001,0.001
mlm,0.001,0.001,0.001,0.001
