In [None]:
import os

os.environ["HF_HOME"] = "/home/yandex/APDL2425a/group_12/gorodissky/.cache/huggingface"
print(f"HF_HOME set to:\t\t {os.environ['HF_HOME']}")
import torch

print(torch.__version__)
print(torch.cuda.is_available())
print(torch.cuda.device_count())
if torch.cuda.is_available():
    print(torch.cuda.get_device_name(0))


In [None]:
from cisc.src.post_processing import aggregators, run_eval_lib
import json
import os
import warnings
warnings.filterwarnings('ignore', message='Labels passed were')

aggregator_configs = [
    aggregators.AggregatorConfig(
        aggregator_type=aggregators.AggregatorType.SC,
        norm_type=aggregators.NormalizationType.NONE,
    ),
    aggregators.AggregatorConfig(
        aggregator_type=aggregators.AggregatorType.CISC,
        norm_type=aggregators.NormalizationType.SOFTMAX,
        temperature=0.2,  # This value is taken from the CISC paper.
        confidence_col_name="logit_confidence",  # P(True) in the paper.
    )
]
PATH = "/home/yandex/APDL2425a/group_12/gorodissky/google-research/cisc/output/Qwen2.5-7B-Instruct/confidence/2026_01_11_14:43/MMLU"
with open(os.path.join(PATH, "experiment_conf.json"), "r") as f:
    expr_config = json.load(f)
sample_sizes = range(1, 6)

stats = run_eval_lib.calculate_stats_for_model_and_dataset_path(
    expr_config["tag"],
    PATH,
    filter_answers=False,
    round_negative_conf_to_zero=False,
    re_compute_is_correct=False,
    aggregator_configs=aggregator_configs,
    traces_lens=sample_sizes,
    num_bootstrap=500,
    return_per_question_scores=False,
).score_stats

for metric_name, metric_scores in stats.items():
    print(f"{metric_name}: {metric_scores}")


In [None]:
# plot accuracy vs sample size
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 2, figsize=(12, 4))
mean_number_of_traces = stats.pop("DIIS_mean_num_traces")
for metric_name, metric_scores in stats.items():
    if 'budget' in metric_name:
        continue
    if "CISC" in metric_name:
        if "DIIS" in metric_name:
            axes[0].plot( mean_number_of_traces, metric_scores, linestyle='--', marker='o', label=metric_name, color='orange')
        else:
            axes[0].plot(sample_sizes, metric_scores, label=metric_name, linestyle='-', marker='o', color='orange')
    else: # SC
        if "DIIS" in metric_name:
            axes[1].plot( mean_number_of_traces, metric_scores, linestyle='--', marker='o', label=metric_name, color='blue')
        else:
            axes[1].plot(sample_sizes, metric_scores, label=metric_name, linestyle='-', marker='o', color='blue')
            
for i in range(2):
    axes[i].set_xlabel('Sample Size')
    axes[i].set_ylabel('Accuracy')
    axes[i].legend()
    axes[i].grid()

# create table for effiecency 
plt.tight_layout()
plt.show()

In [None]:
fig, axes =   plt.subplots(2, figsize=(10, 3))
for i in range(2):
    axes[i].axis('off')
DIIS_budget = stats["DIIS_total_budget"]

cell_text = [ f"{stats['DIIS_CISC_logit_confidence_SOFTMAX_0.2'][0] / DIIS_budget:.4f}"]
for sample_size in sample_sizes:
    budget = sample_size * 128
    efficiency = stats["CISC_logit_confidence_SOFTMAX_0.2"][sample_size - 1] / budget
    cell_text.append(f"{efficiency:.4f}")
col_labels = ["DIIS_CISC", "CISC (1)", "CISC (2)", "CISC (3)", "CISC (4)", "CISC (5)",]
row_labels = ["Acc / Budget"]
table = axes[0].table(cellText=[cell_text], colLabels=col_labels, rowLabels=row_labels, loc='center')
table.auto_set_font_size(False)
table.set_fontsize(12)
table.scale(1.5, 1.5)  

cell_text = [ f"{stats['DIIS_SC_NONE'][0] / DIIS_budget:.4f}"]
for sample_size in sample_sizes:
    budget = sample_size * 128
    efficiency = stats["SC_NONE"][sample_size - 1] / budget
    cell_text.append(f"{efficiency:.4f}")
col_labels = ["DIIS_SC", "SC (1)", "SC (2)", "SC (3)", "SC (4)", "SC (5)",]
row_labels = ["Acc / Budget"]
table = axes[1].table(cellText=[cell_text], colLabels=col_labels, rowLabels=row_labels, loc='center')
table.auto_set_font_size(False)
table.set_fontsize(12)
table.scale(1.5, 1.5)   


plt.tight_layout()
plt.show()

In [None]:
import pickle 
base_dir = "/home/yandex/APDL2425a/group_12/gorodissky/google-research/cisc/output/"
with open(base_dir + "Qwen2.5-7B-Instruct/confidence/2026_01_11_14:43/MMLU/experiment_output.pkl", "rb") as f:
    results = pickle.load(f)

df = results.get_results_df()

In [None]:
difficulties = df.groupby('question_id').agg({'difficulty': 'first'})
print("mean:", difficulties.mean().item())
print("std:", difficulties.std().item())
print("max:", difficulties.max().item())
print("min:", difficulties.min().item())
