In [40]:
import os
import json
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

In [41]:
EXCLUDE = [
    {"run_id": "test-system-maik-1234567", "version": "2025-01"},
    {"run_id": "bm25+reranker", "version": "2025-05-19-15-39-38"},
    {"run_id": "bm25+reranker", "version": "2025-05-21-12-45-30"},
    {"run_id": "bm25+reranker", "version": "2025-05-21-15-04-09"},
    {"run_id": "bm25+reranker", "version": "2025-05-22-15-40-14"},
    {"run_id": "bm25+reranker", "version": "2025-05-22-16-08-06"},
    {"run_id": "bm25+reranker", "version": "2025-05-22-16-11-21"},
    {"run_id": "bm25+reranker", "version": "2025-05-24-18-08-15"},
    {"run_id": "bm25+reranker", "version": "2025-05-24-18-08-52"},
    {"run_id": "bm25+reranker", "version": "2025-05-24-18-09-00"},
    {"run_id": "bm25+reranker", "version": "2025-05-24-18-09-08"},
    {"run_id": "bm25+reranker", "version": "2025-05-24-18-09-26"},
    {"run_id": "bm25+reranker", "version": "2025-05-24-18-09-34"},
    {"run_id": "bm25+reranker", "version": "2025-05-24-18-09-43"},
    {"run_id": "bm25+reranker", "version": "2025-05-24-18-09-57"},
    {"run_id": "bm25+reranker", "version": "2025-05-24-18-10-09"},
    {"run_id": "bm25+reranker", "version": "2025-05-24-18-10-21"},
    {"run_id": "bm25+reranker", "version": "2025-05-21-12-54-02"},
    {"run_id": "bm25+reranker", "version": "2025-05-22-15-51-48"},
    {"run_id": "bm25+reranker", "version": "2025-05-21-12-45-30"},
    {"run_id": "bm25+reranker", "version": "2025-05-24-18-08-27"},
    {"run_id": "bm25+reranker", "version": "2025-05-24-18-08-35"},
    {"run_id": "bm25+reranker+weighted", "version": "2025-05-26-14-37-40"},
    {"run_id": "bm25+reranker+weighted", "version": "2025-05-26-14-36-24"},
    {"run_id": "bm25+reranker+weighted", "version": "2025-05-26-14-37-56"},
    {"run_id": "bm25+reranker+weighted", "version": "2025-05-26-14-34-01"},
    {"run_id": "bm25+reranker+weighted", "version": "2025-05-26-14-33-04"},
    # web
    {
        "run_id": "clef25-seupd2425-rise",
        "version": "2025-05-20-15-47-16",
    },
    {
        "run_id": "clef25-seupd2425-rise",
        "version": "2025-05-20-16-20-56",
    },
    {
        "run_id": "clef25-seupd2425-rise",
        "version": "2025-05-20-15-55-39",
    },
    {
        "run_id": "clef25-seupd2425-rise",
        "version": "2025-05-20-15-38-06",
    },
    {
        "run_id": "query_expansion_time_dependence",
        "version": "2025-05-24-22-54-13",
    }
]

TEAMS = {
    "cir-cluster": "\cite{CIRcluster}",
    "cir-jmft": "\cite{CIR}",
    "cir-sauerkraut": "\cite{CIR}",
    "cir-super-team-123": "\cite{CIR}",
    "cir-fair-schaer": "\cite{CIR}",
    "cir-schared-retrieval": "\cite{CIR}",
    "seupd2425-datahunter": "\cite{DataHunter}",
    "seupd2425-racoon": "\cite{RACOON}",
    "seupd2425-basette": "\cite{BASETTE}",
    "seupd2425-rise": "\cite{RISE}",
    "ds-gt": "\cite{DS@GT}",
    "seupd2425-rand": "\cite{RAND}",
    "agh-cracow": "\cite{EAIiIB}",
    "open-web-search": "\cite{OWS}",
    "seupd2425-3ds2a": "\cite{3DS2A}",
    "seupd2425-sard": "\cite{SARD}",
}

In [42]:
results = "/workspaces/longeval-code/clef25/evaluation-in-progress/evaluation-results-in-progress/replicability"
datasets = "sci-20250430-test"
# datasets = "web-20250430-test"

In [43]:
tab = []
for team in os.listdir(os.path.join(results, datasets)):
    team_path = os.path.join(results, datasets, team)
    for run_id in os.listdir(team_path):
        run_directory = os.path.join(team_path, run_id)
        for version in os.listdir(run_directory):
            run_directory_version = os.path.join(run_directory, version)
            with open(os.path.join(run_directory_version, "results.json"), 'r') as f:
                r = json.load(f)
            tab.extend(r)

In [44]:
table = pd.DataFrame(tab)

In [45]:
for exclusion in EXCLUDE:
    table = table[
        ~(
            (table["run_id"] == exclusion["run_id"])
            & (table["version"] == exclusion["version"])
        )
    ]

In [46]:
# some teams only submit to the new task
table = table[~table["er"].isna()]

In [47]:
table["valid"] = table["arp"].apply(lambda x: x["ndcg"] > 0)
table = table[table["valid"]]
table.drop(columns=["valid"], inplace=True)

In [48]:
def make_table(table, measures):
    df = table.copy()
    meta_columns = ["team", "run_id", "version", "snapshot"]
    meata_measures = df.columns.difference(meta_columns)
    
    for measure in measures:
        for meta_measure in meata_measures:
            if meta_measure=="ttest":
                df[measure + "_" + meta_measure] = df.apply(lambda x: x[meta_measure]["advanced"].get(measure), axis=1)
            else:
                df[measure + "_" + meta_measure] = df.apply(lambda x: x[meta_measure].get(measure), axis=1)
    
    return df.drop(columns=meata_measures)
        

In [49]:
measures = ["ndcg_cut_10"]

df = make_table(table, measures)

In [51]:
def get_text_color(bg_hex_color):
    """Determines if text should be black or white based on background luminance."""
    rgb = mcolors.to_rgb(bg_hex_color)
    r, g, b = [x * 255 for x in rgb]
    luminance = (0.299 * r + 0.587 * g + 0.114 * b)
    return 'white' if luminance < 150 else 'black'

def color_by_center(s, center_val=1.0, cmap='berlin_r'):
    """
    V2: Sets background color AND automatically sets text color for contrast.
    """
    norm = mcolors.TwoSlopeNorm(vcenter=center_val, vmin=s.min(), vmax=s.max())
    colormap = plt.get_cmap(cmap)
    
    styles = []
    for val in s.values:
        # Calculate the background color for this specific value
        bg_color_hex = mcolors.to_hex(colormap(norm(val)))
        
        # Calculate the appropriate text color for that background
        text_color = get_text_color(bg_color_hex)
        
        # Combine both into a single CSS style string
        styles.append(f'background-color: {bg_color_hex}; color: {text_color}')
        
    return styles

def results_table(df, measure, sort_by=(), output=None, snapshots=None):
    def fix_run_tags(row, run_ids):
        """fix run_ids to include version if there are multiple versions"""
        if row["run_id"] in run_ids:
            return row["run_id"] + " (v" + row["version"] + ")"
        else:
            return row["run_id"]

    # ranme columns and strip measure
    df = df.rename(columns=lambda x: x.replace(measure + "_", ""))
    df = df[["team", "run_id", "version", "snapshot", "arp", "ap", "rc", "dri", "er", "ttest"]]
    
    columns = df.columns
    indices = ["team", "run_id", "version", "snapshot", "ttest"]
    columns = [col for col in columns if col not in indices]
    table = df.copy()
    
    # filter out excluded runs
    for exclusion in EXCLUDE:
        table = table[
            ~(
                (table["run_id"] == exclusion["run_id"])
                & (table["version"] == exclusion["version"])
            )
        ]
    # filter out snapshots
    if snapshots:
        table = table[table["snapshot"].isin(snapshots)]
        
    # Remove results where everything except the version is the same
    table = table.drop_duplicates(subset=df.columns.difference(['version']))

    # Fix run_ids and rem,ove version
    run_ids = table[table.duplicated(["run_id", "snapshot"])]["run_id"].unique()
    table["run_id"] = table.apply(lambda row: fix_run_tags(row, run_ids), axis=1)
    table.drop(columns=["version"], inplace=True)

    # Fix team names
    table["team"] = table["team"].str.replace("clef25-", "")
    table["team"] = table["team"].apply(lambda x: TEAMS.get(x, x))
    table["run_id"] = table["run_id"] + " " + table["team"]
    table.drop(columns=["team"], inplace=True)

    # Pivot the table to have snapshots as columns
    table = table.pivot(index=["run_id"], columns="snapshot", values=columns)
    # table.columns = table.columns.swaplevel(0, 1)
    # table = table.sort_index(axis=1, level=0)  # Optional: sort by snapshot
    table = table.reset_index()
    
    # Sort
    if sort_by:
        table = table.sort_values(by=sort_by, ascending=False)

    # round
    table = table.round(3)
    
    if output:
        # Start styling the DataFrame
        styler = table.style.format("{:.3f}")

        # Define colormaps
        cmap_higher = plt.get_cmap('Blues')
        
        # "rc", "dri", "er",
        styler = styler.background_gradient(subset="arp", cmap=cmap_higher).format({"arp": f"{{:.{3}f}}"})
        styler = styler.background_gradient(subset="ap", cmap=cmap_higher).format({"ap": f"{{:.{3}f}}"})
        
        # RC, DRI
        
        # 4. Apply the background gradient using the norm object in the 'gmap' argument
        styler = styler.apply(color_by_center, subset='er', center_val=1.0)
        styler = styler.apply(color_by_center, subset='rc', center_val=0.0)
        styler = styler.apply(color_by_center, subset='dri', center_val=0.0)
        styler.format("{:.3f}", subset=["arp", "ap", "rc", "dri", "er"])


        styler.to_latex(
            buf=output,
            label="tab:xxx-results",
            column_format="ll" + "c" * (len(table.columns) - 2),
            multicol_align="c",
            convert_css=True,
            # output,
            # caption=f"Evaluation Results for . The results are sorted by {sort_by[0]} for the {sort_by[1]} snapshot.",
            # label="tab:xxx-results",
            # column_format="l" + "c" * (len(table.columns) - 1),
            # multicolumn=True,
            # multirow=True,
            # escape=True,
            # index_names=True,
            # float_format="%.3f",
            # index=False        
            )
    print(table)

# results_table(df, measure="ndcg_cut_10", sort_by=("rc", "2023-08"), snapshots=("2023-03", "2023-08"), output="results-table-change-web-long.tex")
# t = results_table(df, measure="ndcg_cut_10", sort_by=("rc", "2023-08"), snapshots=("2023-03", "2023-08"), output="test.tex")
t = results_table(df, measure="ndcg_cut_10", sort_by=("rc", "2025-01"), snapshots=("2024-11", "2025-01"), output="test.tex")

                                                     run_id     arp      ap  \
snapshot                                                    2025-01 2025-01   
16                              fusion-with-core \cite{OWS}   0.059   0.060   
2                                     A7-BM25-TestAbs sambs   0.000   0.000   
17                                monot5-in-core \cite{OWS}   0.031   0.029   
13                                              bm25 tf-idk   0.172   0.154   
18                                      ows-bm25 \cite{OWS}   0.218   0.191   
19                          ows-cluster-boosting \cite{OWS}   0.222   0.193   
22                             rm3-on-qrel-boost \cite{OWS}   0.253   0.201   
0                                    A3-TFIDF-TestAbs sambs   0.137   0.102   
1                                   A4-TFIDF-TestFull sambs   0.134   0.100   
12                   SciBERT_LongEVAL long-eval-sci-group-5   0.059   0.043   
15                            bm25+reranker+weighted