In [1]:
import pyterrier as pt
import pandas as pd
import os
import csv
import pandas as pd
import pyterrier as pt
from pyterrier.measures import *

In [6]:
def evaluate_all_runs(results_folder, qrels_file, metric):
    """
    Evaluates all TREC result files in a folder and returns per-query metric scores.

    Parameters:
    - results_folder (str): Path to folder containing .res files.
    - qrels_file: Qrels file
    - metric: pyterrier.measures metric (default: AP with rel >= 2).

    Returns:
    - dict: Nested dict {query_id: {model_name: score, ...}, ...}
    """
    # Final nested dict: {qid: {model_name: score}}
    all_results = {}
    qrels = pt.io.read_qrels(qrels_file)

    for filename in os.listdir(results_folder):
        filepath = os.path.join(results_folder, filename)
        if not os.path.isfile(filepath) or not filename.endswith(".res"):
            continue  # skip directories or non-res files
            
        model_name = os.path.splitext(filename)[0]  # remove .res extension

        # Load run as DataFrame
        run_df = pd.read_csv(filepath, 
                     sep='\\s+', 
                     names=["qid", "iter", "docno", "rank", "score", "runid"])

        # Evaluate per query
        perquery_results = pt.Evaluate(run_df, qrels, metrics=[metric], perquery=True)
        perquery_df = pd.DataFrame.from_dict(perquery_results, orient='index')
        
        # Fill into nested dictionary
        for qid, row in perquery_df.iterrows():
            metric_name = perquery_df.columns[0]
            score = row[metric_name]
            all_results.setdefault(str(qid), {})[model_name] = score

    return all_results

In [7]:
results = evaluate_all_runs("data/runs/2019", qrels_file="data/2019.qrels", metric=AP(rel=2))
#print(results)

# Convert to DataFrame
results_df = pd.DataFrame.from_dict(results, orient='index')
print(results_df.head())

print(results["19335"]["prf_rerank_beta05.2019"])

        prf_rerank_beta05.2019  BM25.2019  prf_rank_beta05.2019  \
19335                 0.005410   0.417649              0.005223   
47923                 0.269306   0.208209              0.270124   
87181                 0.511408   0.224310              0.523762   
87452                 0.130716   0.127828              0.133332   
104861                0.544986   0.316859              0.546889   

        prf_rerank_beta1.2019  prf_rank_beta1.2019  
19335                0.005315             0.004859  
47923                0.250536             0.250679  
87181                0.480961             0.495223  
87452                0.146770             0.151613  
104861               0.621656             0.622021  
0.005409924460404536


In [12]:
def load_qpp_estimates(folder):
    """
    Loads QPP estimates for all queries and IR models from a folder of .qpp files.

    Each .qpp file is named after an IR model (e.g., 'bm25.qpp') and contains:
    query_id \t qpp1 \t qpp2 \t ... \t qppN

    Returns:
    - dict of {query_id: {model_name: [qpp1, qpp2, ...]}}
    """
    qpp_data = {}

    for filename in os.listdir(folder):
        filepath = os.path.join(folder, filename)
        if not os.path.isfile(filepath) or not filename.endswith(".qpp"):
            continue
        
        model_name = os.path.splitext(filename)[0] # remove ".qpp"
        model_name = os.path.splitext(model_name)[0] # remove ".res"
        print (model_name)

        with open(filepath, "r") as f:
            for line in f:
                parts = line.strip().split("\t")
                if len(parts) < 2:
                    continue

                qid = parts[0]
                try:
                    preds = [float(x) for x in parts[1:]]
                except ValueError:
                    continue  # skip malformed line

                qpp_data.setdefault(qid, {})[model_name] = preds

    return qpp_data


In [13]:
qpp_estimates = load_qpp_estimates("data/runs/2019")
print(qpp_estimates["19335"]["prf_rerank_beta05.2019"])


prf_rerank_beta1.2019
prf_rerank_beta05.2019
prf_rank_beta05.2019
prf_rank_beta1.2019
BM25.2019
[7.1786523, 89.22529, 0.27864638]


In [14]:
from scipy.stats import kendalltau

def compute_perquery_kendall_multiqpp(qpp_estimates, true_ap_scores):
    """
    Computes Kendall's tau correlation between each QPP model and true AP scores, per query.

    Parameters:
    - qpp_estimates: dict of {qid: {model: [qpp1, qpp2, ...]}}
    - true_ap_scores: dict of {qid: {model: ap_score}}

    Returns:
    - dict of {qid: {qpp_model_index: tau_value}}
    """
    perquery_tau = {}

    for qid in qpp_estimates:
        if qid not in true_ap_scores:
            continue

        models_in_common = set(qpp_estimates[qid]) & set(true_ap_scores[qid])
        if len(models_in_common) < 2:
            continue  # Not enough models to compute Kendall's tau

        # For each QPP model index, collect predictions and corresponding AP scores
        # We'll assume all QPP vectors are of the same length
        qpp_len = len(next(iter(qpp_estimates[qid].values())))
        perquery_tau[qid] = {}

        for i in range(qpp_len):
            qpp_vals = []
            ap_vals = []

            for model in sorted(models_in_common):
                try:
                    qpp_val = qpp_estimates[qid][model][i]
                    ap_val = true_ap_scores[qid][model]
                    qpp_vals.append(qpp_val)
                    ap_vals.append(ap_val)
                except (IndexError, KeyError):
                    continue  # Skip any inconsistent entries

            if len(qpp_vals) >= 2:
                tau, _ = kendalltau(qpp_vals, ap_vals)
                perquery_tau[qid][i] = tau

    return perquery_tau

In [15]:
tau_scores = compute_perquery_kendall_multiqpp(qpp_estimates, results)

In [20]:
print (tau_scores['1037798'][0])

0.31622776601683794


In [26]:
import numpy as np
from collections import defaultdict

qpp_model_avgs = defaultdict(list)
for qid in tau_scores:
    for i, tau in tau_scores[qid].items():
        if tau is not None:
            qpp_model_avgs[i].append(tau)

mean_local_taus = []
for i in sorted(qpp_model_avgs):
    mean_tau = np.mean(qpp_model_avgs[i])
    mean_local_taus.append(mean_tau)
    print(f"QPP model {i}: mean Kendall's tau = {mean_tau:.4f}")

QPP model 0: mean Kendall's tau = -0.0110
QPP model 1: mean Kendall's tau = 0.0107
QPP model 2: mean Kendall's tau = -0.1206


In [27]:
def compute_modelwise_kendall_multiqpp(qpp_estimates, true_ap_scores):
    """
    Computes Kendall's tau between QPP estimates and AP values across queries, 
    for each QPP model and IR model.

    Parameters:
    - qpp_estimates: dict of {qid: {model: [qpp1, qpp2, ...]}}
    - true_ap_scores: dict of {qid: {model: ap_score}}

    Returns:
    - dict of {model: {qpp_model_index: tau_value}}
    """
    modelwise_tau = {}

    # Get all IR models (from union of inner keys)
    all_models = set()
    for qid in qpp_estimates:
        all_models.update(qpp_estimates[qid].keys())
    for qid in true_ap_scores:
        all_models.update(true_ap_scores[qid].keys())

    for model in sorted(all_models):
        qpp_vals_by_index = defaultdict(list)
        ap_vals = []

        # For each query, collect the QPP predictions and AP for this model
        for qid in qpp_estimates:
            if model not in qpp_estimates[qid] or model not in true_ap_scores.get(qid, {}):
                continue

            qpp_preds = qpp_estimates[qid][model]
            ap_val = true_ap_scores[qid][model]

            for i, pred in enumerate(qpp_preds):
                qpp_vals_by_index[i].append((qid, pred, ap_val))

        modelwise_tau[model] = {}
        for i, vals in qpp_vals_by_index.items():
            qpp_list = [v[1] for v in vals]
            ap_list = [v[2] for v in vals]

            if len(qpp_list) >= 2:
                tau, _ = kendalltau(qpp_list, ap_list)
                modelwise_tau[model][i] = tau

    return modelwise_tau

In [28]:
standard_tau = compute_modelwise_kendall_multiqpp(qpp_estimates, results)

# Average Kendall's tau per QPP model (averaged over IR models)
qpp_model_avgs = defaultdict(list)
for model in standard_tau:
    for i, tau in standard_tau[model].items():
        if tau is not None:
            qpp_model_avgs[i].append(tau)

mean_taus = []
for i in sorted(qpp_model_avgs):
    mean_tau = np.mean(qpp_model_avgs[i])
    mean_taus.append(mean_tau)
    print(f"[Standard QPP Eval] QPP model {i}: mean Kendall's tau = {mean_tau:.4f}")


[Standard QPP Eval] QPP model 0: mean Kendall's tau = 0.3059
[Standard QPP Eval] QPP model 1: mean Kendall's tau = 0.3054
[Standard QPP Eval] QPP model 2: mean Kendall's tau = 0.1070


In [30]:
qpp_eval_agreement, _ = kendalltau(mean_local_taus, mean_taus)
print(qpp_eval_agreement)

0.33333333333333337
