In [1]:
import pandas as pd
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import sys

from dotenv import find_dotenv
sys.path.append(os.path.dirname(find_dotenv()))
    
from utils import ir_evaluation
from ir_evaluation import BASELINE_RUNS_DICT, DATASET_IDS_DICT

# constants and setup
DATA_DIR = './../data/' # data directory

EVAL_DATASETS = ["dl19", "dl20", "dlhard", "devsmall"]
FIRST_STAGE_DATASETS = ["bm25", "colbert", "contriever"]

In [4]:
def parse_first_stage_model(run_path: str) -> str:
    for model_name in FIRST_STAGE_DATASETS:
        if model_name in run_path:
            return model_name
    return None
    
def print_table(results, run_path: str, metrics):
    first_stage_model = parse_first_stage_model(run_path)

    first_stage_prefix = first_stage_model + "-" if first_stage_model is not None else ""
    
    dataset = None
    for eval_dataset in EVAL_DATASETS:
        if eval_dataset in run_path:
            dataset = eval_dataset
            break
       
    if dataset is None:
        for dataset_name, run_name in BASELINE_RUNS_DICT.items():
            if run_name in run_path:
                dataset = dataset_name
                break
            
    df = pd.DataFrame.from_dict(results.items()).T
    df.columns =list(results.keys())
    df = df[metrics].iloc[1:]
    df = df.map(lambda x: '{:.4f}'.format(round(x, 4))) # round and keep zeroes
    
    return pd.DataFrame(df.values, index=([first_stage_prefix+dataset]))
   

def perform_evaluation(run_path: str, mode: str) -> pd.DataFrame:
    dataset_id = DATASET_IDS_DICT[mode]
    
    metrics = ir_evaluation.get_metrics_for_dataset(mode)
    
    results = ir_evaluation.compute_metrics_from_ir_dataset(dataset_id, run_path, metrics)
    
    df = print_table(results, run_path, metrics)
    
    return df 

def perform_experiment(model_class: str, cross_encoder: str, ft_stage: str) -> pd.DataFrame:
    df_concat = []
    
    run_prefix = ""

    reranked_runs = FIRST_STAGE_DATASETS
      
    for reranked_run in reranked_runs:
        run_prefix = "" + cross_encoder + "-ft" + str(ft_stage) + "-" + reranked_run + "-"
        
        runs_dir = DATA_DIR + f"runs/evaluation/{model_class}/" # path to runs directory
        
        df_concat = df_concat + [perform_evaluation(run_path=runs_dir+run_prefix+mode+".run", mode=mode) for mode in EVAL_DATASETS]
        results = pd.concat(df_concat)
    results.columns = [str(metric) for metric in ir_evaluation.DEV_EVAL_METRICS]
    return results

def perform_experiment_baseline(baseline: str) -> bool:
    df_concat = []
    
    if baseline not in (FIRST_STAGE_DATASETS + ["all"]):
        print("Error: the baseline provided is not supported.")
        return None
    
    to_evaluate = [baseline] if baseline in FIRST_STAGE_DATASETS else FIRST_STAGE_DATASETS

    for benchmark in to_evaluate:
        runs_dir = DATA_DIR + f"runs/baseline/{benchmark}/" # path to runs directory
            
        df_concat = df_concat + [perform_evaluation(run_path=runs_dir+ BASELINE_RUNS_DICT[mode], mode=mode) for mode in EVAL_DATASETS]
        results = pd.concat(df_concat)
    results.columns = [str(metric) for metric in ir_evaluation.DEV_EVAL_METRICS]
    return results

In [12]:
perform_experiment_baseline("all")

Unnamed: 0,AP,nDCG@1,nDCG@5,nDCG@10,R@100,RR@10
bm25-dl19,0.3035,0.5581,0.534,0.5121,0.4988,0.7138
bm25-dl20,0.2811,0.5957,0.5067,0.4769,0.5623,0.6653
bm25-dlhard,0.1622,0.3667,0.2939,0.2886,0.4564,0.474
bm25-devsmall,0.1941,0.1063,0.1989,0.2301,0.6622,0.1855
colbert-dl19,0.5077,0.7829,0.7678,0.7369,0.661,0.8876
colbert-dl20,0.516,0.7747,0.7509,0.7328,0.7671,0.8282
colbert-dlhard,0.2641,0.47,0.413,0.4021,0.6298,0.5531
colbert-devsmall,0.3956,0.2593,0.4176,0.4569,0.9111,0.3907
contriever-dl19,0.4019,0.7132,0.681,0.6744,0.6132,0.814
contriever-dl20,0.4482,0.7253,0.6966,0.6716,0.717,0.7997


# MonoEncoder (Electra)

#### CL(100)

In [14]:
perform_experiment(model_class="monoElectra-CL100", cross_encoder="electra", ft_stage=1)

Unnamed: 0,AP,nDCG@1,nDCG@5,nDCG@10,R@100,RR@10
bm25-dl19,0.3651,0.7636,0.7534,0.7236,0.4988,0.8314
bm25-dl20,0.4012,0.7562,0.7252,0.6759,0.5623,0.8278
bm25-dlhard,0.2102,0.42,0.3998,0.3829,0.4564,0.5197
bm25-devsmall,0.3689,0.2605,0.3963,0.4203,0.6622,0.3709
colbert-dl19,0.4701,0.7829,0.7626,0.7537,0.6612,0.8663
colbert-dl20,0.5205,0.7778,0.7539,0.7337,0.7671,0.8536
colbert-dlhard,0.2541,0.4,0.4073,0.4022,0.6298,0.515
colbert-devsmall,0.4228,0.2861,0.4472,0.4844,0.9111,0.4191
contriever-dl19,0.4461,0.7829,0.761,0.7376,0.6132,0.8682
contriever-dl20,0.4984,0.7778,0.7559,0.7391,0.717,0.8536


#### KD (lr=1e-5)

In [15]:
perform_experiment(model_class="monoElectra-KD-lr-5", cross_encoder="electra", ft_stage=1)

Unnamed: 0,AP,nDCG@1,nDCG@5,nDCG@10,R@100,RR@10
bm25-dl19,0.3345,0.7558,0.692,0.6691,0.4988,0.8876
bm25-dl20,0.3531,0.7006,0.6357,0.6147,0.5623,0.772
bm25-dlhard,0.204,0.4333,0.376,0.3557,0.4564,0.5671
bm25-devsmall,0.2692,0.1622,0.2866,0.321,0.6622,0.2656
colbert-dl19,0.4072,0.7403,0.7133,0.6916,0.6612,0.8915
colbert-dl20,0.4273,0.6944,0.6546,0.6407,0.7671,0.778
colbert-dlhard,0.2316,0.42,0.3965,0.3689,0.6298,0.5684
colbert-devsmall,0.2896,0.1669,0.2997,0.3421,0.9111,0.2794
contriever-dl19,0.384,0.7248,0.7033,0.6775,0.6132,0.8651
contriever-dl20,0.4132,0.6883,0.6489,0.6417,0.717,0.7752


#### CL -> KD

In [7]:
perform_experiment(model_class="monoElectra-CLKD", cross_encoder="electra", ft_stage=2)

Unnamed: 0,AP,nDCG@1,nDCG@5,nDCG@10,R@100,RR@10
bm25-dl19,0.3652,0.7636,0.7515,0.7234,0.4988,0.8391
bm25-dl20,0.4003,0.7469,0.7231,0.6775,0.5623,0.838
bm25-dlhard,0.2085,0.41,0.3964,0.3858,0.4564,0.5184
bm25-devsmall,0.3696,0.2616,0.3963,0.4209,0.6622,0.3708
colbert-dl19,0.4705,0.7829,0.7642,0.755,0.6612,0.874
colbert-dl20,0.5198,0.7685,0.7548,0.7334,0.7671,0.8638
colbert-dlhard,0.2525,0.39,0.4038,0.4015,0.6298,0.5137
colbert-devsmall,0.4227,0.2861,0.4467,0.4841,0.9111,0.4182
contriever-dl19,0.4461,0.7829,0.7626,0.741,0.6132,0.876
contriever-dl20,0.4974,0.7685,0.7576,0.7387,0.717,0.8638


##### KD->CL

In [6]:
perform_experiment(model_class="monoElectra-KDCL", cross_encoder="electra", ft_stage=2)

Unnamed: 0,AP,nDCG@1,nDCG@5,nDCG@10,R@100,RR@10
bm25-dl19,0.3633,0.7403,0.7555,0.7304,0.4988,0.8262
bm25-dl20,0.4065,0.8086,0.728,0.6822,0.5623,0.8438
bm25-dlhard,0.205,0.4033,0.3874,0.3841,0.4564,0.5041
bm25-devsmall,0.3693,0.2603,0.396,0.4208,0.6622,0.3712
colbert-dl19,0.4732,0.7752,0.7694,0.7632,0.6612,0.8599
colbert-dl20,0.5265,0.8488,0.7788,0.7585,0.7671,0.8824
colbert-dlhard,0.2552,0.4367,0.4098,0.4104,0.6298,0.5254
colbert-devsmall,0.4234,0.284,0.4483,0.4855,0.9111,0.4193
contriever-dl19,0.4488,0.7752,0.7723,0.7482,0.6132,0.864
contriever-dl20,0.5037,0.8488,0.7828,0.7577,0.717,0.8833


## monoRoBERTa

#### CL(100)

In [16]:
perform_experiment(model_class="monoRoberta-CL100", cross_encoder="roberta", ft_stage=1)

Unnamed: 0,AP,nDCG@1,nDCG@5,nDCG@10,R@100,RR@10
bm25-dl19,0.3687,0.7946,0.7585,0.7356,0.4988,0.8651
bm25-dl20,0.3997,0.7932,0.7146,0.672,0.5623,0.8438
bm25-dlhard,0.223,0.4733,0.4159,0.4058,0.4564,0.5657
bm25-devsmall,0.367,0.2576,0.3938,0.4182,0.6622,0.368
colbert-dl19,0.4633,0.7597,0.7339,0.7333,0.6612,0.8391
colbert-dl20,0.5136,0.8025,0.7569,0.737,0.7671,0.8617
colbert-dlhard,0.2638,0.48,0.4184,0.4211,0.6298,0.564
colbert-devsmall,0.4151,0.2778,0.4391,0.4773,0.9111,0.4105
contriever-dl19,0.4367,0.7597,0.7246,0.7221,0.6132,0.8411
contriever-dl20,0.4917,0.8025,0.7643,0.7383,0.717,0.8633


#### KD

In [5]:
perform_experiment(model_class="monoRoberta-KD-lr-5", cross_encoder="roberta", ft_stage=1)

Unnamed: 0,AP,nDCG@1,nDCG@5,nDCG@10,R@100,RR@10
bm25-dl19,0.3284,0.686,0.6758,0.6558,0.4988,0.8353
bm25-dl20,0.3505,0.6759,0.6153,0.6029,0.5623,0.7352
bm25-dlhard,0.1972,0.4333,0.3757,0.3652,0.4564,0.5645
bm25-devsmall,0.2744,0.1711,0.2911,0.3242,0.6622,0.2705
colbert-dl19,0.4105,0.7132,0.7077,0.6784,0.6612,0.8729
colbert-dl20,0.4238,0.6605,0.6363,0.6369,0.7671,0.7282
colbert-dlhard,0.2194,0.4067,0.3697,0.366,0.6298,0.5534
colbert-devsmall,0.2933,0.1739,0.3035,0.3441,0.9111,0.2829
contriever-dl19,0.3754,0.6822,0.6728,0.6544,0.6132,0.844
contriever-dl20,0.4033,0.6543,0.6354,0.6263,0.717,0.7139


#### CL -> KD

In [6]:
perform_experiment(model_class="monoRoberta-CLKD", cross_encoder="roberta", ft_stage=2)

Unnamed: 0,AP,nDCG@1,nDCG@5,nDCG@10,R@100,RR@10
bm25-dl19,0.3685,0.7946,0.7572,0.7348,0.4988,0.8651
bm25-dl20,0.4012,0.7932,0.7187,0.6752,0.5623,0.8438
bm25-dlhard,0.2228,0.4733,0.4141,0.4051,0.4564,0.5657
bm25-devsmall,0.3667,0.2569,0.3938,0.4182,0.6622,0.3679
colbert-dl19,0.4633,0.7597,0.7364,0.7341,0.6612,0.8411
colbert-dl20,0.515,0.8025,0.7602,0.7375,0.7671,0.8617
colbert-dlhard,0.2637,0.48,0.4215,0.4217,0.6298,0.5647
colbert-devsmall,0.4148,0.2772,0.4389,0.4771,0.9111,0.4106
contriever-dl19,0.4368,0.7597,0.727,0.7223,0.6132,0.843
contriever-dl20,0.4931,0.8025,0.7681,0.7394,0.717,0.8633


#### KD->CL

In [5]:
perform_experiment(model_class="monoRoberta-KDCL", cross_encoder="roberta", ft_stage=2)

Unnamed: 0,AP,nDCG@1,nDCG@5,nDCG@10,R@100,RR@10
bm25-dl19,0.3628,0.7636,0.7484,0.7323,0.4988,0.8529
bm25-dl20,0.3998,0.8025,0.7208,0.6687,0.5623,0.8525
bm25-dlhard,0.216,0.4467,0.4014,0.3912,0.4564,0.5338
bm25-devsmall,0.3678,0.26,0.3939,0.4187,0.6622,0.3693
colbert-dl19,0.4646,0.7364,0.7357,0.7337,0.6612,0.851
colbert-dl20,0.5115,0.8117,0.7659,0.7322,0.7671,0.8675
colbert-dlhard,0.2569,0.4267,0.4095,0.4125,0.6298,0.5278
colbert-devsmall,0.4171,0.2807,0.4414,0.4788,0.9111,0.4132
contriever-dl19,0.4419,0.7364,0.7356,0.7236,0.6132,0.8496
contriever-dl20,0.4892,0.8117,0.7658,0.7306,0.717,0.869
