# Statistical Test

In [1]:
from scipy import stats
import pandas as pd
import numpy as np

In [315]:
def significance_test(before_df, after_df, metrics, thresh=0.05):
    for metric in metrics:
        before = before_df[metric].tolist()
        after = after_df[metric].tolist()
        pvalue = stats.ttest_rel(before, after).pvalue
        if pvalue <= thresh:
            print(f"{metric} score is statistically significant")
        else:
            print(f"{metric} score is statistically insignificant")

In [316]:
def compare_with_dpr(some_model_scores, dataset_name='tydi'):
    dpr_file = f'../Results/Scores/dpr_eval_{dataset_name}_scores.csv'
    dpr = pd.read_csv(dpr_file)
    metrics = dpr.columns
    significance_test(dpr, some_model_scores, metrics)

def compare_with_sparse(some_model_scores, sparse_type='bm25'):
    sparse_file = f'../Results/Scores/{sparse_type}_eval_scores.csv'
    sparse = pd.read_csv(sparse_file)
    metrics = sparse.columns
    significance_test(sparse, some_model_scores, metrics)

# A. Tydi

In [4]:
dpr_prf = pd.read_csv('../Results/Scores/dpr_prf_eval_tydi_scores.csv')
compare_with_dpr(dpr_prf, 'tydi')

Precision score is statistically significant
MRR score is statistically insignificant
MAP score is statistically insignificant
BPref score is statistically insignificant


## Part 1: Hybrid using Alpha Linear Combo

The linear combination can be formulated as the following
\begin{gather*} 
Score(Q,D) =  \alpha*sim_{DPR}(Q,D) + (1-\alpha)*sim_{sparse}(Q,D)
\end{gather*}

## Sparse Model vs Hybrid Sparse-Dense Model

### BM25 vs BM25-DPR

In [5]:
bm25_dpr = pd.read_csv('../Results/Scores/dpr-bm25_eval_tydi_scores.csv')
compare_with_sparse(bm25_dpr, 'bm25')

Precision score is statistically insignificant
MRR score is statistically insignificant
MAP score is statistically insignificant
BPref score is statistically insignificant


### LMD vs LMD-DPR

In [6]:
lmd_dpr = pd.read_csv('../Results/Scores/dpr-lmd_eval_tydi_scores.csv')
compare_with_sparse(lmd_dpr, 'lmd')

Precision score is statistically insignificant
MRR score is statistically insignificant
MAP score is statistically significant
BPref score is statistically significant


### TFIDF vs TFIDF-DPR

In [7]:
tfidf_dpr = pd.read_csv('../Results/Scores/dpr-classic_eval_tydi_scores.csv')
compare_with_sparse(tfidf_dpr, 'classic')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


## Dense Model vs Hybrid Sparse-Dense Model

### DPR vs BM25-DPR

In [8]:
bm25_dpr = pd.read_csv('../Results/Scores/dpr-bm25_eval_tydi_scores.csv')
compare_with_dpr(bm25_dpr, 'tydi')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically significant


### DPR vs LMD-DPR

In [9]:
lmd_dpr = pd.read_csv('../Results/Scores/dpr-lmd_eval_tydi_scores.csv')
compare_with_dpr(lmd_dpr, 'tydi')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically significant


### DPR vs TFIDF-DPR

In [10]:
tfidf_dpr = pd.read_csv('../Results/Scores/dpr-classic_eval_tydi_scores.csv')
compare_with_dpr(tfidf_dpr, 'tydi')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


## Part 2: Hybrid using Alpha Beta Linear Combo

The linear combination can be formulated as the following
\begin{gather*} 
Score(Q,D) =  \alpha*sim_{sparse}(Q,D) + \beta*sim_{DPR}(Q,D)
\end{gather*}
Those "optimized" hyperparameters can be found by using logistic regression.

## Sparse Model vs Hybrid Sparse-Dense Model

### BM25 vs BM25-DPR

In [11]:
bm25_dpr = pd.read_csv('../Results/Scores/dpr-bm25-v2_eval_tydi_scores.csv')
compare_with_sparse(bm25_dpr, 'bm25')

Precision score is statistically significant
MRR score is statistically insignificant
MAP score is statistically insignificant
BPref score is statistically insignificant


### LMD vs LMD-DPR

In [12]:
lmd_dpr = pd.read_csv('../Results/Scores/dpr-lmd-v2_eval_tydi_scores.csv')
compare_with_sparse(lmd_dpr, 'lmd')

Precision score is statistically insignificant
MRR score is statistically insignificant
MAP score is statistically significant
BPref score is statistically significant


### TFIDF vs TFIDF-DPR

In [13]:
tfidf_dpr = pd.read_csv('../Results/Scores/dpr-classic-v2_eval_tydi_scores.csv')
compare_with_sparse(tfidf_dpr, 'classic')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


## Dense Model vs Hybrid Sparse-Dense Model

### DPR vs BM25-DPR

In [14]:
bm25_dpr = pd.read_csv('../Results/Scores/dpr-bm25-v2_eval_tydi_scores.csv')
compare_with_dpr(bm25_dpr, 'tydi')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


### DPR vs LMD-DPR

In [15]:
lmd_dpr = pd.read_csv('../Results/Scores/dpr-lmd-v2_eval_tydi_scores.csv')
compare_with_dpr(lmd_dpr, 'tydi')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


### DPR vs TFIDF-DPR

In [16]:
tfidf_dpr = pd.read_csv('../Results/Scores/dpr-classic-v2_eval_tydi_scores.csv')
compare_with_dpr(tfidf_dpr, 'tydi')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


## Part 3: Retriever - Reranker Approach

## Dense Retriever - Sparse Reranker vs Sparse Retriever

### DPR Retriever-BM25 Reranker vs BM25

In [17]:
bm25_dpr = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-bm25_eval_tydi_scores.csv')
compare_with_sparse(bm25_dpr, 'bm25')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


### DPR Retriever-LMD Reranker vs LMD

In [18]:
lmd_dpr = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-lmd_eval_tydi_scores.csv')
compare_with_sparse(lmd_dpr, 'lmd')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


### DPR Retriever-TFIDF Reranker vs TFIDF

In [19]:
tfidf_dpr = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-classic_eval_tydi_scores.csv')
compare_with_sparse(tfidf_dpr, 'classic')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


## Dense Retriever - Sparse Reranker vs Dense Retriever

### DPR Retriever-BM25 Reranker vs DPR

In [20]:
bm25_dpr = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-bm25_eval_tydi_scores.csv')
compare_with_dpr(bm25_dpr, 'tydi')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically significant


### DPR Retriever-LMD Reranker vs DPR

In [21]:
lmd_dpr = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-lmd_eval_tydi_scores.csv')
compare_with_dpr(lmd_dpr, 'tydi')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically significant


### DPR Retriever-TFIDF Reranker vs DPR

In [22]:
tfidf_dpr = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-classic_eval_tydi_scores.csv')
compare_with_dpr(tfidf_dpr, 'tydi')

Precision score is statistically significant
MRR score is statistically insignificant
MAP score is statistically insignificant
BPref score is statistically insignificant


## Sparse Retriever - Dense Reranker vs Sparse Retriever

### BM25 Retriever - DPR Reranker vs BM25

In [23]:
bm25_dpr = pd.read_csv('../Results/Scores/retrieve-bm25_rerank-dpr_eval_tydi_scores.csv')
compare_with_sparse(bm25_dpr, 'bm25')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically significant


### LMD Retriever - DPR Reranker vs LMD

In [24]:
lmd_dpr = pd.read_csv('../Results/Scores/retrieve-lmd_rerank-dpr_eval_tydi_scores.csv')
compare_with_sparse(lmd_dpr, 'lmd')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


### TFIDF Retriever - DPR Reranker vs TFIDF

In [25]:
tfidf_dpr = pd.read_csv('../Results/Scores/retrieve-classic_rerank-dpr_eval_tydi_scores.csv')
compare_with_sparse(tfidf_dpr, 'classic')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically significant


## Sparse Retriever - Dense Reranker vs Dense Retriever

### BM25 Retriever - DPR Reranker vs DPR

In [26]:
bm25_dpr = pd.read_csv('../Results/Scores/retrieve-bm25_rerank-dpr_eval_tydi_scores.csv')
compare_with_dpr(bm25_dpr, 'tydi')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


### LMD Retriever - DPR Reranker vs DPR

In [27]:
lmd_dpr = pd.read_csv('../Results/Scores/retrieve-lmd_rerank-dpr_eval_tydi_scores.csv')
compare_with_dpr(lmd_dpr, 'tydi')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


### TFIDF Retriever - DPR Reranker vs DPR

In [28]:
tfidf_dpr = pd.read_csv('../Results/Scores/retrieve-classic_rerank-dpr_eval_tydi_scores.csv')
compare_with_dpr(tfidf_dpr, 'tydi')

Precision score is statistically significant
MRR score is statistically insignificant
MAP score is statistically significant
BPref score is statistically significant


## Sparse Reranker - Dense Retriever vs Sparse Retriever - Dense Reranker

### BM25 Reranker - DPR Retriever vs BM25 Retriever - DPR Reranker

In [29]:
bm25_dpr1 = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-bm25_eval_tydi_scores.csv')
bm25_dpr2 = pd.read_csv('../Results/Scores/retrieve-bm25_rerank-dpr_eval_tydi_scores.csv')

In [30]:
metrics = bm25_dpr1.columns

In [31]:
significance_test(bm25_dpr1, bm25_dpr2, metrics)

Precision score is statistically significant
MRR score is statistically insignificant
MAP score is statistically insignificant
BPref score is statistically significant


### LMD Reranker - DPR Retriever vs LMD Retriever - DPR Reranker

In [32]:
lmd_dpr1 = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-lmd_eval_tydi_scores.csv')
lmd_dpr2 = pd.read_csv('../Results/Scores/retrieve-lmd_rerank-dpr_eval_tydi_scores.csv')

In [33]:
metrics = lmd_dpr1.columns

In [34]:
significance_test(lmd_dpr1, lmd_dpr2, metrics)

Precision score is statistically insignificant
MRR score is statistically insignificant
MAP score is statistically insignificant
BPref score is statistically insignificant


### TFIDF Reranker - DPR Retriever vs TFIDF Retriever - DPR Reranker

In [35]:
tfidf_dpr1 = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-classic_eval_tydi_scores.csv')
tfidf_dpr2 = pd.read_csv('../Results/Scores/retrieve-classic_rerank-dpr_eval_tydi_scores.csv')

In [36]:
metrics = tfidf_dpr1.columns

In [37]:
significance_test(tfidf_dpr1, tfidf_dpr2, metrics)

Precision score is statistically significant
MRR score is statistically insignificant
MAP score is statistically insignificant
BPref score is statistically significant


# Part 4: BERT Reranker 

### DPR Retriever - BERT Reranker vs DPR

In [38]:
monobert_dpr = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-monobert_eval_tydi_tydi_scores.csv')
compare_with_dpr(monobert_dpr, 'tydi')

Precision score is statistically insignificant
MRR score is statistically insignificant
MAP score is statistically insignificant
BPref score is statistically significant


## BERT Reranker vs Sparse Reranker

### DPR Retriever - BERT Reranker vs DPR Retriever - BM25 Reranker

In [39]:
bm25_dpr = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-bm25_eval_tydi_scores.csv')

In [40]:
metrics = bm25_dpr.columns

In [41]:
significance_test(bm25_dpr, monobert_dpr, metrics)

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically significant


### DPR Retriever - BERT Reranker vs DPR Retriever - LMD Reranker

In [42]:
lmd_dpr = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-lmd_eval_tydi_scores.csv')

In [43]:
metrics = lmd_dpr.columns

In [44]:
significance_test(lmd_dpr, monobert_dpr, metrics)

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically significant


### DPR Retriever - BERT Reranker vs DPR Retriever - TFIDF Reranker

In [45]:
tfidf_dpr = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-classic_eval_tydi_scores.csv')

In [46]:
metrics = tfidf_dpr.columns

In [47]:
significance_test(tfidf_dpr, monobert_dpr, metrics)

Precision score is statistically significant
MRR score is statistically insignificant
MAP score is statistically significant
BPref score is statistically significant


## BERT Reranker vs DPR Reranker

### BM25

In [48]:
bm25_dpr = pd.read_csv('../Results/Scores/retrieve-bm25_rerank-dpr_eval_tydi_scores.csv')
bm25_monobert = pd.read_csv('../Results/Scores/retrieve-bm25_rerank-monobert_eval_tydi_scores.csv')

In [49]:
metrics = bm25_monobert.columns

In [50]:
significance_test(bm25_dpr, bm25_monobert, metrics)

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


### LMD

In [51]:
lmd_dpr = pd.read_csv('../Results/Scores/retrieve-lmd_rerank-dpr_eval_tydi_scores.csv')
lmd_monobert = pd.read_csv('../Results/Scores/retrieve-lmd_rerank-monobert_eval_tydi_scores.csv')

In [52]:
metrics = lmd_monobert.columns

In [53]:
significance_test(lmd_dpr, lmd_monobert, metrics)

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


### TFIDF

In [54]:
tfidf_dpr = pd.read_csv('../Results/Scores/retrieve-classic_rerank-dpr_eval_tydi_scores.csv')
tfidf_monobert = pd.read_csv('../Results/Scores/retrieve-classic_rerank-monobert_eval_tydi_scores.csv')

In [55]:
metrics = tfidf_monobert.columns

In [56]:
significance_test(tfidf_dpr, tfidf_monobert, metrics)

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically significant


# B. MFAQ

## Part 1: Hybrid using Alpha Linear Combo

### Sparse Model vs Hybrid Sparse-Dense Model

#### BM25 vs BM25-DPR


In [57]:
bm25_dpr = pd.read_csv('../Results/Scores/dpr-bm25_eval_mfaq_scores.csv')
compare_with_sparse(bm25_dpr, 'bm25')

Precision score is statistically significant
MRR score is statistically insignificant
MAP score is statistically insignificant
BPref score is statistically insignificant


#### LMD vs LMD-DPR

In [58]:
lmd_dpr = pd.read_csv('../Results/Scores/dpr-lmd_eval_mfaq_scores.csv')
compare_with_sparse(lmd_dpr, 'lmd')

Precision score is statistically insignificant
MRR score is statistically insignificant
MAP score is statistically significant
BPref score is statistically significant


#### TFIDF vs TFIDF-DPR

In [59]:
tfidf_dpr = pd.read_csv('../Results/Scores/dpr-classic_eval_mfaq_scores.csv')
compare_with_sparse(tfidf_dpr, 'classic')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


### Dense Model vs Hybrid Sparse-Dense Model

#### DPR vs BM25-DPR

In [60]:
bm25_dpr = pd.read_csv('../Results/Scores/dpr-bm25_eval_mfaq_scores.csv')
compare_with_dpr(bm25_dpr, 'mfaq')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically significant


#### DPR vs LMD-DPR

In [61]:
lmd_dpr = pd.read_csv('../Results/Scores/dpr-lmd_eval_mfaq_scores.csv')
compare_with_dpr(lmd_dpr, 'mfaq')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically significant


#### DPR vs TFIDF-DPR

In [62]:
tfidf_dpr = pd.read_csv('../Results/Scores/dpr-classic_eval_mfaq_scores.csv')
compare_with_dpr(tfidf_dpr, 'mfaq')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


## Part 2: Hybrid using Alpha Beta Linear Combo

#### BM25 vs BM25-DPR

In [63]:
bm25_dpr = pd.read_csv('../Results/Scores/dpr-bm25-v2_eval_mfaq_scores.csv')
compare_with_sparse(bm25_dpr, 'bm25')

Precision score is statistically significant
MRR score is statistically insignificant
MAP score is statistically insignificant
BPref score is statistically insignificant


#### LMD vs LMD-DPR

In [64]:
lmd_dpr = pd.read_csv('../Results/Scores/dpr-lmd-v2_eval_mfaq_scores.csv')
compare_with_sparse(lmd_dpr, 'lmd')

Precision score is statistically insignificant
MRR score is statistically insignificant
MAP score is statistically insignificant
BPref score is statistically insignificant


#### TFIDF vs TFIDF-DPR

In [65]:
tfidf_dpr = pd.read_csv('../Results/Scores/dpr-classic-v2_eval_mfaq_scores.csv')
compare_with_sparse(tfidf_dpr, 'classic')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


### Dense Model vs Hybrid Sparse-Dense Model

#### DPR vs BM25-DPR

In [66]:
bm25_dpr = pd.read_csv('../Results/Scores/dpr-bm25-v2_eval_mfaq_scores.csv')
compare_with_dpr(bm25_dpr, 'mfaq')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically significant


#### DPR vs LMD-DPR

In [67]:
lmd_dpr = pd.read_csv('../Results/Scores/dpr-lmd-v2_eval_mfaq_scores.csv')
compare_with_dpr(lmd_dpr, 'mfaq')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically significant


#### DPR vs TFIDF-DPR

In [68]:
tfidf_dpr = pd.read_csv('../Results/Scores/dpr-classic-v2_eval_mfaq_scores.csv')
compare_with_dpr(tfidf_dpr, 'mfaq')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


## Part 3: Retriever - Reranker Approach

### Dense Retriever - Sparse Reranker vs Sparse Retriever

#### DPR Retriever-BM25 Reranker vs BM25

In [69]:
bm25_dpr = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-bm25_eval_mfaq_scores.csv')
compare_with_sparse(bm25_dpr, 'bm25')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically significant


#### DPR Retriever-LMD Reranker vs LMD

In [70]:
lmd_dpr = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-lmd_eval_mfaq_scores.csv')
compare_with_sparse(lmd_dpr, 'lmd')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


#### DPR Retriever-TFIDF Reranker vs TFIDF

In [71]:
tfidf_dpr = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-classic_eval_mfaq_scores.csv')
compare_with_sparse(tfidf_dpr, 'classic')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically significant


### Dense Retriever - Sparse Reranker vs Dense Retriever

#### DPR Retriever-BM25 Reranker vs DPR

In [72]:
bm25_dpr = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-bm25_eval_mfaq_scores.csv')
compare_with_dpr(bm25_dpr, 'mfaq')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically significant


#### DPR Retriever-LMD Reranker vs DPR

In [73]:
lmd_dpr = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-lmd_eval_mfaq_scores.csv')
compare_with_dpr(lmd_dpr, 'mfaq')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically significant


#### DPR Retriever-TFIDF Reranker vs DPR

In [74]:
tfidf_dpr = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-classic_eval_mfaq_scores.csv')
compare_with_dpr(tfidf_dpr, 'mfaq')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


### Sparse Retriever - Dense Reranker vs Sparse Retriever

#### BM25 Retriever - DPR Reranker vs BM25

In [75]:
bm25_dpr = pd.read_csv('../Results/Scores/retrieve-bm25_rerank-dpr_eval_mfaq_scores.csv')
compare_with_sparse(bm25_dpr, 'bm25')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically significant


#### LMD Retriever - DPR Reranker vs LMD

In [76]:
lmd_dpr = pd.read_csv('../Results/Scores/retrieve-lmd_rerank-dpr_eval_mfaq_scores.csv')
compare_with_sparse(lmd_dpr, 'lmd')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically significant


#### TFIDF Retriever - DPR Reranker vs TFIDF

In [77]:
tfidf_dpr = pd.read_csv('../Results/Scores/retrieve-classic_rerank-dpr_eval_mfaq_scores.csv')
compare_with_sparse(tfidf_dpr, 'classic')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


### Sparse Retriever - Dense Reranker vs Dense Retriever

#### BM25 Retriever - DPR Reranker vs DPR

In [78]:
bm25_dpr = pd.read_csv('../Results/Scores/retrieve-bm25_rerank-dpr_eval_mfaq_scores.csv')
compare_with_dpr(bm25_dpr, 'mfaq')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


#### LMD Retriever - DPR Reranker vs DPR

In [79]:
lmd_dpr = pd.read_csv('../Results/Scores/retrieve-lmd_rerank-dpr_eval_mfaq_scores.csv')
compare_with_dpr(lmd_dpr, 'mfaq')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


#### TFIDF Retriever - DPR Reranker vs DPR

In [80]:
tfidf_dpr = pd.read_csv('../Results/Scores/retrieve-classic_rerank-dpr_eval_mfaq_scores.csv')
compare_with_dpr(tfidf_dpr, 'mfaq')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


### Sparse Reranker - Dense Retriever vs Sparse Retriever - Dense Reranker

#### BM25 Reranker - DPR Retriever vs BM25 Retriever - DPR Reranker

In [81]:
bm25_dpr1 = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-bm25_eval_mfaq_scores.csv')

bm25_dpr2 = pd.read_csv('../Results/Scores/retrieve-bm25_rerank-dpr_eval_mfaq_scores.csv')

metrics = bm25_dpr1.columns

significance_test(bm25_dpr1, bm25_dpr2, metrics)

Precision score is statistically significant
MRR score is statistically insignificant
MAP score is statistically insignificant
BPref score is statistically insignificant


#### LMD Reranker - DPR Retriever vs LMD Retriever - DPR Reranker

In [82]:
lmd_dpr1 = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-lmd_eval_mfaq_scores.csv')

lmd_dpr2 = pd.read_csv('../Results/Scores/retrieve-lmd_rerank-dpr_eval_mfaq_scores.csv')

metrics = lmd_dpr1.columns

significance_test(lmd_dpr1, lmd_dpr2, metrics)

Precision score is statistically significant
MRR score is statistically insignificant
MAP score is statistically insignificant
BPref score is statistically significant


#### TFIDF Reranker - DPR Retriever vs TFIDF Retriever - DPR Reranker

In [83]:
tfidf_dpr1 = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-classic_eval_mfaq_scores.csv')

tfidf_dpr2 = pd.read_csv('../Results/Scores/retrieve-classic_rerank-dpr_eval_mfaq_scores.csv')

metrics = tfidf_dpr1.columns

significance_test(tfidf_dpr1, tfidf_dpr2, metrics)

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


# C. TTHealth

## DPR vs DPR-PRF

In [84]:
dpr_prf = pd.read_csv('../Results/Scores/dpr_prf_eval_tthealth_scores.csv')
compare_with_dpr(dpr_prf, 'tthealth')

Precision score is statistically insignificant
MRR score is statistically insignificant
MAP score is statistically insignificant
BPref score is statistically insignificant


## Sparse Model vs Hybrid Sparse-Dense Model

### BM25 vs BM25-DPR

In [85]:
bm25_dpr = pd.read_csv('../Results/Scores/dpr-bm25_eval_tthealth_scores.csv')
compare_with_sparse(bm25_dpr, 'bm25')

Precision score is statistically insignificant
MRR score is statistically insignificant
MAP score is statistically insignificant
BPref score is statistically insignificant


### LMD vs LMD-DPR

In [86]:
lmd_dpr = pd.read_csv('../Results/Scores/dpr-lmd_eval_tthealth_scores.csv')
compare_with_sparse(lmd_dpr, 'lmd')

Precision score is statistically insignificant
MRR score is statistically insignificant
MAP score is statistically significant
BPref score is statistically significant


### TFIDF vs TFIDF-DPR

In [87]:
tfidf_dpr = pd.read_csv('../Results/Scores/dpr-classic_eval_tthealth_scores.csv')
compare_with_sparse(tfidf_dpr, 'classic')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


## Dense Model vs Hybrid Sparse-Dense Model

### DPR vs BM25-DPR

In [88]:
bm25_dpr = pd.read_csv('../Results/Scores/dpr-bm25_eval_tthealth_scores.csv')
compare_with_dpr(bm25_dpr,'tthealth')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


### DPR vs LMD-DPR

In [89]:
lmd_dpr = pd.read_csv('../Results/Scores/dpr-lmd_eval_tthealth_scores.csv')
compare_with_dpr(lmd_dpr,'tthealth')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


### DPR vs TFIDF-DPR

In [90]:
tfidf_dpr = pd.read_csv('../Results/Scores/dpr-classic_eval_tthealth_scores.csv')
compare_with_dpr(tfidf_dpr,'tthealth')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


## Part 2: Hybrid using Alpha Beta Linear Combo

## Sparse Model vs Hybrid Sparse-Dense Model

### BM25 vs BM25-DPR

In [91]:
bm25_dpr = pd.read_csv('../Results/Scores/dpr-bm25-v2_eval_tthealth_scores.csv')
compare_with_sparse(bm25_dpr,'bm25')

Precision score is statistically significant
MRR score is statistically insignificant
MAP score is statistically insignificant
BPref score is statistically insignificant


### LMD vs LMD-DPR

In [92]:
lmd_dpr = pd.read_csv('../Results/Scores/dpr-lmd-v2_eval_tthealth_scores.csv')
compare_with_sparse(lmd_dpr,'lmd')

Precision score is statistically insignificant
MRR score is statistically insignificant
MAP score is statistically significant
BPref score is statistically significant


### TFIDF vs TFIDF-DPR

In [93]:
tfidf_dpr = pd.read_csv('../Results/Scores/dpr-classic-v2_eval_tthealth_scores.csv')
compare_with_sparse(tfidf_dpr,'classic')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


## Dense Model vs Hybrid Sparse-Dense Model

### DPR vs BM25-DPR

In [94]:
bm25_dpr = pd.read_csv('../Results/Scores/dpr-bm25-v2_eval_tthealth_scores.csv')
compare_with_dpr(bm25_dpr, 'tthealth')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


### DPR vs LMD-DPR

In [95]:
lmd_dpr = pd.read_csv('../Results/Scores/dpr-lmd-v2_eval_tthealth_scores.csv')
compare_with_dpr(lmd_dpr, 'tthealth')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


### DPR vs TFIDF-DPR

In [96]:
tfidf_dpr = pd.read_csv('../Results/Scores/dpr-classic-v2_eval_tthealth_scores.csv')
compare_with_dpr(tfidf_dpr, 'tthealth')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


## Part 3: Retriever - Reranker Approach

## Dense Retriever - Sparse Reranker vs Sparse Retriever

### DPR Retriever-BM25 Reranker vs BM25

In [97]:
bm25_dpr = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-bm25_eval_tthealth_scores.csv')
compare_with_sparse(bm25_dpr, 'bm25')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


### DPR Retriever-LMD Reranker vs LMD

In [98]:
lmd_dpr = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-lmd_eval_tthealth_scores.csv')
compare_with_sparse(lmd_dpr, 'lmd')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


### DPR Retriever-TFIDF Reranker vs TFIDF

In [99]:
tfidf_dpr = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-classic_eval_tthealth_scores.csv')
compare_with_sparse(tfidf_dpr, 'classic')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


## Dense Retriever - Sparse Reranker vs Dense Retriever

### DPR Retriever-BM25 Reranker vs DPR

In [100]:
bm25_dpr = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-bm25_eval_tthealth_scores.csv')
compare_with_dpr(bm25_dpr, 'tthealth')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


### DPR Retriever-LMD Reranker vs DPR

In [101]:
lmd_dpr = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-lmd_eval_tthealth_scores.csv')
compare_with_dpr(lmd_dpr, 'tthealth')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


### DPR Retriever-TFIDF Reranker vs DPR

In [102]:
tfidf_dpr = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-classic_eval_tthealth_scores.csv')
compare_with_dpr(tfidf_dpr, 'tthealth')

Precision score is statistically insignificant
MRR score is statistically insignificant
MAP score is statistically insignificant
BPref score is statistically insignificant


## Sparse Retriever - Dense Reranker vs Sparse Retriever

### BM25 Retriever - DPR Reranker vs BM25

In [103]:
bm25_dpr = pd.read_csv('../Results/Scores/retrieve-bm25_rerank-dpr_eval_tthealth_scores.csv')
compare_with_sparse(bm25_dpr, 'bm25')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically significant


### LMD Retriever - DPR Reranker vs LMD

In [104]:
lmd_dpr = pd.read_csv('../Results/Scores/retrieve-lmd_rerank-dpr_eval_tthealth_scores.csv')
compare_with_sparse(lmd_dpr, 'lmd')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


### TFIDF Retriever - DPR Reranker vs TFIDF

In [105]:
tfidf_dpr = pd.read_csv('../Results/Scores/retrieve-classic_rerank-dpr_eval_tthealth_scores.csv')
compare_with_sparse(tfidf_dpr, 'classic')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


## Sparse Retriever - Dense Reranker vs Dense Retriever

### BM25 Retriever - DPR Reranker vs DPR

In [106]:
bm25_dpr = pd.read_csv('../Results/Scores/retrieve-bm25_rerank-dpr_eval_tthealth_scores.csv')
compare_with_dpr(bm25_dpr, 'tthealth')

Precision score is statistically significant
MRR score is statistically insignificant
MAP score is statistically insignificant
BPref score is statistically insignificant


### LMD Retriever - DPR Reranker vs DPR

In [107]:
lmd_dpr = pd.read_csv('../Results/Scores/retrieve-lmd_rerank-dpr_eval_tthealth_scores.csv')
compare_with_dpr(lmd_dpr, 'tthealth')

Precision score is statistically insignificant
MRR score is statistically insignificant
MAP score is statistically insignificant
BPref score is statistically significant


### TFIDF Retriever - DPR Reranker vs DPR

In [108]:
tfidf_dpr = pd.read_csv('../Results/Scores/retrieve-classic_rerank-dpr_eval_tthealth_scores.csv')
compare_with_dpr(tfidf_dpr, 'tthealth')

Precision score is statistically significant
MRR score is statistically insignificant
MAP score is statistically insignificant
BPref score is statistically insignificant


## Sparse Reranker - Dense Retriever vs Sparse Retriever - Dense Reranker

### BM25 Reranker - DPR Retriever vs BM25 Retriever - DPR Reranker

In [109]:
bm25_dpr1 = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-bm25_eval_tthealth_scores.csv')
bm25_dpr2 = pd.read_csv('../Results/Scores/retrieve-bm25_rerank-dpr_eval_tthealth_scores.csv')

In [110]:
metrics = bm25_dpr1.columns

In [111]:
significance_test(bm25_dpr1, bm25_dpr2, metrics)

Precision score is statistically insignificant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically significant


### LMD Reranker - DPR Retriever vs LMD Retriever - DPR Reranker

In [112]:
lmd_dpr1 = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-lmd_eval_tthealth_scores.csv')
lmd_dpr2 = pd.read_csv('../Results/Scores/retrieve-lmd_rerank-dpr_eval_tthealth_scores.csv')

In [113]:
metrics = lmd_dpr1.columns

In [114]:
significance_test(lmd_dpr1, lmd_dpr2, metrics)

Precision score is statistically insignificant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically significant


### TFIDF Reranker - DPR Retriever vs TFIDF Retriever - DPR Reranker

In [115]:
tfidf_dpr1 = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-classic_eval_tthealth_scores.csv')
tfidf_dpr2 = pd.read_csv('../Results/Scores/retrieve-classic_rerank-dpr_eval_tthealth_scores.csv')

In [116]:
metrics = tfidf_dpr1.columns

In [117]:
significance_test(tfidf_dpr1, tfidf_dpr2, metrics)

Precision score is statistically significant
MRR score is statistically insignificant
MAP score is statistically insignificant
BPref score is statistically insignificant


# Part 4: BERT Reranker 

## DPR Retriever - BERT Reranker vs Baselines

### DPR Retriever - BERT Reranker vs DPR

In [118]:
monobert_dpr = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-monobert_eval_tthealth_mediqa_scores.csv')
compare_with_dpr(monobert_dpr, 'tthealth')

Precision score is statistically insignificant
MRR score is statistically insignificant
MAP score is statistically insignificant
BPref score is statistically insignificant


## BERT Reranker vs Sparse Reranker

### DPR Retriever - BERT Reranker vs DPR Retriever - BM25 Reranker

In [119]:
bm25_dpr = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-bm25_eval_tthealth_scores.csv')

In [120]:
metrics = bm25_dpr.columns

In [121]:
significance_test(bm25_dpr, monobert_dpr, metrics)

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


### DPR Retriever - BERT Reranker vs DPR Retriever - LMD Reranker

In [122]:
lmd_dpr = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-lmd_eval_tthealth_scores.csv')

In [123]:
metrics = lmd_dpr.columns

In [124]:
significance_test(lmd_dpr, monobert_dpr, metrics)

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


### DPR Retriever - BERT Reranker vs DPR Retriever - TFIDF Reranker

In [125]:
tfidf_dpr = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-classic_eval_tthealth_scores.csv')

In [126]:
metrics = tfidf_dpr.columns

In [127]:
significance_test(tfidf_dpr, monobert_dpr, metrics)

Precision score is statistically significant
MRR score is statistically insignificant
MAP score is statistically insignificant
BPref score is statistically insignificant


## BERT Reranker vs DPR Reranker

### BM25

In [128]:
bm25_dpr = pd.read_csv('../Results/Scores/retrieve-bm25_rerank-dpr_eval_tthealth_scores.csv')
bm25_monobert = pd.read_csv('../Results/Scores/retrieve-bm25_rerank-monobert_eval_mediqa_scores.csv')

In [129]:
metrics = bm25_monobert.columns

In [130]:
significance_test(bm25_dpr, bm25_monobert, metrics)

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically significant


### LMD

In [131]:
lmd_dpr = pd.read_csv('../Results/Scores/retrieve-lmd_rerank-dpr_eval_tthealth_scores.csv')
lmd_monobert = pd.read_csv('../Results/Scores/retrieve-lmd_rerank-monobert_eval_mediqa_scores.csv')

In [132]:
metrics = lmd_monobert.columns

In [133]:
significance_test(lmd_dpr, lmd_monobert, metrics)

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


### TFIDF

In [134]:
tfidf_dpr = pd.read_csv('../Results/Scores/retrieve-classic_rerank-dpr_eval_tthealth_scores.csv')
tfidf_monobert = pd.read_csv('../Results/Scores/retrieve-classic_rerank-monobert_eval_mediqa_scores.csv')

In [135]:
metrics = tfidf_monobert.columns

In [136]:
significance_test(tfidf_dpr, tfidf_monobert, metrics)

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically significant


# D. IndoSum

## DPR vs DPR-PRF

In [137]:
dpr_prf = pd.read_csv('../Results/Scores/dpr_prf_eval_indosum_scores.csv')
compare_with_dpr(dpr_prf, 'indosum')

Precision score is statistically significant
MRR score is statistically insignificant
MAP score is statistically insignificant
BPref score is statistically significant


## Part 1: Hybrid using Alpha Linear Combo

The linear combination can be formulated as the following
\begin{gather*} 
Score(Q,D) =  \alpha*sim_{DPR}(Q,D) + (1-\alpha)*sim_{sparse}(Q,D)
\end{gather*}

## Sparse Model vs Hybrid Sparse-Dense Model

### BM25 vs BM25-DPR

In [138]:
bm25_dpr = pd.read_csv('../Results/Scores/dpr-bm25_eval_indosum_scores.csv')
compare_with_sparse(bm25_dpr, 'bm25')

Precision score is statistically significant
MRR score is statistically insignificant
MAP score is statistically insignificant
BPref score is statistically insignificant


### LMD vs LMD-DPR

In [139]:
lmd_dpr = pd.read_csv('../Results/Scores/dpr-lmd_eval_indosum_scores.csv')
compare_with_sparse(lmd_dpr, 'lmd')

Precision score is statistically insignificant
MRR score is statistically insignificant
MAP score is statistically insignificant
BPref score is statistically significant


### TFIDF vs TFIDF-DPR

In [140]:
tfidf_dpr = pd.read_csv('../Results/Scores/dpr-classic_eval_indosum_scores.csv')
compare_with_sparse(tfidf_dpr, 'classic')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


## Dense Model vs Hybrid Sparse-Dense Model

### DPR vs BM25-DPR

In [141]:
bm25_dpr = pd.read_csv('../Results/Scores/dpr-bm25_eval_indosum_scores.csv')
compare_with_dpr(bm25_dpr, 'indosum')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically significant


### DPR vs LMD-DPR

In [142]:
lmd_dpr = pd.read_csv('../Results/Scores/dpr-lmd_eval_indosum_scores.csv')
compare_with_dpr(lmd_dpr, 'indosum')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically significant


### DPR vs TFIDF-DPR

In [143]:
tfidf_dpr = pd.read_csv('../Results/Scores/dpr-classic_eval_indosum_scores.csv')
compare_with_dpr(tfidf_dpr, 'indosum')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically significant


## Part 2: Hybrid using Alpha Beta Linear Combo

The linear combination can be formulated as the following
\begin{gather*} 
Score(Q,D) =  \alpha*sim_{sparse}(Q,D) + \beta*sim_{DPR}(Q,D)
\end{gather*}
Those "optimized" hyperparameters can be found by using logistic regression.

## Sparse Model vs Hybrid Sparse-Dense Model

### BM25 vs BM25-DPR

In [144]:
bm25_dpr = pd.read_csv('../Results/Scores/dpr-bm25-v2_eval_indosum_scores.csv')
compare_with_sparse(bm25_dpr, 'bm25')

Precision score is statistically significant
MRR score is statistically insignificant
MAP score is statistically insignificant
BPref score is statistically insignificant


### LMD vs LMD-DPR

In [145]:
lmd_dpr = pd.read_csv('../Results/Scores/dpr-lmd-v2_eval_indosum_scores.csv')
compare_with_sparse(lmd_dpr, 'lmd')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically significant


### TFIDF vs TFIDF-DPR

In [146]:
tfidf_dpr = pd.read_csv('../Results/Scores/dpr-classic-v2_eval_indosum_scores.csv')
compare_with_sparse(tfidf_dpr, 'classic')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


## Dense Model vs Hybrid Sparse-Dense Model

### DPR vs BM25-DPR

In [147]:
bm25_dpr = pd.read_csv('../Results/Scores/dpr-bm25-v2_eval_indosum_scores.csv')
compare_with_dpr(bm25_dpr, 'indosum')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


### DPR vs LMD-DPR

In [148]:
lmd_dpr = pd.read_csv('../Results/Scores/dpr-lmd-v2_eval_indosum_scores.csv')
compare_with_dpr(lmd_dpr, 'indosum')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically significant


### DPR vs TFIDF-DPR

In [149]:
tfidf_dpr = pd.read_csv('../Results/Scores/dpr-classic-v2_eval_indosum_scores.csv')
compare_with_dpr(tfidf_dpr, 'indosum')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically significant


## Part 3: Retriever - Reranker Approach

## Dense Retriever - Sparse Reranker vs Sparse Retriever

### DPR Retriever-BM25 Reranker vs BM25

In [150]:
bm25_dpr = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-bm25_eval_indosum_scores.csv')
compare_with_sparse(bm25_dpr, 'bm25')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


### DPR Retriever-LMD Reranker vs LMD

In [151]:
lmd_dpr = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-lmd_eval_indosum_scores.csv')
compare_with_sparse(lmd_dpr, 'lmd')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


### DPR Retriever-TFIDF Reranker vs TFIDF

In [152]:
tfidf_dpr = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-classic_eval_indosum_scores.csv')
compare_with_sparse(tfidf_dpr, 'classic')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


## Dense Retriever - Sparse Reranker vs Dense Retriever

### DPR Retriever-BM25 Reranker vs DPR

In [153]:
bm25_dpr = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-bm25_eval_indosum_scores.csv')
compare_with_dpr(bm25_dpr, 'indosum')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically significant


### DPR Retriever-LMD Reranker vs DPR

In [154]:
lmd_dpr = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-lmd_eval_indosum_scores.csv')
compare_with_dpr(lmd_dpr, 'indosum')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically significant


### DPR Retriever-TFIDF Reranker vs DPR

In [155]:
tfidf_dpr = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-classic_eval_indosum_scores.csv')
compare_with_dpr(tfidf_dpr, 'indosum')

Precision score is statistically significant
MRR score is statistically insignificant
MAP score is statistically significant
BPref score is statistically insignificant


## Sparse Retriever - Dense Reranker vs Sparse Retriever

### BM25 Retriever - DPR Reranker vs BM25

In [156]:
bm25_dpr = pd.read_csv('../Results/Scores/retrieve-bm25_rerank-dpr_eval_indosum_scores.csv')
compare_with_sparse(bm25_dpr, 'bm25')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically significant


### LMD Retriever - DPR Reranker vs LMD

In [157]:
lmd_dpr = pd.read_csv('../Results/Scores/retrieve-lmd_rerank-dpr_eval_indosum_scores.csv')
compare_with_sparse(lmd_dpr, 'lmd')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


### TFIDF Retriever - DPR Reranker vs TFIDF

In [158]:
tfidf_dpr = pd.read_csv('../Results/Scores/retrieve-classic_rerank-dpr_eval_indosum_scores.csv')
compare_with_sparse(tfidf_dpr, 'classic')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically significant


## Sparse Retriever - Dense Reranker vs Dense Retriever

### BM25 Retriever - DPR Reranker vs DPR

In [159]:
bm25_dpr = pd.read_csv('../Results/Scores/retrieve-bm25_rerank-dpr_eval_indosum_scores.csv')
compare_with_dpr(bm25_dpr, 'indosum')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


### LMD Retriever - DPR Reranker vs DPR

In [160]:
lmd_dpr = pd.read_csv('../Results/Scores/retrieve-lmd_rerank-dpr_eval_indosum_scores.csv')
compare_with_dpr(lmd_dpr, 'indosum')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


### TFIDF Retriever - DPR Reranker vs DPR

In [161]:
tfidf_dpr = pd.read_csv('../Results/Scores/retrieve-classic_rerank-dpr_eval_indosum_scores.csv')
compare_with_dpr(tfidf_dpr, 'indosum')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


## Sparse Reranker - Dense Retriever vs Sparse Retriever - Dense Reranker

### BM25 Reranker - DPR Retriever vs BM25 Retriever - DPR Reranker

In [162]:
bm25_dpr1 = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-bm25_eval_indosum_scores.csv')
bm25_dpr2 = pd.read_csv('../Results/Scores/retrieve-bm25_rerank-dpr_eval_indosum_scores.csv')

In [163]:
metrics = bm25_dpr1.columns

In [164]:
significance_test(bm25_dpr1, bm25_dpr2, metrics)

Precision score is statistically significant
MRR score is statistically insignificant
MAP score is statistically insignificant
BPref score is statistically insignificant


### LMD Reranker - DPR Retriever vs LMD Retriever - DPR Reranker

In [165]:
lmd_dpr1 = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-lmd_eval_indosum_scores.csv')
lmd_dpr2 = pd.read_csv('../Results/Scores/retrieve-lmd_rerank-dpr_eval_indosum_scores.csv')

In [166]:
metrics = lmd_dpr1.columns

In [167]:
significance_test(lmd_dpr1, lmd_dpr2, metrics)

Precision score is statistically significant
MRR score is statistically insignificant
MAP score is statistically insignificant
BPref score is statistically insignificant


### TFIDF Reranker - DPR Retriever vs TFIDF Retriever - DPR Reranker

In [168]:
tfidf_dpr1 = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-classic_eval_indosum_scores.csv')
tfidf_dpr2 = pd.read_csv('../Results/Scores/retrieve-classic_rerank-dpr_eval_indosum_scores.csv')

In [169]:
metrics = tfidf_dpr1.columns

In [170]:
significance_test(tfidf_dpr1, tfidf_dpr2, metrics)

Precision score is statistically significant
MRR score is statistically insignificant
MAP score is statistically insignificant
BPref score is statistically insignificant


# E. TTMeqSum

## DPR vs DPR-PRF

In [171]:
dpr_prf = pd.read_csv('../Results/Scores/dpr_prf_eval_ttmeqsum_scores.csv')
compare_with_dpr(dpr_prf, 'ttmeqsum')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


## Part 1: Hybrid using Alpha Linear Combo

The linear combination can be formulated as the following
\begin{gather*} 
Score(Q,D) =  \alpha*sim_{DPR}(Q,D) + (1-\alpha)*sim_{sparse}(Q,D)
\end{gather*}

## Sparse Model vs Hybrid Sparse-Dense Model

### BM25 vs BM25-DPR

In [172]:
bm25_dpr = pd.read_csv('../Results/Scores/dpr-bm25_eval_ttmeqsum_scores.csv')
compare_with_sparse(bm25_dpr, 'bm25')

Precision score is statistically insignificant
MRR score is statistically insignificant
MAP score is statistically insignificant
BPref score is statistically insignificant


### LMD vs LMD-DPR

In [173]:
lmd_dpr = pd.read_csv('../Results/Scores/dpr-lmd_eval_ttmeqsum_scores.csv')
compare_with_sparse(lmd_dpr, 'lmd')

Precision score is statistically insignificant
MRR score is statistically insignificant
MAP score is statistically significant
BPref score is statistically significant


### TFIDF vs TFIDF-DPR

In [174]:
tfidf_dpr = pd.read_csv('../Results/Scores/dpr-classic_eval_ttmeqsum_scores.csv')
compare_with_sparse(tfidf_dpr, 'classic')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


## Dense Model vs Hybrid Sparse-Dense Model

### DPR vs BM25-DPR

In [175]:
bm25_dpr = pd.read_csv('../Results/Scores/dpr-bm25_eval_ttmeqsum_scores.csv')
compare_with_dpr(bm25_dpr, 'ttmeqsum')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


### DPR vs LMD-DPR

In [176]:
lmd_dpr = pd.read_csv('../Results/Scores/dpr-lmd_eval_ttmeqsum_scores.csv')
compare_with_dpr(lmd_dpr, 'ttmeqsum')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically significant


### DPR vs TFIDF-DPR

In [177]:
tfidf_dpr = pd.read_csv('../Results/Scores/dpr-classic_eval_ttmeqsum_scores.csv')
compare_with_dpr(tfidf_dpr, 'ttmeqsum')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically significant


## Part 2: Hybrid using Alpha Beta Linear Combo

The linear combination can be formulated as the following
\begin{gather*} 
Score(Q,D) =  \alpha*sim_{sparse}(Q,D) + \beta*sim_{DPR}(Q,D)
\end{gather*}
Those "optimized" hyperparameters can be found by using logistic regression.

## Sparse Model vs Hybrid Sparse-Dense Model

### BM25 vs BM25-DPR

In [178]:
bm25_dpr = pd.read_csv('../Results/Scores/dpr-bm25-v2_eval_ttmeqsum_scores.csv')
compare_with_sparse(bm25_dpr, 'bm25')

Precision score is statistically insignificant
MRR score is statistically insignificant
MAP score is statistically insignificant
BPref score is statistically insignificant


### LMD vs LMD-DPR

In [179]:
lmd_dpr = pd.read_csv('../Results/Scores/dpr-lmd-v2_eval_ttmeqsum_scores.csv')
compare_with_sparse(lmd_dpr, 'lmd')

Precision score is statistically insignificant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically significant


### TFIDF vs TFIDF-DPR

In [180]:
tfidf_dpr = pd.read_csv('../Results/Scores/dpr-classic-v2_eval_ttmeqsum_scores.csv')
compare_with_sparse(tfidf_dpr, 'classic')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


## Dense Model vs Hybrid Sparse-Dense Model

### DPR vs BM25-DPR

In [181]:
bm25_dpr = pd.read_csv('../Results/Scores/dpr-bm25-v2_eval_ttmeqsum_scores.csv')
compare_with_dpr(bm25_dpr, 'ttmeqsum')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


### DPR vs LMD-DPR

In [182]:
lmd_dpr = pd.read_csv('../Results/Scores/dpr-lmd-v2_eval_ttmeqsum_scores.csv')
compare_with_dpr(lmd_dpr, 'ttmeqsum')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


### DPR vs TFIDF-DPR

In [183]:
tfidf_dpr = pd.read_csv('../Results/Scores/dpr-classic-v2_eval_ttmeqsum_scores.csv')
compare_with_dpr(tfidf_dpr, 'ttmeqsum')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


## Part 3: Retriever - Reranker Approach

## Dense Retriever - Sparse Reranker vs Sparse Retriever

### DPR Retriever-BM25 Reranker vs BM25

In [184]:
bm25_dpr = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-bm25_eval_ttmeqsum_scores.csv')
compare_with_sparse(bm25_dpr, 'bm25')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


### DPR Retriever-LMD Reranker vs LMD

In [185]:
lmd_dpr = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-lmd_eval_ttmeqsum_scores.csv')
compare_with_sparse(lmd_dpr, 'lmd')

Precision score is statistically significant
MRR score is statistically insignificant
MAP score is statistically insignificant
BPref score is statistically significant


### DPR Retriever-TFIDF Reranker vs TFIDF

In [186]:
tfidf_dpr = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-classic_eval_ttmeqsum_scores.csv')
compare_with_sparse(tfidf_dpr, 'classic')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


## Dense Retriever - Sparse Reranker vs Dense Retriever

### DPR Retriever-BM25 Reranker vs DPR

In [187]:
bm25_dpr = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-bm25_eval_ttmeqsum_scores.csv')
compare_with_dpr(bm25_dpr, 'ttmeqsum')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically significant


### DPR Retriever-LMD Reranker vs DPR

In [188]:
lmd_dpr = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-lmd_eval_ttmeqsum_scores.csv')
compare_with_dpr(lmd_dpr, 'ttmeqsum')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically significant


### DPR Retriever-TFIDF Reranker vs DPR

In [189]:
tfidf_dpr = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-classic_eval_ttmeqsum_scores.csv')
compare_with_dpr(tfidf_dpr, 'ttmeqsum')

Precision score is statistically insignificant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


## Sparse Retriever - Dense Reranker vs Sparse Retriever

### BM25 Retriever - DPR Reranker vs BM25

In [190]:
bm25_dpr = pd.read_csv('../Results/Scores/retrieve-bm25_rerank-dpr_eval_ttmeqsum_scores.csv')
compare_with_sparse(bm25_dpr, 'bm25')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically significant


### LMD Retriever - DPR Reranker vs LMD

In [191]:
lmd_dpr = pd.read_csv('../Results/Scores/retrieve-lmd_rerank-dpr_eval_ttmeqsum_scores.csv')
compare_with_sparse(lmd_dpr, 'lmd')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically significant


### TFIDF Retriever - DPR Reranker vs TFIDF

In [192]:
tfidf_dpr = pd.read_csv('../Results/Scores/retrieve-classic_rerank-dpr_eval_ttmeqsum_scores.csv')
compare_with_sparse(tfidf_dpr, 'classic')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


## Sparse Retriever - Dense Reranker vs Dense Retriever

### BM25 Retriever - DPR Reranker vs DPR

In [193]:
bm25_dpr = pd.read_csv('../Results/Scores/retrieve-bm25_rerank-dpr_eval_ttmeqsum_scores.csv')
compare_with_dpr(bm25_dpr, 'ttmeqsum')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


### LMD Retriever - DPR Reranker vs DPR

In [194]:
lmd_dpr = pd.read_csv('../Results/Scores/retrieve-lmd_rerank-dpr_eval_ttmeqsum_scores.csv')
compare_with_dpr(lmd_dpr, 'ttmeqsum')

Precision score is statistically insignificant
MRR score is statistically insignificant
MAP score is statistically insignificant
BPref score is statistically significant


### TFIDF Retriever - DPR Reranker vs DPR

In [195]:
tfidf_dpr = pd.read_csv('../Results/Scores/retrieve-classic_rerank-dpr_eval_ttmeqsum_scores.csv')
compare_with_dpr(tfidf_dpr, 'ttmeqsum')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


## Sparse Reranker - Dense Retriever vs Sparse Retriever - Dense Reranker

### BM25 Reranker - DPR Retriever vs BM25 Retriever - DPR Reranker

In [196]:
bm25_dpr1 = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-bm25_eval_indosum_scores.csv')
bm25_dpr2 = pd.read_csv('../Results/Scores/retrieve-bm25_rerank-dpr_eval_indosum_scores.csv')

In [197]:
metrics = bm25_dpr1.columns

In [198]:
significance_test(bm25_dpr1, bm25_dpr2, metrics)

Precision score is statistically significant
MRR score is statistically insignificant
MAP score is statistically insignificant
BPref score is statistically insignificant


### LMD Reranker - DPR Retriever vs LMD Retriever - DPR Reranker

In [199]:
lmd_dpr1 = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-lmd_eval_indosum_scores.csv')
lmd_dpr2 = pd.read_csv('../Results/Scores/retrieve-lmd_rerank-dpr_eval_indosum_scores.csv')

In [200]:
metrics = lmd_dpr1.columns

In [201]:
significance_test(lmd_dpr1, lmd_dpr2, metrics)

Precision score is statistically significant
MRR score is statistically insignificant
MAP score is statistically insignificant
BPref score is statistically insignificant


### TFIDF Reranker - DPR Retriever vs TFIDF Retriever - DPR Reranker

In [202]:
tfidf_dpr1 = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-classic_eval_indosum_scores.csv')
tfidf_dpr2 = pd.read_csv('../Results/Scores/retrieve-classic_rerank-dpr_eval_indosum_scores.csv')

In [203]:
metrics = tfidf_dpr1.columns

In [204]:
significance_test(tfidf_dpr1, tfidf_dpr2, metrics)

Precision score is statistically significant
MRR score is statistically insignificant
MAP score is statistically insignificant
BPref score is statistically insignificant


# F. ICT-Syifa

## DPR vs DPR-PRF

In [317]:
dpr_prf = pd.read_csv('../Results/Scores/dpr_prf_eval_ict_scores.csv')
compare_with_dpr(dpr_prf, 'ict')

Precision score is statistically insignificant
MRR score is statistically insignificant
MAP score is statistically insignificant
BPref score is statistically insignificant


## Part 1: Hybrid using Alpha Linear Combo

The linear combination can be formulated as the following
\begin{gather*} 
Score(Q,D) =  \alpha*sim_{DPR}(Q,D) + (1-\alpha)*sim_{sparse}(Q,D)
\end{gather*}

## Sparse Model vs Hybrid Sparse-Dense Model

### BM25 vs BM25-DPR

In [206]:
bm25_dpr = pd.read_csv('../Results/Scores/dpr-bm25_eval_ict_scores.csv')
compare_with_sparse(bm25_dpr, 'bm25')

Precision score is statistically significant
MRR score is statistically insignificant
MAP score is statistically insignificant
BPref score is statistically insignificant


### LMD vs LMD-DPR

In [207]:
lmd_dpr = pd.read_csv('../Results/Scores/dpr-lmd_eval_ict_scores.csv')
compare_with_sparse(lmd_dpr, 'lmd')

Precision score is statistically insignificant
MRR score is statistically insignificant
MAP score is statistically significant
BPref score is statistically significant


### TFIDF vs TFIDF-DPR

In [208]:
tfidf_dpr = pd.read_csv('../Results/Scores/dpr-classic_eval_ict_scores.csv')
compare_with_sparse(tfidf_dpr, 'classic')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


## Dense Model vs Hybrid Sparse-Dense Model

### DPR vs BM25-DPR

In [209]:
bm25_dpr = pd.read_csv('../Results/Scores/dpr-bm25_eval_ict_scores.csv')
compare_with_dpr(bm25_dpr, 'ict')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically significant


### DPR vs LMD-DPR

In [210]:
lmd_dpr = pd.read_csv('../Results/Scores/dpr-lmd_eval_ict_scores.csv')
compare_with_dpr(lmd_dpr, 'ict')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically significant


### DPR vs TFIDF-DPR

In [211]:
tfidf_dpr = pd.read_csv('../Results/Scores/dpr-classic_eval_ict_scores.csv')
compare_with_dpr(tfidf_dpr, 'ict')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically significant


## Part 2: Hybrid using Alpha Beta Linear Combo

The linear combination can be formulated as the following
\begin{gather*} 
Score(Q,D) =  \alpha*sim_{sparse}(Q,D) + \beta*sim_{DPR}(Q,D)
\end{gather*}
Those "optimized" hyperparameters can be found by using logistic regression.

## Sparse Model vs Hybrid Sparse-Dense Model

### BM25 vs BM25-DPR

In [212]:
bm25_dpr = pd.read_csv('../Results/Scores/dpr-bm25-v2_eval_ict_scores.csv')
compare_with_sparse(bm25_dpr, 'bm25')

Precision score is statistically significant
MRR score is statistically insignificant
MAP score is statistically insignificant
BPref score is statistically insignificant


### LMD vs LMD-DPR

In [213]:
lmd_dpr = pd.read_csv('../Results/Scores/dpr-lmd-v2_eval_ict_scores.csv')
compare_with_sparse(lmd_dpr, 'lmd')

Precision score is statistically insignificant
MRR score is statistically insignificant
MAP score is statistically insignificant
BPref score is statistically significant


### TFIDF vs TFIDF-DPR

In [214]:
tfidf_dpr = pd.read_csv('../Results/Scores/dpr-classic-v2_eval_ict_scores.csv')
compare_with_sparse(tfidf_dpr, 'classic')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


## Dense Model vs Hybrid Sparse-Dense Model

### DPR vs BM25-DPR

In [215]:
bm25_dpr = pd.read_csv('../Results/Scores/dpr-bm25-v2_eval_ict_scores.csv')
compare_with_dpr(bm25_dpr, 'ict')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically significant


### DPR vs LMD-DPR

In [216]:
lmd_dpr = pd.read_csv('../Results/Scores/dpr-lmd-v2_eval_ict_scores.csv')
compare_with_dpr(lmd_dpr, 'ict')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically significant


### DPR vs TFIDF-DPR

In [217]:
tfidf_dpr = pd.read_csv('../Results/Scores/dpr-classic-v2_eval_ict_scores.csv')
compare_with_dpr(tfidf_dpr, 'ict')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically significant


## Part 3: Retriever - Reranker Approach

## Dense Retriever - Sparse Reranker vs Sparse Retriever

### DPR Retriever-BM25 Reranker vs BM25

In [218]:
bm25_dpr = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-bm25_eval_ict_scores.csv')
compare_with_sparse(bm25_dpr, 'bm25')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically significant


### DPR Retriever-LMD Reranker vs LMD

In [219]:
lmd_dpr = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-lmd_eval_ict_scores.csv')
compare_with_sparse(lmd_dpr, 'lmd')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


### DPR Retriever-TFIDF Reranker vs TFIDF

In [220]:
tfidf_dpr = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-classic_eval_ict_scores.csv')
compare_with_sparse(tfidf_dpr, 'classic')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically significant


## Dense Retriever - Sparse Reranker vs Dense Retriever

### DPR Retriever-BM25 Reranker vs DPR

In [221]:
bm25_dpr = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-bm25_eval_ict_scores.csv')
compare_with_dpr(bm25_dpr, 'ict')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically significant


### DPR Retriever-LMD Reranker vs DPR

In [222]:
lmd_dpr = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-lmd_eval_ict_scores.csv')
compare_with_dpr(lmd_dpr, 'ict')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically significant


### DPR Retriever-TFIDF Reranker vs DPR

In [223]:
tfidf_dpr = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-classic_eval_ict_scores.csv')
compare_with_dpr(tfidf_dpr, 'ict')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


## Sparse Retriever - Dense Reranker vs Sparse Retriever

### BM25 Retriever - DPR Reranker vs BM25

In [224]:
bm25_dpr = pd.read_csv('../Results/Scores/retrieve-bm25_rerank-dpr_eval_ict_scores.csv')
compare_with_sparse(bm25_dpr, 'bm25')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically significant


### LMD Retriever - DPR Reranker vs LMD

In [225]:
lmd_dpr = pd.read_csv('../Results/Scores/retrieve-lmd_rerank-dpr_eval_ict_scores.csv')
compare_with_sparse(lmd_dpr, 'lmd')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


### TFIDF Retriever - DPR Reranker vs TFIDF

In [226]:
tfidf_dpr = pd.read_csv('../Results/Scores/retrieve-classic_rerank-dpr_eval_ict_scores.csv')
compare_with_sparse(tfidf_dpr, 'classic')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


## Sparse Retriever - Dense Reranker vs Dense Retriever

### BM25 Retriever - DPR Reranker vs DPR

In [227]:
bm25_dpr = pd.read_csv('../Results/Scores/retrieve-bm25_rerank-dpr_eval_ict_scores.csv')
compare_with_dpr(bm25_dpr, 'ict')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


### LMD Retriever - DPR Reranker vs DPR

In [228]:
lmd_dpr = pd.read_csv('../Results/Scores/retrieve-lmd_rerank-dpr_eval_ict_scores.csv')
compare_with_dpr(lmd_dpr, 'ict')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


### TFIDF Retriever - DPR Reranker vs DPR

In [229]:
tfidf_dpr = pd.read_csv('../Results/Scores/retrieve-classic_rerank-dpr_eval_ict_scores.csv')
compare_with_dpr(tfidf_dpr, 'ict')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


## Sparse Reranker - Dense Retriever vs Sparse Retriever - Dense Reranker

### BM25 Reranker - DPR Retriever vs BM25 Retriever - DPR Reranker

In [230]:
bm25_dpr1 = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-bm25_eval_ict_scores.csv')
bm25_dpr2 = pd.read_csv('../Results/Scores/retrieve-bm25_rerank-dpr_eval_ict_scores.csv')

In [231]:
metrics = bm25_dpr1.columns

In [232]:
significance_test(bm25_dpr1, bm25_dpr2, metrics)

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


### LMD Reranker - DPR Retriever vs LMD Retriever - DPR Reranker

In [233]:
lmd_dpr1 = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-lmd_eval_ict_scores.csv')
lmd_dpr2 = pd.read_csv('../Results/Scores/retrieve-lmd_rerank-dpr_eval_ict_scores.csv')

In [234]:
metrics = lmd_dpr1.columns

In [235]:
significance_test(lmd_dpr1, lmd_dpr2, metrics)

Precision score is statistically significant
MRR score is statistically insignificant
MAP score is statistically significant
BPref score is statistically insignificant


### TFIDF Reranker - DPR Retriever vs TFIDF Retriever - DPR Reranker

In [236]:
tfidf_dpr1 = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-classic_eval_ict_scores.csv')
tfidf_dpr2 = pd.read_csv('../Results/Scores/retrieve-classic_rerank-dpr_eval_ict_scores.csv')

In [237]:
metrics = tfidf_dpr1.columns

In [238]:
significance_test(tfidf_dpr1, tfidf_dpr2, metrics)

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


# G. Indowiki

## DPR vs DPR-PRF

In [239]:
dpr_prf = pd.read_csv('../Results/Scores/dpr_prf_eval_indowiki_scores.csv')
compare_with_dpr(dpr_prf, 'indowiki')

Precision score is statistically significant
MRR score is statistically insignificant
MAP score is statistically significant
BPref score is statistically significant


## Part 1: Hybrid using Alpha Linear Combo

The linear combination can be formulated as the following
\begin{gather*} 
Score(Q,D) =  \alpha*sim_{DPR}(Q,D) + (1-\alpha)*sim_{sparse}(Q,D)
\end{gather*}

## Sparse Model vs Hybrid Sparse-Dense Model

### BM25 vs BM25-DPR

In [240]:
bm25_dpr = pd.read_csv('../Results/Scores/dpr-bm25_eval_indowiki_scores.csv')
compare_with_sparse(bm25_dpr, 'bm25')

Precision score is statistically significant
MRR score is statistically insignificant
MAP score is statistically insignificant
BPref score is statistically insignificant


### LMD vs LMD-DPR

In [241]:
lmd_dpr = pd.read_csv('../Results/Scores/dpr-lmd_eval_indowiki_scores.csv')
compare_with_sparse(lmd_dpr, 'lmd')

Precision score is statistically insignificant
MRR score is statistically insignificant
MAP score is statistically insignificant
BPref score is statistically significant


### TFIDF vs TFIDF-DPR

In [242]:
tfidf_dpr = pd.read_csv('../Results/Scores/dpr-classic_eval_indowiki_scores.csv')
compare_with_sparse(tfidf_dpr, 'classic')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


## Dense Model vs Hybrid Sparse-Dense Model

### DPR vs BM25-DPR

In [243]:
bm25_dpr = pd.read_csv('../Results/Scores/dpr-bm25_eval_indowiki_scores.csv')
compare_with_dpr(bm25_dpr, 'indowiki')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically significant


### DPR vs LMD-DPR

In [244]:
lmd_dpr = pd.read_csv('../Results/Scores/dpr-lmd_eval_indowiki_scores.csv')
compare_with_dpr(lmd_dpr, 'indowiki')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically significant


### DPR vs TFIDF-DPR

In [245]:
tfidf_dpr = pd.read_csv('../Results/Scores/dpr-classic_eval_indowiki_scores.csv')
compare_with_dpr(tfidf_dpr, 'indowiki')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


## Part 2: Hybrid using Alpha Beta Linear Combo

The linear combination can be formulated as the following
\begin{gather*} 
Score(Q,D) =  \alpha*sim_{sparse}(Q,D) + \beta*sim_{DPR}(Q,D)
\end{gather*}
Those "optimized" hyperparameters can be found by using logistic regression.

## Sparse Model vs Hybrid Sparse-Dense Model

### BM25 vs BM25-DPR

In [246]:
bm25_dpr = pd.read_csv('../Results/Scores/dpr-bm25-v2_eval_indowiki_scores.csv')
compare_with_sparse(bm25_dpr, 'bm25')

Precision score is statistically significant
MRR score is statistically insignificant
MAP score is statistically insignificant
BPref score is statistically insignificant


### LMD vs LMD-DPR

In [247]:
lmd_dpr = pd.read_csv('../Results/Scores/dpr-lmd-v2_eval_indowiki_scores.csv')
compare_with_sparse(lmd_dpr, 'lmd')

Precision score is statistically insignificant
MRR score is statistically insignificant
MAP score is statistically significant
BPref score is statistically significant


### TFIDF vs TFIDF-DPR

In [248]:
tfidf_dpr = pd.read_csv('../Results/Scores/dpr-classic-v2_eval_indowiki_scores.csv')
compare_with_sparse(tfidf_dpr, 'classic')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


## Dense Model vs Hybrid Sparse-Dense Model

### DPR vs BM25-DPR

In [249]:
bm25_dpr = pd.read_csv('../Results/Scores/dpr-bm25-v2_eval_indowiki_scores.csv')
compare_with_dpr(bm25_dpr, 'indowiki')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically significant


### DPR vs LMD-DPR

In [250]:
lmd_dpr = pd.read_csv('../Results/Scores/dpr-lmd-v2_eval_indowiki_scores.csv')
compare_with_dpr(lmd_dpr, 'indowiki')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically significant


### DPR vs TFIDF-DPR

In [251]:
tfidf_dpr = pd.read_csv('../Results/Scores/dpr-classic-v2_eval_indowiki_scores.csv')
compare_with_dpr(tfidf_dpr, 'indowiki')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


## Part 3: Retriever - Reranker Approach

## Dense Retriever - Sparse Reranker vs Sparse Retriever

### DPR Retriever-BM25 Reranker vs BM25

In [252]:
bm25_dpr = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-bm25_eval_indowiki_scores.csv')
compare_with_sparse(bm25_dpr, 'bm25')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically significant


### DPR Retriever-LMD Reranker vs LMD

In [253]:
lmd_dpr = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-lmd_eval_indowiki_scores.csv')
compare_with_sparse(lmd_dpr, 'lmd')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


### DPR Retriever-TFIDF Reranker vs TFIDF

In [254]:
tfidf_dpr = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-classic_eval_indowiki_scores.csv')
compare_with_sparse(tfidf_dpr, 'classic')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically significant


## Dense Retriever - Sparse Reranker vs Dense Retriever

### DPR Retriever-BM25 Reranker vs DPR

In [255]:
bm25_dpr = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-bm25_eval_indowiki_scores.csv')
compare_with_dpr(bm25_dpr, 'indowiki')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically significant


### DPR Retriever-LMD Reranker vs DPR

In [256]:
lmd_dpr = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-lmd_eval_indowiki_scores.csv')
compare_with_dpr(lmd_dpr, 'indowiki')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically significant


### DPR Retriever-TFIDF Reranker vs DPR

In [257]:
tfidf_dpr = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-classic_eval_indowiki_scores.csv')
compare_with_dpr(tfidf_dpr, 'indowiki')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


## Sparse Retriever - Dense Reranker vs Sparse Retriever

### BM25 Retriever - DPR Reranker vs BM25

In [258]:
bm25_dpr = pd.read_csv('../Results/Scores/retrieve-bm25_rerank-dpr_eval_indowiki_scores.csv')
compare_with_sparse(bm25_dpr, 'bm25')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


### LMD Retriever - DPR Reranker vs LMD

In [259]:
lmd_dpr = pd.read_csv('../Results/Scores/retrieve-lmd_rerank-dpr_eval_indowiki_scores.csv')
compare_with_sparse(lmd_dpr, 'lmd')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


### TFIDF Retriever - DPR Reranker vs TFIDF

In [260]:
tfidf_dpr = pd.read_csv('../Results/Scores/retrieve-classic_rerank-dpr_eval_indowiki_scores.csv')
compare_with_sparse(tfidf_dpr, 'classic')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


## Sparse Retriever - Dense Reranker vs Dense Retriever

### BM25 Retriever - DPR Reranker vs DPR

In [261]:
bm25_dpr = pd.read_csv('../Results/Scores/retrieve-bm25_rerank-dpr_eval_indowiki_scores.csv')
compare_with_dpr(bm25_dpr, 'indowiki')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically significant


### LMD Retriever - DPR Reranker vs DPR

In [262]:
lmd_dpr = pd.read_csv('../Results/Scores/retrieve-lmd_rerank-dpr_eval_indowiki_scores.csv')
compare_with_dpr(lmd_dpr, 'indowiki')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


### TFIDF Retriever - DPR Reranker vs DPR

In [263]:
tfidf_dpr = pd.read_csv('../Results/Scores/retrieve-classic_rerank-dpr_eval_indowiki_scores.csv')
compare_with_dpr(tfidf_dpr, 'indowiki')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


## Sparse Reranker - Dense Retriever vs Sparse Retriever - Dense Reranker

### BM25 Reranker - DPR Retriever vs BM25 Retriever - DPR Reranker

In [264]:
bm25_dpr1 = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-bm25_eval_indowiki_scores.csv')
bm25_dpr2 = pd.read_csv('../Results/Scores/retrieve-bm25_rerank-dpr_eval_indowiki_scores.csv')

In [265]:
metrics = bm25_dpr1.columns

In [266]:
significance_test(bm25_dpr1, bm25_dpr2, metrics)

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


### LMD Reranker - DPR Retriever vs LMD Retriever - DPR Reranker

In [267]:
lmd_dpr1 = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-lmd_eval_indowiki_scores.csv')
lmd_dpr2 = pd.read_csv('../Results/Scores/retrieve-lmd_rerank-dpr_eval_indowiki_scores.csv')

In [268]:
metrics = lmd_dpr1.columns

In [269]:
significance_test(lmd_dpr1, lmd_dpr2, metrics)

Precision score is statistically significant
MRR score is statistically insignificant
MAP score is statistically insignificant
BPref score is statistically insignificant


### TFIDF Reranker - DPR Retriever vs TFIDF Retriever - DPR Reranker

In [270]:
tfidf_dpr1 = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-classic_eval_indowiki_scores.csv')
tfidf_dpr2 = pd.read_csv('../Results/Scores/retrieve-classic_rerank-dpr_eval_indowiki_scores.csv')

In [271]:
metrics = tfidf_dpr1.columns

In [272]:
significance_test(tfidf_dpr1, tfidf_dpr2, metrics)

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


# H. Syifa-QA

In [273]:
dpr_prf = pd.read_csv('../Results/Scores/dpr_prf_eval_other_scores.csv')
compare_with_dpr(dpr_prf, 'other')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically significant


## Part 1: Hybrid using Alpha Linear Combo

The linear combination can be formulated as the following
\begin{gather*} 
Score(Q,D) =  \alpha*sim_{DPR}(Q,D) + (1-\alpha)*sim_{sparse}(Q,D)
\end{gather*}

## Sparse Model vs Hybrid Sparse-Dense Model

### BM25 vs BM25-DPR

In [274]:
bm25_dpr = pd.read_csv('../Results/Scores/dpr-bm25_eval_other_scores.csv')
compare_with_sparse(bm25_dpr, 'bm25')

Precision score is statistically insignificant
MRR score is statistically insignificant
MAP score is statistically insignificant
BPref score is statistically insignificant


### LMD vs LMD-DPR

In [275]:
lmd_dpr = pd.read_csv('../Results/Scores/dpr-lmd_eval_other_scores.csv')
compare_with_sparse(lmd_dpr, 'lmd')

Precision score is statistically insignificant
MRR score is statistically insignificant
MAP score is statistically significant
BPref score is statistically significant


### TFIDF vs TFIDF-DPR

In [276]:
tfidf_dpr = pd.read_csv('../Results/Scores/dpr-classic_eval_other_scores.csv')
compare_with_sparse(tfidf_dpr, 'classic')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


## Dense Model vs Hybrid Sparse-Dense Model

### DPR vs BM25-DPR

In [277]:
bm25_dpr = pd.read_csv('../Results/Scores/dpr-bm25_eval_other_scores.csv')
compare_with_dpr(bm25_dpr, 'other')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


### DPR vs LMD-DPR

In [278]:
lmd_dpr = pd.read_csv('../Results/Scores/dpr-lmd_eval_other_scores.csv')
compare_with_dpr(lmd_dpr, 'other')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically significant


### DPR vs TFIDF-DPR

In [279]:
tfidf_dpr = pd.read_csv('../Results/Scores/dpr-classic_eval_other_scores.csv')
compare_with_dpr(tfidf_dpr, 'other')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


## Part 2: Hybrid using Alpha Beta Linear Combo

The linear combination can be formulated as the following
\begin{gather*} 
Score(Q,D) =  \alpha*sim_{sparse}(Q,D) + \beta*sim_{DPR}(Q,D)
\end{gather*}
Those "optimized" hyperparameters can be found by using logistic regression.

## Sparse Model vs Hybrid Sparse-Dense Model

### BM25 vs BM25-DPR

In [280]:
bm25_dpr = pd.read_csv('../Results/Scores/dpr-bm25-v2_eval_other_scores.csv')
compare_with_sparse(bm25_dpr, 'bm25')

Precision score is statistically insignificant
MRR score is statistically insignificant
MAP score is statistically insignificant
BPref score is statistically insignificant


### LMD vs LMD-DPR

In [281]:
lmd_dpr = pd.read_csv('../Results/Scores/dpr-lmd-v2_eval_other_scores.csv')
compare_with_sparse(lmd_dpr, 'lmd')

Precision score is statistically insignificant
MRR score is statistically insignificant
MAP score is statistically insignificant
BPref score is statistically insignificant


### TFIDF vs TFIDF-DPR

In [282]:
tfidf_dpr = pd.read_csv('../Results/Scores/dpr-classic-v2_eval_other_scores.csv')
compare_with_sparse(tfidf_dpr, 'classic')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


## Dense Model vs Hybrid Sparse-Dense Model

### DPR vs BM25-DPR

In [283]:
bm25_dpr = pd.read_csv('../Results/Scores/dpr-bm25-v2_eval_other_scores.csv')
compare_with_dpr(bm25_dpr, 'other')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


### DPR vs LMD-DPR

In [284]:
lmd_dpr = pd.read_csv('../Results/Scores/dpr-lmd-v2_eval_other_scores.csv')
compare_with_dpr(lmd_dpr, 'other')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


### DPR vs TFIDF-DPR

In [285]:
tfidf_dpr = pd.read_csv('../Results/Scores/dpr-classic-v2_eval_other_scores.csv')
compare_with_dpr(tfidf_dpr, 'other')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


## Part 3: Retriever - Reranker Approach

## Dense Retriever - Sparse Reranker vs Sparse Retriever

### DPR Retriever-BM25 Reranker vs BM25

In [286]:
bm25_dpr = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-bm25_eval_other_scores.csv')
compare_with_sparse(bm25_dpr, 'bm25')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


### DPR Retriever-LMD Reranker vs LMD

In [287]:
lmd_dpr = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-lmd_eval_other_scores.csv')
compare_with_sparse(lmd_dpr, 'lmd')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


### DPR Retriever-TFIDF Reranker vs TFIDF

In [288]:
tfidf_dpr = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-classic_eval_other_scores.csv')
compare_with_sparse(tfidf_dpr, 'classic')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


## Dense Retriever - Sparse Reranker vs Dense Retriever

### DPR Retriever-BM25 Reranker vs DPR

In [289]:
bm25_dpr = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-bm25_eval_other_scores.csv')
compare_with_dpr(bm25_dpr, 'other')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


### DPR Retriever-LMD Reranker vs DPR

In [290]:
lmd_dpr = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-lmd_eval_other_scores.csv')
compare_with_dpr(lmd_dpr, 'other')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


### DPR Retriever-TFIDF Reranker vs DPR

In [291]:
tfidf_dpr = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-classic_eval_other_scores.csv')
compare_with_dpr(tfidf_dpr, 'other')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


## Sparse Retriever - Dense Reranker vs Sparse Retriever

### BM25 Retriever - DPR Reranker vs BM25

In [292]:
bm25_dpr = pd.read_csv('../Results/Scores/retrieve-bm25_rerank-dpr_eval_other_scores.csv')
compare_with_sparse(bm25_dpr, 'bm25')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


### LMD Retriever - DPR Reranker vs LMD

In [293]:
lmd_dpr = pd.read_csv('../Results/Scores/retrieve-lmd_rerank-dpr_eval_other_scores.csv')
compare_with_sparse(lmd_dpr, 'lmd')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


### TFIDF Retriever - DPR Reranker vs TFIDF

In [294]:
tfidf_dpr = pd.read_csv('../Results/Scores/retrieve-classic_rerank-dpr_eval_other_scores.csv')
compare_with_sparse(tfidf_dpr, 'classic')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


## Sparse Retriever - Dense Reranker vs Dense Retriever

### BM25 Retriever - DPR Reranker vs DPR

In [295]:
bm25_dpr = pd.read_csv('../Results/Scores/retrieve-bm25_rerank-dpr_eval_other_scores.csv')
compare_with_dpr(bm25_dpr, 'other')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


### LMD Retriever - DPR Reranker vs DPR

In [296]:
lmd_dpr = pd.read_csv('../Results/Scores/retrieve-lmd_rerank-dpr_eval_other_scores.csv')
compare_with_dpr(lmd_dpr, 'other')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


### TFIDF Retriever - DPR Reranker vs DPR

In [297]:
tfidf_dpr = pd.read_csv('../Results/Scores/retrieve-classic_rerank-dpr_eval_other_scores.csv')
compare_with_dpr(tfidf_dpr, 'other')

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically insignificant


## Sparse Reranker - Dense Retriever vs Sparse Retriever - Dense Reranker

### BM25 Reranker - DPR Retriever vs BM25 Retriever - DPR Reranker

In [298]:
bm25_dpr1 = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-bm25_eval_other_scores.csv')
bm25_dpr2 = pd.read_csv('../Results/Scores/retrieve-bm25_rerank-dpr_eval_other_scores.csv')

In [299]:
metrics = bm25_dpr1.columns

In [300]:
significance_test(bm25_dpr1, bm25_dpr2, metrics)

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically insignificant
BPref score is statistically insignificant


### LMD Reranker - DPR Retriever vs LMD Retriever - DPR Reranker

In [301]:
lmd_dpr1 = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-lmd_eval_other_scores.csv')
lmd_dpr2 = pd.read_csv('../Results/Scores/retrieve-lmd_rerank-dpr_eval_other_scores.csv')

In [302]:
metrics = lmd_dpr1.columns

In [303]:
significance_test(lmd_dpr1, lmd_dpr2, metrics)

Precision score is statistically insignificant
MRR score is statistically insignificant
MAP score is statistically insignificant
BPref score is statistically insignificant


### TFIDF Reranker - DPR Retriever vs TFIDF Retriever - DPR Reranker

In [304]:
tfidf_dpr1 = pd.read_csv('../Results/Scores/retrieve-dpr_rerank-classic_eval_other_scores.csv')
tfidf_dpr2 = pd.read_csv('../Results/Scores/retrieve-classic_rerank-dpr_eval_other_scores.csv')

In [305]:
metrics = tfidf_dpr1.columns

In [306]:
significance_test(tfidf_dpr1, tfidf_dpr2, metrics)

Precision score is statistically significant
MRR score is statistically insignificant
MAP score is statistically insignificant
BPref score is statistically insignificant


# I. Dataset Comparison

In [318]:
baseline = pd.read_csv('../Results/Scores/dpr_eval_indobert_scores.csv')
metrics = baseline.columns

### Tydi

In [319]:
tydi = pd.read_csv('../Results/Scores/dpr_eval_tydi_scores.csv')
significance_test(tydi, baseline, metrics)

Precision score is statistically significant
MRR score is statistically insignificant
MAP score is statistically insignificant
BPref score is statistically insignificant


### MFAQ

In [320]:
mfaq = pd.read_csv('../Results/Scores/dpr_eval_mfaq_scores.csv')
significance_test(mfaq, baseline, metrics)

Precision score is statistically insignificant
MRR score is statistically insignificant
MAP score is statistically insignificant
BPref score is statistically insignificant


### TTHealth

In [321]:
tthealth = pd.read_csv('../Results/Scores/dpr_eval_tthealth_scores.csv')
significance_test(tthealth, baseline, metrics)

Precision score is statistically significant
MRR score is statistically significant
MAP score is statistically significant
BPref score is statistically significant


### TTMeqSum

In [322]:
ttmeqsum = pd.read_csv('../Results/Scores/dpr_eval_ttmeqsum_scores.csv')
significance_test(ttmeqsum, baseline, metrics)

Precision score is statistically significant
MRR score is statistically insignificant
MAP score is statistically insignificant
BPref score is statistically significant


### IndoSum

In [323]:
indosum = pd.read_csv('../Results/Scores/dpr_eval_indosum_scores.csv')
significance_test(indosum, baseline, metrics)

Precision score is statistically insignificant
MRR score is statistically insignificant
MAP score is statistically insignificant
BPref score is statistically insignificant


### ICT

In [324]:
ict = pd.read_csv('../Results/Scores/dpr_eval_ict_scores.csv')
significance_test(ict, baseline, metrics)

Precision score is statistically insignificant
MRR score is statistically insignificant
MAP score is statistically significant
BPref score is statistically insignificant


### Indowiki

In [325]:
indowiki = pd.read_csv('../Results/Scores/dpr_eval_indowiki_scores.csv')
significance_test(indowiki, baseline, metrics)

Precision score is statistically insignificant
MRR score is statistically insignificant
MAP score is statistically insignificant
BPref score is statistically insignificant


### Other/Syifa-QA

In [328]:
other = pd.read_csv('../Results/Scores/dpr_eval_other_scores.csv')
significance_test(other, baseline, metrics)

Precision score is statistically significant
MRR score is statistically insignificant
MAP score is statistically insignificant
BPref score is statistically significant
