In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
from scipy import stats
from scipy.stats import wilcoxon
from mlxtend.evaluate import permutation_test
from statsmodels.stats.multitest import fdrcorrection

In [None]:
def read_file(file_name):
  folder_path = "/content/drive/MyDrive/Statistical_Significance_RAG"


  path = folder_path + "/" + file_name
  # Read the CSV safely
  df = pd.read_csv(
    path
  )
  print("Dataset size:", len(df))
  df.columns = df.columns.str.lower()
  # Convert all column names to lowercase
  df.columns = df.columns.str.replace('rerank_', '', regex=False)


  return df

In [None]:
def wilcoxon_test(list1, list2, alpha):
  stat, p = wilcoxon(list1, list2)
  # Apply Benjamini-Hochberg correction (FDR control)
  #reject, pval_corrected = fdrcorrection(p.flatten(), alpha=0.05, method='indep')

  print(f"P-value: {p:.4f}")
  if p < alpha:
    print("=> Significant difference (reject H0)")
  else:
    print("=> No significant difference (fail to reject H0)")

  return p

In [None]:
def fdr_correction(pvals, alpha):
  #pvals = [p_map3, p_map5, p_map10, p_ndcg3, p_ndcg5, p_ndcg10]
  reject, pvals_corrected = fdrcorrection(pvals, alpha, method='indep')

  print("Reject null hypotheses:", reject)
  print("FDR-adjusted p-values:", pvals_corrected)

In [None]:
def apply_statistical_significance_test(df, df2, alpha):
  metrics_tocompare = ["map@3", "map@3", "map@5", "ndcg@3", "ndcg@3", "ndcg@5"]
  metrics_tocompare2 = ["map@5", "map@10", "map@10", "ndcg@5", "ndcg@10", "ndcg@10"]

  pvals = []
  for curr_metric in metrics_tocompare:
    list1 = df [curr_metric]

  for curr_metric in metrics_tocompare2:
    list2 = df2 [curr_metric]

    print(curr_metric)
    p = wilcoxon_test(list1, list2, alpha)
    print("*****")

    pvals.append(p)

  print("Apply fdr_correction")
  fdr_correction(pvals,alpha)

In [None]:
filename = "hotpotqa_hybrid_linear_with_rerank_minilm_alpha_0.3_beta_0.85_final_metrics.csv"
filename2 = "hotpotqa_hybrid_linear_with_rerank_minilm_alpha_0.3_beta_0.85_final_metrics.csv"

df = read_file(filename)
df2 = read_file(filename2)

alpha = 0.05
apply_statistical_significance_test(df, df2, alpha)

In [None]:
def check_permutation_test(list1, list2, alpha):
  #confirm with permutation test
  p_perm = permutation_test(list1, list2, method='approximate', num_rounds=10000)

  print(f"P-value: {p_perm:.4f}")
  if p_perm < 0.05:
    print("=> Significant difference (reject H0)")
  else:
    print("=> No significant difference (fail to reject H0)")

In [None]:
def t_test(data1, data2):
  # Perform two-tailed independent t-test
  t_stat, p_value = stats.ttest_ind(data1, data2)

  print("t-statistic:", t_stat)
  print("p-value:", p_value)

  # Check significance
  alpha = 0.05
  if p_value < alpha:
    print("Reject the null hypothesis: groups are significantly different")
  else:
    print("Fail to reject the null hypothesis: no significant difference")

In [None]:
def check_normality(data):
  stat, p = stats.shapiro(data)
  print("Shapiro-Wilk test statistic:", stat)
  print("p-value:", p)

  alpha = 0.05
  if p > alpha:
    print("Fail to reject H₀: data looks normally distributed")
  else:
    print("Reject H₀: data is not normally distributed")