In [1]:
import pandas as pd

def read_csv_to_dataframe(file_path):
  """Reads a CSV file and returns it as a Pandas DataFrame.

  Args:
    file_path: The path to the CSV file.

  Returns:
    A Pandas DataFrame containing the data from the CSV file.
  """

  try:
    df = pd.read_csv(file_path)
    return df
  except FileNotFoundError:
    print(f"Error: File not found at {file_path}")
  except pd.errors.EmptyDataError:
    print(f"Error: The CSV file at {file_path} is empty.")
  except Exception as e:
    print(f"Error: An unexpected error occurred while reading the CSV file: {e}")

# Cohere query translation

In [22]:
from from_root import from_root
import os

baseline_path = os.path.join(from_root(), "data-test/test_dataset/test_dataset_it_baseline.csv")

cohere_query_tran_path = os.path.join(from_root(), "data-test/test_dataset/test_dataset_it_with_cohere_query_translate.csv")

In [3]:
df_baseline = read_csv_to_dataframe(baseline_path)
df_cohere_query_tran = read_csv_to_dataframe(cohere_query_tran_path)

## Answer relevancy

In [18]:
import scipy.stats as stats

# Assuming model1_scores and model2_scores are your lists of performance scores
baseline_score = df_baseline['answer_relevancy'].tolist()  # New model scores
cohere_query_tran_scores = df_cohere_query_tran['answer_relevancy'].tolist()  # Baseline model scores

# Perform the Mann-Whitney U Test
statistic, p_value = stats.mannwhitneyu(cohere_query_tran_scores, baseline_score)

# Set a significance level (alpha)
alpha = 0.05

# Check if p-value is less than alpha
if p_value < alpha:
    print("p-value: {}".format(p_value))
    print("Reject the null hypothesis. There is a significant difference in answer relevanacy between the two models.")
else:
    print("p-value: {}".format(p_value))
    print("Fail to reject the null hypothesis. There is no significant difference in answer relevanacy between the two models.")

p-value: 0.723383916545185
Fail to reject the null hypothesis. There is no significant difference in answer relevanacy between the two models.


## Faithfulness

In [19]:
import scipy.stats as stats

# Assuming model1_scores and model2_scores are your lists of performance scores
baseline_score = df_baseline['faithfulness'].tolist()  # New model scores
cohere_query_tran_scores = df_cohere_query_tran['faithfulness'].tolist()  # Baseline model scores

# Perform the Mann-Whitney U Test
statistic, p_value = stats.mannwhitneyu(cohere_query_tran_scores, baseline_score)

# Set a significance level (alpha)
alpha = 0.05

# Check if p-value is less than alpha
if p_value < alpha:
    print("p-value: {}".format(p_value))
    print("Reject the null hypothesis. There is a significant difference in faithfulness between the two models.")
else:
    print("p-value: {}".format(p_value))
    print("Fail to reject the null hypothesis. There is no significant difference in faithfulness between the two models.")

p-value: 0.5649865965151069
Fail to reject the null hypothesis. There is no significant difference in faithfulness between the two models.


## Context Recall

In [20]:
import scipy.stats as stats

# Assuming model1_scores and model2_scores are your lists of performance scores
baseline_score = df_baseline['context_recall'].tolist()  # New model scores
cohere_query_tran_scores = df_cohere_query_tran['context_recall'].tolist()  # Baseline model scores

# Perform the Mann-Whitney U Test
statistic, p_value = stats.mannwhitneyu(cohere_query_tran_scores, baseline_score)

# Set a significance level (alpha)
alpha = 0.05

# Check if p-value is less than alpha
if p_value < alpha:
    print("p-value: {}".format(p_value))
    print("Reject the null hypothesis. There is a significant difference in context_recall between the two models.")
else:
    print("p-value: {}".format(p_value))
    print("Fail to reject the null hypothesis. There is no significant difference in context_recall between the two models.")

p-value: 1.0
Fail to reject the null hypothesis. There is no significant difference in context_recall between the two models.


## Conext Precision

In [21]:
import scipy.stats as stats

# Assuming model1_scores and model2_scores are your lists of performance scores
baseline_score = df_baseline['context_precision'].tolist()  # New model scores
cohere_query_tran_scores = df_cohere_query_tran['context_precision'].tolist()  # Baseline model scores

# Perform the Mann-Whitney U Test
statistic, p_value = stats.mannwhitneyu(cohere_query_tran_scores, baseline_score)

# Set a significance level (alpha)
alpha = 0.05

# Check if p-value is less than alpha
if p_value < alpha:
    print("p-value: {}".format(p_value))
    print("Reject the null hypothesis. There is a significant difference in context_precision between the two models.")
else:
    print("p-value: {}".format(p_value))
    print("Fail to reject the null hypothesis. There is no significant difference in context_precision between the two models.")

p-value: 0.37406279749088245
Fail to reject the null hypothesis. There is no significant difference in context_precision between the two models.


# HyDE

In [37]:
from from_root import from_root
import os

baseline_path = os.path.join(from_root(), "data-test/test_dataset/test_dataset_it_baseline.csv")

hyde_path = os.path.join(from_root(), "data-test/test_dataset/test_dataset_it_with_hyde.csv")

In [30]:
df_baseline = read_csv_to_dataframe(baseline_path)
df_hyde = read_csv_to_dataframe(hyde_path)

## Answer relevancy

In [31]:
import scipy.stats as stats

# Assuming model1_scores and model2_scores are your lists of performance scores
baseline_score = df_baseline['answer_relevancy'].tolist()  # New model scores
hyde_scores = df_hyde['answer_relevancy'].tolist()  # Baseline model scores

# Perform the Mann-Whitney U Test
statistic, p_value = stats.mannwhitneyu(hyde_scores, baseline_score)

# Set a significance level (alpha)
alpha = 0.05

# Check if p-value is less than alpha
if p_value < alpha:
    print("p-value: {}".format(p_value))
    print("Reject the null hypothesis. There is a significant difference in answer relevanacy between the two models.")
else:
    print("p-value: {}".format(p_value))
    print("Fail to reject the null hypothesis. There is no significant difference in answer relevanacy between the two models.")

p-value: 0.627029093446337
Fail to reject the null hypothesis. There is no significant difference in answer relevanacy between the two models.


## Faithfulness

In [32]:
import scipy.stats as stats

# Assuming model1_scores and model2_scores are your lists of performance scores
baseline_score = df_baseline['faithfulness'].tolist()  # New model scores
hyde_scores = df_hyde['faithfulness'].tolist()  # Baseline model scores

# Perform the Mann-Whitney U Test
statistic, p_value = stats.mannwhitneyu(hyde_scores, baseline_score)

# Set a significance level (alpha)
alpha = 0.05

# Check if p-value is less than alpha
if p_value < alpha:
    print("p-value: {}".format(p_value))
    print("Reject the null hypothesis. There is a significant difference in faithfulness between the two models.")
else:
    print("p-value: {}".format(p_value))
    print("Fail to reject the null hypothesis. There is no significant difference in faithfulness between the two models.")

p-value: 1.0
Fail to reject the null hypothesis. There is no significant difference in faithfulness between the two models.


## Context Recall

In [33]:
import scipy.stats as stats

# Assuming model1_scores and model2_scores are your lists of performance scores
baseline_score = df_baseline['context_recall'].tolist()  # New model scores
hyde_scores = df_hyde['context_recall'].tolist()  # Baseline model scores

# Perform the Mann-Whitney U Test
statistic, p_value = stats.mannwhitneyu(hyde_scores, baseline_score)

# Set a significance level (alpha)
alpha = 0.05

# Check if p-value is less than alpha
if p_value < alpha:
    print("p-value: {}".format(p_value))
    print("Reject the null hypothesis. There is a significant difference in context_recall between the two models.")
else:
    print("p-value: {}".format(p_value))
    print("Fail to reject the null hypothesis. There is no significant difference in context_recall between the two models.")

p-value: 1.0
Fail to reject the null hypothesis. There is no significant difference in context_recall between the two models.


## Conext Precision

In [34]:
import scipy.stats as stats

# Assuming model1_scores and model2_scores are your lists of performance scores
baseline_score = df_baseline['context_precision'].tolist()  # Baseline model scores
hyde_scores = df_hyde['context_precision'].tolist()  # New model scores

# Perform the Mann-Whitney U Test
statistic, p_value = stats.mannwhitneyu(hyde_scores, baseline_score)

# Set a significance level (alpha)
alpha = 0.05

# Check if p-value is less than alpha
if p_value < alpha:
    print("p-value: {}".format(p_value))
    print("Reject the null hypothesis. There is a significant difference in context_precision between the two models.")
else:
    print("p-value: {}".format(p_value))
    print("Fail to reject the null hypothesis. There is no significant difference in context_precision between the two models.")

p-value: 4.657054587626058e-05
Reject the null hypothesis. There is a significant difference in context_precision between the two models.


# Multi Query Retriever

In [38]:
from from_root import from_root
import os

baseline_path = os.path.join(from_root(), "data-test/test_dataset/test_dataset_it_baseline.csv")

multi_query_path = os.path.join(from_root(), "data-test/test_dataset/test_dataset_it_multi_query.csv")

In [39]:
df_baseline = read_csv_to_dataframe(baseline_path)
df_multi_query = read_csv_to_dataframe(multi_query_path)

## Answer relevancy

In [40]:
import scipy.stats as stats

# Assuming model1_scores and model2_scores are your lists of performance scores
baseline_score = df_baseline['answer_relevancy'].tolist()  # Baseline model scores
multi_query_scores = df_multi_query['answer_relevancy'].tolist()  # New model scores

# Perform the Mann-Whitney U Test
statistic, p_value = stats.mannwhitneyu(multi_query_scores, baseline_score)

# Set a significance level (alpha)
alpha = 0.05

# Check if p-value is less than alpha
if p_value < alpha:
    print("p-value: {}".format(p_value))
    print("Reject the null hypothesis. There is a significant difference in answer relevanacy between the two models.")
else:
    print("p-value: {}".format(p_value))
    print("Fail to reject the null hypothesis. There is no significant difference in answer relevanacy between the two models.")

p-value: 1.0
Fail to reject the null hypothesis. There is no significant difference in answer relevanacy between the two models.


## Faithfulness

In [41]:
import scipy.stats as stats

# Assuming model1_scores and model2_scores are your lists of performance scores
baseline_score = df_baseline['faithfulness'].tolist()  # New model scores
multi_query_scores = df_multi_query['faithfulness'].tolist()  # Baseline model scores

# Perform the Mann-Whitney U Test
statistic, p_value = stats.mannwhitneyu(multi_query_scores, baseline_score)

# Set a significance level (alpha)
alpha = 0.05

# Check if p-value is less than alpha
if p_value < alpha:
    print("p-value: {}".format(p_value))
    print("Reject the null hypothesis. There is a significant difference in faithfulness between the two models.")
else:
    print("p-value: {}".format(p_value))
    print("Fail to reject the null hypothesis. There is no significant difference in faithfulness between the two models.")

p-value: 0.4794748796063637
Fail to reject the null hypothesis. There is no significant difference in faithfulness between the two models.


## Context Recall

In [42]:
import scipy.stats as stats

# Assuming model1_scores and model2_scores are your lists of performance scores
baseline_score = df_baseline['context_recall'].tolist()  # Baseline model scores
multi_query_scores = df_multi_query['context_recall'].tolist()  # New model scores

# Perform the Mann-Whitney U Test
statistic, p_value = stats.mannwhitneyu(multi_query_scores, baseline_score)

# Set a significance level (alpha)
alpha = 0.05

# Check if p-value is less than alpha
if p_value < alpha:
    print("p-value: {}".format(p_value))
    print("Reject the null hypothesis. There is a significant difference in context_recall between the two models.")
else:
    print("p-value: {}".format(p_value))
    print("Fail to reject the null hypothesis. There is no significant difference in context_recall between the two models.")

p-value: 1.0
Fail to reject the null hypothesis. There is no significant difference in context_recall between the two models.


## Conext Precision

In [43]:
import scipy.stats as stats

# Assuming model1_scores and model2_scores are your lists of performance scores
baseline_score = df_baseline['context_precision'].tolist()  # Baseline model scores
multi_query_scores = df_multi_query['context_precision'].tolist()  # New model scores

# Perform the Mann-Whitney U Test
statistic, p_value = stats.mannwhitneyu(multi_query_scores, baseline_score)

# Set a significance level (alpha)
alpha = 0.05

# Check if p-value is less than alpha
if p_value < alpha:
    print("p-value: {}".format(p_value))
    print("Reject the null hypothesis. There is a significant difference in context_precision between the two models.")
else:
    print("p-value: {}".format(p_value))
    print("Fail to reject the null hypothesis. There is no significant difference in context_precision between the two models.")

p-value: 0.37406279749088245
Fail to reject the null hypothesis. There is no significant difference in context_precision between the two models.
