#### Notebook to compute the correlation between the `factual_consistency` metric outputs and human annotated consistency scores on benchmark datasets


### Note

The consistency scores texts have been translated to German using https://huggingface.co/Helsinki-NLP/opus-mt-en-de


In [None]:
# Load the benchmark datasets
import json

# These files were copied from the UniEval repo
# (https://github.com/maszhongming/UniEval/tree/main/reproduce/data/fact), which
# is a modified version of the dataset from https://github.com/W4ngatang/qags, then translated to German

qags_xsum_path = 'data/qags_xsum-de.json'
qags_cnndm_path = 'data/qags_cnndm-de.json'

with open(qags_xsum_path) as f:
    qags_xsum_data = json.loads(f.read())
with open(qags_cnndm_path) as f:
    qags_cnndm_data = json.loads(f.read())

print(f'QAGS-XSUM has {len(qags_xsum_data)} data points')
print(f'QAGS-CNN has {len(qags_cnndm_data)} data points')

In [None]:
# Extract the generated outputs, sources, and human annotated scores
qags_xsum_generated_outputs = [item['system_output'] for item in qags_xsum_data]
qags_xsum_sources = [item['source'] for item in qags_xsum_data]
qags_xsum_scores = [item['scores']['consistency'] for item in qags_xsum_data]

qags_cnndm_generated_outputs = [
    item['system_output'] for item in qags_cnndm_data
]
qags_cnndm_sources = [item['source'] for item in qags_cnndm_data]
qags_cnndm_scores = [item['scores']['consistency'] for item in qags_cnndm_data]

In [None]:
from scipy.stats import spearmanr, pearsonr, kendalltau


def compute_correlation_values(result, annotated_scores):
    '''Function to compute and output the correlation values between the metric
    score and the human annotation scores.'''
    # Ignore any data points where the evaluator returned `None`. This may happen
    # if, for example, the prompt triggers Azure OpenAI's content filter.
    result_df = result.to_df()
    indices = list(result_df[result_df['metric_value'].notna()].index)
    valid_metric_values = [result.metric_values[i] for i in indices]
    valid_annotated_scores = [annotated_scores[i] for i in indices]

    pearson_corr = pearsonr(valid_metric_values, valid_annotated_scores)[0]
    spearman_corr = spearmanr(valid_metric_values, valid_annotated_scores)[0]
    kendalltau_corr = kendalltau(valid_metric_values, valid_annotated_scores)[0]

    print(f'Pearson correlation = {pearson_corr}')
    print(f'Spearman correlation = {spearman_corr}')
    print(f'Kendall-Tau correlation = {kendalltau_corr}')

In [None]:
# Compute the factual consistency scores on QAGS-XSUM using the local (UniEval)
# model option and measure various correlations with the human annotated scores
from langcheck.metrics.de import factual_consistency

result = factual_consistency(qags_xsum_generated_outputs, qags_xsum_sources)
compute_correlation_values(result, qags_xsum_scores)

# RUN-DATE: 2024-1-17
# Resulting correlation values:
# Pearson correlation = 0.40358016311552586
# Spearman correlation = 0.37558373934197853
# Kendall-Tau correlation = 0.3097142857142857

In [None]:
# Compute the factual consistency scores on QAGS-XSUM using the OpenAI
# (gpt-3.5-turbo) model option and measure various correlations with the human
# annotated scores
from langcheck.metrics.de import factual_consistency
import os
from langcheck.metrics.eval_clients import AzureOpenAIEvalClient

os.environ["AZURE_OPENAI_KEY"] = 'YOUR_AZURE_OPENAI_KEY'
os.environ["OPENAI_API_VERSION"] = 'YOUR_OPENAI_API_VERSION'
os.environ["AZURE_OPENAI_ENDPOINT"] = 'YOUR_AZURE_OPENAI_ENDPOINT'
client = AzureOpenAIEvalClient(text_model_name='YOUR_DEPLOYMENT_NAME')
result = factual_consistency(qags_xsum_generated_outputs,
                             qags_xsum_sources,
                             eval_model=client)

compute_correlation_values(result, qags_xsum_scores)

# RUN-DATE: 2024-1-30
# OpenAI deployment details:
# - Model name: gpt-35-turbo
# - Model version: 0613
# Resulting correlation values:
#   Computed on 230 examples
#   Pearson correlation = 0.4101880067233937
#   Spearman correlation = 0.4099039059599361
#   Kendall-Tau correlation = 0.3868346648563296

# RUN-DATE: 2024-1-17
# OpenAI deployment details:
# - Model name: gpt-35-turbo
# - Model version: 0613
# Resulting correlation values:
#   Computed on 230 examples
#   Pearson correlation = 0.1632062194597614
#   Spearman correlation = 0.15952417117218096
#   Kendall-Tau correlation = 0.15103303151237832

In [None]:
# Compute the factual consistency scores on QAGS-CNN using the local (UniEval)
# model option and measure various correlations with the human annotated scores
from langcheck.metrics.de import factual_consistency

result = factual_consistency(qags_cnndm_generated_outputs, qags_cnndm_sources)
compute_correlation_values(result, qags_cnndm_scores)

# RUN-DATE: 2024-1-18
# Resulting correlation values:
# Pearson correlation = 0.5126921817479836
# Spearman correlation = 0.4940799552395499
# Kendall-Tau correlation = 0.3910688466232861

In [None]:
# Compute the factual consistency scores on QAGS-CNN using the OpenAI
# (gpt-3.5-turbo) model option and measure various correlations with the human
# annotated scores
from langcheck.metrics.de import factual_consistency
import os
from langcheck.metrics.eval_clients import AzureOpenAIEvalClient

os.environ["AZURE_OPENAI_KEY"] = 'YOUR_AZURE_OPENAI_KEY'
os.environ["OPENAI_API_VERSION"] = 'YOUR_OPENAI_API_VERSION'
os.environ["AZURE_OPENAI_ENDPOINT"] = 'YOUR_AZURE_OPENAI_ENDPOINT'
client = AzureOpenAIEvalClient(text_model_name='YOUR_DEPLOYMENT_NAME')

result = factual_consistency(qags_cnndm_generated_outputs,
                             qags_cnndm_sources,
                             eval_model=client)
compute_correlation_values(result, qags_cnndm_scores)

# RUN-DATE: 2024-1-30
# Resulting correlation values:
#   Pearson correlation = 0.35438564244887516
#   Spearman correlation = 0.28485743239220623
#   Kendall-Tau correlation = 0.2558269140150481

# RUN-DATE: 2024-1-18
# Resulting correlation values:
#   Pearson correlation = 0.2562263899971331
#   Spearman correlation = 0.21022360246996274
#   Kendall-Tau correlation = 0.19670459803185497