In [None]:
%load_ext autoreload
%autoreload 2

In [17]:
import random
from pathlib import Path

from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from eval_dataset_loader import get_uss_dataset, get_fed_dial_dataset
from models.benchmark_dialogues import EvaluatedDialogue
from models.configs import RatingEvalConfig
from models.datasets import RatingBenchmarkDataset
from rating_evaluation.rating_evaluator import evaluate_ratings
from rating_evaluation.dataset_analyzer import plot_human_rating_bar_plots, compute_dataset_statistics
from chat_checker.models.chatbot import ChatbotInfo, ChatbotType
from chat_checker.dialogue_rating.rating_dimensions import DEFAULT_TASK_ORIENTED_DIMENSIONS, DSTC_CONVERSATIONAL_DIMENSIONS

plt.style.use("seaborn-v0_8")
plt.style.use(Path("./styles/subfigure.mplstyle").resolve().as_posix())


In [18]:
random.seed(42)

# Load Datasets

## Task-Oriented Datasets
Data source: Downloaded data from https://chateval.org/sgd and stored under `../datasets/task_oriented_dialogue_systems/uss/`

### SGD

In [19]:
sgd_samples = get_uss_dataset("SGD")

In [None]:
print(sgd_samples[0])
print(sgd_samples[-1])

In [21]:
sgd_chatbot_description = """A multi-domain chatbot designed for interacting with users across a variety of services and APIs. The chatbot can provide answers and services for requests spanning 20 domains, such as banks, events, media, calendar, travel, and weather."""
sgd_chatbot = ChatbotInfo(
    name="Service Chatbot",
    description=sgd_chatbot_description,
    type=ChatbotType.TASK_ORIENTED,
    interaction_method="text-based chat interface",
    task="Provide answers and services for requests spanning 20 domains, such as banks, events, media, calendar, travel, and weather.",
    available_languages=["English"],
)

In [22]:
sgd_dataset = RatingBenchmarkDataset(
    name="sgd",
    all_samples=sgd_samples,
    rated_samples_path=Path("data/sgd/rated_samples.json"),
    chatbot_info=sgd_chatbot,
    rating_dimensions=DEFAULT_TASK_ORIENTED_DIMENSIONS,
    label_aggregation_method="mean"
)

## MWOZ

In [23]:
mwoz_samples = get_uss_dataset("MWOZ")

In [None]:
print(mwoz_samples[0])
print(mwoz_samples[-1])

In [25]:
mwoz_chatbot_description = "A bot that helps tourists find information about restaurants, hotels, attractions, trains and taxis in Cambridge, UK. The user can also book restaurants, hotels and trains through the bot."
mwoz_chatbot_task = "The chatbot should help the user find information about restaurants, hotels, attractions, trains and taxis in Cambridge, UK. The chatbot should also be able to book restaurants, hotels and trains for the user."
mwoz_chatbot_info = ChatbotInfo(
    name="Cambridge Tourist Bot",
    description=mwoz_chatbot_description,
    type=ChatbotType.TASK_ORIENTED,
    interaction_method="text-based chat interface",
    task=mwoz_chatbot_task,
    available_languages=["English"],
)

In [26]:
mwoz_dataset = RatingBenchmarkDataset(
    name="mwoz",
    all_samples=mwoz_samples,
    rated_samples_path=Path("data/mwoz/rated_samples.json"),
    chatbot_info=mwoz_chatbot_info,
    rating_dimensions=DEFAULT_TASK_ORIENTED_DIMENSIONS,
    label_aggregation_method="mean",
)

## JDDC

In [None]:
jddc_samples = get_uss_dataset("JDDC")
len(jddc_samples)

In [None]:
print(jddc_samples[0])
print(jddc_samples[-1])

In [29]:
jddc_chatbot_description = "A chatbot for customer service requests for the Chinese E-commerce website Jing Don."
jddc_chatbot_task = "The chatbot should help the user with their customer service requests. This includes answering questions about warranty, returns, deliveries, order status, invoices, etc. and managing orders (changing order information, cancelling orders, etc.)."
jddc_chatbot_info = ChatbotInfo(
    name="E-Commerce Bot",
    description=jddc_chatbot_description,
    type=ChatbotType.TASK_ORIENTED,
    interaction_method="text-based chat interface",
    task=jddc_chatbot_task,
    available_languages=["Chinese"],
)

In [30]:
jddc_dataset = RatingBenchmarkDataset(
    name="jddc",
    all_samples=jddc_samples,
    rated_samples_path=Path("data/jddc/rated_samples.json"),
    chatbot_info=jddc_chatbot_info,
    rating_dimensions=DEFAULT_TASK_ORIENTED_DIMENSIONS,
    label_aggregation_method="mean",
)

## Conversational Datasets
Data source: downloaded data from https://chateval.org/dstc10 and extracted to `../datasets/conversational_dialogue_systems/dstc_10_track_5/`

## FED

In [None]:
fed_samples = get_fed_dial_dataset()
len(fed_samples)

In [None]:
fed_samples[0]

In [33]:
fed_chatbot_description = """An open-domain chatbot designed for chit-chat and general conversation. The chatbot can engage in free-form conversation on a wide variety of topics."""
fed_chatbot_info = ChatbotInfo(
    name="Open-Domain Chatbot",
    description=fed_chatbot_description,
    type=ChatbotType.CONVERSATIONAL,
    interaction_method="text-based chat interface",
    available_languages=["English"],
)

In [34]:
fed_dataset = RatingBenchmarkDataset(
    name="fed_dial",
    all_samples=fed_samples,
    rated_samples_path=Path("data/fed_dial/rated_samples.json"),
    chatbot_info=fed_chatbot_info,
    rating_dimensions=DSTC_CONVERSATIONAL_DIMENSIONS,
    label_aggregation_method="mean",
)

# Set Dataset for Evaluation

In [None]:
# Specify the dataset to evaluate the dialogue rating on
dataset = fed_dataset
# Specify whether to load existing samples and recompute existing annotations
load_existing_samples = True
recompute_existing_ratings = False
# Specify the number of new samples to sample and the maximum number of samples for evaluation
new_samples = 0
max_samples = min(125, len(dataset.all_samples) - (3 + 3))  # Maximum set to 125 to avoid exceeding the maximum number of samples of the FED-Dial dataset
print(f"Max samples: {max_samples}")

print(f"Using {dataset.name} dataset ({len(dataset.all_samples)} total samples)")

In [None]:
rated_samples = []
if load_existing_samples:
    rated_samples = dataset.load_rated_samples()
len(rated_samples)

In [None]:
remaining_dataset = [sample for sample in dataset.all_samples if sample.dialogue_id not in [s.dialogue_id for s in rated_samples]]
remaining_dataset = [sample for sample in remaining_dataset if sample.dialogue_id not in [s.dialogue_id for s in dataset.representative_few_shot_samples]]
remaining_dataset = [sample for sample in remaining_dataset if sample.dialogue_id not in [s.dialogue_id for s in dataset.random_few_shot_samples]]
fresh_subset = random.sample(remaining_dataset, new_samples)
print(f"Selected {len(fresh_subset)} new samples for rating.")

In [None]:
subset_for_rating = fresh_subset + rated_samples
subset_for_rating = subset_for_rating[:max_samples]
len(subset_for_rating)

In [None]:
print(subset_for_rating[0])

In [None]:
dataset.representative_few_shot_samples

## Show dataset stats

In [None]:
stats = compute_dataset_statistics(dataset.all_samples)
with open(dataset.rated_samples_path.parent / "full_dataset_stats.txt", "w") as f:
    f.write(str(stats))
print(stats)

In [None]:
stats = compute_dataset_statistics(subset_for_rating)
with open(dataset.rated_samples_path.parent / "subset_dataset_stats.txt", "w") as f:
    f.write(str(stats))
print(stats)

In [None]:
plot_human_rating_bar_plots(dataset.all_samples)

In [None]:
plot_human_rating_bar_plots(subset_for_rating)

# Build Evaluation Variants

In [None]:
from chat_checker.dialogue_rating.rating_dimensions import OVERALL_DIMENSION
from chat_checker.models.rating import RatingDimension


# Configure the models, rating dimensions, few-shot variants, and chatbot info to evaluate on
models = {
    "gpt-4o": "gpt-4o-2024-08-06",
    # "o3-mini": "o3-mini-2025-01-31",
    # "gpt-4-turbo": "gpt-4-turbo-2024-04-09"
}

rating_dimensions: dict[str, list[RatingDimension]] = {
    "only_overall_rd": [OVERALL_DIMENSION],
    "all_rd": dataset.rating_dimensions,
}
# type_specific_dimensions = [d for d in dataset.rating_dimensions if str(d.type) == str(dataset.chatbot_info.type)]
# if len(type_specific_dimensions) == 0:
#     print("WARNING:No type-specific dimensions found")
    
# if len(type_specific_dimensions) != 0 and len(type_specific_dimensions + [OVERALL_DIMENSION]) != len(dataset.rating_dimensions):
#     print(f"Adding variant with only type-specific dimensions: {type_specific_dimensions + [OVERALL_DIMENSION]}")
#     rating_dimensions["only_type_specific_rd"] = type_specific_dimensions + [OVERALL_DIMENSION]

few_shot_variants: dict[str, list[EvaluatedDialogue]] = {
    "zero_s": [],
    "random_s": dataset.random_few_shot_samples,
    "representative_s": dataset.representative_few_shot_samples,
}
chatbot_infos: dict[str, ChatbotInfo] = {
    "no_ci": None,
    "w_ci": dataset.chatbot_info,
}

eval_configs: list[RatingEvalConfig] = []
for model_name, model_version in models.items():
    for rating_key, dimensions in rating_dimensions.items():
        for few_shot_key, few_shot_samples in few_shot_variants.items():
            for chatbot_key, chatbot_info in chatbot_infos.items():
                eval_config = RatingEvalConfig(
                    key=f"{model_name}-{rating_key}-{few_shot_key}-{chatbot_key}",
                    model=model_version,
                    rating_dimensions=dimensions,
                    few_shot_samples=[d.to_chat_checker_dialogue() for d in few_shot_samples],
                    chatbot_info=chatbot_info,
                )
                eval_configs.append(eval_config)

print(f"Total number of eval configs: {len(eval_configs)}")

# Run Evaluation

In [None]:
correlation_results = {}
for eval_config in tqdm(eval_configs, desc="Computing LLM Ratings"):
    print (f"Evaluating {eval_config.key}")

    correlations = evaluate_ratings(
        subset_for_rating,
        config=eval_config,
        benchmark_dataset=dataset,
        recompute_existing_ratings=recompute_existing_ratings,
    )
    correlation_results[eval_config.key] = correlations

In [47]:
# Set the directory to save the plots
plot_dir = dataset.rated_samples_path.parent

## Analyze whether Spearman correlation is bettter for overall rating dimension or for ensembled rating
Verdict for FED-Dial: no big difference
Verdict for SGD: no big difference  
--> use overall dimension for simplicity  

In [None]:
# Create lists to store correlations for comparison
overall_correlations = []
ensemble_correlations = []
differences = []

for config_key, correlations in correlation_results.items():
    overall_corr = correlations["spearman_correlation_overall"]
    ensemble_corr = correlations["spearman_correlation_dimension_ensemble"]
    
    
    if overall_corr is not None:
        overall_correlations.append(overall_corr)
    if ensemble_corr is not None and "only_overall_rd" not in config_key:
        ensemble_correlations.append(ensemble_corr)
    if overall_corr is not None and ensemble_corr is not None:
        difference = overall_corr - ensemble_corr
        differences.append(difference)

print("Overall Rating Correlations:")
print(f"Mean: {np.mean(overall_correlations):.3f}")
print(f"Median: {np.median(overall_correlations):.3f}")
print(f"Std: {np.std(overall_correlations):.3f}")
print(f"Min: {np.min(overall_correlations):.3f}")
print(f"Max: {np.max(overall_correlations):.3f}")
print()

print("Ensemble Rating Correlations:") 
print(f"Mean: {np.mean(ensemble_correlations):.3f}")
print(f"Median: {np.median(ensemble_correlations):.3f}")
print(f"Std: {np.std(ensemble_correlations):.3f}")
print(f"Min: {np.min(ensemble_correlations):.3f}")
print(f"Max: {np.max(ensemble_correlations):.3f}")
print()

print("Differences:")
print(f"Mean: {np.mean(differences):.3f}")
print(f"Std: {np.std(differences):.3f}")
print(f"Min: {np.min(differences):.3f}")
print(f"Max: {np.max(differences):.3f}")
print()

# Create box plot to visualize distribution of correlations
plt.figure(figsize=(10, 6))
plt.boxplot([overall_correlations, ensemble_correlations], labels=['Overall', 'Ensemble'])
# plt.title('Distribution of Spearman Correlations: Overall vs Ensemble Ratings')
plt.ylabel('Spearman Correlation Coefficient')
plt.grid(True)

# Save the plot
plot_name = "overall_vs_ensemble_correlations"
plt.savefig(plot_dir / f"{plot_name}.png", dpi=300, bbox_inches='tight')
plt.savefig(plot_dir / f"{plot_name}.pdf", bbox_inches='tight')

plt.show()


## Analyze which configuration performed the best
Verdict for FED-Dial: use few-shot (no big diff between representative/random), other parameters no big diff

In [None]:
sorted_config_results = sorted(correlation_results.items(), key=lambda x: x[1]["spearman_correlation_overall"], reverse=True)
print("\nBest performing configurations by Spearman correlation:")
for config_key, results in sorted_config_results:
    correlation = results["spearman_correlation_overall"]
    print(f"{config_key}: {correlation:.3f}")

# Save rankings to text file
with open(plot_dir / "config_rankings.txt", "w") as f:
    f.write("Configuration Rankings by Spearman Correlation:\n")
    f.write("-" * 50 + "\n\n")
    for config_key, results in sorted_config_results:
        correlation = results["spearman_correlation_overall"]
        f.write(f"{config_key}: {correlation:.3f}\n")



In [50]:
rating_dimension_names = {
    "only_overall_rd": "only overall RD",
    "only_type_specific_rd": "only type-specific RDs",
    "all_rd": "all RDs",
}
few_shot_names = {
    "zero_s": "zero shot",
    "random_s": "random 3-shot",
    "representative_s": "spectrum 3-shot",
}
chatbot_info_names = {
    "no_ci": "no CI",
    "w_ci": "with CI",
}


In [51]:
import pandas as pd

def create_and_save_correlation_plot(df: pd.DataFrame, x_axis: str, plot_name: str):
    sns.barplot(x=x_axis, y='spearman_overall', data=df, color="dodgerblue", width=0.4, capsize=0.05)
    plt.ylabel('Average Spearman Correlation')
    plt.xlabel(None)
    plt.xticks(rotation=0, ha='center')
    plt.tight_layout()
    
    # Save plot
    plt.savefig(plot_dir / f"{plot_name}.png", dpi=300, bbox_inches='tight')
    plt.savefig(plot_dir / f"{plot_name}.pdf", bbox_inches='tight')
    plt.show()

In [None]:

# Load your results into a pandas DataFrame for easier analysis
data = []
for config_key, results in correlation_results.items():
    model_base, model_tag, rd, fs, ci = config_key.split('-')
    model = f"{model_base}-{model_tag}"
    data.append({
        'config_key': config_key,
        'model': model,
        'rating_dimensions': rating_dimension_names.get(rd, rd),
        'few_shot_samples': few_shot_names.get(fs, fs),
        'chatbot_info': chatbot_info_names.get(ci, ci),
        'spearman_overall': results['spearman_correlation_overall']
    })
df = pd.DataFrame(data)

print("\n--- Analysis ---")
print("\nDataFrame of Correlation Results:")
print(df.head())

# 2. Calculate Baseline Performance
baseline_config = df[(df['rating_dimensions'] == rating_dimension_names.get('only_overall_rd')) &
                    (df['few_shot_samples'] == few_shot_names.get('zero_s')) &
                    (df['chatbot_info'] == chatbot_info_names.get('no_ci'))]
baseline_correlation = baseline_config['spearman_overall'].iloc[0] if not baseline_config.empty else None
print(f"\nBaseline Configuration (gpt-4o-only_overall_rd-zero_s-no_ci) Spearman Correlation: {baseline_correlation:.3f}" if baseline_correlation is not None else "\nBaseline Configuration not found.")

# 3. Analyze the Impact of Each Factor Individually
print("\n--- Impact of Rating Dimensions ---")
print(df.groupby('rating_dimensions')['spearman_overall'].mean().sort_values(ascending=False))

print("\n--- Impact of Few-Shot Samples ---")
print(df.groupby('few_shot_samples')['spearman_overall'].mean().sort_values(ascending=False))

print("\n--- Impact of Chatbot Info ---")
print(df.groupby('chatbot_info')['spearman_overall'].mean().sort_values(ascending=False))

# 4. Visualization

print("\n--- Visualizations ---")

# Bar plot of average Spearman correlation by Rating Dimensions
create_and_save_correlation_plot(df, 'rating_dimensions', 'average_spearman_correlation_by_rating_dimensions')

# Bar plot of average Spearman correlation by Few-Shot Samples
create_and_save_correlation_plot(df, 'few_shot_samples', "average_spearman_correlation_by_few_shot_samples")

# Bar plot of average Spearman correlation by Chatbot Info
create_and_save_correlation_plot(df, "chatbot_info", "average_spearman_correlation_by_chatbot_info")