# Benchmark Meta Inspection
This notebook prints some statistics about the benchmark. Swap the file path from `50000006_german.jsonl` to `50000006_german_template.jsonl` depending on whether you want statistics about the parameterized questions or the instantiations.

In [None]:
import os
import json

script_dir = os.path.abspath("")
file_path = os.path.join(script_dir, "../data/evaluation", "50000006_german.jsonl")

with open(file_path, "r", encoding="utf-8") as file:
    lines = file.readlines()

answer_types = []
dataset_lists = []
step_lists = []

for i, line in enumerate(lines):
    try:
        data = json.loads(line)
        answer_types.append(data["answer_type"])
        dataset_lists.append(data["outputs"]["relevant_datasets"])
        if "required_operations" in data:
            step_lists.append(data["required_operations"])
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

# Count how common each number of datasets is
dataset_counts = {}
for datasets in dataset_lists:
    num_datasets = len(datasets)
    if num_datasets not in dataset_counts:
        dataset_counts[num_datasets] = 0
    dataset_counts[num_datasets] += 1

# Print number of positive (more than 0 datasets) and negative (0 datasets) questions
positive_count = sum(1 for datasets in dataset_lists if len(datasets) > 0)
negative_count = sum(1 for datasets in dataset_lists if len(datasets) == 0)
print(f"{positive_count} positive questions and {negative_count} negative questions.")

print("\nNumber of datasets counts:")
for num_datasets, count in sorted(dataset_counts.items()):
    print(f"{num_datasets} datasets: {count}")

# Figure out number of unique datasets
unique_datasets = set()
for datasets in dataset_lists:
    unique_datasets.update(datasets)
print(f"\nNumber of unique datasets: {len(unique_datasets)}")

# Print how often each answer type appears
answer_type_counts = {answer_type: answer_types.count(answer_type) for answer_type in set(answer_types)}
print("\nAnswer type counts:")
for answer_type, count in sorted(answer_type_counts.items(), key=lambda x: x[1], reverse=True):
    print(f"{answer_type}: {count}")

# See how many questions require how many operations
operation_counts = {}
for steps in step_lists:
    num_operations = len(steps)
    if num_operations not in operation_counts:
        operation_counts[num_operations] = 0
    operation_counts[num_operations] += 1

print("\nNumber of operations counts:")
for num_operations, count in sorted(operation_counts.items()):
    print(f"{num_operations} operations: {count}")

# Concat all steps into one list and see how often each step appears
all_steps = []
for steps in step_lists:
    all_steps.extend(steps)
step_counts = {step: all_steps.count(step) for step in set(all_steps)}
print("\nStep counts:")
for step, count in sorted(step_counts.items(), key=lambda x: x[1], reverse=True):
    print(f"{step}: {count}")


# Count how many times each category appears in template, as category field not present in actual benchmark
file_path_template = os.path.join(script_dir, "../data/evaluation", "50000006_german_template.jsonl")

with open(file_path_template, "r", encoding="utf-8") as file:
    template_lines = file.readlines()

category_counts = {}
for i, line in enumerate(template_lines):
    data = json.loads(line)
    if "dataset_categories" in data:
        categories = data["dataset_categories"]
        for category in categories:
            if category not in category_counts:
                category_counts[category] = 0
            category_counts[category] += min(4, len(data.get("param_options", [""])))

print("\nCategory counts:")
for category, count in sorted(category_counts.items(), key=lambda x: x[1], reverse=True):
    print(f"{category}: {count}")