In [15]:
import json
from collections import defaultdict
from pathlib import Path
import pandas as pd

In [4]:
# Datei laden
with open("../data/validationset.json", "r", encoding="utf-8") as f:
    data = json.load(f)

In [5]:
# Anzahl Samples je Kategorie
for turn_type in ["singleturn", "multiturn"]:
    for category, sample_list in data[turn_type].items():
        num_samples = len(sample_list)
        print(f"{turn_type}/{category}: {num_samples} samples")

singleturn/specific: 92 samples
singleturn/seminar-search: 55 samples
singleturn/handoff: 57 samples
singleturn/out-of-scope: 30 samples
singleturn/bad-intentions: 45 samples
singleturn/abstract: 30 samples
multiturn/specific: 51 samples
multiturn/seminar-search: 33 samples
multiturn/handoff: 30 samples
multiturn/out-of-scope: 24 samples
multiturn/bad-intentions: 11 samples
multiturn/abstract: 10 samples


In [6]:
# Evaluierungskriterien
criteria = data["evaluation_criteria"]

# Sample-Sets
samples = {
    "singleturn": data["singleturn"],
    "multiturn": data["multiturn"]
}

# Berechnung der Gesamtanzahl an Beurteilungen
total_evaluations = 0
evaluations_by_category = defaultdict(int)

# Für jede Dialogart (singleturn, multiturn)
for turn_type in ["singleturn", "multiturn"]:
    for category, sample_list in samples[turn_type].items():
        num_samples = len(sample_list)
        num_criteria = len(criteria[turn_type][category])
        evaluations = num_samples * num_criteria
        total_evaluations += evaluations
        evaluations_by_category[f"{turn_type}/{category}"] = evaluations

total_evaluations, dict(evaluations_by_category)

(1581,
 {'singleturn/specific': 460,
  'singleturn/seminar-search': 220,
  'singleturn/handoff': 114,
  'singleturn/out-of-scope': 60,
  'singleturn/bad-intentions': 90,
  'singleturn/abstract': 90,
  'multiturn/specific': 255,
  'multiturn/seminar-search': 132,
  'multiturn/handoff': 60,
  'multiturn/out-of-scope': 48,
  'multiturn/bad-intentions': 22,
  'multiturn/abstract': 30})

In [11]:
file_path = Path("../data/validationset.json")
with open(file_path, "r", encoding="utf-8") as f:
    data = json.load(f)

# Approximate token count (using space-separated word count * 1.3 to simulate tokenization)
def count_tokens(text):
    if not text:
        return 0
    return int(len(text.split()) * 1.3)

evaluation_criteria = data["evaluation_criteria"]

In [17]:
# count tokens for validationset

results = []

for mode in ["singleturn", "multiturn"]:
    for category, metrics in evaluation_criteria[mode].items():
        for sample in data[mode][category]:
            sample_id = sample["id"]
            for metric in metrics:
                token_count = 0
                if "purpose" in metric:
                    token_count = count_tokens(sample.get("query", "")) + count_tokens(sample.get("answer", ""))
                    if "multiturn" in metric:
                        token_count += sum(count_tokens(turn["content"]) for turn in sample.get("history", []))
                elif "faithfulness" in metric:
                    token_count = sum(count_tokens(ctx) for ctx in sample.get("retrieved_contexts_full", [])) + count_tokens(sample.get("answer", ""))
                    if "multiturn" in metric:
                        token_count += sum(count_tokens(turn["content"]) for turn in sample.get("history", []))
                elif "context_relevance" in metric:
                    token_count = sum(count_tokens(ctx) for ctx in sample.get("retrieved_contexts_full", [])) + count_tokens(sample.get("query", ""))
                    if "multiturn" in metric:
                        token_count += sum(count_tokens(turn["content"]) for turn in sample.get("history", []))
                elif "answer_correctness" in metric:
                    token_count = count_tokens(sample.get("answer", "")) + count_tokens(sample.get("reference_answer", ""))
                    if "multiturn" in metric:
                        token_count += sum(count_tokens(turn["content"]) for turn in sample.get("history", []))
                elif "answer_relevance" in metric:
                    token_count = count_tokens(sample.get("query", "")) + count_tokens(sample.get("answer", ""))
                    if "multiturn" in metric:
                        token_count += sum(count_tokens(turn["content"]) for turn in sample.get("history", []))
                elif "handoff" in metric:
                    token_count = count_tokens(sample.get("answer", ""))
                elif "quality_pairwise" in metric:
                    token_count = 2 * count_tokens(sample.get("answer", "")) + count_tokens(sample.get("query", ""))
                    if "multiturn" in metric:
                            token_count += sum(count_tokens(turn["content"]) for turn in sample.get("history", []))
                results.append({
                    "id": sample_id,
                    "mode": mode,
                    "category": category,
                    "metric": metric,
                    "token_count": token_count
                })

# Convert to DataFrame and display again
df = pd.DataFrame(results)

df.head()


Unnamed: 0,id,mode,category,metric,token_count
0,1,singleturn,specific,faithfulness,910
1,1,singleturn,specific,answer_relevance,80
2,1,singleturn,specific,answer_correctness,112
3,1,singleturn,specific,context_relevance,848
4,1,singleturn,specific,quality_pairwise,151


In [20]:
# sum input tokens for all samples
total_tokens = df["token_count"].sum()
print(f"Total token count for all samples: {total_tokens}")

Total token count for all samples: 818645
