In [1]:
import os
import sys
import pandas as pd
import tiktoken

project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
print(f"Project root: {project_root}")
if project_root not in sys.path:
    sys.path.insert(0, project_root)
    
from src.column import Column

Project root: /home/dc/Nextcloud/BachelorsThesis/AQuA_Thesis/Code/echr-qa-benchmark


In [2]:
qa_dataset_path = "data/e_ron_rag_qwen3_doc_echr_topic_query_echr_retrieve_gptoss_notranslation.csv"
judgement_paragraphs_path = "data/echr_case_paragraphs.csv"

In [None]:
qa_df = pd.read_csv(os.path.join(project_root, qa_dataset_path))
judgement_df = pd.read_csv(os.path.join(project_root, judgement_paragraphs_path))

print(f"QA dataset shape: {qa_df.shape}")
print(f"Judgement paragraphs shape: {judgement_df.shape}")

QA dataset shape: (552, 13)
Judgement paragraphs shape: (1070605, 5)


In [None]:
encoder = tiktoken.get_encoding("cl100k_base")

def count_tokens(text):
    """Count tokens in a text string"""
    if pd.isna(text):
        return 0
    return len(encoder.encode(str(text)))

# Calculate token counts for each column
qa_df['question_tokens'] = qa_df[Column.QUESTION].apply(count_tokens)
qa_df['question_translation_tokens'] = qa_df[Column.QUESTION_TRANSLATION].apply(count_tokens)
judgement_df['paragraph_text_tokens'] = judgement_df[Column.ECHR_CASE_PARAGRAPH_TEXT].apply(count_tokens)

print("Token counting complete!")

Token counting complete!


In [None]:
# statistics for QUESTION column
question_stats = {
    'min': qa_df['question_tokens'].min(),
    'max': qa_df['question_tokens'].max(),
    'mean': qa_df['question_tokens'].mean(),
    'median': qa_df['question_tokens'].median()
}

print("QUESTION token statistics:")
print(f"  Min: {question_stats['min']}")
print(f"  Max: {question_stats['max']}")
print(f"  Mean: {question_stats['mean']:.2f}")
print(f"  Median: {question_stats['median']:.1f}")
print()

# statistics for QUESTION_TRANSLATION column
question_trans_stats = {
    'min': qa_df['question_translation_tokens'].min(),
    'max': qa_df['question_translation_tokens'].max(),
    'mean': qa_df['question_translation_tokens'].mean(),
    'median': qa_df['question_translation_tokens'].median()
}

print("QUESTION_TRANSLATION token statistics:")
print(f"  Min: {question_trans_stats['min']}")
print(f"  Max: {question_trans_stats['max']}")
print(f"  Mean: {question_trans_stats['mean']:.2f}")
print(f"  Median: {question_trans_stats['median']:.1f}")
print()

# statistics for ECHR_CASE_PARAGRAPH_TEXT column
paragraph_stats = {
    'min': judgement_df['paragraph_text_tokens'].min(),
    'max': judgement_df['paragraph_text_tokens'].max(),
    'mean': judgement_df['paragraph_text_tokens'].mean(),
    'median': judgement_df['paragraph_text_tokens'].median()
}

print("ECHR_CASE_PARAGRAPH_TEXT token statistics:")
print(f"  Min: {paragraph_stats['min']}")
print(f"  Max: {paragraph_stats['max']}")
print(f"  Mean: {paragraph_stats['mean']:.2f}")
print(f"  Median: {paragraph_stats['median']:.1f}")

QUESTION token statistics:
  Min: 55
  Max: 379
  Mean: 165.76
  Median: 159.0

QUESTION_TRANSLATION token statistics:
  Min: 29
  Max: 225
  Mean: 89.76
  Median: 86.0

ECHR_CASE_PARAGRAPH_TEXT token statistics:
  Min: 0
  Max: 31272
  Mean: 97.99
  Median: 71.0
