In [None]:
# corpus_comparison.ipynb

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# --- Step 1: Define paths to preprocessed CSV summary files ---
# Replace with actual paths if needed
CDLK_CSV = "xmi_basic_info_cdlk.csv"
KLP1_CSV = "xmi_basic_info_klp1.csv"

# --- Step 2: Load the corpus CSVs as pandas DataFrames ---
df_cdlk = pd.read_csv(CDLK_CSV)
df_klp1 = pd.read_csv(KLP1_CSV)

# --- Step 3: Add a label column to identify corpus source ---
df_cdlk["corpus_name"] = "CDLK"
df_klp1["corpus_name"] = "KLP1"

# --- Step 4: Combine both datasets for comparison ---
df_combined = pd.concat([df_cdlk, df_klp1], ignore_index=True)

# --- Step 5: Calculate descriptive statistics grouped by corpus ---
summary_stats = df_combined.groupby("corpus_name").agg({
    "doc_text_length": ["mean", "median", "std", "min", "max"],
    "token_count": ["mean", "median", "std", "min", "max"],
    "sentence_count": ["mean", "median", "std", "min", "max"],
    "filename": "count"  # number of files in corpus
})

# Rename columns for easier reading
summary_stats.columns = ["_".join(col) for col in summary_stats.columns]
summary_stats = summary_stats.reset_index()

print("Summary statistics by corpus:")
print(summary_stats)

# --- Step 6: Visual comparison using boxplots ---
sns.set(style="whitegrid")

fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Document length boxplot
sns.boxplot(data=df_combined, x="corpus_name", y="doc_text_length", ax=axes[0])
axes[0].set_title("Document Text Length by Corpus")
axes[0].set_xlabel("")
axes[0].set_ylabel("Length")

# Token count boxplot
sns.boxplot(data=df_combined, x="corpus_name", y="token_count", ax=axes[1])
axes[1].set_title("Token Count by Corpus")
axes[1].set_xlabel("")
axes[1].set_ylabel("Tokens")

# Sentence count boxplot
sns.boxplot(data=df_combined, x="corpus_name", y="sentence_count", ax=axes[2])
axes[2].set_title("Sentence Count by Corpus")
axes[2].set_xlabel("")
axes[2].set_ylabel("Sentences")

plt.tight_layout()
plt.show()
