In [None]:
full_PACT = pd.read_csv("..\data\paragraphs.csv", sep=';')

exclude_columns = ["report_namePKO", "paragraphNumber", "paragraph_ID"]
potential_categories = [col for col in full_PACT.columns if col not in exclude_columns]

# Check which of these columns contain boolean-like data
all_categories = []
for col in potential_categories:
    # Check if column contains only boolean values, or strings, or NaNs
    unique_values = full_PACT[col].dropna().unique()
    if all(isinstance(x, bool) for x in unique_values) or \
       all(isinstance(x, str) for x in unique_values) or \
       len(unique_values) <= 2:  # Assuming binary categories
        all_categories.append(col)

print(f"Detected {len(all_categories)} category columns")

# Convert string values to boolean if needed
for category in all_categories:
    full_PACT[category] = full_PACT[category].map(lambda x: isinstance(x, str) if pd.notna(x) else False)

# Calculate counts and distribution
class_counts = full_PACT[all_categories].sum()
total_samples = len(full_PACT)
class_distribution = class_counts / total_samples

# Sort categories by frequency for better readability
sorted_categories = class_counts.sort_values(ascending=False).index.tolist()

# Create a markdown table
print("## Class Distribution in paragraphs.csv (All Categories)\n")
print("| Category | Count | Proportion | Percentage |")
print("|----------|-------|------------|------------|")
for category in sorted_categories:
    count = class_counts[category]
    proportion = class_distribution[category]
    print(f"| {category} | {count} | {proportion:.4f} | {proportion*100:.2f}% |")

# Add summary row
print(f"| **Total Documents** | {total_samples} | - | - |")

# Calculate and add imbalance ratio
most_frequent = class_distribution.max()
least_frequent = class_distribution.min()
imbalance_ratio = most_frequent / least_frequent
print(f"\n**Imbalance ratio (most frequent / least frequent)**: {imbalance_ratio:.2f}")