## 0. Setup

Install required packages.

In [None]:
pip install pandas numpy torch transformers scikit-learn matplotlib seaborn tqdm umap-learn

## 1. Import Required Libraries

In [None]:
import os
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import re
import warnings
from collections import Counter, defaultdict
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 200)

print("Libraries imported successfully!")

## 2. Configure Chinese Font Support

In [None]:
from matplotlib import font_manager
import matplotlib

# Check if running on Google Colab
try:
    import google.colab
    IN_COLAB = True
except ImportError:
    IN_COLAB = False

# Install fonts if on Google Colab
if IN_COLAB:
    print("Google Colab detected - installing Chinese fonts...")
    os.system('apt-get update -qq')
    os.system('apt-get install -y fonts-noto-cjk fonts-wqy-zenhei -qq')

    # Clear matplotlib font cache
    import shutil
    try:
        cache_dir = matplotlib.get_cachedir()
        if os.path.exists(cache_dir):
            shutil.rmtree(cache_dir)
    except Exception as e:
        pass

    # Ensure cache directory exists
    cache_dir = matplotlib.get_cachedir()
    os.makedirs(cache_dir, exist_ok=True)

    # Rebuild font cache
    font_manager._load_fontmanager(try_read_cache=False)
    print("‚úÖ Chinese fonts installed")

# List of Chinese fonts to try
chinese_fonts = [
    'Noto Sans CJK SC',
    'Noto Sans CJK JP',
    'WenQuanYi Zen Hei',
    'PingFang SC',
    'Microsoft YaHei',
    'SimHei',
    'Heiti SC',
    'STHeiti',
    'Droid Sans Fallback'
]

# Get list of available fonts
available_fonts = [f.name for f in font_manager.fontManager.ttflist]

# Find first available Chinese font
selected_font = None
for font in chinese_fonts:
    if font in available_fonts:
        selected_font = font
        break

if selected_font:
    matplotlib.rcParams['font.sans-serif'] = [selected_font]
    matplotlib.rcParams['axes.unicode_minus'] = False
    print(f"‚úÖ Chinese font configured: {selected_font}")
else:
    matplotlib.rcParams['font.sans-serif'] = ['DejaVu Sans', 'Arial Unicode MS']
    matplotlib.rcParams['axes.unicode_minus'] = False
    print("‚ö†Ô∏è  Using fallback font configuration")

## 3. Load SIKU-BERT Model

In [None]:
# Model configuration - using local copy
model_path = "./sikubert"

print(f"Loading SIKU-BERT tokenizer and model from local directory: {model_path}")
tokenizer = AutoTokenizer.from_pretrained(
    model_path,
    trust_remote_code=True
)

print("Loading SIKU-BERT model...")

# Configure device for optimal performance
if torch.cuda.is_available():
    device = torch.device('cuda')
    print(f"üöÄ GPU detected: {torch.cuda.get_device_name(0)}")
    print(f"   GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
else:
    device = torch.device('cpu')
    print("‚ö†Ô∏è  GPU not available, using CPU (this will be slower)")

model = AutoModel.from_pretrained(
    model_path,
    dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    trust_remote_code=True
)

# Move model to GPU if available
model = model.to(device)
model.eval()

print(f"‚úÖ SIKU-BERT model loaded successfully on {device}!")

## 4. Define Helper Functions

In [None]:
def get_embeddings_batch(texts, batch_size=64):
    """
    Generate SIKU-BERT embeddings for multiple texts in batches.
    Optimized for GPU processing.

    Args:
        texts: List of text strings
        batch_size: Number of texts to process at once

    Returns:
        List of embeddings (numpy arrays)
    """
    all_embeddings = []

    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]

        inputs = tokenizer(batch_texts, return_tensors="pt", padding=True,
                          truncation=True, max_length=512)
        inputs = {k: v.to(device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = model(**inputs)
            embeddings = outputs.last_hidden_state

        # Mean pooling for each text in batch
        attention_mask = inputs['attention_mask'].unsqueeze(-1)
        masked_embeddings = embeddings * attention_mask
        sum_embeddings = masked_embeddings.sum(dim=1)
        sum_mask = attention_mask.sum(dim=1)
        pooled = sum_embeddings / sum_mask

        all_embeddings.extend(pooled.cpu().numpy())

    return all_embeddings


def segment_into_passages(text, min_length=10):
    """
    Segment text into meaningful passages (sentences).

    Args:
        text: Punctuated text string
        min_length: Minimum passage length in characters

    Returns:
        List of (passage_text, start_pos, end_pos) tuples
    """
    # Split by major sentence delimiters („ÄÇÔºõ)
    sentences = re.split(r'([„ÄÇÔºõ])', text)

    # Reconstruct sentences with their delimiters
    passages = []
    current_pos = 0

    for i in range(0, len(sentences)-1, 2):
        if i+1 < len(sentences):
            passage_text = sentences[i] + sentences[i+1]
            if len(passage_text.strip()) >= min_length:
                start_pos = text.find(passage_text, current_pos)
                end_pos = start_pos + len(passage_text)
                passages.append((passage_text.strip(), start_pos, end_pos))
                current_pos = end_pos

    # Handle last sentence if no delimiter
    if len(sentences) % 2 == 1 and len(sentences[-1].strip()) >= min_length:
        passage_text = sentences[-1].strip()
        start_pos = text.find(passage_text, current_pos)
        end_pos = start_pos + len(passage_text)
        passages.append((passage_text, start_pos, end_pos))

    return passages


def calculate_cluster_statistics(cluster_df):
    """
    Calculate statistics for a cluster to assess its validity.

    Args:
        cluster_df: DataFrame containing passages in a cluster

    Returns:
        Dictionary with cluster statistics
    """
    # Number of unique edicts
    n_edicts = cluster_df['edict_title'].nunique()

    # Average passage length
    avg_length = cluster_df['passage_length'].mean()

    # Intra-cluster similarity (how similar passages are to each other)
    embeddings = np.array(cluster_df['embedding'].tolist())
    if len(embeddings) > 1:
        sim_matrix = cosine_similarity(embeddings)
        # Get upper triangle (exclude diagonal)
        upper_tri = sim_matrix[np.triu_indices_from(sim_matrix, k=1)]
        avg_similarity = upper_tri.mean() if len(upper_tri) > 0 else 0
        min_similarity = upper_tri.min() if len(upper_tri) > 0 else 0
    else:
        avg_similarity = 1.0
        min_similarity = 1.0

    return {
        'n_passages': len(cluster_df),
        'n_edicts': n_edicts,
        'avg_length': avg_length,
        'avg_similarity': avg_similarity,
        'min_similarity': min_similarity
    }


def find_representative_passage(cluster_df):
    """
    Find the most representative passage in a cluster (closest to centroid).

    Args:
        cluster_df: DataFrame containing passages in a cluster

    Returns:
        Index of the most representative passage
    """
    embeddings = np.array(cluster_df['embedding'].tolist())
    centroid = embeddings.mean(axis=0).reshape(1, -1)

    similarities = cosine_similarity(embeddings, centroid).flatten()
    most_representative_idx = similarities.argmax()

    return cluster_df.index[most_representative_idx]


def generate_cluster_label(cluster_df, max_words=8):
    """
    Generate a meaningful label for a cluster based on common text patterns.

    Args:
        cluster_df: DataFrame containing passages in a cluster
        max_words: Maximum words to include in label

    Returns:
        String label for the cluster
    """
    # Get the representative (most central) passage
    rep_idx = find_representative_passage(cluster_df)
    rep_text = cluster_df.loc[rep_idx, 'passage_text']

    # Extract first N characters as label, removing punctuation
    label_text = rep_text.replace('„ÄÇ', '').replace('Ôºõ', '').replace('Ôºå', '').strip()

    # Truncate if too long (rough estimate for Chinese characters)
    if len(label_text) > max_words * 2:
        label_text = label_text[:max_words * 2] + '...'

    return label_text

print("Helper functions defined successfully!")

## 5. Load and Prepare Data

In [None]:
# Configuration
MIN_PASSAGE_LENGTH = 10  # Minimum characters per passage
MIN_EDICTS_PER_CLUSTER = 2  # Cluster must appear in at least 2 edicts (true cross-document patterns)
EDICT_TYPE = 'Âç≥‰ΩçËµ¶'  # Specify the edict type to analyze (change this to analyze different types)

print(f"Loading edicts from extracted_edicts_punc.csv...")
df_all = pd.read_csv('extracted_edicts_punc.csv', encoding='utf-8-sig')
print(f"Total rows in dataset: {len(df_all)}")

# Debug: Check for potential filtering issues
print(f"\nüîç Data Quality Check:")
print(f"   Rows with document_type == '{EDICT_TYPE}': {len(df_all[df_all['document_type'] == EDICT_TYPE])}")
print(f"   Rows with non-null text_contents_punctuated: {df_all['text_contents_punctuated'].notna().sum()}")
print(f"   Rows matching both conditions: {len(df_all[(df_all['document_type'] == EDICT_TYPE) & (df_all['text_contents_punctuated'].notna())])}")

# Show available document types
print(f"\nAvailable document types:")
doc_types = df_all['document_type'].value_counts()
for doc_type, count in doc_types.items():
    marker = " ‚Üê SELECTED" if doc_type == EDICT_TYPE else ""
    print(f"  {doc_type}: {count} edicts{marker}")

# Filter for specified edict type with punctuated text
df_edicts = df_all[
    (df_all['document_type'] == EDICT_TYPE) &
    (df_all['text_contents_punctuated'].notna())
].copy()
df_edicts.reset_index(drop=True, inplace=True)

print(f"\n‚úÖ Loaded {len(df_edicts)} '{EDICT_TYPE}' edicts with punctuated text")
print(f"\nEdicts to analyze:")
for idx, row in df_edicts.iterrows():
    print(f"  {idx+1}. {row['text_title']}")

## 6. Segment All Edicts into Passages

Split each edict into passages and generate embeddings.

In [None]:
print(f"Segmenting {len(df_edicts)} '{EDICT_TYPE}' edicts into passages...")
print(f"Minimum passage length: {MIN_PASSAGE_LENGTH} characters\n")

all_passages = []

# Collect all passages from all edicts
for idx, row in tqdm(df_edicts.iterrows(), total=len(df_edicts), desc="Segmenting edicts"):
    edict_title = row['text_title']
    edict_type = row['document_type']
    full_text = row['text_contents_punctuated']

    # Segment into passages
    passages = segment_into_passages(full_text, min_length=MIN_PASSAGE_LENGTH)

    for passage_text, start_pos, end_pos in passages:
        all_passages.append({
            'edict_title': edict_title,
            'edict_type': edict_type,
            'passage_text': passage_text,
            'start_pos': start_pos,
            'end_pos': end_pos,
            'passage_length': len(passage_text),
            'embedding': None  # Will be filled in batch
        })

print(f"\n‚úÖ Extracted {len(all_passages)} passages")
print(f"   Average passages per edict: {len(all_passages)/len(df_edicts):.1f}")

# Convert to DataFrame
df_passages = pd.DataFrame(all_passages)

print(f"\nPassage length statistics:")
print(f"  Mean: {df_passages['passage_length'].mean():.1f} characters")
print(f"  Median: {df_passages['passage_length'].median():.0f} characters")
print(f"  Min: {df_passages['passage_length'].min()} characters")
print(f"  Max: {df_passages['passage_length'].max()} characters")

## 7. Generate Embeddings

Create SIKU-BERT embeddings for all passages using batch processing.

In [None]:
print(f"Generating SIKU-BERT embeddings for {len(df_passages)} passages...")

# Determine batch size and processing mode based on device
if torch.cuda.is_available():
    batch_size = 64
    print("Using GPU-accelerated batch processing...")
else:
    batch_size = 16
    print("Using CPU batch processing (this will be slower)...")

# Generate embeddings in batches with progress bar
passage_texts = df_passages['passage_text'].tolist()
n_batches = (len(passage_texts) + batch_size - 1) // batch_size
print(f"Processing {len(passage_texts)} passages in {n_batches} batches...\n")

try:
    all_embeddings = []
    
    for i in tqdm(range(0, len(passage_texts), batch_size), desc="Generating embeddings"):
        batch_texts = passage_texts[i:i + batch_size]
        
        inputs = tokenizer(batch_texts, return_tensors="pt", padding=True,
                          truncation=True, max_length=512)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        with torch.no_grad():
            outputs = model(**inputs)
            embeddings_batch = outputs.last_hidden_state
        
        # Mean pooling for each text in batch
        attention_mask = inputs['attention_mask'].unsqueeze(-1)
        masked_embeddings = embeddings_batch * attention_mask
        sum_embeddings = masked_embeddings.sum(dim=1)
        sum_mask = attention_mask.sum(dim=1)
        pooled = sum_embeddings / sum_mask
        
        all_embeddings.extend(pooled.cpu().numpy())
    
    embeddings = all_embeddings

    # Assign embeddings back to DataFrame
    df_passages['embedding'] = embeddings

    print(f"‚úÖ Generated embeddings for {len(embeddings)} passages")
    print(f"   Embedding dimension: {len(embeddings[0])}")

except Exception as e:
    print(f"‚ö†Ô∏è  Error during batch processing: {e}")
    if torch.cuda.is_available():
        print("Please check your GPU memory and reduce batch_size if needed.")
    else:
        print("Running on CPU. Consider reducing batch_size if you encounter memory issues.")
    raise

print("\nEmbedding generation complete!")

## 8. Perform Clustering

Apply DBSCAN clustering to identify semantically similar passage groups.

In [None]:
print("Performing density-based clustering (DBSCAN)...")
print("This will identify groups of semantically similar passages.\n")

# Configuration
EPS = 0.05  # Maximum distance between passages in a cluster (STRICT - only very similar passages)
MIN_SAMPLES = 2  # Minimum passages to form a cluster (2 = at least a pair of similar passages)

print(f"Clustering parameters:")
print(f"  eps (max distance): {EPS}")
print(f"  min_samples: {MIN_SAMPLES}")
print(f"  similarity_threshold: {1 - EPS:.3f} (STRICT - formulaic expressions only)\n")

# Extract embeddings
embeddings = np.array(df_passages['embedding'].tolist())

# Apply DBSCAN clustering
# Using cosine distance (1 - cosine_similarity)
print("Running DBSCAN clustering...")

# Use all available CPU cores, works on both CPU and GPU systems
if torch.cuda.is_available():
    print("   (Clustering on CPU with parallel processing)")
clustering = DBSCAN(eps=EPS, min_samples=MIN_SAMPLES, metric='cosine', n_jobs=-1)
cluster_labels = clustering.fit_predict(embeddings)

# Add cluster labels to DataFrame
df_passages['cluster'] = cluster_labels

# Count clusters
n_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)
n_noise = list(cluster_labels).count(-1)

print(f"\n‚úÖ Clustering complete!")
print(f"   Identified clusters: {n_clusters}")
print(f"   Cluster rate: {(len(df_passages) - n_noise) / len(df_passages):.1%}")
print(f"   Passages in clusters: {len(df_passages[df_passages['cluster'] != -1])}")
print(f"   Noise/unique passages: {n_noise}")

## 9. Filter Clusters by Cross-Document Frequency

Keep only clusters that appear in multiple edicts (high-frequency patterns).

In [None]:
print(f"Filtering clusters by cross-document frequency...")
print(f"Minimum edicts per cluster: {MIN_EDICTS_PER_CLUSTER}\n")

# Analyze each cluster
cluster_stats = []

for cluster_id in sorted(df_passages['cluster'].unique()):
    if cluster_id == -1:  # Skip noise
        continue

    cluster_df = df_passages[df_passages['cluster'] == cluster_id]
    stats = calculate_cluster_statistics(cluster_df)
    stats['cluster_id'] = cluster_id
    cluster_stats.append(stats)

df_cluster_stats = pd.DataFrame(cluster_stats)

# Filter clusters by minimum edict count
valid_clusters = df_cluster_stats[df_cluster_stats['n_edicts'] >= MIN_EDICTS_PER_CLUSTER]

print(f"Cluster filtering results:")
print(f"  Initial clusters: {len(df_cluster_stats)}")
print(f"  Valid clusters (‚â•{MIN_EDICTS_PER_CLUSTER} edicts): {len(valid_clusters)}")
print(f"  Filtered out: {len(df_cluster_stats) - len(valid_clusters)}")

# Mark invalid clusters as noise
valid_cluster_ids = set(valid_clusters['cluster_id'].tolist())
df_passages.loc[~df_passages['cluster'].isin(valid_cluster_ids), 'cluster'] = -1

# Recount
n_valid_clusters = len(valid_cluster_ids)
n_passages_in_clusters = len(df_passages[df_passages['cluster'] != -1])

print(f"\n‚úÖ Final results:")
print(f"   Valid clusters: {n_valid_clusters}")
print(f"   Passages in valid clusters: {n_passages_in_clusters}")
print(f"   Average passages per cluster: {n_passages_in_clusters / n_valid_clusters:.1f}")

# Display top clusters by size
if len(valid_clusters) > 0:
    print(f"\nTop 10 clusters by passage count:")
    top_clusters = valid_clusters.nlargest(10, 'n_passages')[
        ['cluster_id', 'n_passages', 'n_edicts', 'avg_similarity', 'avg_length']
    ]
    for _, row in top_clusters.iterrows():
        print(f"  Cluster {int(row['cluster_id']):3d}: {int(row['n_passages']):3d} passages, "
              f"{int(row['n_edicts']):2d} edicts, "
              f"sim={row['avg_similarity']:.3f}, "
              f"len={row['avg_length']:.0f} chars")

## 10. Analyze and Display Cluster Examples

Show representative examples from each cluster.

In [None]:
print("Analyzing formulaic expressions and listing ALL instances...\n")

cluster_analysis = []

for cluster_id in sorted(valid_cluster_ids):
    cluster_df = df_passages[df_passages['cluster'] == cluster_id]

    # Get statistics
    stats = calculate_cluster_statistics(cluster_df)

    # Generate meaningful label
    label = generate_cluster_label(cluster_df)

    # Find representative passage
    rep_idx = find_representative_passage(cluster_df)
    rep_passage = df_passages.loc[rep_idx]

    # Get edict distribution
    edict_counts = cluster_df['edict_title'].value_counts()

    # Get edict type distribution
    type_counts = cluster_df['edict_type'].value_counts()

    cluster_analysis.append({
        'cluster_id': cluster_id,
        'formula_label': label,
        'n_passages': stats['n_passages'],
        'n_edicts': stats['n_edicts'],
        'avg_similarity': stats['avg_similarity'],
        'min_similarity': stats['min_similarity'],
        'representative_text': rep_passage['passage_text'],
        'representative_edict': rep_passage['edict_title'],
        'representative_type': rep_passage['edict_type'],
        'avg_length': stats['avg_length'],
        'primary_edict_types': ', '.join(type_counts.head(3).index.tolist())
    })

df_cluster_analysis = pd.DataFrame(cluster_analysis)

# Sort by passage count (descending)
df_cluster_analysis = df_cluster_analysis.sort_values('n_passages', ascending=False)

print(f"‚úÖ Analysis complete for {len(df_cluster_analysis)} formulaic expression clusters\n")
print("="*100)
print(f"FORMULAIC EXPRESSIONS IN '{EDICT_TYPE}' EDICTS")
print("="*100)
print(f"\nFound {len(df_cluster_analysis)} distinct formulaic patterns\n")

for rank, (idx, row) in enumerate(df_cluster_analysis.iterrows(), 1):
    print(f"\n{'='*100}")
    print(f"Formula #{rank}: [{row['formula_label']}]")
    print(f"{'='*100}")
    print(f"Occurrences: {row['n_passages']} instances across {row['n_edicts']} edict(s)")
    print(f"Similarity: avg={row['avg_similarity']:.3f}, min={row['min_similarity']:.3f}")
    print(f"Avg length: {row['avg_length']:.0f} characters")

    # Show ALL instances of this formula
    cluster_df = df_passages[df_passages['cluster'] == row['cluster_id']].sort_values('edict_title')

    print(f"\nüìù ALL {len(cluster_df)} INSTANCES:")
    print("-" * 100)

    for i, (inst_idx, instance) in enumerate(cluster_df.iterrows(), 1):
        print(f"\n  Instance {i}/{len(cluster_df)} - From: {instance['edict_title']}")
        print(f"  Position: chars {instance['start_pos']}-{instance['end_pos']}")
        print(f"  Text: {instance['passage_text']}")

print(f"\n{'='*100}")
print(f"\nTotal: {len(df_cluster_analysis)} formulaic expressions identified")

print(f"Total instances: {sum(df_cluster_analysis['n_passages'])} occurrences across all formulas")
print(f"{'='*100}\n")

## 11. Visualize Cluster Distribution

In [None]:
print("Creating visualizations...\n")
print(f"Device used for analysis: {device}")
print()

# 1. Cluster size distribution
print("Generating cluster size distribution...")

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Cluster sizes (passage count)
axes[0, 0].hist(df_cluster_analysis['n_passages'], bins=30, color='steelblue', alpha=0.7, edgecolor='black')
axes[0, 0].set_title('ÂàÜÂ∏ÉÔºöÊØè‰∏™ËÅöÁ±ªÁöÑÊÆµËêΩÊï∞Èáè', fontsize=12)
axes[0, 0].set_xlabel('ÊÆµËêΩÊï∞Èáè', fontsize=10)
axes[0, 0].set_ylabel('ËÅöÁ±ªÊï∞Èáè', fontsize=10)
axes[0, 0].grid(axis='y', alpha=0.3)

# Cross-document frequency
axes[0, 1].hist(df_cluster_analysis['n_edicts'], bins=range(MIN_EDICTS_PER_CLUSTER,
                df_cluster_analysis['n_edicts'].max()+2), color='green', alpha=0.7, edgecolor='black')
axes[0, 1].set_title('ÂàÜÂ∏ÉÔºöÊØè‰∏™ËÅöÁ±ªÁöÑÊñáÊ°£Êï∞Èáè', fontsize=12)
axes[0, 1].set_xlabel('Âá∫Áé∞ÁöÑÊñáÊ°£Êï∞Èáè', fontsize=10)
axes[0, 1].set_ylabel('ËÅöÁ±ªÊï∞Èáè', fontsize=10)
axes[0, 1].grid(axis='y', alpha=0.3)

# Intra-cluster similarity
axes[1, 0].hist(df_cluster_analysis['avg_similarity'], bins=30, color='purple', alpha=0.7, edgecolor='black')
axes[1, 0].set_title('ÂàÜÂ∏ÉÔºöËÅöÁ±ªÂÜÖÂπ≥ÂùáÁõ∏‰ººÂ∫¶', fontsize=12)
axes[1, 0].set_xlabel('Âπ≥Âùá‰ΩôÂº¶Áõ∏‰ººÂ∫¶', fontsize=10)
axes[1, 0].set_ylabel('ËÅöÁ±ªÊï∞Èáè', fontsize=10)
axes[1, 0].grid(axis='y', alpha=0.3)

# Passage length distribution
axes[1, 1].hist(df_cluster_analysis['avg_length'], bins=30, color='orange', alpha=0.7, edgecolor='black')
axes[1, 1].set_title('ÂàÜÂ∏ÉÔºöËÅöÁ±ªÂÜÖÂπ≥ÂùáÊÆµËêΩÈïøÂ∫¶', fontsize=12)
axes[1, 1].set_xlabel('Âπ≥ÂùáÂ≠óÁ¨¶Êï∞', fontsize=10)
axes[1, 1].set_ylabel('ËÅöÁ±ªÊï∞Èáè', fontsize=10)
axes[1, 1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig(f'cluster_distributions_{EDICT_TYPE}.png', dpi=300, bbox_inches='tight')
print(f"‚úÖ Distribution plots saved: cluster_distributions_{EDICT_TYPE}.png")
plt.show()

# 2. Top clusters by frequency
print("\nGenerating top clusters visualization...")

fig, ax = plt.subplots(figsize=(12, 8))

top_n = min(20, len(df_cluster_analysis))
top_clusters = df_cluster_analysis.head(top_n)

bars = ax.barh(range(top_n), top_clusters['n_passages'], color='steelblue', alpha=0.7)
ax.set_yticks(range(top_n))
ax.set_yticklabels([f"ËÅöÁ±ª {cid}" for cid in top_clusters['cluster_id']], fontsize=9)
ax.set_xlabel('ÊÆµËêΩÊï∞Èáè', fontsize=12)
ax.set_title(f'Ââç{top_n}‰∏™ÊúÄÂ∏∏ËßÅÁöÑËØ≠‰πâËÅöÁ±ª', fontsize=14, pad=20)
ax.invert_yaxis()

# Add edict count as text
for i, (idx, row) in enumerate(top_clusters.iterrows()):
    ax.text(row['n_passages'] + 0.5, i, f"{row['n_edicts']}‰∏™ÊñáÊ°£",
            va='center', fontsize=8, color='darkgreen')

plt.tight_layout()
plt.savefig(f'top_clusters_{EDICT_TYPE}.png', dpi=300, bbox_inches='tight')
print(f"‚úÖ Top clusters visualization saved: top_clusters_{EDICT_TYPE}.png")
plt.show()

## 12. Export Results

In [None]:
print("Exporting results to CSV files...\n")

# 1. Export cluster analysis summary
summary_file = f'cluster_summary_{EDICT_TYPE}.csv'
df_cluster_analysis.to_csv(summary_file, index=False, encoding='utf-8-sig')
print(f"‚úÖ Cluster summary saved: {summary_file}")

# 2. Export all passages with cluster assignments
passages_file = f'passages_with_clusters_{EDICT_TYPE}.csv'
df_passages_export = df_passages.drop(columns=['embedding'])
df_passages_export.to_csv(passages_file, index=False, encoding='utf-8-sig')
print(f"‚úÖ All passages with clusters saved: {passages_file}")

# 3. Export detailed cluster contents
print("\nExporting detailed cluster contents...")

for cluster_id in sorted(valid_cluster_ids):
    cluster_df = df_passages[df_passages['cluster'] == cluster_id]
    cluster_file = f'cluster_{cluster_id}_passages_{EDICT_TYPE}.csv'
    cluster_export = cluster_df[['edict_title', 'edict_type', 'passage_text',
                                  'passage_length', 'start_pos', 'end_pos']].copy()
    cluster_export.to_csv(cluster_file, index=False, encoding='utf-8-sig')

print(f"‚úÖ Exported {len(valid_cluster_ids)} individual cluster files: cluster_X_passages_{EDICT_TYPE}.csv")

# 4. Create a comprehensive report
report_file = f'clustering_report_{EDICT_TYPE}.txt'
with open(report_file, 'w', encoding='utf-8') as f:
    f.write("="*100 + "\n")
    f.write(f"SEMANTIC CLUSTERING REPORT - {EDICT_TYPE} Edicts\n")
    f.write("="*100 + "\n\n")

    f.write(f"Dataset Overview:\n")
    f.write(f"  Total edicts analyzed: {len(df_edicts)}\n")
    f.write(f"  Total passages extracted: {len(df_passages)}\n")
    f.write(f"  Average passages per edict: {len(df_passages)/len(df_edicts):.1f}\n\n")

    f.write(f"Clustering Parameters:\n")
    f.write(f"  Algorithm: DBSCAN\n")
    f.write(f"  Distance metric: Cosine\n")
    f.write(f"  eps (max distance): {EPS}\n")
    f.write(f"  min_samples: {MIN_SAMPLES}\n")
    f.write(f"  Min edicts per cluster: {MIN_EDICTS_PER_CLUSTER}\n\n")

    f.write(f"Results:\n")
    f.write(f"  Valid clusters identified: {len(valid_cluster_ids)}\n")
    f.write(f"  Passages in clusters: {n_passages_in_clusters} ({n_passages_in_clusters/len(df_passages):.1%})\n")
    f.write(f"  Unique passages (noise): {len(df_passages) - n_passages_in_clusters}\n\n")

    f.write("="*100 + "\n")
    f.write("TOP 20 RECURRING PASSAGE CLUSTERS\n")
    f.write("="*100 + "\n\n")

    for idx, row in df_cluster_analysis.head(20).iterrows():
        f.write(f"\nCluster {row['cluster_id']} - Rank #{idx+1}\n")
        f.write(f"{'-'*100}\n")
        f.write(f"Frequency: {row['n_passages']} passages across {row['n_edicts']} edicts\n")
        f.write(f"Similarity: avg={row['avg_similarity']:.3f}, min={row['min_similarity']:.3f}\n")
        f.write(f"Avg length: {row['avg_length']:.0f} characters\n")
        f.write(f"Primary edict types: {row['primary_edict_types']}\n")
        f.write(f"\nRepresentative passage:\n")
        f.write(f"  {row['representative_text']}\n")
        f.write(f"  (from: {row['representative_edict']})\n\n")

print(f"‚úÖ Comprehensive report saved: {report_file}")

print(f"\n{'='*80}")
print("Summary of Output Files:")
print(f"1. {summary_file} - Cluster statistics and analysis")
print(f"2. {passages_file} - All passages with cluster assignments")
print(f"3. cluster_X_passages_{EDICT_TYPE}.csv - Detailed contents of each cluster ({len(valid_cluster_ids)} files)")
print(f"4. {report_file} - Human-readable comprehensive report")
print(f"5. cluster_distributions_{EDICT_TYPE}.png - Statistical visualizations")
print(f"6. top_clusters_{EDICT_TYPE}.png - Top clusters by frequency")
print("="*80)

## 13. Statistical Summary

Final overview of the clustering analysis.

In [None]:
print("SEMANTIC CLUSTERING ANALYSIS - FINAL SUMMARY")
print(f"Edict Type: {EDICT_TYPE}")
print("="*100)

print(f"\nüìä Dataset:")
print(f"   Total '{EDICT_TYPE}' edicts: {len(df_edicts)}")
print(f"   Total passages: {len(df_passages)}")
print(f"   Average passages per edict: {len(df_passages)/len(df_edicts):.1f}")

print(f"\nüîç Clustering Results:")
print(f"   Valid clusters: {len(valid_cluster_ids)}")
print(f"   Passages in clusters: {n_passages_in_clusters} ({n_passages_in_clusters/len(df_passages):.1%})")
print(f"   Unique passages: {len(df_passages) - n_passages_in_clusters}")
print(f"   Average passages per cluster: {n_passages_in_clusters / len(valid_cluster_ids):.1f}")

print(f"\nüìà Cluster Quality Metrics:")
print(f"   Mean intra-cluster similarity: {df_cluster_analysis['avg_similarity'].mean():.3f}")
print(f"   Mean cross-document frequency: {df_cluster_analysis['n_edicts'].mean():.1f} edicts")
print(f"   Largest cluster: {df_cluster_analysis['n_passages'].max()} passages")
print(f"   Widest cluster: {df_cluster_analysis['n_edicts'].max()} edicts")

print(f"\nüìù Top 5 Most Frequent Recurring Passages:")
for idx, row in df_cluster_analysis.head(5).iterrows():
    print(f"\n{idx+1}. Cluster {row['cluster_id']}:")
    print(f"   {row['n_passages']} passages across {row['n_edicts']} edicts")
    print(f"   Text: {row['representative_text'][:100]}...")

print(f"\nüí° Interpretation:")
print(f"   The analysis identified {len(valid_cluster_ids)} semantically similar passage groups")
print(f"   that recur across multiple '{EDICT_TYPE}' edict documents. These represent formulaic")
print(f"   expressions, standard bureaucratic language, and recurring thematic content")
print(f"   specific to {EDICT_TYPE} edicts in Tang Dynasty administrative documents.")

print(f"\n‚öôÔ∏è  Quality Control:")
print(f"   Min edicts per cluster: {MIN_EDICTS_PER_CLUSTER} (ensures cross-document recurrence)")
print(f"   Min samples for cluster: {MIN_SAMPLES} (reduces false positives)")
print(f"   Similarity threshold: {1-EPS:.3f} (strict semantic matching)")

print("\n" + "="*100)
print("‚úÖ Analysis complete! All results exported.")
print("="*100)

## Summary

This notebook successfully identifies clusters of semantically similar passages across Tang Dynasty edict documents of a specified type using SIKU-BERT embeddings.

**Key Features:**
1. **Semantic Similarity**: Uses SIKU-BERT embeddings trained on classical Chinese texts
2. **Density-Based Clustering**: DBSCAN algorithm identifies natural groupings without requiring preset cluster count
3. **Cross-Document Frequency**: Filters clusters to include only passages appearing in multiple edicts
4. **Type-Specific Analysis**: Focuses on a single edict type to find patterns specific to that category
5. **False Positive Reduction**:
   - Minimum samples requirement prevents spurious clusters
   - Strict distance threshold ensures high semantic similarity
   - Cross-document validation confirms true recurring patterns
6. **Quality Metrics**: Intra-cluster similarity and cross-document frequency for validation

**Parameters to Adjust:**
- `EDICT_TYPE`: Specify which edict type to analyze (e.g., 'Âç≥‰ΩçËµ¶', 'Â§ßËµ¶', 'ÊîπÂÖÉËµ¶')
- `MIN_PASSAGE_LENGTH`: Minimum passage size (default: 10 characters)
- `MIN_EDICTS_PER_CLUSTER`: Cross-document threshold (default: 2)
- `EPS`: DBSCAN distance threshold (default: 0.15, lower = stricter)
- `MIN_SAMPLES`: Minimum passages per cluster (default: 3, higher = fewer false positives)

**Output:**
- Cluster summaries with statistics
- Representative examples from each cluster
- Visualizations of cluster distributions
- Detailed CSV files for further analysis
- Comprehensive text report

**Use Cases:**
- Identifying formulaic language in bureaucratic texts
- Finding recurring themes across documents
- Analyzing standardization in administrative writing
- Discovering common rhetorical patterns