## 1. Setup and Imports

In [None]:
import sys
from pathlib import Path

# Add src to path
sys.path.insert(0, str(Path.cwd().parent / 'src'))

from analysis.temporal_evolution import TemporalAnalyzer
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("✓ Imports loaded successfully")

## 2. Initialize Timeline Analyzer

In [None]:
# Initialize analyzer with token coordinates
token_coords_path = '../data/processed/token_coords.jsonl'
output_dir = '../reports/figures/timeline'

analyzer = TemporalAnalyzer(
    token_coords_path=token_coords_path,
    output_dir=output_dir
)

print("✓ Temporal analyzer initialized")

## 3. Load and Explore Token Data

In [None]:
# Load token coordinate data
tokens_df = analyzer.load_data()

print(f"\nDataset Overview:")
print(f"  Total tokens: {len(tokens_df)}")
print(f"  Unique folios: {tokens_df['folio'].nunique()}")
print(f"  Unique tokens: {tokens_df['token'].nunique()}")
print(f"\nFirst few rows:")
tokens_df.head(10)

## 4. Compute Folio-Level Statistics

In [None]:
# Compute statistics for each folio
folio_stats = analyzer.compute_folio_statistics()

print("\nFolio Statistics:")
folio_stats[['folio', 'token_count', 'unique_tokens', 'vocabulary_diversity', 'most_common_token']].head(10)

## 5. Token Frequency Evolution

Analyze how the most common tokens evolve across the manuscript.

In [None]:
# Get most common token
from collections import Counter

all_tokens = []
for tokens in folio_stats['tokens']:
    all_tokens.extend(tokens)

top_tokens = Counter(all_tokens).most_common(5)

print("Top 5 Most Frequent Tokens:")
for i, (token, count) in enumerate(top_tokens, 1):
    print(f"{i}. '{token}': {count} occurrences ({count/len(all_tokens)*100:.2f}%)")

In [None]:
# Analyze evolution of most common token
most_common_token = top_tokens[0][0]
evolution = analyzer.analyze_token_evolution(most_common_token)

print(f"\nEvolution of '{most_common_token}':")
print(f"  First appearance: {evolution['first_appearance']}")
print(f"  Last appearance: {evolution['last_appearance']}")
print(f"  Total occurrences: {evolution['total_occurrences']}")
print(f"  Appears in {evolution['appears_in_folios']}/{len(folio_stats)} folios")

# Plot evolution
plt.figure(figsize=(14, 6))
plt.plot(evolution['evolution']['order'], 
         evolution['evolution']['frequency'],
         marker='o', linewidth=2, markersize=8, color='steelblue')
plt.xlabel('Folio Order', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.title(f"Evolution of Token '{most_common_token}' Across Manuscript", 
          fontsize=14, fontweight='bold')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 6. Vocabulary Diversity Analysis

Examine how vocabulary richness (type-token ratio) changes across folios.

In [None]:
# Statistical summary of vocabulary diversity
print("Vocabulary Diversity Statistics:")
print(f"  Mean: {folio_stats['vocabulary_diversity'].mean():.3f}")
print(f"  Std Dev: {folio_stats['vocabulary_diversity'].std():.3f}")
print(f"  Min: {folio_stats['vocabulary_diversity'].min():.3f} (Folio: {folio_stats.loc[folio_stats['vocabulary_diversity'].idxmin(), 'folio']})")
print(f"  Max: {folio_stats['vocabulary_diversity'].max():.3f} (Folio: {folio_stats.loc[folio_stats['vocabulary_diversity'].idxmax(), 'folio']})")

# Box plot
plt.figure(figsize=(10, 6))
plt.boxplot(folio_stats['vocabulary_diversity'], vert=True)
plt.ylabel('Vocabulary Diversity (Type-Token Ratio)', fontsize=12)
plt.title('Distribution of Vocabulary Diversity Across Folios', fontsize=14, fontweight='bold')
plt.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.show()

## 7. Vocabulary Shift Detection

Identify significant changes in vocabulary between adjacent folio windows.

In [None]:
# Detect vocabulary shifts
shifts = analyzer.detect_vocabulary_shifts(window_size=1)

if not shifts.empty:
    print("Vocabulary Shift Analysis:")
    print(f"  Total transitions analyzed: {len(shifts)}")
    print(f"  Mean Jaccard Similarity: {shifts['jaccard_similarity'].mean():.3f}")
    print(f"  Mean Jensen-Shannon Divergence: {shifts['jsd_distance'].mean():.3f}")
    
    # Find most significant shift
    max_shift_idx = shifts['jsd_distance'].idxmax()
    max_shift = shifts.iloc[max_shift_idx]
    
    print(f"\nMost Significant Vocabulary Shift:")
    print(f"  Between: {max_shift['window_start']} → {max_shift['window_end']}")
    print(f"  JS Divergence: {max_shift['jsd_distance']:.3f}")
    print(f"  Jaccard Similarity: {max_shift['jaccard_similarity']:.3f}")
    print(f"  New tokens introduced: {max_shift['new_tokens']}")
    print(f"  Tokens disappeared: {max_shift['disappeared_tokens']}")
    
    shifts.head(10)
else:
    print("No vocabulary shifts detected (insufficient data)")

## 8. Generate Complete Visualizations

Create comprehensive timeline analysis visualizations.

In [None]:
# Generate token frequency heatmap
analyzer.visualize_token_frequency_evolution(top_n=10)

print("✓ Token frequency heatmap generated")

In [None]:
# Generate vocabulary diversity plots
analyzer.visualize_vocabulary_diversity()

print("✓ Vocabulary diversity plots generated")

In [None]:
# Generate vocabulary shift visualizations
analyzer.visualize_vocabulary_shifts()

print("✓ Vocabulary shift plots generated")

## 9. Generate Comprehensive Report

In [None]:
# Generate full timeline report
report = analyzer.generate_timeline_report()

# Preview first 2000 characters
print("Timeline Analysis Report Preview:")
print("=" * 60)
print(report[:2000])
print("\n[... report continues ...]")
print("=" * 60)
print(f"\n✓ Full report saved to: {Path(output_dir).parent / 'timeline_analysis.md'}")

## 10. Key Findings Summary

In [None]:
# Summarize key findings
print("KEY FINDINGS FROM TIMELINE ANALYSIS")
print("=" * 60)

print(f"\n1. VOCABULARY DIVERSITY:")
diversity_std = folio_stats['vocabulary_diversity'].std()
if diversity_std < 0.1:
    print(f"   → Highly consistent (σ={diversity_std:.3f})")
    print(f"   → Suggests single author/scribe")
else:
    print(f"   → Significant variation (σ={diversity_std:.3f})")
    print(f"   → May indicate multiple authors or topic shifts")

print(f"\n2. TOKEN DISTRIBUTION:")
top_5_percentage = sum(c for _, c in top_tokens[:5]) / len(all_tokens) * 100
print(f"   → Top 5 tokens: {top_5_percentage:.1f}% of all tokens")
if top_5_percentage > 40:
    print(f"   → High repetitiveness (cipher-like behavior)")
else:
    print(f"   → Moderate diversity (natural language-like)")

if not shifts.empty:
    print(f"\n3. VOCABULARY SHIFTS:")
    mean_jsd = shifts['jsd_distance'].mean()
    print(f"   → Mean JS Divergence: {mean_jsd:.3f}")
    if mean_jsd < 0.3:
        print(f"   → Vocabulary remains stable throughout")
        print(f"   → Consistent encoding/cipher system")
    else:
        print(f"   → Significant vocabulary shifts detected")
        print(f"   → Multiple topics or encoding schemes possible")

print(f"\n4. TEMPORAL STRUCTURE:")
token_variance = folio_stats['token_count'].std() / folio_stats['token_count'].mean()
print(f"   → Token count variation: {token_variance:.3f}")
if token_variance < 0.3:
    print(f"   → Uniform folio structure")
else:
    print(f"   → Variable folio lengths (may indicate sections)")

print("\n" + "=" * 60)
print("Timeline analysis complete! ✓")

## Conclusions

This timeline analysis provides insights into:

1. **Temporal consistency** - Whether the manuscript maintains consistent linguistic patterns
2. **Structural boundaries** - Potential section divisions or topic changes
3. **Authorship indicators** - Evidence for single vs. multiple scribes
4. **Encoding patterns** - Characteristics that inform decipherment strategy

These findings should be:
- Cross-referenced with manuscript illustrations and physical structure
- Compared with known historical cipher systems
- Validated with expanded corpus analysis
- Integrated with other linguistic analyses (embeddings, language comparison)

---

*For detailed methodology and implementation, see `src/analysis/temporal_evolution.py`*