In [None]:
# Setup and imports
import sys
from pathlib import Path
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter, defaultdict
from datetime import datetime

# Set working directory to project root
import os
os.chdir('..')
print(f"üìÅ Working directory: {Path('.').resolve()}")

# Configure plotting
plt.style.use('seaborn-v0_8-darkgrid')
plt.rcParams['figure.figsize'] = (14, 8)
plt.rcParams['figure.dpi'] = 100
sns.set_palette("husl")

print("‚úÖ Setup complete")

## 1. Load All Results Data

Load hypotheses, language comparisons, and experiment metrics.

In [None]:
# Load hypotheses
hypotheses_file = Path('reports/hypotheses/hypotheses_aggregated.jsonl')
hypotheses = []
if hypotheses_file.exists():
    with open(hypotheses_file, 'r') as f:
        for line in f:
            hypotheses.append(json.loads(line))

print(f"üìä Loaded {len(hypotheses)} hypotheses")

# Load run hypotheses
run_hyp_file = Path('reports/hypotheses/run_hypotheses.jsonl')
run_hypotheses = []
if run_hyp_file.exists():
    with open(run_hyp_file, 'r') as f:
        for line in f:
            run_hypotheses.append(json.loads(line))

print(f"üìä Loaded {len(run_hypotheses)} run hypotheses")

# Load experiment metrics
metrics_file = Path('reports/experiment_metrics.json')
metrics = {}
if metrics_file.exists():
    with open(metrics_file, 'r') as f:
        metrics = json.load(f)

print(f"üìä Loaded experiment metrics")

# Load language comparison files
comparison_dir = Path('reports/comparison')
comparisons = {}
if comparison_dir.exists():
    for comp_file in comparison_dir.glob('*_details.json'):
        lang = comp_file.stem.replace('_details', '')
        with open(comp_file, 'r') as f:
            comparisons[lang] = json.load(f)

print(f"üìä Loaded {len(comparisons)} language comparisons: {list(comparisons.keys())}")

# Load comparison metadata
metadata_file = comparison_dir / 'comparison_metadata.json'
comparison_metadata = {}
if metadata_file.exists():
    with open(metadata_file, 'r') as f:
        comparison_metadata = json.load(f)

print("\n‚úÖ All data loaded successfully")

## 2. Hypothesis Quality Assessment

Evaluate the quality, diversity, and actionability of generated hypotheses.

In [None]:
# Analyze hypothesis diversity
all_hyps = hypotheses + run_hypotheses

# Extract tokens mentioned in prompts
tokens_in_hypotheses = []
for h in all_hyps:
    prompt = h.get('prompt', '')
    tokens = prompt.split()
    tokens_in_hypotheses.extend(tokens)

token_coverage = Counter(tokens_in_hypotheses)

# Count unique response patterns
responses = [h.get('response', '') for h in all_hyps]
unique_responses = len(set(responses))

# Model distribution
models_used = Counter([h.get('model', 'unknown') for h in all_hyps])

print("=" * 60)
print("HYPOTHESIS QUALITY ASSESSMENT")
print("=" * 60)
print(f"\nüìä Total hypotheses: {len(all_hyps)}")
print(f"üìä Unique response patterns: {unique_responses} ({unique_responses/len(all_hyps)*100:.1f}%)")
print(f"üìä Tokens covered: {len(token_coverage)} unique tokens")
print(f"\nü§ñ Models used:")
for model, count in models_used.most_common():
    print(f"   - {model}: {count} hypotheses")

# Assess hypothesis specificity
generic_phrases = ['may represent', 'common noun', 'determiner', 'frequency hint']
specific_hypotheses = 0
generic_hypotheses = 0

for h in all_hyps:
    response = h.get('response', '').lower()
    if any(phrase in response for phrase in generic_phrases):
        generic_hypotheses += 1
    else:
        specific_hypotheses += 1

print(f"\nüéØ Hypothesis specificity:")
print(f"   - Generic: {generic_hypotheses} ({generic_hypotheses/len(all_hyps)*100:.1f}%)")
print(f"   - Specific: {specific_hypotheses} ({specific_hypotheses/len(all_hyps)*100:.1f}%)")

# Visualize token coverage
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Token coverage
ax1 = axes[0]
top_tokens = token_coverage.most_common(15)
tokens_labels = [t[0] for t in top_tokens]
tokens_counts = [t[1] for t in top_tokens]
bars = ax1.barh(tokens_labels, tokens_counts, color='steelblue', edgecolor='black')
ax1.set_xlabel('Hypothesis Count', fontsize=12, fontweight='bold')
ax1.set_ylabel('Token', fontsize=12, fontweight='bold')
ax1.set_title('Top Tokens in Generated Hypotheses', fontsize=14, fontweight='bold')
ax1.invert_yaxis()
ax1.grid(axis='x', alpha=0.3)

# Model distribution
ax2 = axes[1]
model_names = list(models_used.keys())
model_counts = list(models_used.values())
colors = sns.color_palette('husl', len(model_names))
ax2.pie(model_counts, labels=model_names, autopct='%1.1f%%', colors=colors, startangle=90)
ax2.set_title('Hypothesis Generation by Model', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

print("\n‚ö†Ô∏è CRITICAL FINDING: Most hypotheses are generic and rule-based!")
print("   Recommendation: Use more sophisticated LLMs for specific semantic hypotheses.")

## 3. Language Comparison Analysis

Interpret Jensen-Shannon Divergence scores and determine which languages are most similar to Voynich.

In [None]:
# Extract JSD scores
jsd_scores = {}
for lang, data in comparisons.items():
    if 'jsd_unigram' in data:
        jsd_scores[lang] = {
            'unigram': data['jsd_unigram'],
            'bigram': data.get('jsd_bigram', None),
            'vocab_overlap': data.get('vocab_overlap', 0),
            'common_tokens': data.get('common_tokens_count', 0)
        }

print("=" * 60)
print("LANGUAGE SIMILARITY ANALYSIS")
print("=" * 60)
print("\nüìä Jensen-Shannon Divergence (JSD) Interpretation:")
print("   - JSD = 0.0: Identical distributions")
print("   - JSD < 0.3: Very similar")
print("   - JSD 0.3-0.5: Moderately similar")
print("   - JSD > 0.5: Very different")
print("   - JSD = 1.0: Completely different\n")

# Create comparison dataframe
comp_data = []
for lang, scores in jsd_scores.items():
    comp_data.append({
        'Language': lang.replace('_', ' ').title(),
        'JSD Unigram': scores['unigram'],
        'JSD Bigram': scores['bigram'] if scores['bigram'] else np.nan,
        'Vocab Overlap': scores['vocab_overlap'],
        'Common Tokens': scores['common_tokens']
    })

df_comp = pd.DataFrame(comp_data).sort_values('JSD Unigram')

print("\nüìä Language Similarity Ranking (lower JSD = more similar):\n")
print(df_comp.to_string(index=False))

# Visualize comparisons
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# 1. JSD Unigram comparison
ax1 = axes[0, 0]
bars = ax1.barh(df_comp['Language'], df_comp['JSD Unigram'], color='coral', edgecolor='black')
ax1.set_xlabel('JSD Score', fontsize=12, fontweight='bold')
ax1.set_title('Unigram Distribution Similarity (Lower = More Similar)', fontsize=14, fontweight='bold')
ax1.invert_yaxis()
ax1.axvline(x=0.3, color='green', linestyle='--', label='Very Similar Threshold', alpha=0.7)
ax1.axvline(x=0.5, color='orange', linestyle='--', label='Moderate Threshold', alpha=0.7)
ax1.legend()
ax1.grid(axis='x', alpha=0.3)

# Add value labels
for bar, val in zip(bars, df_comp['JSD Unigram']):
    ax1.text(val + 0.01, bar.get_y() + bar.get_height()/2, f'{val:.3f}',
             va='center', fontsize=9, fontweight='bold')

# 2. JSD Bigram comparison (if available)
ax2 = axes[0, 1]
df_bigram = df_comp.dropna(subset=['JSD Bigram']).sort_values('JSD Bigram')
if len(df_bigram) > 0:
    bars2 = ax2.barh(df_bigram['Language'], df_bigram['JSD Bigram'], color='skyblue', edgecolor='black')
    ax2.set_xlabel('JSD Score', fontsize=12, fontweight='bold')
    ax2.set_title('Bigram Distribution Similarity', fontsize=14, fontweight='bold')
    ax2.invert_yaxis()
    ax2.axvline(x=0.3, color='green', linestyle='--', alpha=0.7)
    ax2.axvline(x=0.5, color='orange', linestyle='--', alpha=0.7)
    ax2.grid(axis='x', alpha=0.3)
    for bar, val in zip(bars2, df_bigram['JSD Bigram']):
        ax2.text(val + 0.01, bar.get_y() + bar.get_height()/2, f'{val:.3f}',
                 va='center', fontsize=9, fontweight='bold')
else:
    ax2.text(0.5, 0.5, 'No bigram data available', ha='center', va='center', fontsize=12)
    ax2.set_xlim(0, 1)
    ax2.set_ylim(0, 1)

# 3. Vocabulary overlap
ax3 = axes[1, 0]
df_sorted = df_comp.sort_values('Vocab Overlap', ascending=False)
bars3 = ax3.bar(df_sorted['Language'], df_sorted['Vocab Overlap'] * 100, 
                color='lightgreen', edgecolor='black')
ax3.set_ylabel('Overlap %', fontsize=12, fontweight='bold')
ax3.set_title('Vocabulary Overlap with Voynich', fontsize=14, fontweight='bold')
ax3.tick_params(axis='x', rotation=45)
ax3.grid(axis='y', alpha=0.3)
for bar, val in zip(bars3, df_sorted['Vocab Overlap'] * 100):
    ax3.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5, f'{val:.1f}%',
             ha='center', fontsize=9, fontweight='bold')

# 4. Common tokens count
ax4 = axes[1, 1]
df_sorted2 = df_comp.sort_values('Common Tokens', ascending=False)
bars4 = ax4.bar(df_sorted2['Language'], df_sorted2['Common Tokens'],
                color='plum', edgecolor='black')
ax4.set_ylabel('Token Count', fontsize=12, fontweight='bold')
ax4.set_title('Number of Common Tokens', fontsize=14, fontweight='bold')
ax4.tick_params(axis='x', rotation=45)
ax4.grid(axis='y', alpha=0.3)
for bar, val in zip(bars4, df_sorted2['Common Tokens']):
    ax4.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.2, f'{int(val)}',
             ha='center', fontsize=9, fontweight='bold')

plt.tight_layout()
plt.show()

# Generate conclusions
print("\n" + "=" * 60)
print("CONCLUSIONS")
print("=" * 60)

most_similar = df_comp.iloc[0]
print(f"\n‚úÖ Most similar language: {most_similar['Language']}")
print(f"   - JSD Unigram: {most_similar['JSD Unigram']:.3f}")
print(f"   - Vocab overlap: {most_similar['Vocab Overlap']*100:.2f}%")
print(f"   - Common tokens: {int(most_similar['Common Tokens'])}")

if most_similar['JSD Unigram'] < 0.3:
    print("\nüéØ STRONG SIMILARITY: Voynich shows very similar statistical patterns!")
elif most_similar['JSD Unigram'] < 0.5:
    print("\n‚ö†Ô∏è MODERATE SIMILARITY: Some structural similarities exist.")
else:
    print("\n‚ùå LOW SIMILARITY: Voynich appears statistically distinct from tested languages.")

print(f"\nüìå Recommendation: Focus analysis on {most_similar['Language']} linguistic structures.")

## 4. Statistical Pattern Analysis

Analyze the statistical patterns found in the Voynich text.

In [None]:
print("=" * 60)
print("STATISTICAL PATTERN ANALYSIS")
print("=" * 60)

if metrics:
    print(f"\nüìä Corpus Statistics:")
    print(f"   - Total lines: {metrics.get('lines', 'N/A')}")
    print(f"   - Total tokens: {metrics.get('tokens', 'N/A')}")
    print(f"   - Vocabulary size: {metrics.get('vocab_size', 'N/A')}")
    print(f"   - Hapax legomena: {metrics.get('hapax_count', 'N/A')} ({metrics.get('hapax_ratio', 0)*100:.1f}%)")
    
    print(f"\nüìä Entropy Analysis:")
    entropy = metrics.get('unigram_entropy', 0)
    print(f"   - Unigram entropy: {entropy:.4f} bits")
    
    if entropy < 3.0:
        print("   ‚ö†Ô∏è LOW ENTROPY: Text shows high predictability (possible repetitive structure)")
    elif entropy < 4.5:
        print("   ‚úÖ MODERATE ENTROPY: Similar to natural languages")
    else:
        print("   ‚ö†Ô∏è HIGH ENTROPY: Unusually unpredictable (possible random or artificial)")
    
    print(f"\nüìä Zipf's Law Analysis:")
    zipf_slope = metrics.get('zipf_slope', 0)
    print(f"   - Log-log slope: {zipf_slope:.4f}")
    
    if -1.2 < zipf_slope < -0.8:
        print("   ‚úÖ FOLLOWS ZIPF'S LAW: Consistent with natural language")
    else:
        print(f"   ‚ö†Ô∏è DEVIATES FROM ZIPF'S LAW: Slope should be near -1.0")
        if zipf_slope > -0.8:
            print("      ‚Üí More uniform distribution than typical language")
        else:
            print("      ‚Üí More skewed distribution than typical language")
    
    # Top tokens analysis
    print(f"\nüìä Most Frequent Tokens:")
    top_unigrams = metrics.get('top_unigrams', [])
    if top_unigrams:
        for i, (token, count) in enumerate(top_unigrams[:10], 1):
            freq_pct = count / metrics.get('tokens', 1) * 100
            print(f"   {i:2d}. '{token}': {count} occurrences ({freq_pct:.2f}%)")
        
        # Calculate concentration
        top_5_freq = sum(c for t, c in top_unigrams[:5]) / metrics.get('tokens', 1)
        print(f"\n   Top 5 tokens represent {top_5_freq*100:.1f}% of all text")
        
        if top_5_freq > 0.4:
            print("   ‚úÖ HIGH CONCENTRATION: Similar to function words in natural language")
        else:
            print("   ‚ö†Ô∏è LOW CONCENTRATION: Unusual for natural language")
else:
    print("\n‚ö†Ô∏è No statistical metrics available")

# Visualize if data available
if metrics and 'top_unigrams' in metrics:
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))
    
    # Token frequency
    ax1 = axes[0]
    top_tokens = metrics['top_unigrams'][:15]
    tokens = [t[0] for t in top_tokens]
    counts = [t[1] for t in top_tokens]
    bars = ax1.barh(tokens, counts, color='steelblue', edgecolor='black')
    ax1.set_xlabel('Frequency', fontsize=12, fontweight='bold')
    ax1.set_title('Top 15 Voynich Tokens', fontsize=14, fontweight='bold')
    ax1.invert_yaxis()
    ax1.grid(axis='x', alpha=0.3)
    
    # Zipf's law visualization
    ax2 = axes[1]
    ranks = np.arange(1, len(metrics['top_unigrams']) + 1)
    frequencies = [t[1] for t in metrics['top_unigrams']]
    ax2.loglog(ranks, frequencies, 'o-', color='coral', markersize=8, linewidth=2)
    ax2.set_xlabel('Rank (log scale)', fontsize=12, fontweight='bold')
    ax2.set_ylabel('Frequency (log scale)', fontsize=12, fontweight='bold')
    ax2.set_title("Zipf's Law: Rank vs Frequency", fontsize=14, fontweight='bold')
    ax2.grid(True, alpha=0.3, which='both')
    
    # Add ideal Zipf line
    if len(ranks) > 0:
        ideal_zipf = frequencies[0] / ranks
        ax2.loglog(ranks, ideal_zipf, '--', color='gray', alpha=0.5, 
                   linewidth=2, label='Ideal Zipf (slope=-1)')
        ax2.legend()
    
    plt.tight_layout()
    plt.show()

## 5. Actionable Insights & Recommendations

Synthesize findings into concrete next steps.

In [None]:
print("=" * 60)
print("ACTIONABLE INSIGHTS & RECOMMENDATIONS")
print("=" * 60)

insights = []

# Insight 1: Hypothesis quality
if generic_hypotheses / len(all_hyps) > 0.8:
    insights.append({
        'category': 'Hypothesis Generation',
        'finding': 'Over 80% of hypotheses are generic rule-based outputs',
        'action': 'Replace local-rule model with advanced LLMs (GPT-4, Claude, or fine-tuned models)',
        'priority': 'HIGH'
    })

# Insight 2: Language similarity
if jsd_scores:
    best_lang = min(jsd_scores.items(), key=lambda x: x[1]['unigram'])
    if best_lang[1]['unigram'] < 0.5:
        insights.append({
            'category': 'Language Analysis',
            'finding': f"{best_lang[0].replace('_', ' ').title()} shows moderate structural similarity",
            'action': f"Deep-dive analysis: align Voynich tokens with {best_lang[0]} morphological patterns",
            'priority': 'MEDIUM'
        })

# Insight 3: Statistical patterns
if metrics:
    hapax_ratio = metrics.get('hapax_ratio', 0)
    if hapax_ratio > 0.5:
        insights.append({
            'category': 'Data Quality',
            'finding': f'High hapax legomena ratio ({hapax_ratio*100:.0f}%) indicates small dataset or noise',
            'action': 'Expand corpus to full manuscript transcription (all 240 folios)',
            'priority': 'HIGH'
        })
    
    entropy = metrics.get('unigram_entropy', 0)
    if entropy < 3.5:
        insights.append({
            'category': 'Linguistic Structure',
            'finding': 'Low entropy suggests high repetition or limited vocabulary',
            'action': 'Investigate token repetition patterns and morphological analysis',
            'priority': 'MEDIUM'
        })

# Insight 4: Missing analyses
insights.append({
    'category': 'Missing Analysis',
    'finding': 'No temporal/positional analysis of token evolution',
    'action': 'Implement timeline analysis showing token usage across manuscript sections',
    'priority': 'MEDIUM'
})

insights.append({
    'category': 'Missing Analysis',
    'finding': 'No clustering visualization or semantic grouping',
    'action': 'Generate and visualize embedding clusters with interpretive labels',
    'priority': 'LOW'
})

# Display insights
print("\nüìã Priority Insights:\n")
for i, insight in enumerate(insights, 1):
    print(f"{i}. [{insight['priority']}] {insight['category']}")
    print(f"   Finding: {insight['finding']}")
    print(f"   Action: {insight['action']}")
    print()

# Create summary dataframe
df_insights = pd.DataFrame(insights)
print("\nüìä Insights Summary:\n")
print(df_insights[['priority', 'category', 'finding']].to_string(index=False))

print("\n" + "=" * 60)
print("NEXT STEPS (IN ORDER)")
print("=" * 60)
print("\n1. üî• Expand dataset to full manuscript (240 folios)")
print("2. üî• Implement advanced LLM hypothesis generation with scoring")
print("3. üìä Deep-dive language alignment with best-match corpus")
print("4. üìà Create temporal evolution analysis")
print("5. üéØ Generate and validate specific token mappings")
print("\n‚úÖ Analysis complete! Ready for final report generation.")

## 6. Export Analysis Report

Generate a comprehensive markdown report with all findings.

In [None]:
# Generate report content
report_content = f"""# Voynich Manuscript Decoder - Results Analysis Report

**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}

---

## Executive Summary

This report provides a comprehensive analysis of all results generated by the Voynich Manuscript decoder pipeline, including hypothesis evaluation, language comparisons, and statistical pattern analysis.

### Key Findings

1. **Hypothesis Quality**: {len(all_hyps)} hypotheses generated, with {generic_hypotheses/len(all_hyps)*100:.0f}% being generic rule-based outputs
2. **Language Similarity**: {list(comparisons.keys())[0] if comparisons else 'N/A'} shows closest statistical match
3. **Statistical Patterns**: {'Low' if metrics.get('unigram_entropy', 0) < 3.5 else 'Moderate'} entropy indicates {'repetitive' if metrics.get('unigram_entropy', 0) < 3.5 else 'natural'} structure
4. **Dataset Size**: Current analysis based on {metrics.get('tokens', 'N/A')} tokens from {metrics.get('lines', 'N/A')} lines

---

## 1. Hypothesis Generation Assessment

### Overview
- Total hypotheses: {len(all_hyps)}
- Unique response patterns: {unique_responses} ({unique_responses/len(all_hyps)*100:.1f}%)
- Tokens covered: {len(token_coverage)}
- Models used: {', '.join(models_used.keys())}

### Quality Metrics
- **Generic hypotheses**: {generic_hypotheses} ({generic_hypotheses/len(all_hyps)*100:.1f}%)
- **Specific hypotheses**: {specific_hypotheses} ({specific_hypotheses/len(all_hyps)*100:.1f}%)

### Critical Finding
‚ö†Ô∏è **Most hypotheses are generic and rule-based**, lacking specific semantic interpretations. This indicates the need for more sophisticated LLM models.

### Recommendation
Replace or augment the `local-rule` model with advanced language models (GPT-4, Claude-3, or fine-tuned models) to generate more specific, testable hypotheses.

---

## 2. Language Comparison Results

### Jensen-Shannon Divergence Analysis

"""

if jsd_scores:
    sorted_langs = sorted(jsd_scores.items(), key=lambda x: x[1]['unigram'])
    report_content += "\n| Rank | Language | JSD Unigram | JSD Bigram | Vocab Overlap | Common Tokens |\n"
    report_content += "|------|----------|-------------|------------|---------------|---------------|\n"
    for i, (lang, scores) in enumerate(sorted_langs, 1):
        lang_name = lang.replace('_', ' ').title()
        bigram = f"{scores['bigram']:.3f}" if scores['bigram'] else "N/A"
        report_content += f"| {i} | {lang_name} | {scores['unigram']:.3f} | {bigram} | {scores['vocab_overlap']*100:.1f}% | {scores['common_tokens']} |\n"
    
    best_match = sorted_langs[0]
    report_content += f"\n### Best Match: {best_match[0].replace('_', ' ').title()}\n\n"
    report_content += f"- **JSD Score**: {best_match[1]['unigram']:.3f} "
    
    if best_match[1]['unigram'] < 0.3:
        report_content += "(Very Similar)\n"
    elif best_match[1]['unigram'] < 0.5:
        report_content += "(Moderately Similar)\n"
    else:
        report_content += "(Low Similarity)\n"
    
    report_content += f"- **Vocabulary Overlap**: {best_match[1]['vocab_overlap']*100:.2f}%\n"
    report_content += f"- **Common Tokens**: {best_match[1]['common_tokens']}\n\n"

report_content += """---

## 3. Statistical Pattern Analysis

"""

if metrics:
    report_content += f"""### Corpus Statistics
- Lines: {metrics.get('lines', 'N/A')}
- Tokens: {metrics.get('tokens', 'N/A')}
- Vocabulary size: {metrics.get('vocab_size', 'N/A')}
- Hapax legomena: {metrics.get('hapax_count', 'N/A')} ({metrics.get('hapax_ratio', 0)*100:.1f}%)

### Entropy Analysis
- **Unigram entropy**: {metrics.get('unigram_entropy', 0):.4f} bits
"""
    
    entropy = metrics.get('unigram_entropy', 0)
    if entropy < 3.0:
        report_content += "- **Interpretation**: LOW - High predictability (repetitive structure)\n"
    elif entropy < 4.5:
        report_content += "- **Interpretation**: MODERATE - Similar to natural languages\n"
    else:
        report_content += "- **Interpretation**: HIGH - Unusually unpredictable\n"
    
    report_content += f"""\n### Zipf's Law Compliance
- **Log-log slope**: {metrics.get('zipf_slope', 0):.4f}
"""
    
    zipf_slope = metrics.get('zipf_slope', 0)
    if -1.2 < zipf_slope < -0.8:
        report_content += "- **Interpretation**: ‚úÖ Follows Zipf's Law (consistent with natural language)\n"
    else:
        report_content += f"- **Interpretation**: ‚ö†Ô∏è Deviates from Zipf's Law (expected ~-1.0)\n"

report_content += f"""\n---

## 4. Actionable Insights

"""

for i, insight in enumerate(insights, 1):
    report_content += f"""### {i}. [{insight['priority']}] {insight['category']}

**Finding**: {insight['finding']}

**Recommended Action**: {insight['action']}

"""

report_content += """---

## 5. Next Steps

1. **Expand Dataset** - Process full manuscript (240 folios) to improve statistical reliability
2. **Upgrade Hypothesis Generation** - Implement advanced LLM models with confidence scoring
3. **Deep Language Analysis** - Perform morphological alignment with best-match language
4. **Temporal Analysis** - Track token evolution across manuscript sections
5. **Hypothesis Validation** - Test generated hypotheses against corpus patterns

---

## Conclusion

The current analysis provides a foundation for understanding the Voynich Manuscript's statistical structure. While preliminary results show interesting patterns, the analysis is limited by:

1. Small dataset size (sample transcription)
2. Generic hypothesis generation
3. Lack of temporal/positional analysis

Implementing the recommended next steps will significantly improve the depth and actionability of the decoder's findings.

---

*Report generated by Voynich Manuscript Decoder Analysis Pipeline*
"""

# Save report
output_file = Path('reports/analysis_report.md')
with open(output_file, 'w') as f:
    f.write(report_content)

print(f"‚úÖ Analysis report saved to: {output_file}")
print(f"üìÑ Report length: {len(report_content)} characters")
print(f"\nüéâ Complete results analysis finished!")