# Example Analysis: Semantic Classification Results

This notebook demonstrates how to explore the results from the semantic classification pipeline.

In [None]:
import sys
sys.path.append('../src')

import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## Load Results

In [None]:
# Load outputs
output_dir = Path('../output')

with open(output_dir / 'clusters.json') as f:
    clusters = json.load(f)

with open(output_dir / 'gap_analysis.json') as f:
    gaps = json.load(f)

with open(output_dir / 'hypotheses.json') as f:
    hypotheses = json.load(f)

print("✅ Results loaded")

## Cluster Analysis

In [None]:
# Extract cluster sizes
cluster_stats = clusters['summary']
cluster_names = [k for k in cluster_stats.keys() if k != 'noise']
cluster_sizes = [cluster_stats[k]['size'] for k in cluster_names]

# Plot cluster distribution
plt.figure(figsize=(12, 6))
plt.bar(range(len(cluster_sizes)), cluster_sizes, color='steelblue')
plt.xlabel('Cluster ID')
plt.ylabel('Number of Papers')
plt.title('Distribution of Papers Across Clusters')
plt.xticks(range(len(cluster_names)), cluster_names, rotation=45)
plt.tight_layout()
plt.show()

print(f"Total clusters: {len(cluster_names)}")
print(f"Noise points: {cluster_stats.get('noise', {}).get('size', 0)}")

## Gap Analysis Summary

In [None]:
# Summary statistics
print("Gap Analysis Summary")
print("="*50)
print(f"Total papers: {gaps['summary']['total_papers']}")
print(f"Clusters found: {gaps['summary']['n_clusters']}")
print(f"\nGaps identified:")
print(f"  - Temporal gaps: {len(gaps['temporal_gaps'])}")
print(f"  - Methodological gaps: {len(gaps['methodological_gaps'])}")
print(f"  - Contradictions: {len(gaps['contradictions'])}")
print(f"  - Cross-cluster opportunities: {len(gaps['cross_cluster_opportunities'])}")

print("\n" + "="*50)
print("Key Insights:")
for insight in gaps.get('actionable_insights', []):
    print(f"  {insight}")

## Temporal Gap Analysis

In [None]:
# Plot temporal gaps
temporal_data = gaps['temporal_gaps']

if temporal_data:
    clusters_list = list(temporal_data.keys())
    years_since_peak = [temporal_data[c]['years_since_peak'] for c in clusters_list]
    
    plt.figure(figsize=(12, 6))
    colors = ['red' if y > 5 else 'orange' if y > 3 else 'green' for y in years_since_peak]
    plt.barh(range(len(clusters_list)), years_since_peak, color=colors)
    plt.xlabel('Years Since Most Recent Paper')
    plt.ylabel('Cluster')
    plt.title('Temporal Gaps by Cluster (Red=Outdated, Orange=Needs Update, Green=Recent)')
    plt.yticks(range(len(clusters_list)), clusters_list)
    plt.axvline(x=5, color='red', linestyle='--', alpha=0.5, label='Outdated (>5 years)')
    plt.axvline(x=3, color='orange', linestyle='--', alpha=0.5, label='Needs update (>3 years)')
    plt.legend()
    plt.tight_layout()
    plt.show()

## Hypothesis Summary

In [None]:
# Hypothesis statistics
print("Hypothesis Generation Summary")
print("="*50)
print(f"Total hypotheses: {hypotheses['metadata']['total_hypotheses']}")
print(f"Generated at: {hypotheses['metadata']['generated_at']}")
print(f"\nBy type:")
for htype, count in hypotheses['summary']['by_type'].items():
    print(f"  - {htype}: {count}")

print("\n" + "="*50)
print("Top 3 Hypotheses:\n")
for i, h in enumerate(hypotheses['summary']['top_5'][:3], 1):
    print(f"{i}. [{h['type'].upper()}] Score: {h.get('score', 0):.2f}")
    print(f"   {h['hypothesis']}")
    print(f"   Novelty: {h.get('novelty', 'N/A')}, Feasibility: {h.get('feasibility', 'N/A')}, Impact: {h.get('impact_potential', 'N/A')}")
    print()

## Hypothesis Score Distribution

In [None]:
# Plot hypothesis scores
all_hyp = hypotheses['all_hypotheses']
scores = [h.get('score', 0) for h in all_hyp]
types = [h['type'] for h in all_hyp]

plt.figure(figsize=(12, 6))
plt.hist(scores, bins=20, color='steelblue', edgecolor='black')
plt.xlabel('Hypothesis Score')
plt.ylabel('Count')
plt.title('Distribution of Hypothesis Scores')
plt.axvline(x=np.median(scores), color='red', linestyle='--', label=f'Median: {np.median(scores):.2f}')
plt.legend()
plt.tight_layout()
plt.show()

## Export High-Priority Hypotheses

In [None]:
# Create DataFrame of top hypotheses
top_hypotheses = hypotheses['summary']['top_5']

df_hyp = pd.DataFrame([
    {
        'type': h['type'],
        'score': h.get('score', 0),
        'hypothesis': h['hypothesis'][:100] + '...',
        'novelty': h.get('novelty', 'N/A'),
        'feasibility': h.get('feasibility', 'N/A'),
        'impact': h.get('impact_potential', 'N/A')
    }
    for h in top_hypotheses
])

print("Top 5 Hypotheses:")
display(df_hyp)

# Save to CSV
df_hyp.to_csv('../output/top_hypotheses.csv', index=False)
print("\n✅ Saved to output/top_hypotheses.csv")

## Next Steps

1. **Explore individual clusters** in `clusters.json`
2. **Review specific contradictions** in `gap_analysis.json`
3. **Deep dive into hypotheses** in `hypotheses.json`
4. **Use Claude agents** for interactive Q&A
5. **Customize this notebook** for your specific research questions