# Corall Recommendation Engine Exploration

This notebook helps explore how the recommendation engine works by:
- Loading papers from Zotero
- Computing similarity matrices
- Analyzing citation networks
- Visualizing recommendation scores

Data is saved to `Test_Data/` directory (excluded from git).

## Setup

In [None]:
import sys
import os
import json
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Add parent directory to path to import Corall modules
sys.path.insert(0, os.path.abspath('..'))

from src.zotero_client import ZoteroClient
from src.openalex_client import OpenAlexClient
from src.similarity_engine import SimilarityEngine
from src.citation_scorer import CitationScorer
from src.recommender import PaperRecommender

# Create Test_Data directory if it doesn't exist
TEST_DATA_DIR = Path('Test_Data')
TEST_DATA_DIR.mkdir(exist_ok=True)

print("✓ Imports successful")
print(f"✓ Test data directory: {TEST_DATA_DIR.absolute()}")

## 1. Load Papers from Zotero

In [None]:
# Initialize Zotero client
zotero = ZoteroClient()

# Fetch library papers
print("Fetching papers from Zotero...")
library_papers = zotero.fetch_library()

print(f"\n✓ Loaded {len(library_papers)} papers from Zotero")
print(f"  Papers with DOI: {sum(1 for p in library_papers if p.get('doi'))}")
print(f"  Papers with abstract: {sum(1 for p in library_papers if p.get('abstract'))}")

# Save to Test_Data
with open(TEST_DATA_DIR / 'library_papers.pkl', 'wb') as f:
    pickle.dump(library_papers, f)
    
print(f"\n✓ Saved to {TEST_DATA_DIR / 'library_papers.pkl'}")

In [None]:
# Display first few papers
df_papers = pd.DataFrame([{
    'title': p.get('title', '')[:60] + '...' if len(p.get('title', '')) > 60 else p.get('title', ''),
    'year': p.get('year', ''),
    'has_doi': bool(p.get('doi')),
    'has_abstract': bool(p.get('abstract')),
    'num_authors': len(p.get('authors', []))
} for p in library_papers[:10]])

print("Sample papers:")
df_papers

## 2. Compute Similarity Matrix

In [None]:
# Initialize similarity engine
similarity_engine = SimilarityEngine()

print("Building library profile (computing embeddings)...")
print("This may take a few minutes for large libraries...\n")

similarity_engine.build_library_profile(library_papers)

print(f"\n✓ Generated embeddings for {len(similarity_engine.library_embeddings)} papers")
print(f"  Embedding dimension: {similarity_engine.library_embeddings[0].shape}")

In [None]:
# Compute pairwise similarity matrix
from scipy.spatial.distance import cosine

n_papers = len(similarity_engine.library_embeddings)
similarity_matrix = np.zeros((n_papers, n_papers))

print(f"Computing {n_papers}x{n_papers} similarity matrix...")

for i in range(n_papers):
    for j in range(i, n_papers):
        if i == j:
            similarity_matrix[i, j] = 1.0
        else:
            sim = 1 - cosine(similarity_engine.library_embeddings[i], 
                           similarity_engine.library_embeddings[j])
            similarity_matrix[i, j] = sim
            similarity_matrix[j, i] = sim
    
    if (i + 1) % 10 == 0:
        print(f"  Progress: {i+1}/{n_papers}")

print(f"\n✓ Similarity matrix computed: {similarity_matrix.shape}")

# Save to Test_Data
np.save(TEST_DATA_DIR / 'similarity_matrix.npy', similarity_matrix)
print(f"✓ Saved to {TEST_DATA_DIR / 'similarity_matrix.npy'}")

In [None]:
# Visualize similarity matrix
plt.figure(figsize=(12, 10))
sns.heatmap(similarity_matrix, cmap='RdYlGn', vmin=0, vmax=1, 
            cbar_kws={'label': 'Cosine Similarity'})
plt.title(f'Paper Similarity Matrix ({n_papers} papers)', fontsize=14, pad=20)
plt.xlabel('Paper Index')
plt.ylabel('Paper Index')
plt.tight_layout()
plt.savefig(TEST_DATA_DIR / 'similarity_matrix_heatmap.png', dpi=150, bbox_inches='tight')
plt.show()

print(f"\n✓ Saved visualization to {TEST_DATA_DIR / 'similarity_matrix_heatmap.png'}")

In [None]:
# Similarity statistics
# Get upper triangle (excluding diagonal)
upper_triangle = similarity_matrix[np.triu_indices_from(similarity_matrix, k=1)]

print("Similarity Statistics:")
print(f"  Mean similarity: {upper_triangle.mean():.3f}")
print(f"  Median similarity: {np.median(upper_triangle):.3f}")
print(f"  Std deviation: {upper_triangle.std():.3f}")
print(f"  Min similarity: {upper_triangle.min():.3f}")
print(f"  Max similarity: {upper_triangle.max():.3f}")
print(f"  25th percentile: {np.percentile(upper_triangle, 25):.3f}")
print(f"  75th percentile: {np.percentile(upper_triangle, 75):.3f}")

# Histogram
plt.figure(figsize=(10, 6))
plt.hist(upper_triangle, bins=50, edgecolor='black', alpha=0.7)
plt.axvline(upper_triangle.mean(), color='red', linestyle='--', label=f'Mean: {upper_triangle.mean():.3f}')
plt.axvline(np.median(upper_triangle), color='green', linestyle='--', label=f'Median: {np.median(upper_triangle):.3f}')
plt.xlabel('Cosine Similarity', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.title('Distribution of Paper Similarities', fontsize=14)
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.savefig(TEST_DATA_DIR / 'similarity_distribution.png', dpi=150, bbox_inches='tight')
plt.show()

print(f"\n✓ Saved distribution plot to {TEST_DATA_DIR / 'similarity_distribution.png'}")

## 3. Build Citation Network

In [None]:
# Initialize clients
openalex = OpenAlexClient()
citation_scorer = CitationScorer()

print("Building citation network from OpenAlex...")
print("This will query OpenAlex for each paper in your library.")
print("May take several minutes...\n")

citation_scorer.build_library_network(openalex, library_papers)

print(f"\n✓ Citation network built")
print(f"  Total works in network: {len(citation_scorer.library_network)}")
print(f"  Library papers mapped: {len([p for p in library_papers if p.get('openalex_id')])}")

# Save to Test_Data
with open(TEST_DATA_DIR / 'citation_network.pkl', 'wb') as f:
    pickle.dump(citation_scorer.library_network, f)
    
print(f"\n✓ Saved to {TEST_DATA_DIR / 'citation_network.pkl'}")

In [None]:
# Citation network statistics
network_size = len(citation_scorer.library_network)
library_size = len(library_papers)
mapped_papers = len([p for p in library_papers if p.get('openalex_id')])

print("Citation Network Statistics:")
print(f"  Library papers: {library_size}")
print(f"  Papers mapped to OpenAlex: {mapped_papers} ({mapped_papers/library_size*100:.1f}%)")
print(f"  Total works in network: {network_size}")
print(f"  Network expansion factor: {network_size/max(mapped_papers, 1):.1f}x")

# Save statistics
stats = {
    'library_size': library_size,
    'mapped_papers': mapped_papers,
    'network_size': network_size,
    'expansion_factor': network_size/max(mapped_papers, 1)
}

with open(TEST_DATA_DIR / 'citation_stats.json', 'w') as f:
    json.dump(stats, f, indent=2)
    
print(f"\n✓ Saved statistics to {TEST_DATA_DIR / 'citation_stats.json'}")

## 4. Test Recommendation Scoring

In [None]:
# Get recent papers for testing
from datetime import datetime, timedelta

# Search for papers from last 7 days
from_date = (datetime.now() - timedelta(days=7)).strftime('%Y-%m-%d')

print(f"Searching for recent papers (from {from_date})...")
print("Note: This uses default journal filtering\n")

# Note: This may take a while depending on journal filters
# For testing, you might want to limit the number of papers
test_candidates = openalex.search_recent_papers(
    from_date=from_date,
    limit=50  # Limit to 50 for faster testing
)

print(f"\n✓ Found {len(test_candidates)} candidate papers")

In [None]:
# Compute scores for test candidates
print("Computing similarity scores...")
test_candidates = similarity_engine.compute_similarity(test_candidates)

print("Computing citation scores...")
test_candidates = citation_scorer.compute_citation_scores(test_candidates)

# Compute combined scores
citation_weight = 0.3
similarity_weight = 0.7

for paper in test_candidates:
    paper['combined_score'] = (
        citation_weight * paper.get('citation_score', 0) +
        similarity_weight * paper.get('similarity_score', 0)
    )

# Sort by combined score
test_candidates.sort(key=lambda x: x['combined_score'], reverse=True)

print(f"\n✓ Scored {len(test_candidates)} papers")

# Save to Test_Data
with open(TEST_DATA_DIR / 'test_recommendations.pkl', 'wb') as f:
    pickle.dump(test_candidates, f)
    
print(f"✓ Saved to {TEST_DATA_DIR / 'test_recommendations.pkl'}")

In [None]:
# Display top recommendations
print("\nTop 10 Recommendations:\n" + "="*60)

for i, paper in enumerate(test_candidates[:10], 1):
    title = paper.get('title', 'Unknown')[:60] + '...' if len(paper.get('title', '')) > 60 else paper.get('title', 'Unknown')
    combined = paper.get('combined_score', 0)
    citation = paper.get('citation_score', 0)
    similarity = paper.get('similarity_score', 0)
    
    print(f"\n{i}. {title}")
    print(f"   Combined: {combined:.3f} | Citation: {citation:.3f} | Similarity: {similarity:.3f}")
    
    most_similar = paper.get('most_similar_paper')
    if most_similar:
        similar_title = most_similar.get('title', 'Unknown')[:50] + '...' if len(most_similar.get('title', '')) > 50 else most_similar.get('title', 'Unknown')
        print(f"   Most similar to: \"{similar_title}\"")

In [None]:
# Visualize score distributions
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Combined scores
combined_scores = [p.get('combined_score', 0) for p in test_candidates]
axes[0, 0].hist(combined_scores, bins=30, edgecolor='black', alpha=0.7, color='purple')
axes[0, 0].set_xlabel('Combined Score')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].set_title('Combined Score Distribution')
axes[0, 0].axvline(np.mean(combined_scores), color='red', linestyle='--', label=f'Mean: {np.mean(combined_scores):.3f}')
axes[0, 0].legend()
axes[0, 0].grid(alpha=0.3)

# Citation scores
citation_scores = [p.get('citation_score', 0) for p in test_candidates]
axes[0, 1].hist(citation_scores, bins=30, edgecolor='black', alpha=0.7, color='red')
axes[0, 1].set_xlabel('Citation Score')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].set_title('Citation Score Distribution')
axes[0, 1].axvline(np.mean(citation_scores), color='darkred', linestyle='--', label=f'Mean: {np.mean(citation_scores):.3f}')
axes[0, 1].legend()
axes[0, 1].grid(alpha=0.3)

# Similarity scores
similarity_scores = [p.get('similarity_score', 0) for p in test_candidates]
axes[1, 0].hist(similarity_scores, bins=30, edgecolor='black', alpha=0.7, color='blue')
axes[1, 0].set_xlabel('Similarity Score')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].set_title('Similarity Score Distribution')
axes[1, 0].axvline(np.mean(similarity_scores), color='darkblue', linestyle='--', label=f'Mean: {np.mean(similarity_scores):.3f}')
axes[1, 0].legend()
axes[1, 0].grid(alpha=0.3)

# Scatter: Citation vs Similarity
axes[1, 1].scatter(citation_scores, similarity_scores, alpha=0.6, s=50)
axes[1, 1].set_xlabel('Citation Score')
axes[1, 1].set_ylabel('Similarity Score')
axes[1, 1].set_title('Citation vs Similarity Scores')
axes[1, 1].grid(alpha=0.3)

# Add correlation
correlation = np.corrcoef(citation_scores, similarity_scores)[0, 1]
axes[1, 1].text(0.05, 0.95, f'Correlation: {correlation:.3f}', 
               transform=axes[1, 1].transAxes, verticalalignment='top',
               bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

plt.tight_layout()
plt.savefig(TEST_DATA_DIR / 'score_distributions.png', dpi=150, bbox_inches='tight')
plt.show()

print(f"\n✓ Saved score visualizations to {TEST_DATA_DIR / 'score_distributions.png'}")

## 5. Analyze Most Similar Papers

In [None]:
# Count which library papers are most frequently matched
from collections import Counter

most_similar_titles = []
for paper in test_candidates:
    most_similar = paper.get('most_similar_paper')
    if most_similar:
        most_similar_titles.append(most_similar.get('title', 'Unknown'))

title_counts = Counter(most_similar_titles)

print("Top 10 Most Frequently Matched Library Papers:\n" + "="*60)
for i, (title, count) in enumerate(title_counts.most_common(10), 1):
    display_title = title[:70] + '...' if len(title) > 70 else title
    print(f"{i}. {display_title}")
    print(f"   Matched {count} times ({count/len(test_candidates)*100:.1f}% of recommendations)\n")

## 6. Summary Report

In [None]:
# Generate summary report
summary = {
    'library': {
        'total_papers': len(library_papers),
        'papers_with_doi': sum(1 for p in library_papers if p.get('doi')),
        'papers_with_abstract': sum(1 for p in library_papers if p.get('abstract')),
    },
    'similarity': {
        'mean_similarity': float(upper_triangle.mean()),
        'median_similarity': float(np.median(upper_triangle)),
        'std_similarity': float(upper_triangle.std()),
    },
    'citation_network': {
        'network_size': len(citation_scorer.library_network),
        'mapped_papers': len([p for p in library_papers if p.get('openalex_id')]),
    },
    'recommendations': {
        'num_candidates': len(test_candidates),
        'mean_combined_score': float(np.mean(combined_scores)),
        'mean_citation_score': float(np.mean(citation_scores)),
        'mean_similarity_score': float(np.mean(similarity_scores)),
        'citation_similarity_correlation': float(correlation),
    }
}

# Save summary
with open(TEST_DATA_DIR / 'analysis_summary.json', 'w') as f:
    json.dump(summary, f, indent=2)

print("\n" + "="*60)
print("ANALYSIS SUMMARY")
print("="*60)
print(f"\nLibrary:")
print(f"  Total papers: {summary['library']['total_papers']}")
print(f"  Papers with DOI: {summary['library']['papers_with_doi']}")
print(f"  Papers with abstract: {summary['library']['papers_with_abstract']}")

print(f"\nSimilarity Matrix:")
print(f"  Mean similarity: {summary['similarity']['mean_similarity']:.3f}")
print(f"  Median similarity: {summary['similarity']['median_similarity']:.3f}")
print(f"  Std deviation: {summary['similarity']['std_similarity']:.3f}")

print(f"\nCitation Network:")
print(f"  Network size: {summary['citation_network']['network_size']} works")
print(f"  Mapped papers: {summary['citation_network']['mapped_papers']}")

print(f"\nRecommendations:")
print(f"  Candidates tested: {summary['recommendations']['num_candidates']}")
print(f"  Mean combined score: {summary['recommendations']['mean_combined_score']:.3f}")
print(f"  Mean citation score: {summary['recommendations']['mean_citation_score']:.3f}")
print(f"  Mean similarity score: {summary['recommendations']['mean_similarity_score']:.3f}")
print(f"  Citation-Similarity correlation: {summary['recommendations']['citation_similarity_correlation']:.3f}")

print(f"\n✓ Saved summary to {TEST_DATA_DIR / 'analysis_summary.json'}")
print("\n" + "="*60)

## Generated Files

All data is saved to `Test_Data/` directory:

- `library_papers.pkl` - Your Zotero library papers
- `similarity_matrix.npy` - Full similarity matrix (NxN)
- `similarity_matrix_heatmap.png` - Visualization of similarity matrix
- `similarity_distribution.png` - Distribution of similarity scores
- `citation_network.pkl` - Citation network from OpenAlex
- `citation_stats.json` - Citation network statistics
- `test_recommendations.pkl` - Test candidate papers with scores
- `score_distributions.png` - Score distribution visualizations
- `analysis_summary.json` - Complete analysis summary

**Note:** `Test_Data/` is excluded from git to save space.