In [1]:
# =============================================================================
# CELL 1: Setup and Directory Structure
# =============================================================================

import sys
from pathlib import Path

# Setup paths
project_root = Path.cwd().parent if Path.cwd().name == 'notebooks' else Path.cwd()
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

# Configuration helper
def create_config(overrides=None):
    from config import PipelineConfig
    config = PipelineConfig(overrides)
    config.project_root = project_root
    config.documents_root = project_root / "documents"
    config.literature_folder = config.documents_root / "literature"
    config.your_work_folder = config.documents_root / "your_work"
    config.biblio_folder = config.documents_root / "biblio"
    config.current_drafts_folder = config.documents_root / "current_drafts"
    config.cache_file = project_root / "physics_knowledge_base.pkl"
    config._create_directories()
    return config

# Create configuration and directories
config = create_config()

print("üìÅ YOUR PHYSICS PIPELINE DIRECTORY STRUCTURE")
print("=" * 60)
print(f"Project Root: {project_root}")
print()

# Show where to put files
folders = {
    "biblio": "Put your Zotero .bib files here",
    "literature": "Downloaded papers go here (auto-created)",
    "your_work": "Your own published papers",
    "current_drafts": "Your current drafts"
}

for folder_name, description in folders.items():
    folder_path = config.documents_root / folder_name
    exists = "‚úÖ" if folder_path.exists() else "‚ùå"
    file_count = len(list(folder_path.iterdir())) if folder_path.exists() else 0
    
    print(f"{exists} {folder_name}/ - {description}")
    print(f"   üìç Location: {folder_path}")
    print(f"   üìä Current files: {file_count}")
    print()

print("üéØ TO GET STARTED:")
print("1. Export .bib files from Zotero ‚Üí Save to documents/biblio/")
print("2. Copy your papers ‚Üí Save to documents/your_work/")
print("3. Run the commands below!")

üìÅ YOUR PHYSICS PIPELINE DIRECTORY STRUCTURE
Project Root: /Users/fanza/Desktop/Projects/AcademicAssistantExperiments/physics_synthesis_project_modular

‚úÖ biblio/ - Put your Zotero .bib files here
   üìç Location: /Users/fanza/Desktop/Projects/AcademicAssistantExperiments/physics_synthesis_project_modular/documents/biblio
   üìä Current files: 1

‚úÖ literature/ - Downloaded papers go here (auto-created)
   üìç Location: /Users/fanza/Desktop/Projects/AcademicAssistantExperiments/physics_synthesis_project_modular/documents/literature
   üìä Current files: 18

‚úÖ your_work/ - Your own published papers
   üìç Location: /Users/fanza/Desktop/Projects/AcademicAssistantExperiments/physics_synthesis_project_modular/documents/your_work
   üìä Current files: 10

‚úÖ current_drafts/ - Your current drafts
   üìç Location: /Users/fanza/Desktop/Projects/AcademicAssistantExperiments/physics_synthesis_project_modular/documents/current_drafts
   üìä Current files: 2

üéØ TO GET STARTED:
1

In [2]:
# =============================================================================
# CELL 2: Check for Existing .bib Files
# =============================================================================

# Check what .bib files you already have
biblio_folder = config.biblio_folder
bib_files = list(biblio_folder.glob("*.bib"))

print("üìö CHECKING FOR EXISTING .BIB FILES")
print("=" * 40)

if bib_files:
    print(f"‚úÖ Found {len(bib_files)} .bib file(s):")
    for bib_file in bib_files:
        file_size = bib_file.stat().st_size / 1024  # KB
        print(f"   üìÑ {bib_file.name} ({file_size:.1f} KB)")
    
    # Quick preview of first .bib file
    first_bib = bib_files[0]
    try:
        content = first_bib.read_text()
        lines = content.split('\n')[:10]
        print(f"\nüìñ Preview of {first_bib.name}:")
        for line in lines:
            if line.strip():
                print(f"   {line[:60]}...")
                break
    except Exception as e:
        print(f"   Could not preview file: {e}")
        
    print(f"\nüöÄ Ready to download literature!")
    
else:
    print("üì≠ No .bib files found")
    print("\nüí° TO ADD .BIB FILES:")
    print("1. Open Zotero")
    print("2. Select your collection")  
    print("3. Right-click ‚Üí Export Collection")
    print("4. Choose 'BibTeX' format")
    print(f"5. Save to: {biblio_folder}")

üìö CHECKING FOR EXISTING .BIB FILES
‚úÖ Found 1 .bib file(s):
   üìÑ my_papers.bib (25.5 KB)

üìñ Preview of my_papers.bib:
   @article{A.S.Holevo1998CapacityQuantumChannel,...

üöÄ Ready to download literature!


In [3]:
# =============================================================================
# CELL 3: Download Literature from .bib Files (IF YOU HAVE THEM)
# =============================================================================

# Only run this if you have .bib files
bib_files = list(config.biblio_folder.glob("*.bib"))

if bib_files:
    print("üöÄ DOWNLOADING LITERATURE FROM ARXIV")
    print("=" * 45)
    
    from src.downloaders import LiteratureDownloader
    
    # Use the first .bib file (or modify to use a specific one)
    bib_file = bib_files[0]
    print(f"üìñ Using: {bib_file.name}")
    
    # Initialize downloader
    downloader = LiteratureDownloader(
        output_directory=config.literature_folder,
        delay_between_downloads=config.download_delay,
        arxiv_config=config.get_arxiv_config()
    )
    
    print("‚è≥ Starting download... (this may take several minutes)")
    print("üí° The system will:")
    print("   1. Parse your .bib file")
    print("   2. Search for papers on arXiv")
    print("   3. Download PDFs and TEX files")
    print("   4. Generate a detailed report")
    print()
    
    # Download papers
    results = downloader.download_from_bibtex(bib_file)
    
    print(f"\n‚úÖ DOWNLOAD COMPLETE!")
    print(f"   üì• Successfully downloaded: {len(results['successful'])}")
    print(f"   ‚ùå Not found: {len(results['failed'])}")
    
    if results['successful']:
        print(f"\nüìÑ Sample downloads:")
        for i, result in enumerate(results['successful'][:3]):
            paper = result.paper_metadata
            print(f"   {i+1}. {paper.title[:50]}...")
            print(f"      arXiv: {result.search_result.arxiv_id}")
            
else:
    print("‚è≠Ô∏è SKIPPING DOWNLOAD - No .bib files found")
    print("Add .bib files to documents/biblio/ first!")

üöÄ DOWNLOADING LITERATURE FROM ARXIV
üìñ Using: my_papers.bib
2025-05-26 17:20:34 - physics_pipeline.src.downloaders.bibtex_parser - INFO - BibTeX parser initialized
2025-05-26 17:20:34 - physics_pipeline.src.downloaders.arxiv_searcher - INFO - ArXiv searcher initialized
2025-05-26 17:20:34 - physics_pipeline.src.downloaders.literature_downloader - INFO - Literature downloader initialized with output: /Users/fanza/Desktop/Projects/AcademicAssistantExperiments/physics_synthesis_project_modular/documents/literature
‚è≥ Starting download... (this may take several minutes)
üí° The system will:
   1. Parse your .bib file
   2. Search for papers on arXiv
   3. Download PDFs and TEX files
   4. Generate a detailed report

2025-05-26 17:20:34 - physics_pipeline.src.downloaders.literature_downloader - INFO - Starting download process from /Users/fanza/Desktop/Projects/AcademicAssistantExperiments/physics_synthesis_project_modular/documents/biblio/my_papers.bib
2025-05-26 17:20:34 - physics_

2025-05-26 17:20:55 - physics_pipeline.src.downloaders.literature_downloader - INFO - Processing paper 7/17: Fluctuations of Work from Quantum Subensembles: Th...
2025-05-26 17:20:55 - physics_pipeline.src.downloaders.arxiv_searcher - INFO - Searching for paper: Fluctuations of Work from Quantum Subensembles: Th...
2025-05-26 17:20:59 - physics_pipeline.src.downloaders.literature_downloader - INFO - Processing paper 8/17: Unitary-Projective Entanglement Dynamics...
2025-05-26 17:20:59 - physics_pipeline.src.downloaders.arxiv_searcher - INFO - Searching for paper: Unitary-Projective Entanglement Dynamics...
2025-05-26 17:21:00 - physics_pipeline.src.downloaders.arxiv_searcher - INFO - Downloading paper: 1808.05949
2025-05-26 17:21:00 - physics_pipeline.src.downloaders.arxiv_searcher - INFO - PDF downloaded: /Users/fanza/Desktop/Projects/AcademicAssistantExperiments/physics_synthesis_project_modular/documents/literature/1808.05949.pdf
2025-05-26 17:21:01 - physics_pipeline.src.utils.file

2025-05-26 17:21:45 - physics_pipeline.src.utils.file_utils - INFO - Extracted main tex file: /Users/fanza/Desktop/Projects/AcademicAssistantExperiments/physics_synthesis_project_modular/documents/literature/1107.4497.tex
2025-05-26 17:21:45 - physics_pipeline.src.downloaders.arxiv_searcher - INFO - TEX downloaded and extracted: /Users/fanza/Desktop/Projects/AcademicAssistantExperiments/physics_synthesis_project_modular/documents/literature/1107.4497.tex
2025-05-26 17:21:46 - physics_pipeline.src.downloaders.literature_downloader - INFO - ‚úì Successfully downloaded: 1107.4497
2025-05-26 17:21:46 - physics_pipeline.src.downloaders.literature_downloader - INFO - Download report saved to /Users/fanza/Desktop/Projects/AcademicAssistantExperiments/physics_synthesis_project_modular/documents/literature/download_report.md

üìö LITERATURE DOWNLOAD SUMMARY

üìä OVERALL STATISTICS:
   Total papers processed: 17
   Successfully downloaded: 10
   Failed downloads: 7
   Success rate: 58.8%
   To

In [4]:
# =============================================================================
# CELL 4: Build Knowledge Base (WITH PYTORCH FIX)
# =============================================================================

# PYTORCH COMPATIBILITY FIX - Add this first!
import torch

if not hasattr(torch, 'get_default_device'):
    def get_default_device():
        """Fallback implementation for older PyTorch versions."""
        if torch.cuda.is_available():
            return torch.device('cuda')
        else:
            return torch.device('cpu')
    
    torch.get_default_device = get_default_device
    print("‚úÖ Applied PyTorch compatibility fix")

# Now the imports should work
print("\nüèóÔ∏è BUILDING KNOWLEDGE BASE")
print("=" * 35)

from src.core import KnowledgeBase

# Initialize knowledge base
kb = KnowledgeBase(
    embedding_model=config.embedding_model,
    chunk_size=config.chunk_size,
    chunk_overlap=config.chunk_overlap
)

print("‚è≥ Building knowledge base from all documents...")
print("üí° This processes:")
print("   üìö Downloaded literature")
print("   üìù Your own papers") 
print("   ‚úèÔ∏è  Current drafts")
print("   üß† Creates semantic embeddings")
print()

# Build knowledge base
stats = kb.build_from_directories(
    literature_folder=config.literature_folder,
    your_work_folder=config.your_work_folder,
    current_drafts_folder=config.current_drafts_folder
)

print("‚úÖ KNOWLEDGE BASE BUILT!")
print(f"   üìö Total documents: {stats.get('total_documents', 0)}")
print(f"   ‚úÖ Successfully processed: {stats.get('successful_documents', 0)}")
print(f"   üß© Total text chunks: {stats.get('total_chunks', 0)}")
print(f"   üìä Total words: {stats.get('total_words', 0):,}")

# Show source breakdown
source_breakdown = stats.get('source_breakdown', {})
if source_breakdown:
    print(f"\nüìÅ Source breakdown:")
    for source_type, info in source_breakdown.items():
        count = info.get('successful', 0) if isinstance(info, dict) else info
        print(f"   {source_type}: {count} documents")

# Save knowledge base
kb.save_to_file(config.cache_file)
print(f"\nüíæ Knowledge base saved to: {config.cache_file}")

# Store for next cell
globals()['knowledge_base'] = kb

‚úÖ Applied PyTorch compatibility fix

üèóÔ∏è BUILDING KNOWLEDGE BASE
ERROR! Session/line number was not unique in database. History logging moved to new session 2327
2025-05-26 17:22:07 - physics_pipeline.src.core.document_processor - INFO - Document processor initialized with extensions: {'.tex', '.txt', '.pdf'}
2025-05-26 17:22:07 - physics_pipeline.src.core.embeddings - INFO - Loading embedding model: all-MiniLM-L6-v2
2025-05-26 17:22:10 - physics_pipeline.src.core.embeddings - INFO - Embeddings manager initialized with all-MiniLM-L6-v2
2025-05-26 17:22:10 - physics_pipeline.src.core.knowledge_base - INFO - Knowledge base initialized
‚è≥ Building knowledge base from all documents...
üí° This processes:
   üìö Downloaded literature
   üìù Your own papers
   ‚úèÔ∏è  Current drafts
   üß† Creates semantic embeddings

2025-05-26 17:22:10 - physics_pipeline.src.core.knowledge_base - INFO - Building knowledge base from directories
2025-05-26 17:22:10 - physics_pipeline.src.core.know

2025-05-26 17:22:11 - physics_pipeline.src.core.document_processor - INFO - Processing file: /Users/fanza/Desktop/Projects/AcademicAssistantExperiments/physics_synthesis_project_modular/documents/your_work/notes.tex
2025-05-26 17:22:11 - physics_pipeline.src.core.document_processor - INFO - Successfully processed notes.tex: 36228 characters
2025-05-26 17:22:11 - physics_pipeline.src.core.document_processor - INFO - Processing file: /Users/fanza/Desktop/Projects/AcademicAssistantExperiments/physics_synthesis_project_modular/documents/your_work/qgse.tex
2025-05-26 17:22:11 - physics_pipeline.src.core.document_processor - INFO - Successfully processed qgse.tex: 58865 characters
2025-05-26 17:22:11 - physics_pipeline.src.core.document_processor - INFO - Processing file: /Users/fanza/Desktop/Projects/AcademicAssistantExperiments/physics_synthesis_project_modular/documents/your_work/PRXQuantum.3.020355 (1).pdf
2025-05-26 17:22:11 - physics_pipeline.src.core.document_processor - INFO - Succes

Batches:   0%|          | 0/11 [00:00<?, ?it/s]

2025-05-26 17:22:22 - physics_pipeline.src.core.embeddings - INFO - Total chunks in database: 345
2025-05-26 17:22:22 - physics_pipeline.src.core.knowledge_base - INFO - Knowledge base built successfully:
2025-05-26 17:22:22 - physics_pipeline.src.core.knowledge_base - INFO -   Total documents: 29
2025-05-26 17:22:22 - physics_pipeline.src.core.knowledge_base - INFO -   Successful processing: 29
2025-05-26 17:22:22 - physics_pipeline.src.core.knowledge_base - INFO -   Total chunks: 345
‚úÖ KNOWLEDGE BASE BUILT!
   üìö Total documents: 29
   ‚úÖ Successfully processed: 29
   üß© Total text chunks: 345
   üìä Total words: 272,008

üìÅ Source breakdown:
   literature: 17 documents
   your_work: 10 documents
   current_drafts: 2 documents
2025-05-26 17:22:22 - physics_pipeline.src.core.knowledge_base - INFO - Saving knowledge base to /Users/fanza/Desktop/Projects/AcademicAssistantExperiments/physics_synthesis_project_modular/physics_knowledge_base.pkl
2025-05-26 17:22:22 - physics_pipe

In [5]:
# =============================================================================
# CELL 5: Test Literature-Aware Chat (IF YOU HAVE API KEY)
# =============================================================================

print("\nü§ñ TESTING LITERATURE-AWARE CHAT")
print("=" * 40)

try:
    config.validate_api_keys()
    
    from src.chat import LiteratureAssistant
    
    # Create literature assistant
    assistant = LiteratureAssistant(
        knowledge_base=kb,
        anthropic_api_key=config.anthropic_api_key,
        chat_config=config.get_chat_config()
    )
    
    print("‚úÖ Literature assistant created!")
    print(f"üìä Knowledge base: {kb.get_statistics().get('total_documents', 0)} documents")
    
    # Simple chat function
    def chat(question, show_sources=True):
        """Simple chat with the literature assistant."""
        print(f"\nüßë‚Äçüî¨ Question: {question}")
        print("ü§ñ Assistant: ", end="", flush=True)
        
        try:
            response = assistant.ask(question)
            print(response.content)
            
            if show_sources and response.sources_used:
                print(f"\nüìö Sources: {', '.join(response.sources_used)}")
            
        except Exception as e:
            print(f"Error: {e}")
        
        print("\n" + "-" * 50)
    
    print("\nüí¨ READY FOR CHAT!")
    print("üí° Try these commands:")
    print("   chat('What papers do I have in my knowledge base?')")
    print("   chat('What are the main research topics?')")
    print("   chat('Explain quantum entanglement')")
    
    # Store chat function for easy use
    globals()['chat'] = chat
    globals()['assistant'] = assistant
    
except ValueError as e:
    print("‚ö†Ô∏è  Cannot test chat - API key not set")
    print("üí° Set your ANTHROPIC_API_KEY environment variable to test chat")

# =============================================================================
# CELL 6: Summary and Next Steps
# =============================================================================

print("\nüéâ PHYSICS PIPELINE SETUP COMPLETE!")
print("=" * 45)

# Final statistics
bib_count = len(list(config.biblio_folder.glob("*.bib")))
lit_count = len(list(config.literature_folder.iterdir())) if config.literature_folder.exists() else 0
work_count = len(list(config.your_work_folder.iterdir())) if config.your_work_folder.exists() else 0

print(f"üìä FINAL STATUS:")
print(f"   üìö .bib files: {bib_count}")
print(f"   üìÑ Downloaded papers: {lit_count}")
print(f"   üìù Your papers: {work_count}")
print(f"   üß† Knowledge base: {'‚úÖ Built' if 'knowledge_base' in globals() else '‚ùå Not built'}")
print(f"   ü§ñ Chat ready: {'‚úÖ Yes' if 'chat' in globals() else '‚ùå Need API key'}")

print(f"\nüöÄ WHAT YOU CAN DO NOW:")
if 'chat' in globals():
    print("   ‚Ä¢ chat('your question here') - Ask the AI assistant")
    print("   ‚Ä¢ assistant.synthesize_literature('topic') - Get research synthesis")
    print("   ‚Ä¢ assistant.help_with_writing('task') - Get writing help")
else:
    print("   ‚Ä¢ Add .bib files and run cells above")
    print("   ‚Ä¢ Set ANTHROPIC_API_KEY for chat functionality")

print(f"\nüìÅ KEY LOCATIONS:")
print(f"   üìö Add .bib files: {config.biblio_folder}")
print(f"   üìù Add your papers: {config.your_work_folder}")
print(f"   üìÑ Downloaded papers: {config.literature_folder}")
print(f"   üíæ Knowledge base cache: {config.cache_file}")

print("\nüéØ Your physics literature synthesis pipeline is ready! üî¨üìö")


ü§ñ TESTING LITERATURE-AWARE CHAT
2025-05-26 17:23:32 - physics_pipeline.src.chat.chat_interface - INFO - Chat interface initialized with model: claude-3-5-sonnet-20241022
2025-05-26 17:23:32 - physics_pipeline.src.chat.literature_assistant - INFO - Literature assistant initialized
‚úÖ Literature assistant created!
üìä Knowledge base: 29 documents

üí¨ READY FOR CHAT!
üí° Try these commands:
   chat('What papers do I have in my knowledge base?')
   chat('What are the main research topics?')
   chat('Explain quantum entanglement')

üéâ PHYSICS PIPELINE SETUP COMPLETE!
üìä FINAL STATUS:
   üìö .bib files: 1
   üìÑ Downloaded papers: 18
   üìù Your papers: 10
   üß† Knowledge base: ‚úÖ Built
   ü§ñ Chat ready: ‚úÖ Yes

üöÄ WHAT YOU CAN DO NOW:
   ‚Ä¢ chat('your question here') - Ask the AI assistant
   ‚Ä¢ assistant.synthesize_literature('topic') - Get research synthesis
   ‚Ä¢ assistant.help_with_writing('task') - Get writing help

üìÅ KEY LOCATIONS:
   üìö Add .bib files:

In [6]:
# Test 1: Overview of your collection
chat("What papers do I have in my knowledge base?")


üßë‚Äçüî¨ Question: What papers do I have in my knowledge base?
ü§ñ Assistant: 2025-05-26 17:24:46 - physics_pipeline.src.chat.literature_assistant - INFO - Processing question: What papers do I have in my knowledge base?...
2025-05-26 17:24:52 - physics_pipeline.src.chat.literature_assistant - INFO - Question answered in 6.62s with 7 sources
Based on the provided context, I can see several papers in your knowledge base across different categories:

Literature Papers:
1. [Literature: 2112.00716.tex]
2. [Literature: 1911.00008.tex] and its PDF version
3. [Literature: 1911.00008.pdf]

Your Previous Work:
1. [Your work: notes.tex]
2. [Your work: qidge.tex]
3. [Your work: PRXQuantum.3.020355 (1).pdf]
4. [Your work: qgse.tex]

However, I should note that this is likely only a partial list, as the context provided shows only the most relevant documents to your query. From the initial system description, you mentioned there are:
- Total papers: 29
- Total content chunks: 345

So there are

In [7]:
chat("Do you know what measurement-induced entanglement phase transitions are?")


üßë‚Äçüî¨ Question: Do you know what measurement-induced entanglement phase transitions are?
ü§ñ Assistant: 2025-05-26 17:26:12 - physics_pipeline.src.chat.literature_assistant - INFO - Processing question: Do you know what measurement-induced entanglement ...
2025-05-26 17:26:20 - physics_pipeline.src.chat.literature_assistant - INFO - Question answered in 8.04s with 6 sources
Yes, based on the provided literature, measurement-induced entanglement phase transitions (MEPTs) are a relatively recent discovery in quantum physics. Here's what we know from the sources:

Key Points:

1. Basic Definition:
These are phase transitions that occur in monitored quantum circuits where there's an interplay between two competing processes:
- Unitary evolution (which typically increases entanglement)
- Projective measurements (which typically reduce entanglement)
[Literature: 1808.05949.tex]

2. Physical Realizations:
These transitions can be studied in:
- Systems with superconducting qubits
- Pot