In [None]:
# ============================================================================
# SETUP & CONFIGURATION CELL
# Enhanced Multi-Source Literature Search - Dependencies and Config
# ============================================================================

#@title Setup & Configuration - Multi-Source Literature Search

# Install required packages
!pip install -q langchain-community duckduckgo-search langgraph langchain-google-genai

import requests
import xml.etree.ElementTree as ET
import time
import logging
import json
from typing import List, Dict, Any, Optional
from datetime import datetime
from urllib.parse import quote
from dataclasses import dataclass

from langchain_community.tools import DuckDuckGoSearchRun
from langchain_community.utilities import DuckDuckGoSearchAPIWrapper

# LangGraph imports
from langgraph.graph import StateGraph, START, END
from langgraph.checkpoint.memory import MemorySaver
from langchain_core.prompts import ChatPromptTemplate

# LangChain and Gemini imports
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.messages import HumanMessage, SystemMessage

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# ============================================================================
# CONFIGURATION VARIABLES
# ============================================================================

# API Configuration - UPDATE WITH YOUR CREDENTIALS
PUBMED_EMAIL = "dgonchar@stanford.edu"  # REQUIRED for PubMed
CROSSREF_EMAIL = "dgonchar@stanford.edu"  # Optional: For better rate limits

# Search Configuration
RATE_LIMIT_DELAY = 0.5
MAX_RESULTS_PER_SOURCE = 3
TOTAL_MAX_RESULTS = 10

# ============================================================================
# STATE DEFINITION
# ============================================================================

@dataclass
class GenerationState:
    """State shared across all nodes in the generation workflow"""
    research_goal: str = ""
    constraints: List[str] = None
    search_queries: List[str] = None
    literature_findings: List[Dict[str, Any]] = None
    synthesized_knowledge: str = ""
    generated_proposals: List[Dict[str, Any]] = None
    iteration: int = 0
    max_iterations: int = 3
    status: str = "initialized"

    def __post_init__(self):
        if self.constraints is None:
            self.constraints = []
        if self.search_queries is None:
            self.search_queries = []
        if self.literature_findings is None:
            self.literature_findings = []
        if self.generated_proposals is None:
            self.generated_proposals = []

# ============================================================================
# GEMINI MODEL SETUP
# ============================================================================

# Set up your Gemini API key using Colab's secure userdata
from google.colab import userdata
GEMINI_API_KEY = userdata.get('GOOGLE_API_KEY')
import os
os.environ["GOOGLE_API_KEY"] = GEMINI_API_KEY

# Initialize Gemini model
llm = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash-exp",
    temperature=0.7,
    max_tokens=4000
)

print("✅ Setup & Configuration Complete!")
print("📊 Configured Sources: PubMed, ArXiv, CrossRef, Web Academic")
print("🔑 Gemini API configured successfully")
print("⚙️ Ready to run main literature search agent")

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m21.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m152.4/152.4 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m27.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m438.1/438.1 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.2/44.2 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.0/50.0 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# ============================================================================
# LITERATURE SEARCH AGENT - INTERFACE CONTRACT
# ============================================================================
"""
📋 AGENT: Enhanced Multi-Source Literature Search Agent

🔌 INPUTS:
- research_goal: str (e.g., "Develop AI-driven personalized medicine for neurodegeneration")
- constraints: List[str] (e.g., ["Must integrate multiple data modalities", "Focus on translational applications"])

📤 OUTPUTS:
- GenerationState object containing:
  - search_queries: List[str] (optimized search terms)
  - literature_findings: List[Dict] (papers from multiple sources)
  - synthesized_knowledge: str (LLM-generated synthesis)
  - generated_proposals: List[Dict] (novel hypotheses)
  - status: str (workflow progress indicator)

📊 DATA STRUCTURES:
- Literature Finding:
  {
    'title': str,
    'abstract': str,
    'authors': List[str],
    'journal': str,
    'year': str,
    'source': str,  # 'PubMed' | 'ArXiv' | 'CrossRef' | 'Web Search'
    'url': str,
    'key_findings': List[str],
    'relevance_score': float,
    'citation_count': int
  }

- Generated Proposal:
  {
    'id': str,
    'content': str,
    'timestamp': str,
    'based_on_sources': int,
    'source_diversity': int
  }

🔗 INTER-AGENT COMMUNICATION:
- Accepts: research_goal, constraints from Supervisor
- Provides: GenerationState to Proximity → Reflection → Ranking → Evolution agents
- Status updates for Supervisor monitoring

⚠️ ERROR HANDLING:
- API timeouts with exponential backoff
- Rate limiting with delays
- Fallback to available sources when others fail
- Empty result handling with meaningful messages

🧪 TESTING INTERFACE:
- Input: test_research_goal, test_constraints
- Expected: GenerationState with ≥1 proposal, successful literature search
"""

# ============================================================================
# ENHANCED MULTI-SOURCE LITERATURE SEARCH AGENT
# ============================================================================
#@title MULTI-SOURCE LITERATURE SEARCH AGENT
# Required imports
import requests
import xml.etree.ElementTree as ET
import time
import logging
from datetime import datetime
from typing import List, Dict, Any, Optional
from dataclasses import dataclass, field
from langchain_community.tools import DuckDuckGoSearchRun
from langchain_community.utilities import DuckDuckGoSearchAPIWrapper
from langgraph.graph import StateGraph, START, END
from langgraph.checkpoint.memory import MemorySaver

# ============================================================================
# CONFIGURATION
# ============================================================================

# Configuration variables
PUBMED_EMAIL = "dgonchar@stanford.edu"  # Replace with your email
CROSSREF_EMAIL = "dgonchar@stanford.edu"  # Replace with your email
RATE_LIMIT_DELAY = 1.0  # Increased from 0.5 to 1.0 seconds between API calls
MAX_RESULTS_PER_SOURCE = 3
TOTAL_MAX_RESULTS = 12  # Slightly increased to accommodate more sources
WEB_SEARCH_DELAY = 3.0  # Separate delay for web search to avoid rate limits

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# ============================================================================
# STATE MANAGEMENT
# ============================================================================

@dataclass
class GenerationState:
    """State for the generation workflow"""
    research_goal: str = ""
    constraints: List[str] = field(default_factory=list)
    search_queries: List[str] = field(default_factory=list)
    literature_findings: List[Dict[str, Any]] = field(default_factory=list)
    synthesized_knowledge: str = ""
    generated_proposals: List[Dict[str, Any]] = field(default_factory=list)
    status: str = "initialized"
    iteration: int = 0

# ============================================================================
# LITERATURE SEARCH FUNCTIONS
# ============================================================================

def search_pubmed(query: str, max_results: int = 3) -> List[Dict[str, Any]]:
    """Search PubMed for biomedical literature"""
    try:
        logger.info(f"Searching PubMed for: {query}")

        # Step 1: Search for PMIDs
        search_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
        search_params = {
            'db': 'pubmed',
            'term': query,
            'retmax': max_results,
            'retmode': 'json',
            'email': PUBMED_EMAIL,
            'tool': 'ai_co_scientist_enhanced'
        }

        search_response = requests.get(search_url, params=search_params, timeout=10)
        search_response.raise_for_status()

        search_data = search_response.json()
        pmids = search_data.get('esearchresult', {}).get('idlist', [])

        if not pmids:
            logger.warning(f"No PMIDs found for query: {query}")
            return []

        time.sleep(RATE_LIMIT_DELAY)

        # Step 2: Fetch article details
        fetch_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
        fetch_params = {
            'db': 'pubmed',
            'id': ','.join(pmids),
            'retmode': 'xml',
            'email': PUBMED_EMAIL,
            'tool': 'ai_co_scientist_enhanced'
        }

        fetch_response = requests.get(fetch_url, params=fetch_params, timeout=15)
        fetch_response.raise_for_status()

        # Parse XML
        root = ET.fromstring(fetch_response.text)
        articles = []

        for article in root.findall('.//PubmedArticle'):
            title_elem = article.find('.//ArticleTitle')
            abstract_elem = article.find('.//AbstractText')
            pmid_elem = article.find('.//PMID')

            # Extract authors
            authors = []
            for author in article.findall('.//Author')[:3]:
                last_name = author.find('.//LastName')
                fore_name = author.find('.//ForeName')
                if last_name is not None and fore_name is not None:
                    authors.append(f"{fore_name.text} {last_name.text}")

            # Extract journal info
            journal_elem = article.find('.//Journal/Title')
            year_elem = article.find('.//PubDate/Year')

            title = title_elem.text if title_elem is not None else "No title"
            abstract = abstract_elem.text if abstract_elem is not None else "No abstract available"
            pmid = pmid_elem.text if pmid_elem is not None else "Unknown"
            journal = journal_elem.text if journal_elem is not None else "Unknown journal"
            year = year_elem.text if year_elem is not None else "Unknown year"

            # Extract key findings
            key_findings = []
            if abstract and abstract != "No abstract available":
                keywords = ['significant', 'demonstrated', 'showed', 'found', 'revealed']
                sentences = abstract.split('.')
                for sentence in sentences:
                    if any(keyword in sentence.lower() for keyword in keywords):
                        if 20 < len(sentence) < 150:
                            key_findings.append(sentence.strip())
                            if len(key_findings) >= 2:
                                break

            articles.append({
                'title': title,
                'abstract': abstract,
                'authors': authors,
                'journal': journal,
                'year': year,
                'pmid': pmid,
                'source': 'PubMed',
                'url': f"https://pubmed.ncbi.nlm.nih.gov/{pmid}",
                'key_findings': key_findings,
                'relevance_score': 0.9,
                'citation_count': 0  # PubMed doesn't provide citation counts
            })

        logger.info(f"Retrieved {len(articles)} articles from PubMed")
        return articles

    except Exception as e:
        logger.error(f"PubMed search failed for '{query}': {str(e)}")
        return []

def search_arxiv(query: str, max_results: int = 3) -> List[Dict[str, Any]]:
    """Search ArXiv for preprints"""
    try:
        logger.info(f"Searching ArXiv for: {query}")

        base_url = "http://export.arxiv.org/api/query"
        params = {
            'search_query': f'all:{query}',
            'start': 0,
            'max_results': max_results,
            'sortBy': 'relevance',
            'sortOrder': 'descending'
        }

        response = requests.get(base_url, params=params, timeout=15)
        response.raise_for_status()

        # Parse XML response
        root = ET.fromstring(response.text)
        ns = {'atom': 'http://www.w3.org/2005/Atom'}
        entries = root.findall('atom:entry', ns)

        if not entries:
            logger.warning(f"No papers found in ArXiv for: {query}")
            return []

        articles = []
        for entry in entries:
            title = entry.find('atom:title', ns).text.strip()
            summary = entry.find('atom:summary', ns).text.strip()

            # Extract authors
            authors = []
            for author in entry.findall('atom:author', ns):
                name = author.find('atom:name', ns)
                if name is not None:
                    authors.append(name.text)

            # Extract publication date
            published = entry.find('atom:published', ns).text
            year = published[:4] if published else "Unknown"

            # Extract ArXiv URL
            arxiv_url = entry.find('atom:id', ns).text

            # Extract key findings
            key_findings = []
            if summary:
                keywords = ['significant', 'demonstrate', 'show', 'find', 'reveal', 'propose']
                sentences = summary.split('.')
                for sentence in sentences:
                    if any(keyword in sentence.lower() for keyword in keywords):
                        if 20 < len(sentence) < 150:
                            key_findings.append(sentence.strip())
                            if len(key_findings) >= 2:
                                break

            articles.append({
                'title': title,
                'abstract': summary,
                'authors': authors[:3],  # First 3 authors
                'journal': 'ArXiv Preprint',
                'year': year,
                'source': 'ArXiv',
                'url': arxiv_url,
                'key_findings': key_findings,
                'relevance_score': 0.75,  # Lower since preprints
                'citation_count': 0  # ArXiv doesn't provide citation counts
            })

        logger.info(f"Retrieved {len(articles)} preprints from ArXiv")
        return articles

    except Exception as e:
        logger.error(f"ArXiv search failed for '{query}': {str(e)}")
        return []

def search_crossref(query: str, max_results: int = 3) -> List[Dict[str, Any]]:
    """Search CrossRef for academic papers"""
    try:
        logger.info(f"Searching CrossRef for: {query}")

        base_url = "https://api.crossref.org/works"
        headers = {
            'User-Agent': f'AI-Co-Scientist/1.0 (mailto:{CROSSREF_EMAIL})'
        }

        params = {
            'query': query,
            'rows': max_results,
            'sort': 'relevance',
            'select': 'title,author,published-print,abstract,DOI,URL,container-title'
        }

        response = requests.get(base_url, headers=headers, params=params, timeout=10)
        response.raise_for_status()

        data = response.json()
        items = data.get('message', {}).get('items', [])

        if not items:
            logger.warning(f"No papers found in CrossRef for: {query}")
            return []

        articles = []
        for item in items:
            # Extract title
            title_list = item.get('title', ['No title'])
            title = title_list[0] if title_list else 'No title'

            # Extract authors
            authors = []
            for author in item.get('author', [])[:3]:
                given = author.get('given', '')
                family = author.get('family', '')
                if given or family:
                    authors.append(f"{given} {family}".strip())

            # Extract publication date
            pub_date = item.get('published-print', {})
            year = None
            if pub_date and 'date-parts' in pub_date:
                year = pub_date['date-parts'][0][0] if pub_date['date-parts'][0] else None

            # Extract journal
            container = item.get('container-title', [])
            journal = container[0] if container else 'Unknown journal'

            articles.append({
                'title': title,
                'abstract': 'Abstract not available from CrossRef',
                'authors': authors,
                'journal': journal,
                'year': str(year) if year else 'Unknown',
                'source': 'CrossRef',
                'url': item.get('URL', ''),
                'key_findings': [],
                'relevance_score': 0.7,
                'citation_count': 0  # CrossRef doesn't provide citation counts directly
            })

        logger.info(f"Retrieved {len(articles)} papers from CrossRef")
        return articles

    except Exception as e:
        logger.error(f"CrossRef search failed for '{query}': {str(e)}")
        return []

def search_web_academic(query: str, max_results: int = 2) -> List[Dict[str, Any]]:
    """Search web for academic sources using DuckDuckGo with enhanced rate limiting"""
    try:
        logger.info(f"Searching web for academic sources: {query}")

        # Add longer delay to avoid rate limiting
        time.sleep(WEB_SEARCH_DELAY)  # Use configured delay for web search

        search_wrapper = DuckDuckGoSearchAPIWrapper(
            region="us-en",
            time="y",  # Past year
            max_results=max_results
        )
        search_tool = DuckDuckGoSearchRun(api_wrapper=search_wrapper)

        # Enhanced query for academic sources
        academic_query = f"{query} site:pubmed.ncbi.nlm.nih.gov OR site:scholar.google.com OR site:arxiv.org OR site:researchgate.net"

        results = search_tool.run(academic_query)

        if not results or len(results.strip()) < 50:
            logger.warning(f"No meaningful web results for: {query}")
            return []

        # Parse results (simplified)
        result_sections = [section.strip() for section in results.split('\n\n') if len(section.strip()) > 50]

        articles = []
        for i, section in enumerate(result_sections[:max_results]):
            articles.append({
                'title': f"Web Academic Result {i+1}: {query}",
                'abstract': section[:500] + "..." if len(section) > 500 else section,
                'authors': ['Web Search'],
                'journal': 'Web Academic Sources',
                'year': str(datetime.now().year),
                'source': 'Web Search',
                'url': '',
                'key_findings': [],
                'relevance_score': 0.6,  # Lower relevance for web search
                'citation_count': 0
            })

        logger.info(f"Retrieved {len(articles)} web academic results")
        return articles

    except Exception as e:
        # Check if it's a rate limiting error
        if "202" in str(e) or "rate" in str(e).lower():
            logger.warning(f"Rate limited for web search '{query}' - this is normal, continuing with other sources")
        else:
            logger.error(f"Web academic search failed for '{query}': {str(e)}")
        return []

def multi_source_literature_search(queries: List[str]) -> List[Dict[str, Any]]:
    """Search multiple literature sources with intelligent fallbacks"""

    if not queries:
        raise ValueError("No search queries provided")

    all_results = []

    # Define search sources with their functions
    search_sources = [
        ("PubMed", search_pubmed),
        ("ArXiv", search_arxiv),
        ("CrossRef", search_crossref),
        ("Web Academic", search_web_academic)
    ]

    for query in queries:
        logger.info(f"Processing query: {query}")
        query_results = []

        for source_name, search_func in search_sources:
            try:
                # Skip PubMed if email not configured properly
                if source_name == "PubMed" and (not PUBMED_EMAIL or PUBMED_EMAIL == "your_email@example.com"):
                    logger.warning("Skipping PubMed - email not configured")
                    continue

                results = search_func(query, max_results=MAX_RESULTS_PER_SOURCE)
                if results:  # Only add if we got results
                    query_results.extend(results)

                # Stop if we have enough results
                if len(query_results) >= TOTAL_MAX_RESULTS:
                    break

                # Use different delays for different sources
                if source_name == "Web Academic":
                    time.sleep(WEB_SEARCH_DELAY)  # Longer delay for web search
                else:
                    time.sleep(RATE_LIMIT_DELAY)  # Standard delay for APIs

            except Exception as e:
                logger.warning(f"{source_name} failed for query '{query}': {str(e)}")
                continue

        all_results.extend(query_results)

    # Remove duplicates based on title similarity
    unique_results = []
    seen_titles = set()

    for result in all_results:
        title_key = result['title'].lower().strip()
        if title_key not in seen_titles and title_key != "no title":
            seen_titles.add(title_key)
            unique_results.append(result)

    # Sort by relevance score and citation count
    unique_results.sort(key=lambda x: (x['relevance_score'], x['citation_count']), reverse=True)

    if not unique_results:
        raise RuntimeError("Multi-source search returned no results. Try broader search terms.")

    logger.info(f"Multi-source search completed: {len(unique_results)} unique results")
    return unique_results[:TOTAL_MAX_RESULTS]

# ============================================================================
# ENHANCED GENERATION AGENT (Simplified for demo)
# ============================================================================

class MockLLM:
    """Mock LLM for demonstration purposes"""

    def invoke(self, messages):
        class MockResponse:
            def __init__(self, content):
                self.content = content

        # Simple mock responses based on the prompt content
        prompt = messages[0][1] if isinstance(messages[0], tuple) else str(messages[0])

        if "Generate" in prompt and "search queries" in prompt:
            return MockResponse("""1. AI personalized medicine neurodegenerative diseases
2. machine learning precision medicine Alzheimer
3. computational biomarkers neurodegeneration
4. artificial intelligence clinical decision support""")

        elif "synthesize" in prompt.lower():
            return MockResponse("""Based on the multi-source literature review, several key themes emerge:

1. **Current AI Applications**: Machine learning models are showing promise in early detection of neurodegenerative diseases through biomarker analysis and neuroimaging.

2. **Data Integration Challenges**: Current approaches struggle with integrating diverse data types (genomic, proteomic, imaging, clinical) into unified predictive models.

3. **Personalization Gaps**: Most existing models are population-based rather than truly personalized, limiting their clinical utility.

4. **Translational Barriers**: Despite promising research results, few AI-driven approaches have successfully transitioned to clinical practice.

5. **Ethical Considerations**: Data privacy, algorithmic bias, and informed consent remain significant challenges in implementation.""")

        else:  # hypothesis generation
            return MockResponse("""**Novel Hypotheses for AI-Driven Personalized Medicine in Neurodegeneration:**

**Hypothesis 1: Multi-Modal Temporal Fusion**
A federated learning approach combining longitudinal biomarker data, neuroimaging, and wearable sensor data can predict disease progression 2-3 years earlier than current methods while preserving patient privacy.

**Hypothesis 2: Personalized Intervention Optimization**
AI-driven digital twins of individual patients can simulate treatment responses across multiple therapeutic modalities, enabling personalized combination therapy selection with improved efficacy.

**Hypothesis 3: Explainable AI for Clinical Trust**
Integration of causal inference methods with deep learning can create interpretable models that not only predict outcomes but explain the biological mechanisms driving predictions, increasing physician adoption.

**Key Experiments:**
- Prospective cohort validation across diverse populations
- Randomized controlled trials comparing AI-guided vs. standard care
- Regulatory pathway development for AI-driven diagnostic tools""")

# Create the agent class
class EnhancedGenerationAgent:
    """Enhanced Generation Agent with multi-source literature search"""

    def __init__(self):
        self.llm = MockLLM()  # In practice, use actual LLM

    def generate_search_queries(self, state: GenerationState) -> GenerationState:
        """Generate optimized search queries for multiple sources"""
        print(f"🔍 Generating multi-source search queries...")

        prompt = f"""Generate 3-4 effective search queries for academic literature about: {state.research_goal}

Consider these constraints: {', '.join(state.constraints)}

Create simple, effective queries for different academic databases"""

        response = self.llm.invoke([("human", prompt)])

        # Parse queries with better cleaning
        queries = []
        lines = response.content.split('\n')

        for line in lines:
            line = line.strip()
            if not line:
                continue

            # Remove markdown formatting and extract query
            line = line.replace('*', '').replace('_', '')

            # Look for numbered items
            if any(line.startswith(str(i) + '.') for i in range(1, 10)):
                # Extract everything after the number and period
                query = line.split('.', 1)[1].strip() if '.' in line else line
                query = query.replace('(', '').replace(')', '').strip()

                if query and len(query) > 3:
                    queries.append(query)

        # Fallback queries if parsing failed
        if not queries:
            base_terms = state.research_goal.lower().split()[:3]
            queries = [
                ' '.join(base_terms),
                f"{base_terms[0]} treatment" if len(base_terms) > 0 else "treatment",
                f"{base_terms[0]} therapy" if len(base_terms) > 0 else "therapy",
                "clinical trial"
            ]

        state.search_queries = queries[:4]
        state.status = "queries_generated"

        print(f"✅ Generated {len(state.search_queries)} optimized queries")
        for i, q in enumerate(state.search_queries, 1):
            print(f"   {i}. {q}")

        return state

    def literature_exploration(self, state: GenerationState) -> GenerationState:
        """Search literature using multiple sources"""
        print(f"📚 Searching multiple literature sources...")

        try:
            findings = multi_source_literature_search(state.search_queries)
            state.literature_findings = findings
            state.status = "literature_explored"

            # Print source breakdown
            source_counts = {}
            for finding in findings:
                source = finding['source']
                source_counts[source] = source_counts.get(source, 0) + 1

            print(f"✅ Found {len(findings)} papers from {len(source_counts)} sources:")
            for source, count in source_counts.items():
                print(f"   • {source}: {count} papers")

            return state

        except Exception as e:
            print(f"❌ Literature search failed: {str(e)}")
            raise RuntimeError(f"Literature search failed: {str(e)}")

    def synthesize_knowledge(self, state: GenerationState) -> GenerationState:
        """Enhanced knowledge synthesis with source diversity"""
        print("🧠 Synthesizing knowledge from multiple sources...")

        if not state.literature_findings:
            print("⚠️ No literature findings to synthesize")
            state.synthesized_knowledge = "No literature findings available for synthesis."
            state.status = "knowledge_synthesized"
            return state

        # Create structured findings text
        findings_text = "Literature findings from multiple sources..."

        try:
            response = self.llm.invoke([
                ("human", f"""Synthesize this diverse research for: {state.research_goal}

Literature from Multiple Sources:
{findings_text}

Provide a comprehensive synthesis...""")
            ])

            state.synthesized_knowledge = response.content

        except Exception as e:
            logger.error(f"Knowledge synthesis failed: {str(e)}")
            state.synthesized_knowledge = f"Knowledge synthesis encountered an error. Based on {len(state.literature_findings)} papers from multiple sources."

        state.status = "knowledge_synthesized"
        print("✅ Multi-source knowledge synthesis complete")
        return state

    def generate_hypotheses(self, state: GenerationState) -> GenerationState:
        """Generate hypotheses based on multi-source synthesis"""
        print("💡 Generating hypotheses from multi-source research...")

        try:
            response = self.llm.invoke([
                ("human", f"""Generate novel hypotheses for: {state.research_goal}

Based on: {state.synthesized_knowledge}""")
            ])

            hypothesis_content = response.content

        except Exception as e:
            logger.error(f"Hypothesis generation failed: {str(e)}")
            hypothesis_content = f"Hypothesis generation encountered an error."

        state.generated_proposals = [{
            "id": "multi_source_hypothesis",
            "content": hypothesis_content,
            "timestamp": datetime.now().isoformat(),
            "based_on_sources": len(state.literature_findings),
            "source_diversity": len(set(f['source'] for f in state.literature_findings)) if state.literature_findings else 0
        }]

        state.status = "hypotheses_generated"
        state.iteration += 1

        source_count = len(state.literature_findings) if state.literature_findings else 0
        source_diversity = len(set(f['source'] for f in state.literature_findings)) if state.literature_findings else 0

        print(f"✅ Generated hypotheses based on {source_count} papers from {source_diversity} sources")
        return state

    def run_complete_workflow(self, research_goal: str, constraints: List[str] = None) -> GenerationState:
        """
        Run the complete literature search workflow - Interface Contract Method

        This method provides a unified interface for running the entire workflow
        as specified in the agent interface contract.
        """
        # Initialize state
        state = GenerationState(
            research_goal=research_goal,
            constraints=constraints or []
        )

        try:
            # Run each step of the workflow in sequence
            print("🔍 Starting complete literature search workflow...")

            # Step 1: Generate search queries
            state = self.generate_search_queries(state)

            # Step 2: Search literature
            state = self.literature_exploration(state)

            # Step 3: Synthesize knowledge
            state = self.synthesize_knowledge(state)

            # Step 4: Generate hypotheses
            state = self.generate_hypotheses(state)

            print("✅ Complete workflow finished successfully")
            return state

        except Exception as e:
            print(f"❌ Workflow failed at step: {state.status}")
            print(f"Error: {str(e)}")

            # Return state with error information
            state.status = f"failed_at_{state.status}"
            state.generated_proposals = [{
                "id": "workflow_error",
                "content": f"Workflow failed during {state.status}: {str(e)}",
                "timestamp": datetime.now().isoformat(),
                "based_on_sources": len(state.literature_findings),
                "source_diversity": len(set(f['source'] for f in state.literature_findings)) if state.literature_findings else 0
            }]

            return state

# ============================================================================
# WORKFLOW CREATION AND EXECUTION
# ============================================================================

def create_enhanced_workflow():
    """Create enhanced workflow with multi-source search"""

    agent = EnhancedGenerationAgent()
    workflow = StateGraph(GenerationState)

    workflow.add_node("generate_queries", agent.generate_search_queries)
    workflow.add_node("explore_literature", agent.literature_exploration)
    workflow.add_node("synthesize_knowledge", agent.synthesize_knowledge)
    workflow.add_node("generate_hypotheses", agent.generate_hypotheses)

    workflow.add_edge(START, "generate_queries")
    workflow.add_edge("generate_queries", "explore_literature")
    workflow.add_edge("explore_literature", "synthesize_knowledge")
    workflow.add_edge("synthesize_knowledge", "generate_hypotheses")
    workflow.add_edge("generate_hypotheses", END)

    return workflow.compile(checkpointer=MemorySaver())

def run_enhanced_agent(research_goal: str, constraints: List[str] = None):
    """Run enhanced multi-source generation agent"""

    print("🚀 Enhanced AI Co-Scientist with Multi-Source Literature Search")
    print("=" * 60)
    print(f"📊 Configured Sources: PubMed, ArXiv, CrossRef, Web Academic")
    print("=" * 60)

    app = create_enhanced_workflow()

    initial_state = GenerationState(
        research_goal=research_goal,
        constraints=constraints or []
    )

    try:
        final_state = app.invoke(initial_state, {"configurable": {"thread_id": "enhanced_1"}})

        print("\n" + "=" * 60)
        print("🎉 ENHANCED SEARCH SUCCESS!")
        print("=" * 60)

        print(f"\n📋 Research Goal: {final_state.get('research_goal', 'Unknown')}")

        # Source breakdown with safety checks
        literature_findings = final_state.get('literature_findings', [])
        source_counts = {}

        if literature_findings:
            for paper in literature_findings:
                source = paper.get('source', 'Unknown')
                source_counts[source] = source_counts.get(source, 0) + 1

        print(f"\n📚 Literature Found ({len(literature_findings)} papers):")
        if source_counts:
            for source, count in source_counts.items():
                print(f"   • {source}: {count} papers")
        else:
            print("   • No papers found")

        if literature_findings and len(literature_findings) > 0:
            print(f"\n🔝 Top Papers by Relevance:")
            for i, paper in enumerate(literature_findings[:3], 1):
                title = paper.get('title', 'No title')
                source = paper.get('source', 'Unknown')
                citations = paper.get('citation_count', 0)
                print(f"   {i}. {title}")
                print(f"      Source: {source} | Citations: {citations}")

        proposals = final_state.get('generated_proposals', [])
        if proposals and len(proposals) > 0:
            print(f"\n💡 Generated Hypotheses:")
            content = proposals[0].get('content', 'No content available')
            preview = content[:300] + "..." if len(content) > 300 else content
            print(f"   {preview}")
        else:
            print(f"\n💡 No hypotheses generated")

        return final_state

    except Exception as e:
        print(f"\n❌ Enhanced search failed: {str(e)}")
        return None

# ============================================================================
# MAIN EXECUTION
# ============================================================================

if __name__ == "__main__":
    # Test with a complex research goal
    research_goal = "Develop AI-driven personalized medicine approaches for neurodegenerative diseases"
    constraints = [
        "Must integrate multiple data modalities",
        "Consider ethical implications",
        "Focus on translational applications"
    ]

    print("🧪 Testing Enhanced Multi-Source Agent...")
    result = run_enhanced_agent(research_goal, constraints)

🧪 Testing Enhanced Multi-Source Agent...
🚀 Enhanced AI Co-Scientist with Multi-Source Literature Search
📊 Configured Sources: PubMed, ArXiv, CrossRef, Web Academic
🔍 Generating multi-source search queries...
✅ Generated 4 optimized queries
   1. AI personalized medicine neurodegenerative diseases
   2. machine learning precision medicine Alzheimer
   3. computational biomarkers neurodegeneration
   4. artificial intelligence clinical decision support
📚 Searching multiple literature sources...




✅ Found 12 papers from 1 sources:
   • PubMed: 12 papers
🧠 Synthesizing knowledge from multiple sources...
✅ Multi-source knowledge synthesis complete
💡 Generating hypotheses from multi-source research...
✅ Generated hypotheses based on 12 papers from 1 sources

🎉 ENHANCED SEARCH SUCCESS!

📋 Research Goal: Develop AI-driven personalized medicine approaches for neurodegenerative diseases

📚 Literature Found (12 papers):
   • PubMed: 12 papers

🔝 Top Papers by Relevance:
   1. Targeted Gut Microbiota Modulation Enhances Levodopa Bioavailability and Motor Recovery in MPTP Parkinson's Disease Models.
      Source: PubMed | Citations: 0
   2. Macrophages: sentinels, warriors, and healers.
      Source: PubMed | Citations: 0
   3. Machine learning-enhanced SERS diagnostics: Accelerating the AI-powered transition from laboratory discoveries to clinical practice.
      Source: PubMed | Citations: 0

💡 Generated Hypotheses:
   **Novel Hypotheses for AI-Driven Personalized Medicine in Neurodegen

In [None]:
#@title 🔬 Literature Search Agent - Interface Validation Test

# ============================================================================
# MINIMAL TEST SHIM FOR LITERATURE SEARCH AGENT
# Validates agent works according to declared interface contract
# ============================================================================

import time
from dataclasses import dataclass, field
from typing import List, Dict, Any

# Quick setup (assumes main agent code is available)
try:
    # Try to use the real agent from the notebook
    agent = EnhancedGenerationAgent()
    print("✅ Using real EnhancedGenerationAgent")
except NameError:
    # Fallback to mock agent that follows the interface
    print("⚠️ Real agent not found, using mock agent for interface validation")

    @dataclass
    class MockGenerationState:
        research_goal: str = ""
        constraints: List[str] = field(default_factory=list)
        search_queries: List[str] = field(default_factory=list)
        literature_findings: List[Dict[str, Any]] = field(default_factory=list)
        synthesized_knowledge: str = ""
        generated_proposals: List[Dict[str, Any]] = field(default_factory=list)
        status: str = "initialized"
        iteration: int = 0

    class MockEnhancedGenerationAgent:
        def run_complete_workflow(self, research_goal: str, constraints: List[str] = None):
            """Mock implementation following the interface contract"""
            state = MockGenerationState(research_goal=research_goal, constraints=constraints or [])

            # Mock search queries
            state.search_queries = [
                f"{research_goal.split()[0]} mechanisms",
                f"{research_goal.split()[-1] if len(research_goal.split()) > 1 else 'therapy'} approaches",
                "clinical applications",
                "novel targets"
            ]

            # Mock literature findings (follows exact interface spec)
            state.literature_findings = [
                {
                    'title': f'Novel approaches to {research_goal.lower()}',
                    'abstract': f'This study investigates {research_goal.lower()} using innovative methodologies...',
                    'authors': ['Smith, J.', 'Johnson, A.', 'Williams, B.'],
                    'journal': 'Nature Medicine',
                    'year': '2024',
                    'source': 'PubMed',
                    'url': 'https://pubmed.ncbi.nlm.nih.gov/mock123',
                    'key_findings': ['Significant improvement observed', 'Novel mechanism identified'],
                    'relevance_score': 0.92,
                    'citation_count': 45
                },
                {
                    'title': f'Machine learning applications in {research_goal.lower()}',
                    'abstract': f'We present a computational approach to {research_goal.lower()}...',
                    'authors': ['Chen, L.', 'Davis, M.'],
                    'journal': 'ArXiv Preprint',
                    'year': '2024',
                    'source': 'ArXiv',
                    'url': 'https://arxiv.org/abs/2024.mock',
                    'key_findings': ['AI model achieved 95% accuracy', 'Outperformed existing methods'],
                    'relevance_score': 0.87,
                    'citation_count': 12
                }
            ]

            # Mock knowledge synthesis
            state.synthesized_knowledge = f"""
            Based on analysis of {len(state.literature_findings)} papers from multiple sources,
            current research in {research_goal.lower()} shows promising developments in both
            experimental and computational approaches. Key themes include novel therapeutic
            targets, AI-driven personalization, and translational applications.
            """

            # Mock generated proposals (follows exact interface spec)
            state.generated_proposals = [
                {
                    'id': f'lit_search_hyp_1_{int(time.time())}',
                    'content': f'Novel hypothesis 1: Targeting specific molecular pathways in {research_goal.lower()} using personalized biomarker-guided approaches.',
                    'timestamp': time.strftime('%Y-%m-%dT%H:%M:%S'),
                    'based_on_sources': len(state.literature_findings),
                    'source_diversity': len(set(f['source'] for f in state.literature_findings))
                },
                {
                    'id': f'lit_search_hyp_2_{int(time.time())}',
                    'content': f'Novel hypothesis 2: Combining AI-driven analysis with experimental validation to accelerate {research_goal.lower()} research.',
                    'timestamp': time.strftime('%Y-%m-%dT%H:%M:%S'),
                    'based_on_sources': len(state.literature_findings),
                    'source_diversity': len(set(f['source'] for f in state.literature_findings))
                }
            ]

            state.status = "hypotheses_generated"
            state.iteration = 1
            return state

    agent = MockEnhancedGenerationAgent()

# ============================================================================
# INTERFACE CONTRACT VALIDATION TEST
# ============================================================================

def test_literature_search_interface():
    """
    Tests the Literature Search Agent against its declared interface contract
    """
    print("🔬 TESTING LITERATURE SEARCH AGENT INTERFACE")
    print("=" * 55)

    # Test inputs (as specified in interface)
    test_research_goal = "Develop AI-driven personalized medicine for neurodegeneration"
    test_constraints = [
        "Must integrate multiple data modalities",
        "Focus on translational applications",
        "Consider ethical implications"
    ]

    print(f"📥 INPUT TEST:")
    print(f"   Research Goal: {test_research_goal}")
    print(f"   Constraints: {test_constraints}")

    # Run the agent
    try:
        start_time = time.time()
        result = agent.run_complete_workflow(test_research_goal, test_constraints)
        duration = time.time() - start_time

        print(f"\n⏱️ EXECUTION: Completed in {duration:.2f} seconds")

        # Validate output structure
        print(f"\n📤 OUTPUT VALIDATION:")

        # Check GenerationState structure
        required_fields = ['search_queries', 'literature_findings', 'synthesized_knowledge',
                          'generated_proposals', 'status']
        missing_fields = []

        for field in required_fields:
            if not hasattr(result, field):
                missing_fields.append(field)

        if missing_fields:
            print(f"   ❌ Missing required fields: {missing_fields}")
            return False
        else:
            print(f"   ✅ All required GenerationState fields present")

        # Validate data structures
        print(f"\n📊 DATA STRUCTURE VALIDATION:")

        # Check search_queries
        if isinstance(result.search_queries, list) and len(result.search_queries) > 0:
            print(f"   ✅ Search queries: {len(result.search_queries)} generated")
        else:
            print(f"   ❌ Search queries: Invalid structure or empty")

        # Check literature_findings structure
        if isinstance(result.literature_findings, list) and len(result.literature_findings) > 0:
            # Validate first finding structure
            first_finding = result.literature_findings[0]
            required_finding_fields = ['title', 'abstract', 'authors', 'journal', 'year',
                                     'source', 'url', 'key_findings', 'relevance_score', 'citation_count']

            missing_finding_fields = [f for f in required_finding_fields if f not in first_finding]

            if missing_finding_fields:
                print(f"   ❌ Literature findings: Missing fields {missing_finding_fields}")
            else:
                print(f"   ✅ Literature findings: {len(result.literature_findings)} papers with correct structure")
                print(f"       Sources: {set(f['source'] for f in result.literature_findings)}")
        else:
            print(f"   ❌ Literature findings: Invalid structure or empty")

        # Check generated_proposals structure
        if isinstance(result.generated_proposals, list) and len(result.generated_proposals) > 0:
            first_proposal = result.generated_proposals[0]
            required_proposal_fields = ['id', 'content', 'timestamp', 'based_on_sources', 'source_diversity']

            missing_proposal_fields = [f for f in required_proposal_fields if f not in first_proposal]

            if missing_proposal_fields:
                print(f"   ❌ Generated proposals: Missing fields {missing_proposal_fields}")
            else:
                print(f"   ✅ Generated proposals: {len(result.generated_proposals)} hypotheses with correct structure")
        else:
            print(f"   ❌ Generated proposals: Invalid structure or empty")

        # Check status
        if hasattr(result, 'status') and result.status:
            print(f"   ✅ Status: {result.status}")
        else:
            print(f"   ❌ Status: Missing or empty")

        # Sample output display
        print(f"\n📋 SAMPLE OUTPUT:")
        if result.generated_proposals:
            sample_proposal = result.generated_proposals[0]
            print(f"   Proposal ID: {sample_proposal['id']}")
            print(f"   Content: {sample_proposal['content'][:100]}...")
            print(f"   Based on {sample_proposal['based_on_sources']} sources from {sample_proposal['source_diversity']} different databases")

        print(f"\n✅ INTERFACE CONTRACT VALIDATION: PASSED")
        return True

    except Exception as e:
        print(f"\n❌ EXECUTION FAILED: {str(e)}")
        return False

# ============================================================================
# RUN THE INTERFACE TEST
# ============================================================================

if __name__ == "__main__":
    success = test_literature_search_interface()

    print(f"\n{'='*55}")
    if success:
        print("🎉 LITERATURE SEARCH AGENT INTERFACE: VALIDATED")
        print("🔗 Ready for integration with other agents")
    else:
        print("🚨 LITERATURE SEARCH AGENT INTERFACE: VALIDATION FAILED")
        print("🔧 Requires fixes before integration")
    print("="*55)

✅ Using real EnhancedGenerationAgent
🔬 TESTING LITERATURE SEARCH AGENT INTERFACE
📥 INPUT TEST:
   Research Goal: Develop AI-driven personalized medicine for neurodegeneration
   Constraints: ['Must integrate multiple data modalities', 'Focus on translational applications', 'Consider ethical implications']
🔍 Starting complete literature search workflow...
🔍 Generating multi-source search queries...
✅ Generated 4 optimized queries
   1. AI personalized medicine neurodegenerative diseases
   2. machine learning precision medicine Alzheimer
   3. computational biomarkers neurodegeneration
   4. artificial intelligence clinical decision support
📚 Searching multiple literature sources...




✅ Found 12 papers from 1 sources:
   • PubMed: 12 papers
🧠 Synthesizing knowledge from multiple sources...
✅ Multi-source knowledge synthesis complete
💡 Generating hypotheses from multi-source research...
✅ Generated hypotheses based on 12 papers from 1 sources
✅ Complete workflow finished successfully

⏱️ EXECUTION: Completed in 59.77 seconds

📤 OUTPUT VALIDATION:
   ✅ All required GenerationState fields present

📊 DATA STRUCTURE VALIDATION:
   ✅ Search queries: 4 generated
   ✅ Literature findings: 12 papers with correct structure
       Sources: {'PubMed'}
   ✅ Generated proposals: 1 hypotheses with correct structure
   ✅ Status: hypotheses_generated

📋 SAMPLE OUTPUT:
   Proposal ID: multi_source_hypothesis
   Content: **Novel Hypotheses for AI-Driven Personalized Medicine in Neurodegeneration:**

**Hypothesis 1: Mult...
   Based on 12 sources from 1 different databases

✅ INTERFACE CONTRACT VALIDATION: PASSED

🎉 LITERATURE SEARCH AGENT INTERFACE: VALIDATED
🔗 Ready for integration w

In [None]:
#@title Robust Reflection Agent - Flexible LLM Output Parsing
# ===========================================================================
# ROBUST REFLECTION AGENT IMPLEMENTATION
# Flexible Parsing to Handle Real LLM Output Variations
# ===========================================================================

# ===========================================================================
# DATA MODELS
# ===========================================================================

from dataclasses import dataclass, field
from typing import List, Dict, Any, Optional, Tuple
import datetime # FIX: Changed import to be explicit
import json
import statistics
import re
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

@dataclass
class ReviewCriteria:
    novelty_score: float
    feasibility_score: float
    scientific_rigor_score: float
    impact_potential_score: float
    testability_score: float
    novelty_reasoning: str
    feasibility_reasoning: str
    scientific_rigor_reasoning: str
    impact_potential_reasoning: str
    testability_reasoning: str

@dataclass
class HypothesisReview:
    hypothesis_id: str
    hypothesis_text: str
    criteria: ReviewCriteria
    overall_score: float
    overall_assessment: str
    strengths: List[str]
    weaknesses: List[str]
    recommendations: List[str]
    confidence_level: float
    review_timestamp: str
    reviewer_type: str

@dataclass
class ReflectionState:
    hypothesis_reviews: List[HypothesisReview] = field(default_factory=list)
    review_statistics: Optional[Dict[str, float]] = None
    quality_flags: Optional[List[str]] = None
    batch_summary: Optional[str] = None


class RobustReflectionAgent:
    """
    Robust Reflection Agent that can handle various LLM output formats
    """

    def __init__(self, llm):
        self.llm = llm
        self.criteria_weights = {
            'novelty_score': 0.25,
            'feasibility_score': 0.20,
            'scientific_rigor_score': 0.25,
            'impact_potential_score': 0.20,
            'testability_score': 0.10
        }

    def quick_review(self, hypothesis_text: str, hypothesis_id: str) -> HypothesisReview:
        """Simplified quick review with more flexible parsing"""

        logger.info(f"Performing quick review of hypothesis: {hypothesis_id}")

        quick_prompt = f"""You are a scientific peer reviewer. Please evaluate this hypothesis:

HYPOTHESIS: {hypothesis_text}

Please provide your evaluation in this format:

**SCORES (1-10 scale):**
Novelty: [score] - [brief reason]
Feasibility: [score] - [brief reason]
Scientific Rigor: [score] - [brief reason]
Impact Potential: [score] - [brief reason]
Testability: [score] - [brief reason]

**STRENGTHS:**
• [strength 1]
• [strength 2]
• [strength 3]

**WEAKNESSES:**
• [weakness 1]
• [weakness 2]
• [weakness 3]

**RECOMMENDATIONS:**
• [recommendation 1]
• [recommendation 2]
• [recommendation 3]

**OVERALL:** [2-3 sentence summary assessment]

**CONFIDENCE:** [1-10] in your review accuracy"""

        try:
            response = self.llm.invoke([("human", quick_prompt)])
            return self._parse_flexible_review(response.content, hypothesis_text, hypothesis_id, "quick")

        except Exception as e:
            logger.error(f"Quick review failed for {hypothesis_id}: {str(e)}")
            return self._create_enhanced_fallback_review(hypothesis_text, hypothesis_id, "quick")

    def detailed_review(self, hypothesis_text: str, hypothesis_id: str, research_goal: str = "") -> HypothesisReview:
        """Detailed review with flexible parsing"""

        logger.info(f"Performing detailed review of hypothesis: {hypothesis_id}")

        detailed_prompt = f"""You are a senior scientific peer reviewer. Conduct a comprehensive evaluation of this hypothesis.

RESEARCH CONTEXT: {research_goal}

HYPOTHESIS: {hypothesis_text}

Please structure your review as follows:

**DETAILED EVALUATION:**

1. **Novelty (1-10):** [score]
   [Detailed assessment of originality and innovation]

2. **Feasibility (1-10):** [score]
   [Analysis of technical achievability and resource requirements]

3. **Scientific Rigor (1-10):** [score]
   [Evaluation of methodological soundness and evidence base]

4. **Impact Potential (1-10):** [score]
   [Assessment of significance and broader implications]

5. **Testability (1-10):** [score]
   [Analysis of experimental validation approaches]

**COMPREHENSIVE ASSESSMENT:**

**Major Strengths:**
• [detailed strength 1]
• [detailed strength 2]
• [detailed strength 3]
• [detailed strength 4]

**Key Concerns:**
• [detailed concern 1]
• [detailed concern 2]
• [detailed concern 3]
• [detailed concern 4]

**Improvement Recommendations:**
• [specific recommendation 1]
• [specific recommendation 2]
• [specific recommendation 3]
• [specific recommendation 4]

**Summary:** [3-4 sentence overall evaluation]

**Review Confidence:** [1-10]"""

        try:
            response = self.llm.invoke([("human", detailed_prompt)])
            return self._parse_flexible_review(response.content, hypothesis_text, hypothesis_id, "detailed")

        except Exception as e:
            logger.error(f"Detailed review failed for {hypothesis_id}: {str(e)}")
            return self._create_enhanced_fallback_review(hypothesis_text, hypothesis_id, "detailed")

    def _parse_flexible_review(self, response_text: str, hypothesis_text: str,
                              hypothesis_id: str, review_type: str) -> HypothesisReview:
        """Flexible parsing that handles various LLM output formats"""

        logger.info(f"Parsing review response for {hypothesis_id}")

        try:
            # Clean the response text
            cleaned_text = response_text.replace('*', '').replace('#', '').replace('`', '')

            # Extract scores using multiple pattern attempts
            scores = self._extract_scores_flexible(cleaned_text)

            # Extract confidence
            confidence = self._extract_confidence_flexible(cleaned_text)

            # Extract lists using flexible patterns
            strengths = self._extract_lists_flexible(cleaned_text, ["strength", "strengths", "positive", "advantages"])
            weaknesses = self._extract_lists_flexible(cleaned_text, ["weakness", "weaknesses", "concern", "concerns", "limitation", "limitations"])
            recommendations = self._extract_lists_flexible(cleaned_text, ["recommendation", "recommendations", "suggest", "improve"])

            # Extract overall assessment
            overall_assessment = self._extract_overall_flexible(cleaned_text)

            # Create criteria with extracted reasoning
            criteria = ReviewCriteria(
                novelty_score=scores.get('novelty', 6.0),
                feasibility_score=scores.get('feasibility', 6.0),
                scientific_rigor_score=scores.get('rigor', 6.0),
                impact_potential_score=scores.get('impact', 6.0),
                testability_score=scores.get('testability', 6.0),
                novelty_reasoning=f"Novelty assessment: {scores.get('novelty', 6.0)}/10",
                feasibility_reasoning=f"Feasibility assessment: {scores.get('feasibility', 6.0)}/10",
                scientific_rigor_reasoning=f"Scientific rigor assessment: {scores.get('rigor', 6.0)}/10",
                impact_potential_reasoning=f"Impact potential assessment: {scores.get('impact', 6.0)}/10",
                testability_reasoning=f"Testability assessment: {scores.get('testability', 6.0)}/10"
            )

            # Calculate weighted overall score
            overall_score = (
                scores.get('novelty', 6.0) * 0.25 +
                scores.get('feasibility', 6.0) * 0.20 +
                scores.get('rigor', 6.0) * 0.25 +
                scores.get('impact', 6.0) * 0.20 +
                scores.get('testability', 6.0) * 0.10
            )

            return HypothesisReview(
                hypothesis_id=hypothesis_id,
                hypothesis_text=hypothesis_text[:500] + "..." if len(hypothesis_text) > 500 else hypothesis_text,
                criteria=criteria,
                overall_score=overall_score,
                overall_assessment=overall_assessment,
                strengths=strengths,
                weaknesses=weaknesses,
                recommendations=recommendations,
                confidence_level=confidence,
                # FIX: Corrected datetime call
                review_timestamp=datetime.datetime.now().isoformat(),
                reviewer_type=review_type
            )

        except Exception as e:
            logger.error(f"Flexible parsing failed for {hypothesis_id}: {str(e)}")
            # If all parsing fails, create enhanced fallback
            return self._create_enhanced_fallback_review(hypothesis_text, hypothesis_id, review_type)

    def _extract_scores_flexible(self, text: str) -> Dict[str, float]:
        """Extract scores using multiple flexible patterns"""

        scores = {}

        # Define multiple patterns for each criterion
        pattern_sets = {
            'novelty': [
                r'novelty.*?(\\d+(?:\\.\\d+)?)',
                r'original.*?(\\d+(?:\\.\\d+)?)',
                r'innovation.*?(\\d+(?:\\.\\d+)?)',
                r'novel.*?(\\d+(?:\\.\\d+)?)'
            ],
            'feasibility': [
                r'feasibility.*?(\\d+(?:\\.\\d+)?)',
                r'feasible.*?(\\d+(?:\\.\\d+)?)',
                r'achievable.*?(\\d+(?:\\.\\d+)?)',
                r'practical.*?(\\d+(?:\\.\\d+)?)'
            ],
            'rigor': [
                r'rigor.*?(\\d+(?:\\.\\d+)?)',
                r'rigorous.*?(\\d+(?:\\.\\d+)?)',
                r'scientific.*?rigor.*?(\\d+(?:\\.\\d+)?)',
                r'methodological.*?(\\d+(?:\\.\\d+)?)',
                r'evidence.*?(\\d+(?:\\.\\d+)?)'
            ],
            'impact': [
                r'impact.*?(\\d+(?:\\.\\d+)?)',
                r'significance.*?(\\d+(?:\\.\\d+)?)',
                r'important.*?(\\d+(?:\\.\\d+)?)',
                r'potential.*?(\\d+(?:\\.\\d+)?)'
            ],
            'testability': [
                r'testability.*?(\\d+(?:\\.\\d+)?)',
                r'testable.*?(\\d+(?:\\.\\d+)?)',
                r'validation.*?(\\d+(?:\\.\\d+)?)',
                r'experimental.*?(\\d+(?:\\.\\d+)?)'
            ]
        }

        for criterion, patterns in pattern_sets.items():
            score_found = False
            for pattern in patterns:
                match = re.search(pattern, text.lower())
                if match:
                    try:
                        score = float(match.group(1))
                        if 1 <= score <= 10:  # Validate score range
                            scores[criterion] = score
                            score_found = True
                            break
                    except ValueError:
                        continue

            if not score_found:
                # Look for any number in proximity to criterion words
                criterion_word = criterion if criterion != 'rigor' else 'rigor'
                proximity_pattern = f'{criterion_word}.{{0,50}}?(\\d+)'
                match = re.search(proximity_pattern, text.lower())
                if match:
                    try:
                        score = float(match.group(1))
                        if 1 <= score <= 10:
                            scores[criterion] = score
                        else:
                            scores[criterion] = 6.0  # Default
                    except ValueError:
                        scores[criterion] = 6.0
                else:
                    scores[criterion] = 6.0

        return scores

    def _extract_confidence_flexible(self, text: str) -> float:
        """Extract confidence score using flexible patterns"""

        patterns = [
            r'confidence.*?(\\d+(?:\\.\\d+)?)',
            r'confident.*?(\\d+(?:\\.\\d+)?)',
            r'certainty.*?(\\d+(?:\\.\\d+)?)',
            r'sure.*?(\\d+(?:\\.\\d+)?)'
        ]

        for pattern in patterns:
            match = re.search(pattern, text.lower())
            if match:
                try:
                    confidence = float(match.group(1))
                    if 1 <= confidence <= 10:
                        return confidence
                except ValueError:
                    continue

        return 7.0  # Default confidence

    def _extract_lists_flexible(self, text: str, keywords: List[str]) -> List[str]:
        """Extract bulleted or numbered lists using flexible patterns"""

        items = []

        for keyword in keywords:
            # Find section containing the keyword
            lines = text.split('\\n')
            section_started = False

            for i, line in enumerate(lines):
                if keyword.lower() in line.lower() and ':' in line:
                    section_started = True
                    continue

                if section_started:
                    line = line.strip()

                    # Stop if we hit another section header
                    if line and line.endswith(':') and any(word in line.lower() for word in ['score', 'assessment', 'evaluation', 'summary']):
                        break

                    # Extract bullet points or numbered items
                    if re.match(r'^[-•*]\\s+', line) or re.match(r'^\\d+\\.\\s+', line):
                        cleaned_line = re.sub(r'^[-•*\\d.]\\s*', '', line)
                        if len(cleaned_line) > 5:  # Ensure meaningful content
                            items.append(cleaned_line[:150])  # Limit length

                    # Also catch lines that start with text but are clearly list items
                    elif line and not line.startswith(('**', '#', 'Score', 'Assessment')):
                        if 10 < len(line) < 200:  # Reasonable length for list item
                            items.append(line[:150])

            if items:  # If we found items with this keyword, use them
                break

        # If no structured lists found, try to extract from general text
        if not items:
            # Look for sentences that contain action words or assessment language
            sentences = re.split(r'[.!?]+', text)
            for sentence in sentences:
                sentence = sentence.strip()
                if (any(keyword.lower() in sentence.lower() for keyword in keywords) and
                    len(sentence) > 15 and len(sentence) < 200):
                    items.append(sentence[:150])

        return items[:4] if items else [f"No specific {keywords[0]} identified"]

    def _extract_overall_flexible(self, text: str) -> str:
        """Extract overall assessment using flexible patterns"""

        keywords = ["overall", "summary", "assessment", "conclusion", "evaluation"]

        for keyword in keywords:
            # Look for keyword followed by colon
            pattern = f'{keyword}.*?:(.*?)(?:[A-Z]{{2,}}.*?:|$)'
            match = re.search(pattern, text, re.IGNORECASE | re.DOTALL)
            if match:
                assessment = match.group(1).strip()
                if len(assessment) > 20:
                    return assessment[:400] + "..." if len(assessment) > 400 else assessment

        # Fallback: look for any substantial paragraph
        paragraphs = [p.strip() for p in text.split('\\n\\n') if len(p.strip()) > 50]
        if paragraphs:
            # Take the longest paragraph as likely assessment
            longest = max(paragraphs, key=len)
            return longest[:400] + "..." if len(longest) > 400 else longest

        return "Overall assessment could not be extracted from review."

    def _create_enhanced_fallback_review(self, hypothesis_text: str, hypothesis_id: str, review_type: str) -> HypothesisReview:
        """Create an enhanced fallback review with better default scoring"""

        # Analyze hypothesis text to provide more realistic scores
        text_lower = hypothesis_text.lower()

        # Heuristic scoring based on content analysis
        novelty_score = 6.5 if any(word in text_lower for word in ['novel', 'new', 'innovative', 'breakthrough']) else 5.5
        feasibility_score = 7.0 if any(word in text_lower for word in ['demonstrated', 'established', 'proven', 'validated']) else 6.0
        rigor_score = 7.0 if any(word in text_lower for word in ['literature', 'evidence', 'studies', 'research']) else 5.5
        impact_score = 7.5 if any(word in text_lower for word in ['therapeutic', 'clinical', 'treatment', 'therapy']) else 6.0
        testability_score = 7.0 if any(word in text_lower for word in ['experiment', 'test', 'measure', 'assay']) else 6.0

        criteria = ReviewCriteria(
            novelty_score=novelty_score,
            feasibility_score=feasibility_score,
            scientific_rigor_score=rigor_score,
            impact_potential_score=impact_score,
            testability_score=testability_score,
            novelty_reasoning=f"Heuristic assessment based on text analysis: {novelty_score}/10",
            feasibility_reasoning=f"Heuristic assessment based on text analysis: {feasibility_score}/10",
            scientific_rigor_reasoning=f"Heuristic assessment based on text analysis: {rigor_score}/10",
            impact_potential_reasoning=f"Heuristic assessment based on text analysis: {impact_score}/10",
            testability_reasoning=f"Heuristic assessment based on text analysis: {testability_score}/10"
        )

        # Calculate weighted overall score
        overall_score = (novelty_score * 0.25 + feasibility_score * 0.20 +
                        rigor_score * 0.25 + impact_score * 0.20 + testability_score * 0.10)

        # Generate heuristic strengths and weaknesses
        strengths = ["Addresses important research question"]
        if "epigenetic" in text_lower:
            strengths.append("Focuses on promising epigenetic mechanisms")
        if "therapeutic" in text_lower or "treatment" in text_lower:
            strengths.append("Has clear therapeutic relevance")
        if "experiment" in text_lower:
            strengths.append("Includes experimental validation approach")

        weaknesses = ["Requires detailed expert review for accurate assessment"]
        if len(hypothesis_text) < 200:
            weaknesses.append("Limited detail in hypothesis description")
        if "novel" not in text_lower and "new" not in text_lower:
            weaknesses.append("Novelty could be better articulated")

        recommendations = ["Conduct detailed expert peer review"]
        if "literature" not in text_lower:
            recommendations.append("Strengthen literature foundation")
        if "experiment" not in text_lower:
            recommendations.append("Develop specific experimental protocols")

        return HypothesisReview(
            hypothesis_id=hypothesis_id,
            hypothesis_text=hypothesis_text[:300] + "..." if len(hypothesis_text) > 300 else hypothesis_text,
            criteria=criteria,
            overall_score=overall_score,
            overall_assessment=f"Automated heuristic assessment. The hypothesis shows promise with an overall score of {overall_score:.1f}/10. Manual expert review recommended for detailed evaluation.",
            strengths=strengths,
            weaknesses=weaknesses,
            recommendations=recommendations,
            confidence_level=5.0,  # Medium confidence for heuristic assessment
            # FIX: Corrected datetime call
            review_timestamp=datetime.datetime.now().isoformat(),
            reviewer_type=review_type + "_heuristic"
        )

    def adaptive_batch_review(self, hypotheses: List[Dict[str, Any]], research_goal: str = "",
                            quick_threshold: float = 6.5, detailed_threshold: float = 7.5) -> ReflectionState:
        """Robust batch review with enhanced error handling"""

        logger.info(f"Starting robust batch review of {len(hypotheses)} hypotheses")

        reflection_state = ReflectionState()

        for i, hypothesis in enumerate(hypotheses):
            hypothesis_id = hypothesis.get('id', f"hypothesis_{i+1}")
            hypothesis_text = hypothesis.get('content', str(hypothesis))

            # Always start with quick review
            quick_review = self.quick_review(hypothesis_text, hypothesis_id)

            # Decide on detailed review based on score and review quality
            if (quick_review.overall_score >= detailed_threshold and
                not quick_review.reviewer_type.endswith('_heuristic')):
                logger.info(f"Hypothesis {hypothesis_id} scored {quick_review.overall_score:.1f}, performing detailed review")
                detailed_review = self.detailed_review(hypothesis_text, hypothesis_id, research_goal)
                reflection_state.hypothesis_reviews.append(detailed_review)
            else:
                logger.info(f"Hypothesis {hypothesis_id} scored {quick_review.overall_score:.1f}, using quick review")
                reflection_state.hypothesis_reviews.append(quick_review)

        # Generate comprehensive analytics
        reflection_state.review_statistics = self._compute_robust_statistics(reflection_state.hypothesis_reviews)
        reflection_state.quality_flags = self._identify_robust_quality_flags(reflection_state.hypothesis_reviews)
        reflection_state.batch_summary = self._generate_robust_summary(reflection_state)

        logger.info(f"Robust batch review completed: {len(reflection_state.hypothesis_reviews)} reviews generated")
        return reflection_state

    def _compute_robust_statistics(self, reviews: List[HypothesisReview]) -> Dict[str, float]:
        """Compute robust statistics with better error handling"""

        if not reviews:
            return {'error': 'No reviews available'}

        overall_scores = [r.overall_score for r in reviews if r.overall_score is not None]
        confidence_scores = [r.confidence_level for r in reviews if r.confidence_level is not None]

        if not overall_scores:
            return {'error': 'No valid scores found'}

        criteria_scores = {
            'novelty': [r.criteria.novelty_score for r in reviews if r.criteria and r.criteria.novelty_score is not None],
            'feasibility': [r.criteria.feasibility_score for r in reviews if r.criteria and r.criteria.feasibility_score is not None],
            'rigor': [r.criteria.scientific_rigor_score for r in reviews if r.criteria and r.criteria.scientific_rigor_score is not None],
            'impact': [r.criteria.impact_potential_score for r in reviews if r.criteria and r.criteria.impact_potential_score is not None],
            'testability': [r.criteria.testability_score for r in reviews if r.criteria and r.criteria.testability_score is not None]
        }

        stats = {
            'mean_overall_score': statistics.mean(overall_scores),
            'median_overall_score': statistics.median(overall_scores),
            'std_overall_score': statistics.stdev(overall_scores) if len(overall_scores) > 1 else 0,
            'max_overall_score': max(overall_scores),
            'min_overall_score': min(overall_scores),
            'mean_confidence': statistics.mean(confidence_scores) if confidence_scores else 5.0,
            'excellent_count': len([s for s in overall_scores if s >= 8.5]),
            'high_scoring_count': len([s for s in overall_scores if s >= 7.5]),
            'moderate_scoring_count': len([s for s in overall_scores if 5.0 <= s < 7.5]),
            'low_scoring_count': len([s for s in overall_scores if s < 5.0]),
            'detailed_review_count': len([r for r in reviews if r.reviewer_type == "detailed"]),
            'quick_review_count': len([r for r in reviews if r.reviewer_type == "quick"]),
            'heuristic_review_count': len([r for r in reviews if "heuristic" in r.reviewer_type])
        }

        # Add criteria-specific statistics safely
        for criterion, scores in criteria_scores.items():
            if scores:
                stats[f'mean_{criterion}'] = statistics.mean(scores)
                stats[f'max_{criterion}'] = max(scores)
                stats[f'min_{criterion}'] = min(scores)
            else:
                stats[f'mean_{criterion}'] = 6.0
                stats[f'max_{criterion}'] = 6.0
                stats[f'min_{criterion}'] = 6.0

        return stats

    def _identify_robust_quality_flags(self, reviews: List[HypothesisReview]) -> List[str]:
        """Identify quality flags with robust error handling"""

        flags = []

        if not reviews:
            flags.append("NO_REVIEWS_GENERATED")
            return flags

        # Count review types
        heuristic_count = len([r for r in reviews if "heuristic" in r.reviewer_type])
        total_count = len(reviews)

        if heuristic_count == total_count:
            flags.append("ALL_HEURISTIC_REVIEWS")
        elif heuristic_count > total_count * 0.5:
            flags.append("MAJORITY_HEURISTIC_REVIEWS")

        # Check scores safely
        overall_scores = [r.overall_score for r in reviews if r.overall_score is not None]
        confidence_scores = [r.confidence_level for r in reviews if r.confidence_level is not None]

        if overall_scores:
            mean_score = statistics.mean(overall_scores)
            if mean_score < 5.0:
                flags.append("LOW_AVERAGE_QUALITY")
            elif mean_score >= 8.0:
                flags.append("HIGH_QUALITY_BATCH")

        if confidence_scores:
            mean_confidence = statistics.mean(confidence_scores)
            if mean_confidence < 6.0:
                flags.append("LOW_REVIEWER_CONFIDENCE")
            elif mean_confidence >= 8.5:
                flags.append("HIGH_REVIEWER_CONFIDENCE")

        return flags

    def _generate_robust_summary(self, reflection_state: ReflectionState) -> str:
        """Generate robust summary with safe data access"""

        stats = reflection_state.review_statistics
        flags = reflection_state.quality_flags
        reviews = reflection_state.hypothesis_reviews

        if not reviews:
            return "No reviews were generated. Please check the input data and try again."

        # Safe access to statistics
        total_reviews = len(reviews)
        mean_score = stats.get('mean_overall_score', 0)
        mean_confidence = stats.get('mean_confidence', 0)

        summary = f"""
ROBUST REFLECTION AGENT BATCH SUMMARY

📊 REVIEW OVERVIEW:
• Total hypotheses reviewed: {total_reviews}
• Heuristic reviews: {stats.get('heuristic_review_count', 0)}
• Quick reviews: {stats.get('quick_review_count', 0)}
• Detailed reviews: {stats.get('detailed_review_count', 0)}

📈 PERFORMANCE METRICS:
• Mean overall score: {mean_score:.2f}/10
• Score range: {stats.get('min_overall_score', 0):.1f} - {stats.get('max_overall_score', 0):.1f}
• Reviewer confidence: {mean_confidence:.1f}/10

🎯 SCORE DISTRIBUTION:
• Excellent (≥8.5): {stats.get('excellent_count', 0)} hypotheses
• High (7.5-8.4): {stats.get('high_scoring_count', 0)} hypotheses
• Moderate (5.0-7.4): {stats.get('moderate_scoring_count', 0)} hypotheses
• Low (<5.0): {stats.get('low_scoring_count', 0)} hypotheses

⚠️ QUALITY INDICATORS: {', '.join(flags) if flags else 'None detected'}

🏆 TOP HYPOTHESIS: {self._get_top_hypothesis_summary(reviews)}

💡 NEXT STEPS:
{self._generate_next_steps_recommendations(stats, flags)}
"""

        return summary.strip()

    def _get_top_hypothesis_summary(self, reviews: List[HypothesisReview]) -> str:
        """Safely get top hypothesis summary"""
        if not reviews:
            return "No reviews available"

        valid_reviews = [r for r in reviews if r.overall_score is not None]
        if not valid_reviews:
            return "No valid scores available"

        top_review = max(valid_reviews, key=lambda r: r.overall_score)
        return f"{top_review.hypothesis_id} (Score: {top_review.overall_score:.2f}/10)"

    def _generate_next_steps_recommendations(self, stats: Dict[str, float], flags: List[str]) -> str:
        """Generate actionable next steps"""

        recommendations = []

        mean_score = stats.get('mean_overall_score', 0)
        heuristic_count = stats.get('heuristic_review_count', 0)
        total_count = stats.get('heuristic_review_count', 0) + stats.get('quick_review_count', 0) + stats.get('detailed_review_count', 0)

        # Primary recommendations based on review quality
        if "ALL_HEURISTIC_REVIEWS" in flags:
            recommendations.append("• CRITICAL: All reviews were heuristic - improve LLM prompt parsing")
        elif "MAJORITY_HEURISTIC_REVIEWS" in flags:
            recommendations.append("• IMPROVE: Majority heuristic reviews - enhance LLM response structure")

        # Score-based recommendations
        if mean_score >= 7.5:
            recommendations.append("• PROCEED: High quality hypotheses - ready for Ranking Agent")
        elif mean_score >= 6.0:
            recommendations.append("• CONSIDER: Moderate quality - may benefit from Evolution Agent")
        else:
            recommendations.append("• REVISE: Low quality scores - return to Generation Agent")

        # Confidence-based recommendations
        if "LOW_REVIEWER_CONFIDENCE" in flags:
            recommendations.append("• VALIDATE: Low confidence suggests need for human expert review")

        # Default recommendation
        if not recommendations:
            recommendations.append("• CONTINUE: Proceed with current workflow")

        return '\\n'.join(recommendations)

# ===========================================================================
# TESTING THE ROBUST AGENT
# ===========================================================================

def test_robust_reflection_agent():
    """Test the robust reflection agent with flexible parsing"""

    print("🛡️ TESTING ROBUST REFLECTION AGENT")
    print("=" * 55)

    # Test with the same hypotheses but expect better parsing
    test_hypotheses = [
        {
            "id": "hypothesis_epigenetic",
            "content": """
            Novel Hypothesis: Epigenetic Reprogramming of Hepatic Stellate Cells

            We hypothesize that targeted inhibition of DNMT3A in activated hepatic stellate cells will reverse pathological methylation patterns and restore their quiescent phenotype, leading to resolution of liver fibrosis. This innovative approach leverages the reversible nature of epigenetic modifications.

            Scientific Rationale:
            - DNMT3A upregulation is consistently demonstrated in fibrotic liver tissue
            - Hypermethylation of anti-fibrotic genes like PPAR-γ correlates with stellate cell activation
            - Epigenetic targets offer therapeutic advantages due to reversibility

            Experimental Validation:
            1. In vitro DNMT3A knockdown in activated human stellate cells
            2. Genome-wide methylation profiling using bisulfite sequencing
            3. In vivo delivery using stellate cell-specific nanoparticles
            4. Assessment of fibrosis markers and liver function recovery

            Expected Impact: Could lead to first-in-class epigenetic therapy for liver fibrosis with potential for clinical translation within 3-5 years.
            """
        },
        {
            "id": "hypothesis_ai_personalized",
            "content": """
            Breakthrough Hypothesis: AI-Driven Personalized Epigenetic Therapy

            We propose developing a machine learning platform that analyzes individual patient epigenetic profiles to predict optimal combination therapies targeting multiple epigenetic enzymes (DNMT, HDAC, BRD4) for personalized liver fibrosis treatment.

            Innovation: This represents the first AI-guided approach to epigenetic combination therapy, addressing the critical need for personalized medicine in liver fibrosis treatment.

            Technical Approach:
            - Multi-omics profiling of fibrosis patients (methylome, transcriptome, clinical data)
            - Deep learning algorithms for treatment response prediction
            - Validation in patient-derived organoid models
            - Clinical pilot study with AI-recommended therapies

            Transformative Potential: Could revolutionize liver fibrosis treatment by enabling precision medicine approaches with significantly improved patient outcomes.
            """
        }
    ]

    try:
        robust_agent = RobustReflectionAgent(llm)
    except NameError:
        print("❌ Error: LLM not available. Run setup cell first.")
        return

    print(f"📋 Testing with {len(test_hypotheses)} enhanced hypotheses")

    # Test robust batch review
    print("\n🔍 TESTING ROBUST BATCH REVIEW")
    print("-" * 40)

    research_goal = "Discover novel epigenetic targets for liver fibrosis treatment"

    batch_results = robust_agent.adaptive_batch_review(
        test_hypotheses,
        research_goal,
        quick_threshold=5.0,
        detailed_threshold=7.0
    )

    print(f"✅ Robust batch review completed")
    print(f"   Total reviews: {len(batch_results.hypothesis_reviews)}")
    print(f"   Quality flags: {len(batch_results.quality_flags)}")

    # Display robust summary
    print("\n📊 ROBUST BATCH SUMMARY")
    print("-" * 40)
    print(batch_results.batch_summary)

    # Show sample review with robust parsing
    print("\n🔬 SAMPLE ROBUST REVIEW")
    print("-" * 40)

    if batch_results.hypothesis_reviews:
        sample_review = batch_results.hypothesis_reviews[0]
        print(f"Hypothesis: {sample_review.hypothesis_id}")
        print(f"Overall Score: {sample_review.overall_score:.2f}/10")
        print(f"Review Type: {sample_review.reviewer_type}")
        print(f"Confidence: {sample_review.confidence_level:.1f}/10")

        print(f"\nCriteria Breakdown:")
        print(f"  Novelty: {sample_review.criteria.novelty_score:.1f}/10")
        print(f"  Feasibility: {sample_review.criteria.feasibility_score:.1f}/10")
        print(f"  Scientific Rigor: {sample_review.criteria.scientific_rigor_score:.1f}/10")
        print(f"  Impact Potential: {sample_review.criteria.impact_potential_score:.1f}/10")
        print(f"  Testability: {sample_review.criteria.testability_score:.1f}/10")

        print(f"\nRobust Strengths ({len(sample_review.strengths)}):")
        for i, strength in enumerate(sample_review.strengths[:3], 1):
            print(f"  {i}. {strength}")

        print(f"\nIdentified Weaknesses ({len(sample_review.weaknesses)}):")
        for i, weakness in enumerate(sample_review.weaknesses[:3], 1):
            print(f"  {i}. {weakness}")

        print(f"\nOverall Assessment:")
        print(f"  {sample_review.overall_assessment[:200]}...")

    print("\n" + "=" * 55)
    print("🎉 ROBUST REFLECTION AGENT TESTING COMPLETE!")
    print("🛡️ Enhanced error handling and flexible parsing working")

    return batch_results

# Run the robust test
if __name__ == "__main__":
    robust_results = test_robust_reflection_agent()

🛡️ TESTING ROBUST REFLECTION AGENT
📋 Testing with 2 enhanced hypotheses

🔍 TESTING ROBUST BATCH REVIEW
----------------------------------------
✅ Robust batch review completed
   Total reviews: 2
   Quality flags: 1

📊 ROBUST BATCH SUMMARY
----------------------------------------
ROBUST REFLECTION AGENT BATCH SUMMARY

📊 REVIEW OVERVIEW:
• Total hypotheses reviewed: 2
• Heuristic reviews: 0
• Quick reviews: 0
• Detailed reviews: 2

📈 PERFORMANCE METRICS:
• Mean overall score: 4.27/10
• Score range: 1.0 - 7.5
• Reviewer confidence: 7.0/10

🎯 SCORE DISTRIBUTION:
• Excellent (≥8.5): 0 hypotheses
• High (7.5-8.4): 1 hypotheses
• Moderate (5.0-7.4): 0 hypotheses
• Low (<5.0): 1 hypotheses

⚠️ QUALITY INDICATORS: LOW_AVERAGE_QUALITY

🏆 TOP HYPOTHESIS: hypothesis_ai_personalized (Score: 7.55/10)

💡 NEXT STEPS:
• REVISE: Low quality scores - return to Generation Agent

🔬 SAMPLE ROBUST REVIEW
----------------------------------------
Hypothesis: hypothesis_epigenetic
Overall Score: 1.00/10
Review

In [None]:
#@title Ranking Agent - Elo Tournament & Pairwise Comparison System
# ===========================================================================
# RANKING AGENT IMPLEMENTATION
# Elo-Based Tournament System for Hypothesis Ranking
# ===========================================================================

import json
import math
import random
import statistics
from typing import List, Dict, Any, Optional, Tuple
from dataclasses import dataclass, asdict
import datetime # FIX: Added the missing datetime import
import logging
from itertools import combinations
import re # Added missing re import for parsing

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# ===========================================================================
# RANKING AGENT DATA STRUCTURES
# ===========================================================================

@dataclass
class EloRating:
    """Elo rating for a hypothesis"""
    hypothesis_id: str
    current_rating: float
    initial_rating: float
    games_played: int
    wins: int
    losses: int
    draws: int
    rating_history: List[float]

    def __post_init__(self):
        if self.rating_history is None:
            self.rating_history = [self.initial_rating]

    @property
    def win_rate(self) -> float:
        """Calculate win rate percentage"""
        total_games = self.wins + self.losses + self.draws
        return (self.wins / total_games * 100) if total_games > 0 else 0.0

    def update_rating(self, new_rating: float, result: str):
        """Update rating and statistics"""
        self.current_rating = new_rating
        self.rating_history.append(new_rating)
        self.games_played += 1

        if result == "win":
            self.wins += 1
        elif result == "loss":
            self.losses += 1
        else:
            self.draws += 1

@dataclass
class PairwiseComparison:
    """Record of a single pairwise comparison"""
    comparison_id: str
    hypothesis_a_id: str
    hypothesis_b_id: str
    winner_id: Optional[str]  # None for draw
    confidence: float  # 1-10 scale
    reasoning: str
    debate_transcript: str
    reflection_influence: float  # How much reflection scores influenced decision
    timestamp: str
    comparison_round: int

@dataclass
class RankingState:
    """Complete state of the ranking tournament"""
    elo_ratings: Dict[str, EloRating] = None
    pairwise_comparisons: List[PairwiseComparison] = None
    tournament_rounds: int = 0
    final_rankings: List[Dict[str, Any]] = None
    ranking_statistics: Dict[str, float] = None
    convergence_metrics: Dict[str, float] = None
    tournament_summary: str = ""

    def __post_init__(self):
        if self.elo_ratings is None:
            self.elo_ratings = {}
        if self.pairwise_comparisons is None:
            self.pairwise_comparisons = []
        if self.final_rankings is None:
            self.final_rankings = []
        if self.ranking_statistics is None:
            self.ranking_statistics = {}
        if self.convergence_metrics is None:
            self.convergence_metrics = {}

# ===========================================================================
# ELO RATING SYSTEM
# ===========================================================================

class EloSystem:
    """Elo rating system for hypothesis tournaments"""

    def __init__(self, k_factor: float = 32, initial_rating: float = 1200):
        self.k_factor = k_factor  # Rating volatility
        self.initial_rating = initial_rating

    def expected_score(self, rating_a: float, rating_b: float) -> float:
        """Calculate expected score for player A against player B"""
        return 1 / (1 + 10**((rating_b - rating_a) / 400))

    def update_ratings(self, rating_a: float, rating_b: float,
                      actual_score_a: float) -> Tuple[float, float]:
        """
        Update Elo ratings based on match result
        actual_score_a: 1 for A wins, 0 for A loses, 0.5 for draw
        """
        expected_a = self.expected_score(rating_a, rating_b)
        expected_b = 1 - expected_a
        actual_score_b = 1 - actual_score_a

        new_rating_a = rating_a + self.k_factor * (actual_score_a - expected_a)
        new_rating_b = rating_b + self.k_factor * (actual_score_b - expected_b)

        return new_rating_a, new_rating_b

    def rating_difference_to_win_probability(self, rating_diff: float) -> float:
        """Convert rating difference to win probability"""
        return 1 / (1 + 10**(-rating_diff / 400))

# ===========================================================================
# RANKING AGENT IMPLEMENTATION
# ===========================================================================

class RankingAgent:
    """
    Ranking Agent that implements Elo-based tournament system with self-debate
    for pairwise hypothesis comparison and ranking
    """

    def __init__(self, llm, k_factor: float = 32, initial_rating: float = 1200):
        self.llm = llm
        self.elo_system = EloSystem(k_factor, initial_rating)
        self.initial_rating = initial_rating

    def initialize_ratings(self, hypothesis_reviews: List[HypothesisReview]) -> Dict[str, EloRating]:
        """Initialize Elo ratings for all hypotheses"""

        ratings = {}

        for review in hypothesis_reviews:
            # Use reflection score to adjust initial rating slightly
            reflection_bonus = (review.overall_score - 6.0) * 20  # ±40 points max
            initial_rating = self.initial_rating + reflection_bonus

            ratings[review.hypothesis_id] = EloRating(
                hypothesis_id=review.hypothesis_id,
                current_rating=initial_rating,
                initial_rating=initial_rating,
                games_played=0,
                wins=0,
                losses=0,
                draws=0,
                rating_history=[initial_rating]
            )

        logger.info(f"Initialized {len(ratings)} hypothesis ratings")
        return ratings

    def pairwise_debate(self, hypothesis_a: HypothesisReview, hypothesis_b: HypothesisReview,
                       comparison_round: int) -> PairwiseComparison:
        """Conduct self-debate to compare two hypotheses"""

        logger.info(f"Starting pairwise debate: {hypothesis_a.hypothesis_id} vs {hypothesis_b.hypothesis_id}")

        debate_prompt = f"""You are a senior scientific review panel conducting a rigorous comparison between two research hypotheses. You must engage in self-debate to determine which hypothesis is superior.

HYPOTHESIS A ({hypothesis_a.hypothesis_id}):
Text: {hypothesis_a.hypothesis_text}
Reflection Score: {hypothesis_a.overall_score:.2f}/10
Key Strengths: {'; '.join(hypothesis_a.strengths[:3])}
Key Weaknesses: {'; '.join(hypothesis_a.weaknesses[:3])}

HYPOTHESIS B ({hypothesis_b.hypothesis_id}):
Text: {hypothesis_b.hypothesis_text}
Reflection Score: {hypothesis_b.overall_score:.2f}/10
Key Strengths: {'; '.join(hypothesis_b.strengths[:3])}
Key Weaknesses: {'; '.join(hypothesis_b.weaknesses[:3])}

CONDUCT A STRUCTURED DEBATE:

**ROUND 1 - Advocate for Hypothesis A:**
Present the strongest case for why Hypothesis A is superior. Consider novelty, feasibility, impact potential, and scientific rigor.

**ROUND 2 - Advocate for Hypothesis B:**
Present the strongest case for why Hypothesis B is superior. Consider the same criteria and directly address A's advantages.

**ROUND 3 - Critical Analysis:**
Critically examine both hypotheses. What are the decisive factors? Which hypothesis has the strongest foundation and highest potential for breakthrough science?

**FINAL JUDGMENT:**
Winner: [A or B or DRAW]
Confidence: [1-10 scale]
Reflection Influence: [How much did the reflection scores influence your decision? 1-10 scale]

**Reasoning:** [2-3 sentences explaining the decisive factors in your decision]

Conduct this debate rigorously and choose the hypothesis that would most likely lead to significant scientific advancement."""

        try:
            response = self.llm.invoke([("human", debate_prompt)])
            return self._parse_debate_result(response.content, hypothesis_a, hypothesis_b, comparison_round)

        except Exception as e:
            logger.error(f"Debate failed for {hypothesis_a.hypothesis_id} vs {hypothesis_b.hypothesis_id}: {str(e)}")
            return self._create_fallback_comparison(hypothesis_a, hypothesis_b, comparison_round)

    def _parse_debate_result(self, debate_text: str, hypothesis_a: HypothesisReview,
                           hypothesis_b: HypothesisReview, comparison_round: int) -> PairwiseComparison:
        """Parse the debate result to determine winner"""

        # Extract winner using multiple patterns
        winner_patterns = [
            r'winner.*?[:\\s]*([ABab]|draw|tie)',
            r'final.*?judgment.*?[:\\s]*([ABab]|draw|tie)',
            r'decision.*?[:\\s]*([ABab]|draw|tie)',
            r'superior.*?hypothesis.*?([ABab])'
        ]

        winner_id = None
        for pattern in winner_patterns:
            match = re.search(pattern, debate_text, re.IGNORECASE)
            if match:
                result = match.group(1).lower()
                if result in ['a', 'hypothesis a']:
                    winner_id = hypothesis_a.hypothesis_id
                elif result in ['b', 'hypothesis b']:
                    winner_id = hypothesis_b.hypothesis_id
                elif result in ['draw', 'tie']:
                    winner_id = None
                break

        # If no clear winner found, use reflection scores as fallback
        if winner_id is None:
            if hypothesis_a.overall_score > hypothesis_b.overall_score + 0.5:
                winner_id = hypothesis_a.hypothesis_id
            elif hypothesis_b.overall_score > hypothesis_a.overall_score + 0.5:
                winner_id = hypothesis_b.hypothesis_id
            # Otherwise remains None (draw)

        # Extract confidence
        confidence_match = re.search(r'confidence.*?(\\d+(?:\\.\\d+)?)', debate_text, re.IGNORECASE)
        confidence = float(confidence_match.group(1)) if confidence_match else 7.0

        # Extract reflection influence
        influence_match = re.search(r'reflection.*?influence.*?(\\d+(?:\\.\\d+)?)', debate_text, re.IGNORECASE)
        reflection_influence = float(influence_match.group(1)) if influence_match else 5.0

        # Extract reasoning
        reasoning_patterns = [
            r'reasoning.*?:(.*?)(?:\\n\\n|$)',
            r'decisive.*?factors.*?:(.*?)(?:\\n\\n|$)',
            r'explanation.*?:(.*?)(?:\\n\\n|$)'
        ]

        reasoning = "Reasoning could not be extracted from debate."
        for pattern in reasoning_patterns:
            match = re.search(pattern, debate_text, re.IGNORECASE | re.DOTALL)
            if match:
                reasoning = match.group(1).strip()[:300]
                break

        comparison_id = f"{hypothesis_a.hypothesis_id}_vs_{hypothesis_b.hypothesis_id}_r{comparison_round}"

        return PairwiseComparison(
            comparison_id=comparison_id,
            hypothesis_a_id=hypothesis_a.hypothesis_id,
            hypothesis_b_id=hypothesis_b.hypothesis_id,
            winner_id=winner_id,
            confidence=confidence,
            reasoning=reasoning,
            debate_transcript=debate_text[:1000] + "..." if len(debate_text) > 1000 else debate_text,
            reflection_influence=reflection_influence,
            # FIX: Corrected datetime call
            timestamp=datetime.datetime.now().isoformat(),
            comparison_round=comparison_round
        )

    def _create_fallback_comparison(self, hypothesis_a: HypothesisReview,
                                  hypothesis_b: HypothesisReview, comparison_round: int) -> PairwiseComparison:
        """Create fallback comparison based on reflection scores"""

        score_diff = hypothesis_a.overall_score - hypothesis_b.overall_score

        if abs(score_diff) < 0.3:
            winner_id = None  # Draw for very close scores
        elif score_diff > 0:
            winner_id = hypothesis_a.hypothesis_id
        else:
            winner_id = hypothesis_b.hypothesis_id

        comparison_id = f"{hypothesis_a.hypothesis_id}_vs_{hypothesis_b.hypothesis_id}_r{comparison_round}_fallback"

        return PairwiseComparison(
            comparison_id=comparison_id,
            hypothesis_a_id=hypothesis_a.hypothesis_id,
            hypothesis_b_id=hypothesis_b.hypothesis_id,
            winner_id=winner_id,
            confidence=5.0,  # Lower confidence for fallback
            reasoning=f"Fallback comparison based on reflection scores: {hypothesis_a.overall_score:.2f} vs {hypothesis_b.overall_score:.2f}",
            debate_transcript="Fallback comparison - no debate transcript available",
            reflection_influence=10.0,  # Fully based on reflection
            # FIX: Corrected datetime call
            timestamp=datetime.datetime.now().isoformat(),
            comparison_round=comparison_round
        )

    def run_tournament_round(self, hypothesis_reviews: List[HypothesisReview],
                           elo_ratings: Dict[str, EloRating], round_number: int,
                           max_comparisons: int = None) -> List[PairwiseComparison]:
        """Run a single round of the tournament with pairwise comparisons"""

        logger.info(f"Starting tournament round {round_number}")

        # Create review lookup for easy access
        review_lookup = {review.hypothesis_id: review for review in hypothesis_reviews}

        # Generate all possible pairs
        hypothesis_ids = list(elo_ratings.keys())
        all_pairs = list(combinations(hypothesis_ids, 2))

        # Limit comparisons if specified
        if max_comparisons and len(all_pairs) > max_comparisons:
            # Prioritize pairs with similar ratings for more informative comparisons
            def rating_similarity(pair):
                rating_a = elo_ratings[pair[0]].current_rating
                rating_b = elo_ratings[pair[1]].current_rating
                return abs(rating_a - rating_b)

            all_pairs.sort(key=rating_similarity)
            selected_pairs = all_pairs[:max_comparisons]
        else:
            selected_pairs = all_pairs

        round_comparisons = []

        for hypothesis_a_id, hypothesis_b_id in selected_pairs:
            # Get reviews
            review_a = review_lookup.get(hypothesis_a_id)
            review_b = review_lookup.get(hypothesis_b_id)

            if not review_a or not review_b:
                logger.warning(f"Could not find reviews for pair: {hypothesis_a_id}, {hypothesis_b_id}. Skipping.")
                continue

            # Conduct debate
            comparison = self.pairwise_debate(review_a, review_b, round_number)
            round_comparisons.append(comparison)

            # Update Elo ratings
            rating_a = elo_ratings[hypothesis_a_id].current_rating
            rating_b = elo_ratings[hypothesis_b_id].current_rating

            # Convert result to scores
            if comparison.winner_id == hypothesis_a_id:
                score_a, score_b = 1.0, 0.0
                result_a, result_b = "win", "loss"
            elif comparison.winner_id == hypothesis_b_id:
                score_a, score_b = 0.0, 1.0
                result_a, result_b = "loss", "win"
            else:
                score_a, score_b = 0.5, 0.5
                result_a, result_b = "draw", "draw"

            # Update ratings
            new_rating_a, new_rating_b = self.elo_system.update_ratings(rating_a, rating_b, score_a)

            elo_ratings[hypothesis_a_id].update_rating(new_rating_a, result_a)
            elo_ratings[hypothesis_b_id].update_rating(new_rating_b, result_b)

        logger.info(f"Completed round {round_number}: {len(round_comparisons)} comparisons")
        return round_comparisons

    def run_full_tournament(self, reflection_state: ReflectionState,
                          num_rounds: int = 3, max_comparisons_per_round: int = None) -> RankingState:
        """Run complete tournament with multiple rounds"""

        logger.info(f"Starting full tournament with {num_rounds} rounds")

        # Initialize ranking state
        ranking_state = RankingState()

        # Initialize Elo ratings
        ranking_state.elo_ratings = self.initialize_ratings(reflection_state.hypothesis_reviews)

        # Run tournament rounds
        for round_num in range(1, num_rounds + 1):
            round_comparisons = self.run_tournament_round(
                reflection_state.hypothesis_reviews,
                ranking_state.elo_ratings,
                round_num,
                max_comparisons_per_round
            )
            ranking_state.pairwise_comparisons.extend(round_comparisons)

        ranking_state.tournament_rounds = num_rounds

        # Generate final rankings and statistics
        ranking_state.final_rankings = self._generate_final_rankings(ranking_state)
        ranking_state.ranking_statistics = self._compute_ranking_statistics(ranking_state)
        ranking_state.convergence_metrics = self._compute_convergence_metrics(ranking_state)
        ranking_state.tournament_summary = self._generate_tournament_summary(ranking_state)

        logger.info(f"Tournament completed: {len(ranking_state.pairwise_comparisons)} total comparisons")
        return ranking_state

    def _generate_final_rankings(self, ranking_state: RankingState) -> List[Dict[str, Any]]:
        """Generate final hypothesis rankings"""

        rankings = []

        # Sort hypotheses by final Elo rating
        sorted_hypotheses = sorted(
            ranking_state.elo_ratings.items(),
            key=lambda x: x[1].current_rating,
            reverse=True
        )

        for rank, (hypothesis_id, elo_rating) in enumerate(sorted_hypotheses, 1):
            ranking_entry = {
                'rank': rank,
                'hypothesis_id': hypothesis_id,
                'final_elo_rating': elo_rating.current_rating,
                'initial_elo_rating': elo_rating.initial_rating,
                'rating_change': elo_rating.current_rating - elo_rating.initial_rating,
                'games_played': elo_rating.games_played,
                'wins': elo_rating.wins,
                'losses': elo_rating.losses,
                'draws': elo_rating.draws,
                'win_rate': elo_rating.win_rate,
                'rating_volatility': statistics.stdev(elo_rating.rating_history) if len(elo_rating.rating_history) > 1 else 0
            }
            rankings.append(ranking_entry)

        return rankings

    def _compute_ranking_statistics(self, ranking_state: RankingState) -> Dict[str, float]:
        """Compute comprehensive ranking statistics"""

        if not ranking_state.elo_ratings:
            return {}

        if not ranking_state.pairwise_comparisons:
             return { 'total_hypotheses': len(ranking_state.elo_ratings), 'total_comparisons': 0 }

        current_ratings = [rating.current_rating for rating in ranking_state.elo_ratings.values()]
        initial_ratings = [rating.initial_rating for rating in ranking_state.elo_ratings.values()]
        rating_changes = [current - initial for current, initial in zip(current_ratings, initial_ratings)]

        total_comparisons = len(ranking_state.pairwise_comparisons)
        decisive_wins = len([c for c in ranking_state.pairwise_comparisons if c.winner_id is not None])
        draws = total_comparisons - decisive_wins

        avg_confidence = statistics.mean([c.confidence for c in ranking_state.pairwise_comparisons])
        avg_reflection_influence = statistics.mean([c.reflection_influence for c in ranking_state.pairwise_comparisons])

        return {
            'total_hypotheses': len(ranking_state.elo_ratings),
            'total_comparisons': total_comparisons,
            'decisive_outcomes': decisive_wins,
            'draws': draws,
            'draw_rate': (draws / total_comparisons * 100) if total_comparisons > 0 else 0,
            'mean_final_rating': statistics.mean(current_ratings),
            'rating_spread': max(current_ratings) - min(current_ratings),
            'mean_rating_change': statistics.mean(rating_changes),
            'max_rating_change': max(rating_changes),
            'min_rating_change': min(rating_changes),
            'rating_volatility': statistics.stdev(current_ratings) if len(current_ratings) > 1 else 0,
            'average_confidence': avg_confidence,
            'reflection_influence': avg_reflection_influence,
            'tournament_rounds': ranking_state.tournament_rounds
        }

    def _compute_convergence_metrics(self, ranking_state: RankingState) -> Dict[str, float]:
        """Compute metrics indicating ranking convergence"""

        convergence = {}

        # Rating stability (how much ratings changed in later rounds)
        if ranking_state.tournament_rounds >= 2:
            early_ratings = {}
            late_ratings = {}

            for hypothesis_id, elo_rating in ranking_state.elo_ratings.items():
                history = elo_rating.rating_history
                if len(history) >= 3:
                    early_ratings[hypothesis_id] = history[len(history)//2]
                    late_ratings[hypothesis_id] = history[-1]

            if early_ratings and late_ratings:
                rating_stability = statistics.mean([
                    abs(late_ratings[h_id] - early_ratings[h_id])
                    for h_id in early_ratings.keys()
                ])
                convergence['rating_stability'] = rating_stability
                convergence['converged'] = rating_stability < 50  # Threshold for convergence

        # Ranking consistency (how often the same hypothesis wins)
        if ranking_state.pairwise_comparisons:
            winner_counts = {}
            for comparison in ranking_state.pairwise_comparisons:
                if comparison.winner_id:
                    winner_counts[comparison.winner_id] = winner_counts.get(comparison.winner_id, 0) + 1

            if winner_counts:
                total_wins = sum(winner_counts.values())
                max_wins = max(winner_counts.values())
                convergence['winner_concentration'] = max_wins / total_wins if total_wins > 0 else 0

        return convergence

    def _generate_tournament_summary(self, ranking_state: RankingState) -> str:
        """Generate comprehensive tournament summary"""

        stats = ranking_state.ranking_statistics
        convergence = ranking_state.convergence_metrics
        rankings = ranking_state.final_rankings

        if not stats or not rankings: return "Summary not available."

        # Get top 3 hypotheses
        top_3 = rankings[:3] if len(rankings) >= 3 else rankings

        summary = f"""
RANKING AGENT TOURNAMENT SUMMARY

🏆 TOURNAMENT RESULTS:
• Total hypotheses: {stats.get('total_hypotheses', 0)}
• Tournament rounds: {stats.get('tournament_rounds', 0)}
• Total comparisons: {stats.get('total_comparisons', 0)}
• Decisive outcomes: {stats.get('decisive_outcomes', 0)} ({100-stats.get('draw_rate', 0):.1f}%)
• Draws: {stats.get('draws', 0)} ({stats.get('draw_rate', 0):.1f}%)

📊 RATING ANALYSIS:
• Final rating range: {stats.get('mean_final_rating', 0) - stats.get('rating_volatility', 0)/2:.0f} - {stats.get('mean_final_rating', 0) + stats.get('rating_volatility', 0)/2:.0f}
• Rating spread: {stats.get('rating_spread', 0):.0f} points
• Average rating change: {stats.get('mean_rating_change', 0):+.1f} points
• Biggest winner: {stats.get('max_rating_change', 0):+.1f} points
• Biggest loser: {stats.get('min_rating_change', 0):+.1f} points

🎯 DECISION QUALITY:
• Average confidence: {stats.get('average_confidence', 0):.1f}/10
• Reflection influence: {stats.get('reflection_influence', 0):.1f}/10
• Tournament convergence: {'Yes' if convergence.get('converged', False) else 'Partial'}

🥇 TOP 3 RANKINGS:
{self._format_top_rankings(top_3)}

📈 CONVERGENCE ANALYSIS:
• Rating stability: {convergence.get('rating_stability', 0):.1f} points
• Winner concentration: {convergence.get('winner_concentration', 0)*100:.1f}%
• Tournament quality: {'Excellent' if stats.get('average_confidence', 0) > 7.5 else 'Good' if stats.get('average_confidence', 0) > 6.0 else 'Fair'}

💡 RECOMMENDATIONS:
{self._generate_ranking_recommendations(stats, convergence, rankings)}
"""

        return summary.strip()

    def _format_top_rankings(self, top_rankings: List[Dict[str, Any]]) -> str:
        """Format top rankings for display"""

        formatted = ""
        for ranking in top_rankings:
            formatted += f"""
• #{ranking['rank']}: {ranking['hypothesis_id']}
  Elo: {ranking['final_elo_rating']:.0f} ({ranking['rating_change']:+.0f})
  Record: {ranking['wins']}-{ranking['losses']}-{ranking['draws']} ({ranking['win_rate']:.1f}%)"""

        return formatted

    def _generate_ranking_recommendations(self, stats: Dict[str, float],
                                        convergence: Dict[str, float],
                                        rankings: List[Dict[str, Any]]) -> str:
        """Generate strategic recommendations based on tournament results"""

        recommendations = []

        if not stats or not rankings: return "No recommendations available."

        # Top hypothesis recommendations
        if rankings:
            top_hypothesis = rankings[0]
            if top_hypothesis['final_elo_rating'] > stats.get('mean_final_rating', 1200) + 100:
                recommendations.append(f"• PRIORITY: {top_hypothesis['hypothesis_id']} is clearly superior - proceed to Evolution Agent")
            elif top_hypothesis['win_rate'] > 70:
                recommendations.append(f"• STRONG CANDIDATE: {top_hypothesis['hypothesis_id']} shows consistent performance")

        # Convergence recommendations
        if convergence.get('converged', False):
            recommendations.append("• STABLE: Rankings have converged - results are reliable")
        else:
            recommendations.append("• CONTINUE: Consider additional tournament rounds for stability")

        # Quality recommendations
        avg_confidence = stats.get('average_confidence', 0)
        if avg_confidence < 6.0:
            recommendations.append("• REVIEW: Low confidence suggests difficult comparisons - validate manually")
        elif avg_confidence > 8.0:
            recommendations.append("• CONFIDENT: High confidence in ranking decisions")

        # Draw rate recommendations
        draw_rate = stats.get('draw_rate', 0)
        if draw_rate > 30:
            recommendations.append("• SIMILAR QUALITY: High draw rate indicates comparable hypotheses")
        elif draw_rate < 10:
            recommendations.append("• CLEAR DIFFERENCES: Low draw rate shows distinct quality levels")

        # Next steps
        if len(rankings) >= 3:
            top_3_ratings = [r['final_elo_rating'] for r in rankings[:3]]
            if max(top_3_ratings) - min(top_3_ratings) < 50:
                recommendations.append("• NEXT: Top hypotheses are close - consider parallel development")
            else:
                recommendations.append("• NEXT: Clear winner identified - focus evolution efforts")

        return '\\n'.join(recommendations) if recommendations else "• Proceed with evolved hypotheses as planned"

# ===========================================================================
# INTEGRATION WITH REFLECTION AGENT
# ===========================================================================

def run_ranking_tournament(reflection_state: ReflectionState, tournament_rounds: int = 3,
                          max_comparisons_per_round: int = None) -> RankingState:
    """Run complete ranking tournament using reflection results"""

    if not reflection_state.hypothesis_reviews:
        raise ValueError("No hypothesis reviews found in reflection state")

    # Ensure we have the LLM available
    try:
        ranking_agent = RankingAgent(llm)
    except NameError:
        raise RuntimeError("LLM not available. Run setup cell first.")

    # Run tournament
    ranking_state = ranking_agent.run_full_tournament(
        reflection_state,
        tournament_rounds,
        max_comparisons_per_round
    )

    return ranking_state

def integrate_ranking_with_generation_state(generation_state: GenerationState,
                                          ranking_state: RankingState) -> GenerationState:
    """Integrate ranking results with generation state"""

    # Add ranking results to generation state
    if hasattr(generation_state, 'ranking_results'):
        generation_state.ranking_results = ranking_state
    else:
        setattr(generation_state, 'ranking_results', ranking_state)

    # Update status
    generation_state.status = "hypotheses_ranked"

    return generation_state

# ===========================================================================
# TESTING CODE
# ===========================================================================

def test_ranking_agent():
    """Comprehensive test suite for Ranking Agent"""

    print("🏆 TESTING RANKING AGENT")
    print("=" * 50)

    # First, we need some reflection results to work with
    print("📋 Setting up test data...")

    # Create mock reflection state with multiple hypotheses
    mock_reviews = [
        HypothesisReview(
            hypothesis_id="hyp_epigenetic_1",
            hypothesis_text="Epigenetic reprogramming approach targeting DNMT3A for liver fibrosis treatment...",
            criteria=ReviewCriteria(8.0, 7.5, 7.0, 8.5, 7.5, "Novel", "Feasible", "Well-grounded", "High impact", "Testable"),
            overall_score=7.7,
            overall_assessment="Strong hypothesis with novel approach",
            strengths=["Novel epigenetic target", "Strong literature support", "Clear therapeutic path"],
            weaknesses=["Requires specialized expertise", "Complex delivery challenges"],
            recommendations=["Validate in primary cells", "Develop delivery system"],
            confidence_level=8.5,
            review_timestamp=datetime.datetime.now().isoformat(),
            reviewer_type="detailed"
        ),
        HypothesisReview(
            hypothesis_id="hyp_ai_personalized",
            hypothesis_text="AI-driven personalized medicine platform for optimized combination therapies...",
            criteria=ReviewCriteria(9.0, 6.0, 6.5, 9.0, 6.0, "Very novel", "Challenging", "Moderate support", "Transformative", "Complex validation"),
            overall_score=7.2,
            overall_assessment="Highly innovative but technically challenging",
            strengths=["Breakthrough AI application", "Personalized approach", "High impact potential"],
            weaknesses=["Technical complexity", "Data requirements", "Validation challenges"],
            recommendations=["Develop proof-of-concept", "Establish data partnerships"],
            confidence_level=7.0,
            review_timestamp=datetime.datetime.now().isoformat(),
            reviewer_type="detailed"
        ),
        HypothesisReview(
            hypothesis_id="hyp_combination_therapy",
            hypothesis_text="Multi-target combination therapy using BRD4 and HDAC inhibitors...",
            criteria=ReviewCriteria(6.5, 8.0, 7.5, 7.0, 8.0, "Moderate novelty", "Very feasible", "Good support", "Good impact", "Easy to test"),
            overall_score=7.4,
            overall_assessment="Solid, feasible approach with good scientific foundation",
            strengths=["Established targets", "Proven drug classes", "Clear experimental path"],
            weaknesses=["Limited novelty", "Incremental advance", "Competition exists"],
            recommendations=["Focus on unique combinations", "Optimize dosing"],
            confidence_level=8.0,
            review_timestamp=datetime.datetime.now().isoformat(),
            reviewer_type="detailed"
        ),
        HypothesisReview(
            hypothesis_id="hyp_single_cell",
            hypothesis_text="Single-cell epigenetic mapping to identify therapeutic vulnerabilities...",
            criteria=ReviewCriteria(7.5, 7.0, 8.0, 6.5, 7.0, "Good novelty", "Achievable", "Strong evidence", "Moderate impact", "Good testability"),
            overall_score=7.2,
            overall_assessment="Well-designed discovery approach with strong methodology",
            strengths=["Cutting-edge technology", "Discovery potential", "Strong methodology"],
            weaknesses=["Discovery rather than therapy", "Resource intensive", "Complex analysis"],
            recommendations=["Focus on key cell types", "Develop analysis pipeline"],
            confidence_level=7.5,
            review_timestamp=datetime.datetime.now().isoformat(),
            reviewer_type="detailed"
        )
    ]

    mock_reflection_state = ReflectionState(
        hypothesis_reviews=mock_reviews,
        review_statistics={'mean_overall_score': 7.375},
        quality_flags=[],
        batch_summary="Mock reflection state for testing"
    )

    print(f"✅ Created mock reflection state with {len(mock_reviews)} hypotheses")

    # Test 1: Elo System
    print("\n1️⃣ TESTING ELO RATING SYSTEM")
    print("-" * 30)

    elo_system = EloSystem(k_factor=32, initial_rating=1200)

    # Test expected score calculation
    expected = elo_system.expected_score(1200, 1300)
    print(f"Expected score (1200 vs 1300): {expected:.3f}")

    # Test rating update
    new_a, new_b = elo_system.update_ratings(1200, 1300, 1.0)  # A wins
    print(f"Rating update after A wins: {new_a:.1f}, {new_b:.1f}")

    # Test 2: Pairwise Comparison
    print("\n2️⃣ TESTING PAIRWISE COMPARISON")
    print("-" * 30)

    try:
        ranking_agent = RankingAgent(llm)

        # Test debate between two hypotheses
        comparison = ranking_agent.pairwise_debate(mock_reviews[0], mock_reviews[1], 1)

        print(f"✅ Pairwise comparison completed")
        print(f"   Comparison: {comparison.hypothesis_a_id} vs {comparison.hypothesis_b_id}")
        print(f"   Winner: {comparison.winner_id if comparison.winner_id else 'DRAW'}")
        print(f"   Confidence: {comparison.confidence:.1f}/10")
        print(f"   Reflection Influence: {comparison.reflection_influence:.1f}/10")
        print(f"   Reasoning: {comparison.reasoning[:100]}...")

    except NameError:
        print("❌ Error: LLM not available. Cannot test pairwise comparison.")
        return

    # Test 3: Tournament Round
    print("\n3️⃣ TESTING TOURNAMENT ROUND")
    print("-" * 30)

    # Initialize ratings
    elo_ratings = ranking_agent.initialize_ratings(mock_reviews)
    print(f"Initial ratings:")
    for hyp_id, rating in elo_ratings.items():
        print(f"   {hyp_id}: {rating.current_rating:.0f}")

    # Run one tournament round
    round_comparisons = ranking_agent.run_tournament_round(
        mock_reviews, elo_ratings, 1, max_comparisons=3
    )

    print(f"\n✅ Tournament round completed")
    print(f"   Comparisons made: {len(round_comparisons)}")
    print(f"   Updated ratings:")
    for hyp_id, rating in elo_ratings.items():
        change = rating.current_rating - rating.initial_rating
        print(f"   {hyp_id}: {rating.current_rating:.0f} ({change:+.0f})")

    # Test 4: Full Tournament
    print("\n4️⃣ TESTING FULL TOURNAMENT")
    print("-" * 30)

    ranking_state = ranking_agent.run_full_tournament(
        mock_reflection_state,
        num_rounds=2,
        max_comparisons_per_round=4
    )

    print(f"✅ Full tournament completed")
    print(f"   Tournament rounds: {ranking_state.tournament_rounds}")
    print(f"   Total comparisons: {len(ranking_state.pairwise_comparisons)}")
    print(f"   Final rankings generated: {len(ranking_state.final_rankings)}")

    # Test 5: Rankings Analysis
    print("\n5️⃣ TESTING RANKINGS ANALYSIS")
    print("-" * 30)

    print("📊 Final Rankings:")
    for ranking in ranking_state.final_rankings:
        print(f"   #{ranking['rank']}: {ranking['hypothesis_id']}")
        print(f"      Elo: {ranking['final_elo_rating']:.0f} ({ranking['rating_change']:+.0f})")
        print(f"      Record: {ranking['wins']}-{ranking['losses']}-{ranking['draws']} ({ranking['win_rate']:.1f}%)")

    print(f"\n📈 Tournament Statistics:")
    stats = ranking_state.ranking_statistics
    for key, value in stats.items():
        if isinstance(value, float):
            print(f"   {key}: {value:.2f}")
        else:
            print(f"   {key}: {value}")

    # Test 6: Integration
    print("\n6️⃣ TESTING INTEGRATION")
    print("-" * 30)

    # Create mock generation state
    mock_generation_state = GenerationState(
        research_goal="Test ranking integration",
        status="hypotheses_reviewed"
    )

    # Integrate ranking results
    integrated_state = integrate_ranking_with_generation_state(mock_generation_state, ranking_state)

    print(f"✅ Integration successful")
    print(f"   State status: {integrated_state.status}")
    print(f"   Has ranking results: {hasattr(integrated_state, 'ranking_results')}")

    # Test 7: Tournament Summary
    print("\n7️⃣ TOURNAMENT SUMMARY")
    print("-" * 30)
    print(ranking_state.tournament_summary)

    print("\n" + "=" * 50)
    print("🎉 ALL RANKING AGENT TESTS COMPLETED SUCCESSFULLY!")
    print("🏆 The Ranking Agent is ready for pairwise hypothesis comparison")

    return ranking_state

# ===========================================================================
# RUN TESTS
# ===========================================================================

if __name__ == "__main__":
    test_results = test_ranking_agent()

🏆 TESTING RANKING AGENT
📋 Setting up test data...
✅ Created mock reflection state with 4 hypotheses

1️⃣ TESTING ELO RATING SYSTEM
------------------------------
Expected score (1200 vs 1300): 0.360
Rating update after A wins: 1220.5, 1279.5

2️⃣ TESTING PAIRWISE COMPARISON
------------------------------
✅ Pairwise comparison completed
   Comparison: hyp_epigenetic_1 vs hyp_ai_personalized
   Winner: hyp_epigenetic_1
   Confidence: 7.0/10
   Reflection Influence: 5.0/10
   Reasoning: ** Although Hypothesis B has the potential to be revolutionary, Hypothesis A provides a more grounde...

3️⃣ TESTING TOURNAMENT ROUND
------------------------------
Initial ratings:
   hyp_epigenetic_1: 1234
   hyp_ai_personalized: 1224
   hyp_combination_therapy: 1228
   hyp_single_cell: 1224

✅ Tournament round completed
   Comparisons made: 3
   Updated ratings:
   hyp_epigenetic_1: 1234 (+0)
   hyp_ai_personalized: 1193 (-31)
   hyp_combination_therapy: 1227 (-1)
   hyp_single_cell: 1256 (+32)

4️⃣ TES

In [None]:
#@title Evolution Agent - Multi-Strategy Hypothesis Evolution
# ===========================================================================
# EVOLUTION AGENT IMPLEMENTATION
# Multi-Strategy Hypothesis Evolution and Improvement System
# ===========================================================================

import json
import random
import statistics
from typing import List, Dict, Any, Optional, Tuple
from dataclasses import dataclass, asdict
import datetime # FIX: Added the missing datetime import
import logging
from enum import Enum
import re # Added missing re import for parsing

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# ===========================================================================
# EVOLUTION AGENT DATA STRUCTURES
# ===========================================================================

class EvolutionStrategy(Enum):
    """Types of evolution strategies available"""
    SIMPLIFICATION = "simplification"
    COMBINATION = "combination"
    ANALOGICAL_REASONING = "analogical_reasoning"
    RADICAL_VARIATION = "radical_variation"
    CONSTRAINT_RELAXATION = "constraint_relaxation"
    DOMAIN_TRANSFER = "domain_transfer"

@dataclass
class EvolutionStep:
    """Record of a single evolution step"""
    step_id: str
    parent_hypothesis_ids: List[str]
    strategy_used: EvolutionStrategy
    evolution_prompt: str
    evolved_content: str
    improvement_rationale: str
    expected_benefits: List[str]
    potential_risks: List[str]
    novelty_increase: float  # -5 to +5 scale
    feasibility_change: float  # -5 to +5 scale
    impact_increase: float  # -5 to +5 scale
    confidence: float  # 1-10 scale
    timestamp: str

@dataclass
class EvolvedHypothesis:
    """Complete evolved hypothesis with lineage"""
    hypothesis_id: str
    evolved_content: str
    parent_hypothesis_ids: List[str]
    evolution_lineage: List[EvolutionStep]
    generation_number: int
    predicted_scores: Dict[str, float]  # Predicted reflection scores
    improvement_summary: str
    competitive_advantages: List[str]
    implementation_roadmap: List[str]

@dataclass
class EvolutionState:
    """Complete state of the evolution process"""
    evolved_hypotheses: List[EvolvedHypothesis] = None
    evolution_statistics: Dict[str, float] = None
    strategy_effectiveness: Dict[str, float] = None
    evolution_tree: Dict[str, Any] = None
    generation_summary: str = ""
    best_evolved_hypothesis: Optional[str] = None

    def __post_init__(self):
        if self.evolved_hypotheses is None:
            self.evolved_hypotheses = []
        if self.evolution_statistics is None:
            self.evolution_statistics = {}
        if self.strategy_effectiveness is None:
            self.strategy_effectiveness = {}
        if self.evolution_tree is None:
            self.evolution_tree = {}

# ===========================================================================
# EVOLUTION AGENT IMPLEMENTATION
# ===========================================================================

class EvolutionAgent:
    """
    Evolution Agent that applies multiple strategies to improve and evolve hypotheses
    """

    def __init__(self, llm):
        self.llm = llm
        self.generation_counter = 0
        self.evolution_history = []

    def simplify_hypothesis(self, hypothesis_review: HypothesisReview,
                          ranking_info: Dict[str, Any]) -> EvolutionStep:
        """Strategy 1: Simplify and focus the hypothesis"""

        logger.info(f"Applying simplification strategy to {hypothesis_review.hypothesis_id}")

        simplification_prompt = f"""You are a strategic scientific advisor specializing in hypothesis refinement. Your task is to SIMPLIFY and FOCUS this research hypothesis to make it more achievable and impactful.

ORIGINAL HYPOTHESIS ({hypothesis_review.hypothesis_id}):
{hypothesis_review.hypothesis_text}

CURRENT PERFORMANCE:
- Reflection Score: {hypothesis_review.overall_score:.2f}/10
- Tournament Ranking: #{ranking_info.get('rank', 'Unknown')}
- Key Strengths: {'; '.join(hypothesis_review.strengths[:3])}
- Key Weaknesses: {'; '.join(hypothesis_review.weaknesses[:3])}

SIMPLIFICATION STRATEGY:
Your goal is to create a more focused, achievable version that:
1. Reduces complexity while maintaining core innovation
2. Focuses on the most promising aspect
3. Makes the approach more feasible and testable
4. Maintains or increases potential impact

EVOLVED HYPOTHESIS:
[Provide the simplified, focused version of the hypothesis]

IMPROVEMENT RATIONALE:
[Explain specifically how this simplification improves the original - 2-3 sentences]

EXPECTED BENEFITS:
- [Benefit 1: How feasibility improves]
- [Benefit 2: How testability improves]
- [Benefit 3: How focus improves impact]

POTENTIAL RISKS:
- [Risk 1: What might be lost in simplification]
- [Risk 2: Any new limitations introduced]

PREDICTED SCORE CHANGES (-5 to +5):
- Novelty Change: [score change and brief reason]
- Feasibility Change: [score change and brief reason]
- Impact Change: [score change and brief reason]

CONFIDENCE: [1-10 in this improvement]"""

        try:
            response = self.llm.invoke([("human", simplification_prompt)])
            return self._parse_evolution_response(
                response.content, [hypothesis_review.hypothesis_id],
                EvolutionStrategy.SIMPLIFICATION
            )
        except Exception as e:
            logger.error(f"Simplification failed for {hypothesis_review.hypothesis_id}: {str(e)}")
            return self._create_fallback_evolution_step(
                [hypothesis_review.hypothesis_id], EvolutionStrategy.SIMPLIFICATION
            )

    def combine_hypotheses(self, hypothesis_reviews: List[HypothesisReview],
                          ranking_infos: List[Dict[str, Any]]) -> EvolutionStep:
        """Strategy 2: Combine multiple top hypotheses"""

        hypothesis_ids = [hr.hypothesis_id for hr in hypothesis_reviews]
        logger.info(f"Applying combination strategy to {', '.join(hypothesis_ids)}")

        # Prepare hypothesis summaries
        hypothesis_summaries = []
        for i, (review, ranking) in enumerate(zip(hypothesis_reviews, ranking_infos)):
            summary = f"""
HYPOTHESIS {i+1} ({review.hypothesis_id}):
Content: {review.hypothesis_text[:300]}...
Strengths: {'; '.join(review.strengths[:2])}
Ranking: #{ranking.get('rank', 'Unknown')} (Score: {review.overall_score:.1f}/10)"""
            hypothesis_summaries.append(summary)

        combination_prompt = f"""You are an innovative scientific researcher expert at synthesizing breakthrough ideas. Your task is to COMBINE the best elements from these top-performing hypotheses into a superior unified approach.

TOP HYPOTHESES TO COMBINE:
{''.join(hypothesis_summaries)}

COMBINATION STRATEGY:
Create a novel hybrid hypothesis that:
1. Integrates the strongest elements from each hypothesis
2. Creates synergistic effects between different approaches
3. Eliminates redundancies and weaknesses
4. Results in greater impact than individual components
5. Maintains feasibility while increasing innovation

COMBINED EVOLVED HYPOTHESIS:
[Present the unified hypothesis that combines the best elements]

INTEGRATION RATIONALE:
[Explain how you combined the hypotheses and why this synthesis is superior - 3-4 sentences]

SYNERGISTIC BENEFITS:
- [Benefit 1: How combination creates new capabilities]
- [Benefit 2: How different approaches reinforce each other]
- [Benefit 3: How combined approach increases impact]
- [Benefit 4: How integration reduces individual limitations]

POTENTIAL INTEGRATION RISKS:
- [Risk 1: Complexity from combination]
- [Risk 2: Potential conflicts between approaches]

PREDICTED SCORE CHANGES (-5 to +5):
- Novelty Change: [score change - combinations often increase novelty]
- Feasibility Change: [score change - may decrease due to complexity]
- Impact Change: [score change - typically increases significantly]

CONFIDENCE: [1-10 in this combination approach]"""

        try:
            response = self.llm.invoke([("human", combination_prompt)])
            return self._parse_evolution_response(
                response.content, hypothesis_ids, EvolutionStrategy.COMBINATION
            )
        except Exception as e:
            logger.error(f"Combination failed for {hypothesis_ids}: {str(e)}")
            return self._create_fallback_evolution_step(hypothesis_ids, EvolutionStrategy.COMBINATION)

    def analogical_reasoning(self, hypothesis_review: HypothesisReview,
                           ranking_info: Dict[str, Any]) -> EvolutionStep:
        """Strategy 3: Apply analogical reasoning from other successful domains"""

        logger.info(f"Applying analogical reasoning to {hypothesis_review.hypothesis_id}")

        analogical_prompt = f"""You are a cross-disciplinary innovation expert skilled at applying successful patterns from one domain to create breakthroughs in another. Your task is to evolve this hypothesis using ANALOGICAL REASONING from other successful scientific domains.

ORIGINAL HYPOTHESIS ({hypothesis_review.hypothesis_id}):
{hypothesis_review.hypothesis_text}

CURRENT PERFORMANCE:
- Reflection Score: {hypothesis_review.overall_score:.2f}/10
- Tournament Ranking: #{ranking_info.get('rank', 'Unknown')}
- Strengths: {'; '.join(hypothesis_review.strengths[:3])}
- Areas for improvement: {'; '.join(hypothesis_review.weaknesses[:2])}

ANALOGICAL REASONING STRATEGY:
1. Identify successful patterns from other domains (immunology, engineering, computer science, ecology, etc.)
2. Find analogous challenges that have been solved elegantly elsewhere
3. Adapt those successful approaches to this biomedical context
4. Create novel solutions inspired by cross-domain insights

EVOLVED HYPOTHESIS WITH ANALOGICAL INSPIRATION:
[Present the evolved hypothesis incorporating cross-domain insights]

ANALOGICAL INSPIRATION:
[Describe the specific analogy/domain that inspired this evolution - 2-3 sentences]

CROSS-DOMAIN BENEFITS:
- [Benefit 1: What successful pattern was adapted]
- [Benefit 2: How this analogy solves current limitations]
- [Benefit 3: What new capabilities this enables]
- [Benefit 4: How this increases novelty and innovation]

ADAPTATION RISKS:
- [Risk 1: Challenges in domain transfer]
- [Risk 2: Potential misalignment with biomedical context]

PREDICTED SCORE CHANGES (-5 to +5):
- Novelty Change: [score change - analogical reasoning typically increases novelty significantly]
- Feasibility Change: [score change - depends on analogy complexity]
- Impact Change: [score change - cross-domain insights often increase impact]

CONFIDENCE: [1-10 in this analogical approach]"""

        try:
            response = self.llm.invoke([("human", analogical_prompt)])
            return self._parse_evolution_response(
                response.content, [hypothesis_review.hypothesis_id],
                EvolutionStrategy.ANALOGICAL_REASONING
            )
        except Exception as e:
            logger.error(f"Analogical reasoning failed for {hypothesis_review.hypothesis_id}: {str(e)}")
            return self._create_fallback_evolution_step(
                [hypothesis_review.hypothesis_id], EvolutionStrategy.ANALOGICAL_REASONING
            )

    def radical_variation(self, hypothesis_review: HypothesisReview,
                         ranking_info: Dict[str, Any]) -> EvolutionStep:
        """Strategy 4: Create radical variations exploring very different approaches"""

        logger.info(f"Applying radical variation to {hypothesis_review.hypothesis_id}")

        radical_prompt = f"""You are a visionary scientific researcher known for paradigm-shifting breakthroughs. Your task is to create a RADICAL VARIATION of this hypothesis that explores a completely different approach while addressing the same fundamental problem.

ORIGINAL HYPOTHESIS ({hypothesis_review.hypothesis_id}):
{hypothesis_review.hypothesis_text}

CURRENT APPROACH ANALYSIS:
- Current Score: {hypothesis_review.overall_score:.2f}/10
- Current Strengths: {'; '.join(hypothesis_review.strengths[:2])}
- Current Limitations: {'; '.join(hypothesis_review.weaknesses[:2])}

RADICAL VARIATION STRATEGY:
Create a fundamentally different approach that:
1. Addresses the same core problem from a completely new angle
2. Challenges conventional assumptions in the field
3. Explores cutting-edge technologies or methodologies
4. Has potential for paradigm-shifting impact
5. May be higher risk but also higher reward

Think beyond incremental improvements - imagine a breakthrough that would make current approaches obsolete.

RADICAL EVOLVED HYPOTHESIS:
[Present the radically different approach to the same problem]

PARADIGM SHIFT RATIONALE:
[Explain what fundamental assumptions you're challenging and why this radical approach could be superior - 3-4 sentences]

BREAKTHROUGH POTENTIAL:
- [Revolutionary aspect 1: What paradigm this challenges]
- [Revolutionary aspect 2: What new possibilities this opens]
- [Revolutionary aspect 3: How this could transform the field]
- [Revolutionary aspect 4: What conventional limitations this bypasses]

HIGH-RISK FACTORS:
- [Risk 1: Technical challenges of radical approach]
- [Risk 2: Acceptance and validation difficulties]
- [Risk 3: Resource and timeline implications]

PREDICTED SCORE CHANGES (-5 to +5):
- Novelty Change: [score change - should be strongly positive for radical innovation]
- Feasibility Change: [score change - may decrease due to cutting-edge nature]
- Impact Change: [score change - potential for transformative impact]

CONFIDENCE: [1-10 in this radical approach - may be lower due to higher uncertainty]"""

        try:
            response = self.llm.invoke([("human", radical_prompt)])
            return self._parse_evolution_response(
                response.content, [hypothesis_review.hypothesis_id],
                EvolutionStrategy.RADICAL_VARIATION
            )
        except Exception as e:
            logger.error(f"Radical variation failed for {hypothesis_review.hypothesis_id}: {str(e)}")
            return self._create_fallback_evolution_step(
                [hypothesis_review.hypothesis_id], EvolutionStrategy.RADICAL_VARIATION
            )

    def constraint_relaxation(self, hypothesis_review: HypothesisReview,
                            ranking_info: Dict[str, Any],
                            original_constraints: List[str]) -> EvolutionStep:
        """Strategy 5: Relax constraints to explore new possibilities"""

        logger.info(f"Applying constraint relaxation to {hypothesis_review.hypothesis_id}")

        constraint_prompt = f"""You are a strategic research planner exploring expanded possibilities. Your task is to evolve this hypothesis by RELAXING KEY CONSTRAINTS to unlock new potential approaches.

ORIGINAL HYPOTHESIS ({hypothesis_review.hypothesis_id}):
{hypothesis_review.hypothesis_text}

CURRENT CONSTRAINTS:
{chr(10).join([f"- {constraint}" for constraint in original_constraints])}

CURRENT PERFORMANCE:
- Score: {hypothesis_review.overall_score:.2f}/10
- Main limitations: {'; '.join(hypothesis_review.weaknesses[:3])}

CONSTRAINT RELAXATION STRATEGY:
1. Identify which constraints are most limiting the hypothesis potential
2. Explore what becomes possible if we relax 1-2 key constraints
3. Develop evolved approach that leverages this expanded possibility space
4. Maintain core innovation while expanding scope

EVOLVED HYPOTHESIS WITH RELAXED CONSTRAINTS:
[Present the evolved hypothesis with expanded possibilities]

CONSTRAINTS RELAXED:
[Specify which 1-2 constraints you relaxed and why]

EXPANDED POSSIBILITIES:
- [Possibility 1: What new approaches become available]
- [Possibility 2: What additional impact becomes possible]
- [Possibility 3: What technological opportunities open up]
- [Possibility 4: How timeline or scope can be optimized]

TRADE-OFF ANALYSIS:
- [Trade-off 1: What becomes more challenging]
- [Trade-off 2: What additional resources might be needed]

PREDICTED SCORE CHANGES (-5 to +5):
- Novelty Change: [score change and reasoning]
- Feasibility Change: [score change - may decrease with relaxed constraints]
- Impact Change: [score change - typically increases with expanded scope]

CONFIDENCE: [1-10 in this constraint relaxation approach]"""

        try:
            response = self.llm.invoke([("human", constraint_prompt)])
            return self._parse_evolution_response(
                response.content, [hypothesis_review.hypothesis_id],
                EvolutionStrategy.CONSTRAINT_RELAXATION
            )
        except Exception as e:
            logger.error(f"Constraint relaxation failed for {hypothesis_review.hypothesis_id}: {str(e)}")
            return self._create_fallback_evolution_step(
                [hypothesis_review.hypothesis_id], EvolutionStrategy.CONSTRAINT_RELAXATION
            )

    def _parse_evolution_response(self, response_text: str, parent_ids: List[str],
                                strategy: EvolutionStrategy) -> EvolutionStep:
        """Parse LLM evolution response into structured format"""

        try:
            # Extract evolved hypothesis content
            content_patterns = [
                r'evolved hypothesis.*?:(.*?)(?:improvement rationale|rationale|benefits)',
                r'hypothesis.*?:(.*?)(?:improvement|rationale|benefits)',
                r'combined.*?hypothesis.*?:(.*?)(?:integration|rationale|benefits)'
            ]

            evolved_content = "Evolution content could not be extracted"
            for pattern in content_patterns:
                match = re.search(pattern, response_text, re.IGNORECASE | re.DOTALL)
                if match:
                    evolved_content = match.group(1).strip()[:1000]
                    break

            # Extract rationale
            rationale_patterns = [
                r'rationale.*?:(.*?)(?:benefits|expected|predicted)',
                r'improvement.*?rationale.*?:(.*?)(?:benefits|expected|predicted)',
                r'integration.*?rationale.*?:(.*?)(?:benefits|expected|predicted)'
            ]

            improvement_rationale = "Rationale could not be extracted"
            for pattern in rationale_patterns:
                match = re.search(pattern, response_text, re.IGNORECASE | re.DOTALL)
                if match:
                    improvement_rationale = match.group(1).strip()[:500]
                    break

            # Extract benefits and risks
            benefits = self._extract_list_items(response_text, ["benefit", "benefits", "advantages"])
            risks = self._extract_list_items(response_text, ["risk", "risks", "challenge", "challenges"])

            # Extract predicted score changes
            score_changes = self._extract_score_changes(response_text)

            # Extract confidence
            confidence_match = re.search(r'confidence.*?(\\d+(?:\\.\\d+)?)', response_text, re.IGNORECASE)
            confidence = float(confidence_match.group(1)) if confidence_match else 7.0

            # FIX: Corrected datetime call
            step_id = f"evolution_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}_{strategy.value}"

            return EvolutionStep(
                step_id=step_id,
                parent_hypothesis_ids=parent_ids,
                strategy_used=strategy,
                evolution_prompt="Evolution prompt executed",
                evolved_content=evolved_content,
                improvement_rationale=improvement_rationale,
                expected_benefits=benefits,
                potential_risks=risks,
                novelty_increase=score_changes.get('novelty', 0.0),
                feasibility_change=score_changes.get('feasibility', 0.0),
                impact_increase=score_changes.get('impact', 0.0),
                confidence=confidence,
                # FIX: Corrected datetime call
                timestamp=datetime.datetime.now().isoformat()
            )

        except Exception as e:
            logger.error(f"Failed to parse evolution response: {str(e)}")
            return self._create_fallback_evolution_step(parent_ids, strategy)

    def _extract_list_items(self, text: str, keywords: List[str]) -> List[str]:
        """Extract bulleted list items from text"""

        items = []
        for keyword in keywords:
            pattern = f'{keyword}.*?:(.*?)(?:[A-Z]{{2,}}.*?:|$)'
            match = re.search(pattern, text, re.IGNORECASE | re.DOTALL)
            if match:
                section_text = match.group(1)
                lines = section_text.split('\\n')
                for line in lines:
                    line = line.strip()
                    if re.match(r'^[-•*]\\s+', line) or re.match(r'^\\d+\\.\\s+', line):
                        cleaned_line = re.sub(r'^[-•*\\d.]\\s*', '', line)
                        if len(cleaned_line) > 10:
                            items.append(cleaned_line[:150])
                break

        return items[:4] if items else [f"No specific {keywords[0]} identified"]

    def _extract_score_changes(self, text: str) -> Dict[str, float]:
        """Extract predicted score changes from text"""

        changes = {}
        criteria = ['novelty', 'feasibility', 'impact']

        for criterion in criteria:
            pattern = f'{criterion}.*?change.*?([+-]?\\d+(?:\\.\\d+)?)'
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                try:
                    changes[criterion] = float(match.group(1))
                except ValueError:
                    changes[criterion] = 0.0
            else:
                changes[criterion] = 0.0

        return changes

    def _create_fallback_evolution_step(self, parent_ids: List[str],
                                      strategy: EvolutionStrategy) -> EvolutionStep:
        """Create fallback evolution step when parsing fails"""

        # FIX: Corrected datetime call
        step_id = f"fallback_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}_{strategy.value}"

        return EvolutionStep(
            step_id=step_id,
            parent_hypothesis_ids=parent_ids,
            strategy_used=strategy,
            evolution_prompt="Fallback evolution step",
            evolved_content=f"Evolved hypothesis using {strategy.value} strategy (fallback generated)",
            improvement_rationale=f"Applied {strategy.value} strategy to improve the original hypothesis",
            expected_benefits=[f"Expected benefits from {strategy.value}", "Improved hypothesis quality"],
            potential_risks=["Fallback evolution may need manual refinement"],
            novelty_increase=0.5,
            feasibility_change=0.0,
            impact_increase=0.5,
            confidence=5.0,
            # FIX: Corrected datetime call
            timestamp=datetime.datetime.now().isoformat()
        )

    def evolve_top_hypotheses(self, ranking_state: RankingState,
                            reflection_state: ReflectionState,
                            original_constraints: List[str] = None,
                            top_n: int = 3) -> EvolutionState:
        """Evolve the top N hypotheses using multiple strategies"""

        logger.info(f"Starting evolution of top {top_n} hypotheses")

        if original_constraints is None:
            original_constraints = ["Must be clinically applicable", "Focus on translational approaches"]

        evolution_state = EvolutionState()
        self.generation_counter += 1

        # Get top hypotheses from ranking
        top_rankings = ranking_state.final_rankings[:top_n]
        review_lookup = {r.hypothesis_id: r for r in reflection_state.hypothesis_reviews}

        # Apply different strategies to different hypotheses
        for i, ranking in enumerate(top_rankings):
            hypothesis_id = ranking['hypothesis_id']
            hypothesis_review = review_lookup.get(hypothesis_id)
            if not hypothesis_review:
                logger.warning(f"Could not find review for hypothesis {hypothesis_id}, skipping evolution.")
                continue

            logger.info(f"Evolving #{ranking['rank']}: {hypothesis_id}")

            # Strategy selection based on rank and characteristics
            evolution_steps = []

            # Rank 1: Apply multiple strategies
            if ranking['rank'] == 1:
                # Simplify the top hypothesis for broader applicability
                evolution_steps.append(
                    self.simplify_hypothesis(hypothesis_review, ranking)
                )
                # Also create a radical variation for breakthrough potential
                evolution_steps.append(
                    self.radical_variation(hypothesis_review, ranking)
                )

            # Rank 2: Focus on enhancement strategies
            elif ranking['rank'] == 2:
                # Apply analogical reasoning for novel insights
                evolution_steps.append(
                    self.analogical_reasoning(hypothesis_review, ranking)
                )
                # Relax constraints to explore expanded possibilities
                evolution_steps.append(
                    self.constraint_relaxation(hypothesis_review, ranking, original_constraints)
                )

            # Rank 3: Apply targeted improvements
            else:
                # Simplify to increase feasibility
                evolution_steps.append(
                    self.simplify_hypothesis(hypothesis_review, ranking)
                )

            # Create evolved hypotheses from each evolution step
            for step in evolution_steps:
                evolved_hyp = self._create_evolved_hypothesis(step, hypothesis_review)
                evolution_state.evolved_hypotheses.append(evolved_hyp)

        # Apply combination strategy if we have multiple top hypotheses
        if len(top_rankings) >= 2:
            top_2_reviews = [review_lookup[ranking['hypothesis_id']] for ranking in top_rankings[:2] if ranking['hypothesis_id'] in review_lookup]
            top_2_rankings = top_rankings[:2]

            if len(top_2_reviews) >= 2:
                combination_step = self.combine_hypotheses(top_2_reviews, top_2_rankings)
                combined_hyp = self._create_evolved_hypothesis(combination_step, None)
                evolution_state.evolved_hypotheses.append(combined_hyp)

        # Generate analytics and summary
        evolution_state.evolution_statistics = self._compute_evolution_statistics(evolution_state)
        evolution_state.strategy_effectiveness = self._analyze_strategy_effectiveness(evolution_state)
        evolution_state.best_evolved_hypothesis = self._identify_best_evolved(evolution_state)
        evolution_state.generation_summary = self._generate_evolution_summary(evolution_state)

        logger.info(f"Evolution completed: {len(evolution_state.evolved_hypotheses)} evolved hypotheses generated")
        return evolution_state

    def _create_evolved_hypothesis(self, evolution_step: EvolutionStep,
                                 original_review: Optional[HypothesisReview]) -> EvolvedHypothesis:
        """Create evolved hypothesis from evolution step"""
        # FIX: Corrected datetime call
        hypothesis_id = f"evolved_{evolution_step.strategy_used.value}_{datetime.datetime.now().strftime('%H%M%S')}"

        # Predict new scores based on evolution step
        if original_review:
            predicted_scores = {
                'novelty': min(10, max(1, original_review.criteria.novelty_score + evolution_step.novelty_increase)),
                'feasibility': min(10, max(1, original_review.criteria.feasibility_score + evolution_step.feasibility_change)),
                'scientific_rigor': original_review.criteria.scientific_rigor_score,  # Usually preserved
                'impact_potential': min(10, max(1, original_review.criteria.impact_potential_score + evolution_step.impact_increase)),
                'testability': original_review.criteria.testability_score  # Usually preserved
            }
        else:
            # For combinations, estimate higher scores
            predicted_scores = {
                'novelty': 8.0 + evolution_step.novelty_increase,
                'feasibility': 7.0 + evolution_step.feasibility_change,
                'scientific_rigor': 7.5,
                'impact_potential': 8.5 + evolution_step.impact_increase,
                'testability': 7.0
            }

        # Generate implementation roadmap
        roadmap = self._generate_implementation_roadmap(evolution_step)
        competitive_advantages = evolution_step.expected_benefits[:3]

        return EvolvedHypothesis(
            hypothesis_id=hypothesis_id,
            evolved_content=evolution_step.evolved_content,
            parent_hypothesis_ids=evolution_step.parent_hypothesis_ids,
            evolution_lineage=[evolution_step],
            generation_number=self.generation_counter,
            predicted_scores=predicted_scores,
            improvement_summary=evolution_step.improvement_rationale,
            competitive_advantages=competitive_advantages,
            implementation_roadmap=roadmap
        )

    def _generate_implementation_roadmap(self, evolution_step: EvolutionStep) -> List[str]:
        """Generate implementation roadmap based on evolution strategy"""

        strategy = evolution_step.strategy_used

        roadmap_templates = {
            EvolutionStrategy.SIMPLIFICATION: [
                "Phase 1: Validate simplified approach in proof-of-concept studies",
                "Phase 2: Optimize focused methodology for maximum impact",
                "Phase 3: Scale up simplified approach for broader application"
            ],
            EvolutionStrategy.COMBINATION: [
                "Phase 1: Validate synergistic effects between combined approaches",
                "Phase 2: Optimize integration protocols and workflows",
                "Phase 3: Implement unified approach in comprehensive studies"
            ],
            EvolutionStrategy.ANALOGICAL_REASONING: [
                "Phase 1: Validate cross-domain adaptation in target context",
                "Phase 2: Refine approach based on domain-specific requirements",
                "Phase 3: Demonstrate superior performance vs conventional methods"
            ],
            EvolutionStrategy.RADICAL_VARIATION: [
                "Phase 1: Establish proof-of-concept for paradigm-shifting approach",
                "Phase 2: Address technical challenges and validation requirements",
                "Phase 3: Develop breakthrough approach for transformative impact"
            ],
            EvolutionStrategy.CONSTRAINT_RELAXATION: [
                "Phase 1: Explore expanded possibilities with relaxed constraints",
                "Phase 2: Optimize approach within new possibility space",
                "Phase 3: Demonstrate enhanced impact from expanded scope"
            ]
        }

        return roadmap_templates.get(strategy, [
            "Phase 1: Validate evolved approach",
            "Phase 2: Optimize implementation",
            "Phase 3: Scale for maximum impact"
        ])

    def _compute_evolution_statistics(self, evolution_state: EvolutionState) -> Dict[str, float]:
        """Compute comprehensive evolution statistics"""

        if not evolution_state.evolved_hypotheses:
            return {}

        predicted_overall_scores = []
        novelty_improvements = []
        impact_improvements = []
        feasibility_changes = []
        confidence_levels = []

        for evolved_hyp in evolution_state.evolved_hypotheses:
            # Calculate predicted overall score
            scores = evolved_hyp.predicted_scores
            overall = (scores['novelty'] * 0.25 + scores['feasibility'] * 0.20 +
                      scores['scientific_rigor'] * 0.25 + scores['impact_potential'] * 0.20 +
                      scores['testability'] * 0.10)
            predicted_overall_scores.append(overall)

            # Get evolution metrics from lineage
            if evolved_hyp.evolution_lineage:
                step = evolved_hyp.evolution_lineage[0]
                novelty_improvements.append(step.novelty_increase)
                impact_improvements.append(step.impact_increase)
                feasibility_changes.append(step.feasibility_change)
                confidence_levels.append(step.confidence)

        if not predicted_overall_scores: return {}

        return {
            'total_evolved_hypotheses': len(evolution_state.evolved_hypotheses),
            'mean_predicted_score': statistics.mean(predicted_overall_scores),
            'max_predicted_score': max(predicted_overall_scores),
            'min_predicted_score': min(predicted_overall_scores),
            'mean_novelty_improvement': statistics.mean(novelty_improvements),
            'mean_impact_improvement': statistics.mean(impact_improvements),
            'mean_feasibility_change': statistics.mean(feasibility_changes),
            'mean_evolution_confidence': statistics.mean(confidence_levels),
            'strategies_applied': len(set(hyp.evolution_lineage[0].strategy_used for hyp in evolution_state.evolved_hypotheses)),
            'generation_number': self.generation_counter
        }

    def _analyze_strategy_effectiveness(self, evolution_state: EvolutionState) -> Dict[str, float]:
        """Analyze effectiveness of different evolution strategies"""

        strategy_metrics = {}

        for evolved_hyp in evolution_state.evolved_hypotheses:
            if evolved_hyp.evolution_lineage:
                step = evolved_hyp.evolution_lineage[0]
                strategy = step.strategy_used.value

                if strategy not in strategy_metrics:
                    strategy_metrics[strategy] = {
                        'count': 0,
                        'total_novelty_gain': 0,
                        'total_impact_gain': 0,
                        'total_confidence': 0,
                        'predicted_scores': []
                    }

                metrics = strategy_metrics[strategy]
                metrics['count'] += 1
                metrics['total_novelty_gain'] += step.novelty_increase
                metrics['total_impact_gain'] += step.impact_increase
                metrics['total_confidence'] += step.confidence

                # Calculate predicted overall score
                scores = evolved_hyp.predicted_scores
                overall = (scores['novelty'] * 0.25 + scores['feasibility'] * 0.20 +
                          scores['scientific_rigor'] * 0.25 + scores['impact_potential'] * 0.20 +
                          scores['testability'] * 0.10)
                metrics['predicted_scores'].append(overall)

        # Calculate effectiveness scores
        effectiveness = {}
        for strategy, metrics in strategy_metrics.items():
            if metrics['count'] > 0:
                effectiveness[f"{strategy}_avg_novelty_gain"] = metrics['total_novelty_gain'] / metrics['count']
                effectiveness[f"{strategy}_avg_impact_gain"] = metrics['total_impact_gain'] / metrics['count']
                effectiveness[f"{strategy}_avg_confidence"] = metrics['total_confidence'] / metrics['count']
                effectiveness[f"{strategy}_avg_predicted_score"] = statistics.mean(metrics['predicted_scores'])
                effectiveness[f"{strategy}_applications"] = metrics['count']

        return effectiveness

    def _identify_best_evolved(self, evolution_state: EvolutionState) -> Optional[str]:
        """Identify the best evolved hypothesis based on predicted scores"""

        if not evolution_state.evolved_hypotheses:
            return None

        best_hypothesis = max(
            evolution_state.evolved_hypotheses,
            key=lambda h: statistics.mean(list(h.predicted_scores.values()))
        )

        return best_hypothesis.hypothesis_id

    def _generate_evolution_summary(self, evolution_state: EvolutionState) -> str:
        """Generate comprehensive evolution summary"""

        stats = evolution_state.evolution_statistics
        strategy_eff = evolution_state.strategy_effectiveness
        best_id = evolution_state.best_evolved_hypothesis

        # Find best hypothesis for details
        best_hyp = None
        if best_id:
            best_hyp = next((h for h in evolution_state.evolved_hypotheses if h.hypothesis_id == best_id), None)

        if not stats: return "Evolution summary could not be generated."

        summary = f"""
EVOLUTION AGENT GENERATION SUMMARY

🧬 EVOLUTION OVERVIEW:
• Generation Number: {stats.get('generation_number', 1)}
• Evolved Hypotheses: {stats.get('total_evolved_hypotheses', 0)}
• Strategies Applied: {stats.get('strategies_applied', 0)}
• Evolution Confidence: {stats.get('mean_evolution_confidence', 0):.1f}/10

📈 IMPROVEMENT METRICS:
• Mean Predicted Score: {stats.get('mean_predicted_score', 0):.2f}/10
• Score Range: {stats.get('min_predicted_score', 0):.1f} - {stats.get('max_predicted_score', 0):.1f}
• Average Novelty Gain: {stats.get('mean_novelty_improvement', 0):+.1f} points
• Average Impact Gain: {stats.get('mean_impact_improvement', 0):+.1f} points
• Feasibility Change: {stats.get('mean_feasibility_change', 0):+.1f} points

🎯 STRATEGY EFFECTIVENESS:
{self._format_strategy_effectiveness(strategy_eff)}

🏆 BEST EVOLVED HYPOTHESIS:
{self._format_best_hypothesis(best_hyp)}

🚀 IMPLEMENTATION RECOMMENDATIONS:
{self._generate_implementation_recommendations(evolution_state)}
"""

        return summary.strip()

    def _format_strategy_effectiveness(self, strategy_effectiveness: Dict[str, float]) -> str:
        """Format strategy effectiveness for display"""

        strategies = set()
        for key in strategy_effectiveness.keys():
            if '_avg_predicted_score' in key:
                strategies.add(key.replace('_avg_predicted_score', ''))

        formatted = ""
        for strategy in sorted(list(strategies)):
            score = strategy_effectiveness.get(f"{strategy}_avg_predicted_score", 0)
            novelty = strategy_effectiveness.get(f"{strategy}_avg_novelty_gain", 0)
            impact = strategy_effectiveness.get(f"{strategy}_avg_impact_gain", 0)
            count = strategy_effectiveness.get(f"{strategy}_applications", 0)

            formatted += f"\n• {strategy.title()}: Score {score:.1f} (+{novelty:.1f} novelty, +{impact:.1f} impact) [{int(count)} applications]"

        return formatted if formatted else "No strategy effectiveness data available"

    def _format_best_hypothesis(self, best_hyp: Optional[EvolvedHypothesis]) -> str:
        """Format best hypothesis for display"""

        if not best_hyp:
            return "No best hypothesis identified"

        strategy = best_hyp.evolution_lineage[0].strategy_used.value if best_hyp.evolution_lineage else "unknown"
        predicted_score = statistics.mean(list(best_hyp.predicted_scores.values()))

        return f"""
• ID: {best_hyp.hypothesis_id}
• Strategy Used: {strategy.title()}
• Predicted Score: {predicted_score:.2f}/10
• Key Advantages: {'; '.join(best_hyp.competitive_advantages[:2])}
• Parent(s): {', '.join(best_hyp.parent_hypothesis_ids)}"""

    def _generate_implementation_recommendations(self, evolution_state: EvolutionState) -> str:
        """Generate implementation recommendations"""

        recommendations = []
        stats = evolution_state.evolution_statistics

        if not stats: return "No recommendations available."

        # Best hypothesis recommendations
        if evolution_state.best_evolved_hypothesis:
            recommendations.append(f"• PRIORITY: Focus on {evolution_state.best_evolved_hypothesis} for detailed development")

        # Strategy-based recommendations
        mean_score = stats.get('mean_predicted_score', 0)
        if mean_score > 8.0:
            recommendations.append("• EXCELLENT: High-quality evolved hypotheses - proceed to validation")
        elif mean_score > 7.0:
            recommendations.append("• STRONG: Good evolution results - consider further refinement")
        else:
            recommendations.append("• ITERATE: Consider additional evolution rounds")

        # Novelty vs feasibility trade-offs
        novelty_gain = stats.get('mean_novelty_improvement', 0)
        feasibility_change = stats.get('mean_feasibility_change', 0)

        if novelty_gain > 1.5 and feasibility_change < -1.0:
            recommendations.append("• BALANCE: High novelty gained but feasibility decreased - validate technical approach")
        elif novelty_gain > 2.0:
            recommendations.append("• BREAKTHROUGH: Significant novelty increases - potential for high impact")

        # Next steps
        if len(evolution_state.evolved_hypotheses) >= 3:
            recommendations.append("• NEXT: Run reflection and ranking on evolved hypotheses for comparison")

        return '\\n'.join(recommendations) if recommendations else "• Proceed with evolved hypotheses as planned"

# ===========================================================================
# INTEGRATION FUNCTIONS
# ===========================================================================

def run_evolution_process(ranking_state: RankingState, reflection_state: ReflectionState,
                         original_constraints: List[str] = None, top_n: int = 3) -> EvolutionState:
    """Run complete evolution process on top-ranked hypotheses"""

    if not ranking_state.final_rankings:
        raise ValueError("No rankings found in ranking state")

    # Ensure we have the LLM available
    try:
        evolution_agent = EvolutionAgent(llm)
    except NameError:
        raise RuntimeError("LLM not available. Run setup cell first.")

    # Run evolution
    evolution_state = evolution_agent.evolve_top_hypotheses(
        ranking_state, reflection_state, original_constraints, top_n
    )

    return evolution_state

def integrate_evolution_with_generation_state(generation_state: GenerationState,
                                            evolution_state: EvolutionState) -> GenerationState:
    """Integrate evolution results with generation state"""

    # Add evolution results to generation state
    if hasattr(generation_state, 'evolution_results'):
        generation_state.evolution_results = evolution_state
    else:
        setattr(generation_state, 'evolution_results', evolution_state)

    # Update status
    generation_state.status = "hypotheses_evolved"

    return generation_state

# ===========================================================================
# TESTING CODE
# ===========================================================================

def test_evolution_agent():
    """Comprehensive test suite for Evolution Agent"""

    print("🧬 TESTING EVOLUTION AGENT")
    print("=" * 50)

    # Create comprehensive test data
    print("📋 Setting up test data...")

    # Mock reflection and ranking states
    mock_reviews = [
        HypothesisReview(
            hypothesis_id="top_hypothesis_1",
            hypothesis_text="Advanced epigenetic reprogramming approach targeting DNMT3A and HDAC2 in activated hepatic stellate cells for liver fibrosis reversal. This approach combines selective inhibition with targeted delivery systems.",
            criteria=ReviewCriteria(8.5, 7.0, 8.0, 9.0, 7.5, "Highly novel", "Challenging but feasible", "Strong evidence", "Transformative impact", "Well testable"),
            overall_score=8.2,
            overall_assessment="Exceptional hypothesis with breakthrough potential",
            strengths=["Novel dual-target approach", "Strong mechanistic foundation", "Clear therapeutic pathway"],
            weaknesses=["Complex delivery requirements", "Regulatory challenges"],
            recommendations=["Develop targeted delivery", "Validate in animal models"],
            confidence_level=8.5,
            review_timestamp=datetime.datetime.now().isoformat(),
            reviewer_type="detailed"
        ),
        HypothesisReview(
            hypothesis_id="top_hypothesis_2",
            hypothesis_text="AI-powered personalized medicine platform that analyzes patient epigenetic profiles to optimize combination therapies for liver fibrosis treatment.",
            criteria=ReviewCriteria(9.0, 6.0, 7.0, 8.5, 6.5, "Breakthrough innovation", "Technical challenges", "Good foundation", "High impact", "Complex validation"),
            overall_score=7.6,
            overall_assessment="Innovative AI approach with significant potential",
            strengths=["Revolutionary AI application", "Personalized approach", "Scalable platform"],
            weaknesses=["Technical complexity", "Data requirements", "Validation challenges"],
            recommendations=["Develop proof-of-concept", "Secure data partnerships"],
            confidence_level=7.5,
            review_timestamp=datetime.datetime.now().isoformat(),
            reviewer_type="detailed"
        ),
        HypothesisReview(
            hypothesis_id="top_hypothesis_3",
            hypothesis_text="Single-cell epigenomic mapping combined with machine learning to identify stage-specific therapeutic vulnerabilities in liver fibrosis progression.",
            criteria=ReviewCriteria(7.5, 7.5, 8.5, 7.0, 8.0, "Good novelty", "Very feasible", "Excellent methodology", "Solid impact", "Highly testable"),
            overall_score=7.7,
            overall_assessment="Well-designed discovery approach with strong scientific foundation",
            strengths=["Cutting-edge technology", "Discovery potential", "Rigorous methodology"],
            weaknesses=["Discovery focus", "Resource intensive", "Timeline uncertainty"],
            recommendations=["Focus on key stages", "Develop analysis pipeline"],
            confidence_level=8.0,
            review_timestamp=datetime.datetime.now().isoformat(),
            reviewer_type="detailed"
        )
    ]

    mock_reflection_state = ReflectionState(
        hypothesis_reviews=mock_reviews,
        review_statistics={'mean_overall_score': 7.83},
        quality_flags=[],
        batch_summary="Mock reflection state for evolution testing"
    )

    mock_rankings = [
        {'rank': 1, 'hypothesis_id': 'top_hypothesis_1', 'final_elo_rating': 1350, 'win_rate': 75.0},
        {'rank': 2, 'hypothesis_id': 'top_hypothesis_2', 'final_elo_rating': 1280, 'win_rate': 60.0},
        {'rank': 3, 'hypothesis_id': 'top_hypothesis_3', 'final_elo_rating': 1220, 'win_rate': 50.0}
    ]

    mock_ranking_state = RankingState(
        final_rankings=mock_rankings,
        ranking_statistics={'mean_final_rating': 1283}
    )

    print(f"✅ Created test data with {len(mock_reviews)} hypotheses")

    # Test individual evolution strategies
    try:
        evolution_agent = EvolutionAgent(llm)
    except NameError:
        print("❌ Error: LLM not available. Cannot test evolution strategies.")
        return

    print("\n1️⃣ TESTING INDIVIDUAL EVOLUTION STRATEGIES")
    print("-" * 40)

    # Test simplification
    print("Testing Simplification Strategy...")
    simplification_step = evolution_agent.simplify_hypothesis(mock_reviews[0], mock_rankings[0])
    print(f"✅ Simplification: {simplification_step.strategy_used.value}")
    print(f"   Confidence: {simplification_step.confidence:.1f}/10")
    print(f"   Novelty change: {simplification_step.novelty_increase:+.1f}")
    print(f"   Impact change: {simplification_step.impact_increase:+.1f}")

    # Test analogical reasoning
    print("\nTesting Analogical Reasoning Strategy...")
    analogical_step = evolution_agent.analogical_reasoning(mock_reviews[1], mock_rankings[1])
    print(f"✅ Analogical Reasoning: {analogical_step.strategy_used.value}")
    print(f"   Benefits: {len(analogical_step.expected_benefits)}")
    print(f"   Risks: {len(analogical_step.potential_risks)}")

    # Test combination
    print("\nTesting Combination Strategy...")
    combination_step = evolution_agent.combine_hypotheses(mock_reviews[:2], mock_rankings[:2])
    print(f"✅ Combination: {combination_step.strategy_used.value}")
    print(f"   Parent IDs: {combination_step.parent_hypothesis_ids}")
    print(f"   Expected impact: {combination_step.impact_increase:+.1f}")

    print("\n2️⃣ TESTING FULL EVOLUTION PROCESS")
    print("-" * 40)

    # Test complete evolution process
    original_constraints = [
        "Must be clinically applicable",
        "Focus on translational approaches",
        "Consider regulatory pathways"
    ]

    evolution_state = evolution_agent.evolve_top_hypotheses(
        mock_ranking_state,
        mock_reflection_state,
        original_constraints,
        top_n=3
    )

    print(f"✅ Evolution process completed")
    print(f"   Evolved hypotheses: {len(evolution_state.evolved_hypotheses)}")
    print(f"   Best evolved: {evolution_state.best_evolved_hypothesis}")
    print(f"   Generation number: {evolution_state.evolution_statistics.get('generation_number', 1)}")

    print("\n3️⃣ TESTING EVOLUTION ANALYSIS")
    print("-" * 40)

    print("📊 Evolution Statistics:")
    stats = evolution_state.evolution_statistics
    for key, value in stats.items():
        if isinstance(value, float):
            print(f"   {key}: {value:.2f}")
        else:
            print(f"   {key}: {value}")

    print(f"\n🎯 Strategy Effectiveness:")
    strategy_eff = evolution_state.strategy_effectiveness
    for key, value in strategy_eff.items():
        print(f"   {key}: {value:.2f}")

    print("\n4️⃣ TESTING EVOLVED HYPOTHESES")
    print("-" * 40)

    for i, evolved_hyp in enumerate(evolution_state.evolved_hypotheses[:3], 1):
        strategy = evolved_hyp.evolution_lineage[0].strategy_used.value if evolved_hyp.evolution_lineage else "unknown"
        predicted_score = statistics.mean(list(evolved_hyp.predicted_scores.values()))

        print(f"\nEvolved Hypothesis {i}:")
        print(f"   ID: {evolved_hyp.hypothesis_id}")
        print(f"   Strategy: {strategy}")
        print(f"   Predicted Score: {predicted_score:.2f}/10")
        print(f"   Parents: {', '.join(evolved_hyp.parent_hypothesis_ids)}")
        print(f"   Key Advantages: {'; '.join(evolved_hyp.competitive_advantages[:2])}")
        print(f"   Content Preview: {evolved_hyp.evolved_content[:150]}...")

    print("\n5️⃣ TESTING INTEGRATION")
    print("-" * 40)

    # Test integration with generation state
    mock_generation_state = GenerationState(
        research_goal="Test evolution integration",
        status="hypotheses_ranked"
    )

    integrated_state = integrate_evolution_with_generation_state(mock_generation_state, evolution_state)

    print(f"✅ Integration successful")
    print(f"   State status: {integrated_state.status}")
    print(f"   Has evolution results: {hasattr(integrated_state, 'evolution_results')}")

    print("\n6️⃣ EVOLUTION SUMMARY")
    print("-" * 40)
    print(evolution_state.generation_summary)

    print("\n" + "=" * 50)
    print("🎉 ALL EVOLUTION AGENT TESTS COMPLETED SUCCESSFULLY!")
    print("🧬 The Evolution Agent is ready to improve top hypotheses")

    return evolution_state

# ===========================================================================
# RUN TESTS
# ===========================================================================

if __name__ == "__main__":
    test_results = test_evolution_agent()

🧬 TESTING EVOLUTION AGENT
📋 Setting up test data...
✅ Created test data with 3 hypotheses

1️⃣ TESTING INDIVIDUAL EVOLUTION STRATEGIES
----------------------------------------
Testing Simplification Strategy...
✅ Simplification: simplification
   Confidence: 7.0/10
   Novelty change: -1.0
   Impact change: +2.0

Testing Analogical Reasoning Strategy...
✅ Analogical Reasoning: analogical_reasoning
   Benefits: 1
   Risks: 1

Testing Combination Strategy...
✅ Combination: combination
   Parent IDs: ['top_hypothesis_1', 'top_hypothesis_2']
   Expected impact: +4.0

2️⃣ TESTING FULL EVOLUTION PROCESS
----------------------------------------
✅ Evolution process completed
   Evolved hypotheses: 6
   Best evolved: evolved_combination_010933
   Generation number: 1

3️⃣ TESTING EVOLUTION ANALYSIS
----------------------------------------
📊 Evolution Statistics:
   total_evolved_hypotheses: 6
   mean_predicted_score: 8.25
   max_predicted_score: 9.27
   min_predicted_score: 7.70
   mean_novelty_

In [None]:
#@title 🚀 Meta-Review Agent Enhancements (Fixed - v2)
# ===========================================================================
# FIXES:
# 1. More robust TF-IDF parameters for small datasets
# 2. Better correlation calculation with variance checking
# 3. Enhanced text preprocessing
# 4. Improved error handling for edge cases
# 5. Corrected deprecated datetime.utcnow() calls
# 6. Corrected a SyntaxError from an invalid escape character
# ===========================================================================

import asyncio
import json
import pickle
import hashlib
import numpy as np
import logging
import warnings
from typing import Union, Protocol, List, Dict, Any, Optional, Tuple
from dataclasses import dataclass, field, asdict
import datetime
from pathlib import Path
from collections import Counter

# Suppress numpy warnings for cleaner output
warnings.filterwarnings('ignore', category=RuntimeWarning)

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# ===========================================================================
# 1️⃣ CORE DATACLASSES (Same as before)
# ===========================================================================

@dataclass
class PatternInsight:
    """Represents a discovered pattern in hypothesis reviews."""
    description: str
    frequency: int
    sample_hypotheses: List[str]
    confidence: float = 0.0

@dataclass
class ClusteringMetrics:
    """Metrics about the clustering process."""
    n_samples: int
    n_clusters_requested: int
    n_clusters_actual: int
    clustering_method: str
    silhouette_score: Optional[float] = None

@dataclass
class AdvancedMetrics:
    """Extended metrics for deeper analysis."""
    diversity_score: float
    consensus_score: float
    trend_analysis: Dict[str, float]
    outlier_detection: List[str]
    confidence_intervals: Dict[str, Tuple[float, float]]

@dataclass
class MetaReviewState:
    """Complete state of meta-review analysis."""
    pattern_insights: List[PatternInsight]
    criterion_correlations: Dict[str, float]
    actionable_for_generation: str
    actionable_for_reflection: str
    clustering_metrics: ClusteringMetrics
    created_at: str
    analysis_quality: str = "high"
    advanced_metrics: Optional[AdvancedMetrics] = None

@dataclass
class MetaReviewConfig:
    """Configuration for meta-review agent."""
    max_clusters: int = 6
    min_cluster_size: int = 2
    cache_enabled: bool = True
    async_processing: bool = False
    enable_advanced_metrics: bool = True

    @classmethod
    def from_dict(cls, config_dict: dict) -> 'MetaReviewConfig':
        return cls(**{k: v for k, v in config_dict.items() if k in cls.__annotations__})

    def to_dict(self) -> dict:
        return asdict(self)

# ===========================================================================
# 2️⃣ ENHANCED TEXT PREPROCESSING
# ===========================================================================

def _preprocess_text_robust(text: str) -> str:
    """Enhanced text preprocessing for better clustering."""
    import re

    if not text:
        return ""

    # Convert to lowercase
    text = str(text).lower()

    # Remove special characters but keep spaces
    text = re.sub(r'[^\w\s]', ' ', text)

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    # Remove very short words (less than 3 characters)
    words = [word for word in text.split() if len(word) >= 3]

    return ' '.join(words)

def _extract_text_features(corpus: List[str]) -> List[str]:
    """Extract meaningful features from text corpus."""
    if not corpus:
        return []

    # Preprocess all texts
    processed_corpus = [_preprocess_text_robust(text) for text in corpus]

    # Remove empty texts
    processed_corpus = [text for text in processed_corpus if text.strip()]

    if not processed_corpus:
        return []

    # Extract all unique words
    all_words = set()
    for text in processed_corpus:
        all_words.update(text.split())

    # Filter out very common and very rare words manually
    word_counts = Counter()
    for text in processed_corpus:
        word_counts.update(text.split())

    # Keep words that appear in at least 1 document but not in all documents
    n_docs = len(processed_corpus)
    meaningful_words = [
        word for word, count in word_counts.items()
        if 1 <= count <= max(1, n_docs - 1) and len(word) >= 3
    ]

    return processed_corpus if meaningful_words else processed_corpus

# ===========================================================================
# 3️⃣ ROBUST CLUSTERING IMPLEMENTATION
# ===========================================================================

def _improved_text_clustering(corpus: List[str], n_clusters: int) -> Tuple[Dict[int, List[int]], ClusteringMetrics]:
    """Improved text clustering with better parameter handling."""

    if not corpus:
        return {}, ClusteringMetrics(0, 0, 0, "no_data")

    if len(corpus) == 1:
        return {0: [0]}, ClusteringMetrics(1, 1, 1, "single_sample")

    # Preprocess texts
    processed_texts = _extract_text_features(corpus)

    if not processed_texts or all(not text.strip() for text in processed_texts):
        logger.warning("No meaningful text content found for clustering")
        return _simple_grouping_fallback(len(corpus), n_clusters)

    try:
        # Try sklearn clustering with improved parameters
        from sklearn.feature_extraction.text import TfidfVectorizer
        from sklearn.cluster import AgglomerativeClustering
        from sklearn.metrics.pairwise import cosine_similarity

        # FIXED: More robust TF-IDF parameters
        vectorizer = TfidfVectorizer(
            max_features=min(100, len(processed_texts) * 5),  # Reduced max features
            stop_words="english",
            min_df=1,  # Keep all terms that appear at least once
            max_df=1.0,  # Keep all terms (no upper limit)
            ngram_range=(1, 2),  # Include bigrams for better features
            lowercase=True,
            token_pattern=r'\b[a-zA-Z]{3,}\b'  # Only words with 3+ letters
        )

        try:
            X = vectorizer.fit_transform(processed_texts)
            logger.info(f"TF-IDF vectorization successful: {X.shape[0]} docs, {X.shape[1]} features")

            if X.shape[1] == 0:
                raise ValueError("No features extracted after vectorization")

            # Adjust cluster count for small datasets
            actual_n_clusters = min(n_clusters, len(processed_texts) - 1, max(1, len(processed_texts) // 2))

            if actual_n_clusters < 2:
                logger.info("Too few samples for meaningful clustering, using simple grouping")
                return _simple_grouping_fallback(len(corpus), n_clusters)

            # Perform clustering
            clustering = AgglomerativeClustering(
                n_clusters=actual_n_clusters,
                metric='cosine',
                linkage='average'
            )

            X_dense = X.toarray()
            labels = clustering.fit_predict(X_dense)

            # Organize results
            clusters = {}
            for idx, label in enumerate(labels):
                clusters.setdefault(int(label), []).append(idx)

            # Calculate silhouette score safely
            silhouette_score = None
            try:
                if len(set(labels)) > 1 and len(processed_texts) > 2:
                    from sklearn.metrics import silhouette_score as calc_silhouette
                    silhouette_score = float(calc_silhouette(X_dense, labels, metric='cosine'))
            except Exception as e:
                logger.debug(f"Silhouette score calculation failed: {e}")

            metrics = ClusteringMetrics(
                n_samples=len(corpus),
                n_clusters_requested=n_clusters,
                n_clusters_actual=len(clusters),
                clustering_method="sklearn_agglomerative_improved",
                silhouette_score=silhouette_score
            )

            logger.info(f"Sklearn clustering successful: {len(clusters)} clusters created")
            return clusters, metrics

        except Exception as e:
            logger.warning(f"TF-IDF vectorization failed: {e}")
            raise e

    except ImportError:
        logger.info("Sklearn not available, using simple clustering")
    except Exception as e:
        logger.warning(f"Advanced clustering failed: {e}, using simple clustering")

    # Enhanced fallback clustering
    return _enhanced_simple_clustering(processed_texts, n_clusters)

def _enhanced_simple_clustering(texts: List[str], n_clusters: int) -> Tuple[Dict[int, List[int]], ClusteringMetrics]:
    """Enhanced simple clustering based on text similarity."""

    if not texts:
        return {}, ClusteringMetrics(0, 0, 0, "no_data")

    # Simple word-based similarity clustering
    clusters = {}
    text_features = []

    # Extract word sets for each text
    for text in texts:
        words = set(text.lower().split()) if text else set()
        text_features.append(words)

    # Group texts with similar word overlap
    assigned = [False] * len(texts)
    cluster_id = 0

    for i in range(len(texts)):
        if assigned[i]:
            continue

        current_cluster = [i]
        assigned[i] = True

        # Find similar texts
        for j in range(i + 1, len(texts)):
            if assigned[j]:
                continue

            # Calculate Jaccard similarity
            intersection = len(text_features[i].intersection(text_features[j]))
            union = len(text_features[i].union(text_features[j]))

            if union > 0:
                similarity = intersection / union
                if similarity > 0.2:  # Threshold for grouping
                    current_cluster.append(j)
                    assigned[j] = True

        clusters[cluster_id] = current_cluster
        cluster_id += 1

        # Limit number of clusters
        if cluster_id >= n_clusters:
            # Assign remaining texts to existing clusters
            for k in range(len(texts)):
                if not assigned[k]:
                    # Assign to closest cluster (first one for simplicity)
                    clusters[0].append(k)
                    assigned[k] = True
            break

    metrics = ClusteringMetrics(
        n_samples=len(texts),
        n_clusters_requested=n_clusters,
        n_clusters_actual=len(clusters),
        clustering_method="enhanced_simple_similarity"
    )

    return clusters, metrics

def _simple_grouping_fallback(n_samples: int, n_groups: int) -> Tuple[Dict[int, List[int]], ClusteringMetrics]:
    """Simple fallback grouping when clustering fails."""
    groups = {}
    group_size = max(1, n_samples // n_groups)

    for i in range(n_samples):
        group_id = min(i // group_size, n_groups - 1)
        groups.setdefault(group_id, []).append(i)

    metrics = ClusteringMetrics(
        n_samples=n_samples,
        n_clusters_requested=n_groups,
        n_clusters_actual=len(groups),
        clustering_method="simple_fallback"
    )

    return groups, metrics

# ===========================================================================
# 4️⃣ ROBUST CORRELATION CALCULATION
# ===========================================================================

def _compute_correlations_robust(reflection_state, ranking_state) -> Dict[str, float]:
    """FIXED: Robust correlation computation with proper variance handling."""

    try:
        # Get ranking information
        rank_map = {}

        for attr_name in ['final_rankings', 'leaderboard', 'rankings']:
            rankings_data = getattr(ranking_state, attr_name, None)
            if rankings_data:
                if isinstance(rankings_data, list):
                    for i, item in enumerate(rankings_data):
                        if isinstance(item, dict):
                            hyp_id = item.get('hypothesis_id', f'hyp_{i}')
                        else:
                            hyp_id = getattr(item, 'hypothesis_id', f'hyp_{i}')
                        rank_map[hyp_id] = i + 1
                break

        if not rank_map:
            logger.warning("No ranking data available")
            return {}

        # Get review data
        reviews = getattr(reflection_state, 'hypothesis_reviews', [])
        if not reviews:
            return {}

        # Compute correlations for each criterion
        criteria = ["novelty_score", "feasibility_score", "scientific_rigor_score",
                   "impact_potential_score", "testability_score"]
        correlations = {}

        for criterion in criteria:
            criterion_scores = []
            rank_scores = []

            for review in reviews:
                hyp_id = getattr(review, 'hypothesis_id', '')
                if hyp_id in rank_map:
                    criteria_obj = getattr(review, 'criteria', None)
                    if criteria_obj:
                        score = getattr(criteria_obj, criterion, None)
                        if score is not None:
                            try:
                                criterion_scores.append(float(score))
                                # Convert rank to score (lower rank = higher score)
                                rank_scores.append(-rank_map[hyp_id])
                            except (ValueError, TypeError):
                                continue

            # FIXED: Check for sufficient variance before correlation calculation
            if len(criterion_scores) >= 2:
                try:
                    # Check if there's variance in the data
                    criterion_var = np.var(criterion_scores)
                    rank_var = np.var(rank_scores)

                    if criterion_var == 0 or rank_var == 0:
                        # No variance - set correlation to 0
                        correlations[criterion] = 0.0
                        logger.debug(f"No variance in {criterion}, correlation set to 0")
                    else:
                        # Calculate correlation
                        corr_matrix = np.corrcoef(criterion_scores, rank_scores)
                        correlation = corr_matrix[0, 1]

                        if np.isnan(correlation) or np.isinf(correlation):
                            correlations[criterion] = 0.0
                        else:
                            correlations[criterion] = float(correlation)

                except Exception as e:
                    logger.debug(f"Correlation calculation failed for {criterion}: {e}")
                    correlations[criterion] = 0.0
            else:
                correlations[criterion] = 0.0

    except Exception as e:
        logger.error(f"Error computing correlations: {e}")
        return {}

    return correlations

# ===========================================================================
# 5️⃣ HELPER FUNCTIONS (Improved)
# ===========================================================================

def _safe_get_attribute(obj, attr_path: str, default=None):
    """Safely get nested attributes from objects."""
    try:
        attrs = attr_path.split('.')
        current = obj
        for attr in attrs:
            if hasattr(current, attr):
                current = getattr(current, attr)
            elif isinstance(current, dict):
                current = current.get(attr)
            else:
                return default
        return current
    except Exception:
        return default

def _extract_critique_corpus_robust(reflection_state) -> List[str]:
    """Extract text corpus with better error handling."""
    corpus = []

    try:
        reviews = None
        for attr_name in ['hypothesis_reviews', 'reviews', 'hypothesis_evaluations']:
            reviews = _safe_get_attribute(reflection_state, attr_name, [])
            if reviews:
                break

        if not reviews:
            logger.warning("No hypothesis reviews found")
            return []

        for review in reviews:
            texts = []

            # Extract criteria reasoning
            criteria = _safe_get_attribute(review, 'criteria')
            if criteria:
                reasoning_fields = [
                    'novelty_reasoning', 'feasibility_reasoning',
                    'scientific_rigor_reasoning', 'impact_potential_reasoning',
                    'testability_reasoning'
                ]
                for field in reasoning_fields:
                    text = _safe_get_attribute(criteria, field, '')
                    if text and str(text).strip() and str(text) != 'stub':
                        texts.append(str(text))

            # Extract other text fields
            for attr in ['strengths', 'weaknesses', 'recommendations']:
                items = _safe_get_attribute(review, attr, [])
                if items:
                    texts.extend([str(item) for item in items if str(item)])

            # Extract overall assessment
            overall = _safe_get_attribute(review, 'overall_assessment', '')
            if overall and str(overall).strip():
                texts.append(str(overall))

            # Combine texts
            combined_text = " ".join(filter(None, texts))
            if combined_text and len(combined_text.strip()) > 10:
                corpus.append(combined_text)

    except Exception as e:
        logger.error(f"Error extracting corpus: {e}")

    logger.info(f"Extracted corpus with {len(corpus)} documents")
    return corpus

def _summarize_clusters_robust(clusters: Dict[int, List[int]],
                              reflection_state,
                              corpus: List[str]) -> List[PatternInsight]:
    """Improved cluster summarization."""
    insights = []

    try:
        for cluster_id, indices in clusters.items():
            if not indices:
                continue

            # Get cluster texts
            cluster_texts = []
            for i in indices:
                if i < len(corpus):
                    cluster_texts.append(corpus[i])

            if not cluster_texts:
                continue

            # Extract meaningful words
            all_words = []
            for text in cluster_texts:
                # Better word extraction
                words = [word.lower().strip() for word in str(text).split()
                        if len(word) > 3 and word.isalpha() and word not in
                        ['that', 'this', 'with', 'from', 'they', 'were', 'been', 'have']]
                all_words.extend(words)

            word_counts = Counter(all_words)
            # Only keep words that appear more than once in the cluster
            top_words = [word for word, count in word_counts.most_common(10)
                        if count > 1 or len(cluster_texts) == 1]

            # Create meaningful description
            if top_words:
                pattern_desc = f"Cluster {cluster_id + 1}: Themes - {', '.join(top_words[:5])}"
            else:
                pattern_desc = f"Cluster {cluster_id + 1}: Unique patterns"

            # Get sample hypothesis IDs
            sample_hypotheses = []
            try:
                reviews = _safe_get_attribute(reflection_state, 'hypothesis_reviews', [])
                for i in indices[:3]:
                    if i < len(reviews):
                        hyp_id = _safe_get_attribute(reviews[i], 'hypothesis_id', f'hyp_{i}')
                        sample_hypotheses.append(str(hyp_id))
            except Exception:
                sample_hypotheses = [f"sample_{i}" for i in indices[:3]]

            # Calculate confidence based on cluster coherence
            confidence = min(0.9,
                           len(indices) / max(1, len(clusters)) * 0.4 +  # Size factor
                           len(top_words) / 10 * 0.4 +  # Word overlap factor
                           (1.0 if len(cluster_texts) > 1 else 0.8) * 0.2)  # Multiple texts factor

            insights.append(PatternInsight(
                description=pattern_desc[:200],
                frequency=len(indices),
                sample_hypotheses=sample_hypotheses,
                confidence=confidence
            ))

    except Exception as e:
        logger.error(f"Error in cluster summarization: {e}")

    # Sort by frequency and confidence
    insights.sort(key=lambda x: (x.frequency, x.confidence), reverse=True)
    return insights

def compute_advanced_metrics_robust(reflection_state, pattern_insights, correlations) -> AdvancedMetrics:
    """Compute advanced metrics with better error handling."""

    try:
        # Diversity score
        cluster_sizes = [insight.frequency for insight in pattern_insights]
        if cluster_sizes and sum(cluster_sizes) > 0:
            diversity_score = 1 - (max(cluster_sizes) / sum(cluster_sizes))
        else:
            diversity_score = 0.0

        # Consensus score (use absolute values and handle NaN)
        if correlations:
            valid_correlations = [abs(corr) for corr in correlations.values()
                                if not (np.isnan(corr) or np.isinf(corr))]
            consensus_score = np.mean(valid_correlations) if valid_correlations else 0.0
        else:
            consensus_score = 0.0

        # Confidence intervals with proper error handling
        confidence_intervals = {}
        for criterion, corr in correlations.items():
            if not (np.isnan(corr) or np.isinf(corr)):
                se = 0.15  # Conservative standard error
                ci_lower = max(-1, corr - 1.96 * se)
                ci_upper = min(1, corr + 1.96 * se)
                confidence_intervals[criterion] = (ci_lower, ci_upper)
            else:
                confidence_intervals[criterion] = (-0.15, 0.15)  # Default uncertainty

        return AdvancedMetrics(
            diversity_score=diversity_score,
            consensus_score=consensus_score,
            # FIX: Removed invalid escape character
            trend_analysis={"improving": 0.0, "declining": 0.0, "stable": 1.0},
            outlier_detection=[],
            confidence_intervals=confidence_intervals
        )

    except Exception as e:
        logger.error(f"Error computing advanced metrics: {e}")
        return AdvancedMetrics(
            diversity_score=0.0,
            consensus_score=0.0,
            # FIX: Removed invalid escape character
            trend_analysis={"improving": 0.0, "declining": 0.0, "stable": 1.0},
            outlier_detection=[],
            confidence_intervals={}
        )

# ===========================================================================
# 6️⃣ MAIN ENHANCED FUNCTION
# ===========================================================================

def run_enhanced_meta_review(reflection_state,
                           ranking_state,
                           config: MetaReviewConfig = None) -> MetaReviewState:
    """Enhanced meta-review with all fixes applied."""

    config = config or MetaReviewConfig()

    logger.info("Starting enhanced meta-review analysis (v2)")

    try:
        # Extract and process corpus
        corpus = _extract_critique_corpus_robust(reflection_state)

        # Perform improved clustering
        clusters, metrics = _improved_text_clustering(corpus, config.max_clusters)

        # Summarize patterns
        insights = _summarize_clusters_robust(clusters, reflection_state, corpus)

        # Compute robust correlations
        correlations = _compute_correlations_robust(reflection_state, ranking_state)

        # Generate feedback
        feedback_gen = f"Analysis complete: {len(insights)} patterns identified from {len(corpus)} documents"
        feedback_ref = f"Correlation analysis: {len(correlations)} criteria evaluated, mean correlation strength: {np.mean([abs(c) for c in correlations.values()]) if correlations else 0:.3f}"

        # Compute advanced metrics
        advanced_metrics = None
        if config.enable_advanced_metrics:
            advanced_metrics = compute_advanced_metrics_robust(reflection_state, insights, correlations)

        # Assess quality
        quality = "high"
        if metrics.clustering_method.endswith("fallback"):
            quality = "medium"
        elif metrics.n_samples < 3:
            quality = "low"
        elif not insights and not correlations:
            quality = "low"

        result = MetaReviewState(
            pattern_insights=insights,
            criterion_correlations=correlations,
            actionable_for_generation=feedback_gen,
            actionable_for_reflection=feedback_ref,
            clustering_metrics=metrics,
            # FIX: Corrected datetime call
            created_at=datetime.datetime.now().isoformat(),
            analysis_quality=quality,
            advanced_metrics=advanced_metrics
        )

        logger.info(f"Enhanced meta-review completed successfully: {quality} quality")
        return result

    except Exception as e:
        logger.error(f"Enhanced meta-review failed: {e}")

        return MetaReviewState(
            pattern_insights=[],
            criterion_correlations={},
            actionable_for_generation="Meta-review analysis failed. Manual review recommended.",
            actionable_for_reflection="Meta-review analysis failed. Continue with current rubric.",
            clustering_metrics=ClusteringMetrics(0, 0, 0, "failed"),
            # FIX: Corrected datetime call
            created_at=datetime.datetime.now().isoformat(),
            analysis_quality="failed"
        )

# ===========================================================================
# 7️⃣ VISUALIZATION AND TESTING
# ===========================================================================

def generate_analysis_report(meta_review_state: MetaReviewState) -> dict:
    """Generate comprehensive analysis report."""

    def safe_format(value, default="N/A"):
        try:
            if isinstance(value, (int, float)):
                return f"{value:.3f}" if isinstance(value, float) else str(value)
            return str(value) if value is not None else default
        except:
            return default

    return {
        "summary": {
            "analysis_quality": meta_review_state.analysis_quality,
            "patterns_found": len(meta_review_state.pattern_insights),
            "clustering_method": meta_review_state.clustering_metrics.clustering_method,
            "sample_size": meta_review_state.clustering_metrics.n_samples
        },
        "patterns": [
            {
                "description": insight.description,
                "frequency": insight.frequency,
                "confidence": safe_format(insight.confidence),
                "sample_hypotheses": insight.sample_hypotheses[:3]
            }
            for insight in meta_review_state.pattern_insights
        ],
        "correlations": {
            criterion: safe_format(corr)
            for criterion, corr in meta_review_state.criterion_correlations.items()
        },
        "advanced_metrics": {
            "diversity_score": safe_format(meta_review_state.advanced_metrics.diversity_score) if meta_review_state.advanced_metrics else "N/A",
            "consensus_score": safe_format(meta_review_state.advanced_metrics.consensus_score) if meta_review_state.advanced_metrics else "N/A"
        } if meta_review_state.advanced_metrics else {},
        "recommendations": {
            "for_generation": meta_review_state.actionable_for_generation,
            "for_reflection": meta_review_state.actionable_for_reflection
        }
    }

def test_enhanced_meta_review_v2():
    """Test the fixed enhanced meta-review agent."""
    print("🧠 TESTING ENHANCED META-REVIEW AGENT (v2 - FIXED)")
    print("=" * 55)

    # Create more diverse mock data to test clustering
    class MockCriteria:
        def __init__(self, variant=0):
            base_scores = [8.0, 7.0, 8.5, 9.0, 7.5]
            variations = [0, 0.5, -0.5, 1.0, -1.0]

            self.novelty_reasoning = ["Novel approach", "Innovative method", "Creative solution", "Unique perspective", "Original idea"][variant % 5]
            self.feasibility_reasoning = ["Technically feasible", "Practically achievable", "Resource efficient", "Implementation ready", "Scalable approach"][variant % 5]
            self.scientific_rigor_reasoning = ["Well-grounded", "Methodologically sound", "Evidence-based", "Statistically robust", "Peer-reviewed"][variant % 5]
            self.impact_potential_reasoning = ["High impact", "Transformative potential", "Significant benefit", "Wide applicability", "Game-changing"][variant % 5]
            self.testability_reasoning = ["Easily testable", "Measurable outcomes", "Clear metrics", "Validation ready", "Hypothesis driven"][variant % 5]

            # Add some variation to scores
            self.novelty_score = max(1, min(10, base_scores[0] + variations[variant % 5]))
            self.feasibility_score = max(1, min(10, base_scores[1] + variations[variant % 5]))
            self.scientific_rigor_score = max(1, min(10, base_scores[2] + variations[variant % 5]))
            self.impact_potential_score = max(1, min(10, base_scores[3] + variations[variant % 5]))
            self.testability_score = max(1, min(10, base_scores[4] + variations[variant % 5]))

    class MockReview:
        def __init__(self, i):
            self.hypothesis_id = f'hyp_{i}'
            self.hypothesis_text = f'Test hypothesis {i} with specific focus'
            self.criteria = MockCriteria(i)

            # Create more diverse content for better clustering
            themes = [
                ['Novel approach', 'High impact', 'Machine learning'],
                ['Innovative method', 'Clinical application', 'Drug discovery'],
                ['Creative solution', 'Biomarker development', 'Personalized medicine'],
                ['Unique perspective', 'Therapeutic target', 'Precision oncology'],
                ['Original idea', 'Diagnostic tool', 'Computational biology']
            ]

            theme = themes[i % len(themes)]
            self.strengths = theme[:2]
            self.weaknesses = ['Needs validation', 'Resource intensive'][i % 2:i % 2 + 1]
            self.overall_assessment = f'Good hypothesis with {theme[2]} focus'

    class MockReflectionState:
        def __init__(self):
            self.hypothesis_reviews = [MockReview(i) for i in range(7)]  # More samples

    class MockRankingState:
        def __init__(self):
            # Create more realistic ranking with some ties
            ranks = [1, 2, 3, 3, 4, 5, 6]  # Note: rank 3 appears twice
            self.final_rankings = [
                {'hypothesis_id': f'hyp_{i}', 'rank': ranks[i]} for i in range(7)
            ]

    # Run test
    reflection_state = MockReflectionState()
    ranking_state = MockRankingState()

    config = MetaReviewConfig(
        max_clusters=4,
        cache_enabled=False,
        async_processing=False,
        enable_advanced_metrics=True
    )

    result = run_enhanced_meta_review(reflection_state, ranking_state, config)

    # Generate report
    report = generate_analysis_report(result)

    print(f"✅ Analysis Quality: {result.analysis_quality}")
    print(f"📊 Patterns Found: {len(result.pattern_insights)}")
    print(f"🔗 Correlations: {len(result.criterion_correlations)}")
    print(f"🏷️ Clustering Method: {result.clustering_metrics.clustering_method}")

    if result.criterion_correlations:
        print(f"📈 Sample Correlations:")
        for criterion, corr in list(result.criterion_correlations.items())[:3]:
            print(f"   • {criterion}: {corr:.3f}")

    if result.advanced_metrics:
        print(f"📊 Diversity Score: {result.advanced_metrics.diversity_score:.3f}")
        print(f"🤝 Consensus Score: {result.advanced_metrics.consensus_score:.3f}")

    print("\n" + "=" * 55)
    print("🎉 ENHANCED META-REVIEW AGENT (v2) TEST COMPLETED!")
    print("🛠️  Issues fixed: TF-IDF parameters, correlation calculation, clustering robustness")

    return result, report

# Run the improved test
if __name__ == "__main__":
    test_result_v2, test_report_v2 = test_enhanced_meta_review_v2()

    if test_result_v2.pattern_insights:
        print(f"\n📋 Sample Pattern: {test_result_v2.pattern_insights[0].description}")

    print(f"\n📊 Report Summary:")
    print(f"   Clustering: {test_report_v2['summary']['clustering_method']}")
    print(f"   Quality: {test_report_v2['summary']['analysis_quality']}")

🧠 TESTING ENHANCED META-REVIEW AGENT (v2 - FIXED)
✅ Analysis Quality: high
📊 Patterns Found: 3
🔗 Correlations: 5
🏷️ Clustering Method: sklearn_agglomerative_improved
📈 Sample Correlations:
   • novelty_score: 0.000
   • feasibility_score: 0.000
   • scientific_rigor_score: 0.000
📊 Diversity Score: 0.571
🤝 Consensus Score: 0.000

🎉 ENHANCED META-REVIEW AGENT (v2) TEST COMPLETED!
🛠️  Issues fixed: TF-IDF parameters, correlation calculation, clustering robustness

📋 Sample Pattern: Cluster 1: Themes - hypothesis, validation, good, focus, creative

📊 Report Summary:
   Clustering: sklearn_agglomerative_improved
   Quality: high


In [None]:
#@title 🌐 Proximity Agent - Hypothesis Clustering & Deduplication
# ===========================================================================
# PROXIMITY AGENT IMPLEMENTATION
# Hypothesis Clustering and Deduplication
# ===========================================================================

# Install required packages for sentence embeddings
!pip install -q sentence-transformers scikit-learn

import numpy as np
from typing import List, Dict, Any, Optional
from dataclasses import dataclass, field
import datetime # FIX: Added the missing datetime import
import logging

from sentence_transformers import SentenceTransformer
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.pairwise import cosine_similarity

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# ===========================================================================
# DATA STRUCTURES
# ===========================================================================

@dataclass
class HypothesisCluster:
    """Represents a cluster of similar hypotheses."""
    cluster_id: int
    hypothesis_ids: List[str]
    representative_id: str
    representative_text: str
    similarity_score: float  # Average intra-cluster similarity

@dataclass
class ProximityState:
    """Holds the results of the proximity analysis."""
    clusters: List[HypothesisCluster] = field(default_factory=list)
    unique_hypotheses: List[Dict[str, Any]] = field(default_factory=list)
    removed_count: int = 0
    analysis_timestamp: str = ""

# ===========================================================================
# PROXIMITY AGENT
# ===========================================================================

class ProximityAgent:
    """
    Clusters similar hypotheses and removes duplicates to maintain idea diversity.
    """

    def __init__(self, model_name: str = 'all-MiniLM-L6-v2', distance_threshold: float = 0.2):
        """
        Initializes the agent.

        Args:
            model_name: The sentence-transformer model to use for embeddings.
            distance_threshold: The clustering distance. Lower values mean more clusters
                                and stricter similarity for grouping. (0.2 is a good start)
        """
        self.model = SentenceTransformer(model_name)
        # The threshold for AgglomerativeClustering is based on distance (1 - similarity)
        self.distance_threshold = distance_threshold
        logger.info(f"ProximityAgent initialized with model '{model_name}' and distance threshold {distance_threshold}")

    def run(self, proposals: List[Dict[str, Any]]) -> ProximityState:
        """
        Runs the full clustering and deduplication process.

        Args:
            proposals: A list of hypothesis dictionaries from the GenerationAgent.
                       Each dict must have 'id' and 'content' keys.

        Returns:
            A ProximityState object containing the results.
        """
        if not proposals or len(proposals) < 2:
            logger.warning("Not enough hypotheses to run proximity analysis. Skipping.")
            state = ProximityState(unique_hypotheses=proposals)
            # FIX: Corrected datetime call
            state.analysis_timestamp = datetime.datetime.now().isoformat()
            return state

        logger.info(f"Running proximity analysis on {len(proposals)} hypotheses.")

        # 1. Embed all hypotheses
        embeddings = self._embed_hypotheses(proposals)

        # 2. Cluster the embeddings
        cluster_labels = self._cluster_hypotheses(embeddings)

        # 3. Organize hypotheses into clusters and select representatives
        proximity_state = self._select_representatives(proposals, cluster_labels, embeddings)

        # FIX: Corrected datetime call
        proximity_state.analysis_timestamp = datetime.datetime.now().isoformat()
        logger.info(f"Proximity analysis complete. Found {len(proximity_state.clusters)} clusters. "
                    f"Kept {len(proximity_state.unique_hypotheses)} unique hypotheses, removed {proximity_state.removed_count}.")

        return proximity_state

    def _embed_hypotheses(self, proposals: List[Dict[str, Any]]) -> np.ndarray:
        """Converts hypothesis text into numerical embeddings."""
        texts = [p.get('content', '') for p in proposals]
        return self.model.encode(texts, show_progress_bar=False)

    def _cluster_hypotheses(self, embeddings: np.ndarray) -> np.ndarray:
        """Groups similar hypothesis embeddings together."""
        # We use distance_threshold, so n_clusters is None.
        # Affinity is cosine distance (1-cosine_similarity)
        clustering_model = AgglomerativeClustering(
            n_clusters=None,
            distance_threshold=self.distance_threshold,
            metric='cosine',
            linkage='average'
        )
        return clustering_model.fit_predict(embeddings)

    def _select_representatives(self, proposals: List[Dict[str, Any]], labels: np.ndarray, embeddings: np.ndarray) -> ProximityState:
        """
        For each cluster, selects the best representative and archives the others.
        """
        state = ProximityState()

        unique_cluster_labels = set(labels)

        for label in unique_cluster_labels:
            indices = [i for i, l in enumerate(labels) if l == label]

            cluster_proposals = [proposals[i] for i in indices]
            cluster_embeddings = embeddings[indices]

            # Heuristic for selecting representative: the one closest to the cluster's mean embedding (centroid)
            if len(cluster_proposals) > 1:
                centroid = np.mean(cluster_embeddings, axis=0)
                # Cosine similarity requires reshaping the centroid
                similarities = cosine_similarity(cluster_embeddings, centroid.reshape(1, -1))
                representative_index = np.argmax(similarities)
                avg_similarity = np.mean(cosine_similarity(cluster_embeddings))
            else:
                representative_index = 0
                avg_similarity = 1.0

            representative_proposal = cluster_proposals[representative_index]
            state.unique_hypotheses.append(representative_proposal)

            cluster_info = HypothesisCluster(
                cluster_id=int(label),
                hypothesis_ids=[p['id'] for p in cluster_proposals],
                representative_id=representative_proposal['id'],
                representative_text=representative_proposal['content'],
                similarity_score=float(avg_similarity)
            )
            state.clusters.append(cluster_info)

        state.removed_count = len(proposals) - len(state.unique_hypotheses)
        return state

# ===========================================================================
# INTEGRATION FUNCTION
# ===========================================================================

def run_proximity_agent(generation_state) -> Any:
    """
    Integrates the Proximity Agent into the main workflow.

    Args:
        generation_state: The state object, which must have a `generated_proposals` attribute.

    Returns:
        The modified generation_state with deduplicated proposals.
    """
    logger.info("🚀 Engaging Proximity Agent...")

    # Check for proposals in generation_state
    # Using getattr for compatibility with both dataclasses and SimpleNamespace from stubs
    proposals = getattr(generation_state, 'generated_proposals', [])
    if not proposals:
        logger.warning("No proposals found in generation_state for proximity analysis.")
        return generation_state

    # Initialize and run the agent
    proximity_agent = ProximityAgent(distance_threshold=0.25)
    proximity_results = proximity_agent.run(proposals)

    # Update the generation_state with the unique hypotheses
    generation_state.generated_proposals = proximity_results.unique_hypotheses

    # Store proximity results for logging/meta-review if needed
    if not hasattr(generation_state, 'proximity_results'):
        setattr(generation_state, 'proximity_results', [])
    generation_state.proximity_results.append(proximity_results)

    logger.info(f"✅ Proximity Agent finished. {proximity_results.removed_count} redundant hypotheses removed.")
    return generation_state

# ===========================================================================
# TESTING CODE
# ===========================================================================

def test_proximity_agent():
    """Comprehensive test suite for the Proximity Agent."""
    print("🌐 TESTING PROXIMITY AGENT")
    print("=" * 50)

    # 1. Create mock hypotheses with clear clusters and duplicates
    mock_hypotheses = [
        {"id": "hyp_a1", "content": "Targeting protein kinase C alpha will reduce neuroinflammation in Alzheimer's disease."},
        {"id": "hyp_a2", "content": "Inhibiting the PKC-alpha enzyme can alleviate brain inflammation associated with Alzheimer's."},
        {"id": "hyp_b1", "content": "Using gut microbiome modulation is a viable strategy to slow Parkinson's disease progression."},
        {"id": "hyp_c1", "content": "A novel nanoparticle delivery system can cross the blood-brain barrier to deliver drugs."},
        {"id": "hyp_a3", "content": "PKC-alpha inhibition is a key mechanism for controlling neuroinflammatory responses in Alzheimer's."},
        {"id": "hyp_b2", "content": "Altering gut bacteria composition could be a therapeutic method for slowing down Parkinson's."}
    ]

    # Mock the main generation state object
    class MockGenerationState:
        def __init__(self, proposals):
            self.generated_proposals = proposals
            self.research_goal = "Test Goal"

    mock_state = MockGenerationState(mock_hypotheses)

    print(f"📋 Initial state: {len(mock_state.generated_proposals)} hypotheses.")
    for h in mock_state.generated_proposals:
        print(f"  - [{h['id']}] {h['content'][:40]}...")

    # 2. Run the integrated agent function
    print("\n🚀 Running integrated proximity agent...")
    final_state = run_proximity_agent(mock_state)

    # 3. Analyze and print the results
    print("\n📊 Proximity Analysis Results:")

    proximity_results = getattr(final_state, 'proximity_results', [ProximityState()])[0]

    print(f"  - Clusters found: {len(proximity_results.clusters)}")
    print(f"  - Hypotheses removed: {proximity_results.removed_count}")
    print(f"  - Unique hypotheses remaining: {len(proximity_results.unique_hypotheses)}")

    print("\n🔍 Cluster Breakdown:")
    # Sort clusters for consistent output
    sorted_clusters = sorted(proximity_results.clusters, key=lambda c: c.representative_id)
    for cluster in sorted_clusters:
        print(f"\n  Cluster with representative [{cluster.representative_id}]:")
        print(f"    - Member IDs: {', '.join(cluster.hypothesis_ids)}")
        print(f"    - Avg. Similarity: {cluster.similarity_score:.2f}")

    print("\n✅ Final Deduplicated Hypotheses:")
    final_hypotheses = sorted(proximity_results.unique_hypotheses, key=lambda h: h['id'])
    for h in final_hypotheses:
        print(f"  - [{h['id']}] {h['content']}")

    # Assertions
    assert len(final_hypotheses) == 3
    assert proximity_results.removed_count == 3

    print("\n" + "=" * 50)
    print("🎉 ALL PROXIMITY AGENT TESTS COMPLETED SUCCESSFULLY!")
    print("🌐 The Proximity Agent is ready for hypothesis deduplication.")

    return final_state

# Run the test
if __name__ == "__main__":
    proximity_test_results = test_proximity_agent()

🌐 TESTING PROXIMITY AGENT
📋 Initial state: 6 hypotheses.
  - [hyp_a1] Targeting protein kinase C alpha will re...
  - [hyp_a2] Inhibiting the PKC-alpha enzyme can alle...
  - [hyp_b1] Using gut microbiome modulation is a via...
  - [hyp_c1] A novel nanoparticle delivery system can...
  - [hyp_a3] PKC-alpha inhibition is a key mechanism ...
  - [hyp_b2] Altering gut bacteria composition could ...

🚀 Running integrated proximity agent...

📊 Proximity Analysis Results:
  - Clusters found: 3
  - Hypotheses removed: 3
  - Unique hypotheses remaining: 3

🔍 Cluster Breakdown:

  Cluster with representative [hyp_a3]:
    - Member IDs: hyp_a1, hyp_a2, hyp_a3
    - Avg. Similarity: 0.86

  Cluster with representative [hyp_b2]:
    - Member IDs: hyp_b1, hyp_b2
    - Avg. Similarity: 0.93

  Cluster with representative [hyp_c1]:
    - Member IDs: hyp_c1
    - Avg. Similarity: 1.00

✅ Final Deduplicated Hypotheses:
  - [hyp_a3] PKC-alpha inhibition is a key mechanism for controlling neuroinflammato

In [None]:
#@title 🕴️ Supervisor Agent (v5 — final demo with accurate n_hyp counts)
# ================================================================
# This cell orchestrates a “generate → reflect → rank → evolve”
# research loop.  It includes *self-contained* stub agents so it
# will run even before you wire in the real ones.
#
# Key improvements over v4:
# • `run_proximity_agent` / `run_evolution_agent` stubs are wrapped
#   in `_preserve_proposals()` so the `.proposals` list survives,
#   giving correct `n_hyp` numbers in the report.
# • Minor refactor for clarity; behaviour unchanged once real
#   agent functions replace the stubs.
# ================================================================

from __future__ import annotations
from dataclasses import dataclass, field
from typing import List, Any
from types import SimpleNamespace
import datetime, numpy as np

# ---------- 1️⃣  Dataclasses --------------------------------------------------

@dataclass
class SupervisorConfig:
    research_goal: str
    max_cycles: int = 5
    generation_per_cycle: int = 1
    evolution_every: int = 2
    proximity_every: int = 1
    meta_every: int = 2
    no_improve_patience: int = 2

@dataclass
class CycleStats:
    cycle_id: int
    n_hypotheses: int
    best_rank_text: str
    timestamp: str

@dataclass
class SupervisorState:
    config: SupervisorConfig
    cycles: List[CycleStats] = field(default_factory=list)
    generation_state: Any = None
    reflection_state: Any = None
    ranking_state: Any = None
    meta_state: Any = None
    finished: bool = False
    finish_reason: str = ""

# ---------- 2️⃣  Helper: guarantee proposals survive -------------------------

def _preserve_proposals(obj):
    """Return an object (SimpleNamespace) that always has a .proposals list."""
    if hasattr(obj, "proposals"):
        return obj
    if isinstance(obj, dict) and "proposals" in obj:
        return SimpleNamespace(**obj)
    # fabricate an empty list so len()==0 is intentional
    return SimpleNamespace(proposals=[])

# ---------- 3️⃣  Stub builder (only if real agents absent) -------------------

def _ensure_stub(name: str):
    if name in globals():
        return
    print(f"ℹ️  Creating stub for missing agent: {name}")

    if name == "run_generation_agent":
        def fn(goal, constraints=None):
            return SimpleNamespace(
                proposals=[f"hyp_{i}" for i in range(5)],
                ts=datetime.datetime.utcnow().isoformat()
            )

    elif name == "run_reflection_agent":
        def fn(gen_state):
            reviews = []
            for i, hyp in enumerate(gen_state.proposals):
                criteria = SimpleNamespace(
                    novelty_reasoning="stub", feasibility_reasoning="stub",
                    scientific_rigor_reasoning="stub", impact_potential_reasoning="stub",
                    testability_reasoning="stub",
                    novelty_score=float(np.random.rand()*5),
                    feasibility_score=float(np.random.rand()*5),
                    scientific_rigor_score=float(np.random.rand()*5),
                    impact_potential_score=float(np.random.rand()*5),
                    testability_score=float(np.random.rand()*5)
                )
                reviews.append(SimpleNamespace(
                    hypothesis_id=f"H{i}",
                    hypothesis_text=hyp,
                    criteria=criteria,
                    weaknesses=["needs data"], strengths=["novel"]
                ))
            return SimpleNamespace(hypothesis_reviews=reviews)

    elif name == "run_ranking_agent":
        def fn(ref_state):
            leaders = [SimpleNamespace(hypothesis_id=rev.hypothesis_id)
                       for rev in ref_state.hypothesis_reviews]
            return SimpleNamespace(leaderboard=leaders)

    elif name == "run_proximity_agent":
        def fn(gen_state):
            return _preserve_proposals(gen_state)  # keeps list alive

    elif name == "run_evolution_agent":
        def fn(gen_state, rank_state):
            return _preserve_proposals(gen_state)  # keeps list alive

    elif name == "run_meta_review_agent":
        def fn(ref_state, rank_state):
            return SimpleNamespace(
                actionable_for_generation="(stub) improve specificity",
                actionable_for_reflection="(stub) add novelty rubric",
                pattern_insights=[],
                criterion_correlations={}
            )

    globals()[name] = fn

for fn_name in [
    "run_generation_agent", "run_reflection_agent", "run_ranking_agent",
    "run_proximity_agent", "run_evolution_agent", "run_meta_review_agent"
]:
    _ensure_stub(fn_name)

# ---------- 4️⃣  Utility getters ---------------------------------------------

def _safe_get(obj, attr, default=None):
    if isinstance(obj, dict):
        return obj.get(attr, default)
    return getattr(obj, attr, default)

def _count_props(obj) -> int:
    return len(_safe_get(obj, "proposals", []))

# ---------- 5️⃣  Supervisor loop ---------------------------------------------

def run_supervisor(config: SupervisorConfig) -> SupervisorState:
    state = SupervisorState(config=config)
    best_leader, stagnant = None, 0

    for cycle in range(1, config.max_cycles + 1):
        print(f"\n🔄 Cycle {cycle} / {config.max_cycles}")

        # Generation
        for _ in range(config.generation_per_cycle):
            state.generation_state = run_generation_agent(config.research_goal)

        # Proximity
        if cycle % config.proximity_every == 0:
            state.generation_state = run_proximity_agent(state.generation_state)

        # Reflection & Ranking
        state.reflection_state = run_reflection_agent(state.generation_state)
        state.ranking_state    = run_ranking_agent(state.reflection_state)

        # Evolution
        if cycle % config.evolution_every == 0:
            state.generation_state = run_evolution_agent(state.generation_state,
                                                         state.ranking_state)

        # Meta-review
        if cycle % config.meta_every == 0 and hasattr(state.reflection_state, "hypothesis_reviews"):
            state.meta_state = run_meta_review_agent(state.reflection_state,
                                                     state.ranking_state)

        # Stats & early-stop
        leaderboard = _safe_get(state.ranking_state, "leaderboard", [])
        top_id = _safe_get(leaderboard[0], "hypothesis_id", None) if leaderboard else None
        n_hyp  = _count_props(state.generation_state)

        state.cycles.append(CycleStats(
            cycle_id=cycle, n_hypotheses=n_hyp, best_rank_text=str(top_id),
            timestamp=datetime.datetime.utcnow().isoformat()
        ))

        stagnant = stagnant + 1 if top_id == best_leader else 0
        best_leader = best_leader or top_id
        if top_id and top_id != best_leader:
            best_leader, stagnant = top_id, 0

        if stagnant >= config.no_improve_patience:
            state.finished, state.finish_reason = True, "No improvement"
            break

    if not state.finished:
        state.finished, state.finish_reason = True, "Reached max_cycles"

    print(f"\n✅ Supervisor finished: {state.finish_reason}")
    return state

# ---------- 6️⃣  Smoke-test ---------------------------------------------------

if __name__ == "__main__" or True:  # auto-run in notebook
    cfg = SupervisorConfig(research_goal="Stubbed demo", max_cycles=3)
    sup_state = run_supervisor(cfg)

    print("\n📊 Cycle summary:")
    for c in sup_state.cycles:
        print(f"  • Cycle {c.cycle_id}: n_hyp={c.n_hypotheses}  best={c.best_rank_text}")




ℹ️  Creating stub for missing agent: run_generation_agent
ℹ️  Creating stub for missing agent: run_reflection_agent
ℹ️  Creating stub for missing agent: run_ranking_agent
ℹ️  Creating stub for missing agent: run_evolution_agent
ℹ️  Creating stub for missing agent: run_meta_review_agent

🔄 Cycle 1 / 3

🔄 Cycle 2 / 3

🔄 Cycle 3 / 3

✅ Supervisor finished: No improvement

📊 Cycle summary:
  • Cycle 1: n_hyp=5  best=H0
  • Cycle 2: n_hyp=5  best=H0
  • Cycle 3: n_hyp=5  best=H0


In [None]:
#@title 🧬 Full System Integration: The AI Co-Scientist Loop (v3 - Resilient)
# ===========================================================================
# This cell integrates all previously defined agents into a single,
# automated, multi-cycle research loop.
#
# Key Fixes in this version:
# 1. Reorders the Supervisor loop to a more logical flow (Generate -> ... -> Evolve).
# 2. Makes the Ranking Agent's skip-logic more robust by creating a
#    properly populated RankingState object even when there's only one hypothesis.
# 3. Further improves the Generation Agent's prompt for better multi-output.
# 4. Corrected a critical runtime error in the Generation Agent.
# ===========================================================================

import time
import re
from dataclasses import dataclass, field
from typing import List, Dict, Any, Optional
from types import SimpleNamespace # Used for mock objects
import datetime # FIX: Added the missing datetime import

# ---------------------------------------------------------------------------
# 1. UNIFIED STATE & CONFIGURATION (Unchanged)
# ---------------------------------------------------------------------------

@dataclass
class SupervisorConfig:
    """Configuration for the supervisor's run."""
    research_goal: str
    constraints: List[str] = field(default_factory=list)
    max_cycles: int = 3
    evolution_every: int = 2
    meta_every: int = 2
    proximity_every: int = 1
    no_improve_patience: int = 2

@dataclass
class FullSystemState:
    """A unified state object to hold all data for the integrated run."""
    config: SupervisorConfig
    proposals: List[Dict[str, Any]] = field(default_factory=list)
    reviews: List[Any] = field(default_factory=list)
    rankings: List[Dict[str, Any]] = field(default_factory=list)
    generation_agent_state: Optional[Any] = None
    reflection_agent_state: Optional[Any] = None
    ranking_agent_state: Optional[Any] = None
    evolution_agent_state: Optional[Any] = None
    meta_review_agent_state: Optional[Any] = None
    cycle_history: List[Dict[str, Any]] = field(default_factory=list)
    is_finished: bool = False
    finish_reason: str = ""

# ---------------------------------------------------------------------------
# 2. CORRECTED GENERATION AGENT (Improved Prompt & Fixed Timestamp)
# ---------------------------------------------------------------------------
class CorrectedGenerationAgent(EnhancedGenerationAgent):
    """Inherits from the original and fixes the hypothesis parsing."""
    def generate_hypotheses(self, state: GenerationState) -> GenerationState:
        """FIXED: Generate hypotheses and correctly parse them into a list."""
        print("💡 Generating hypotheses from multi-source research...")
        if not state.synthesized_knowledge:
            state.generated_proposals = []
            return state
        try:
            # Improved prompt to strongly encourage multiple, distinct outputs.
            prompt = f"""Based on this multi-source research synthesis, generate 2-3 novel and distinct hypotheses for: {state.research_goal}
Constraints: {', '.join(state.constraints)}

Multi-Source Knowledge Base:
{state.synthesized_knowledge}

Structure your response with each distinct hypothesis separated by '---'. Each hypothesis must be a clear, testable statement.
Example:
Hypothesis 1: [Statement for hypothesis 1]
---
Hypothesis 2: [Statement for hypothesis 2]
"""
            response = self.llm.invoke([("human", prompt)])
            content = response.content
            hypotheses_text = [h.strip() for h in re.split(r'\\n---\\n', content) if h.strip()]
            proposals = []
            for i, text in enumerate(hypotheses_text):
                clean_text = re.sub(r'^Hypothesis \\d+:\\s*', '', text, flags=re.IGNORECASE)
                if clean_text:
                    proposals.append({
                        "id": f"gen_hyp_{state.iteration}_{i+1}_{int(time.time())}",
                        "content": clean_text,
                        # FIX: Changed datetime.now() to datetime.datetime.now()
                        "timestamp": datetime.datetime.now().isoformat(),
                        "based_on_sources": len(state.literature_findings),
                    })
            state.generated_proposals = proposals
        except Exception as e:
            logger.error(f"Hypothesis generation failed: {str(e)}")
            state.generated_proposals = []
        state.status = "hypotheses_generated"
        state.iteration += 1
        print(f"✅ Generated {len(state.generated_proposals)} distinct hypotheses.")
        return state

def _run_generation_step(state: FullSystemState) -> FullSystemState:
    """Runs the Generation Agent and integrates its output."""
    print("  -> Running Generation Agent...")
    agent = CorrectedGenerationAgent()
    workflow = StateGraph(GenerationState)
    workflow.add_node("generate_queries", agent.generate_search_queries)
    workflow.add_node("explore_literature", agent.literature_exploration)
    workflow.add_node("synthesize_knowledge", agent.synthesize_knowledge)
    workflow.add_node("generate_hypotheses", agent.generate_hypotheses)
    workflow.add_edge(START, "generate_queries")
    workflow.add_edge("generate_queries", "explore_literature")
    workflow.add_edge("explore_literature", "synthesize_knowledge")
    workflow.add_edge("synthesize_knowledge", "generate_hypotheses")
    workflow.add_edge("generate_hypotheses", END)
    app = workflow.compile()
    initial_state = GenerationState(
        research_goal=state.config.research_goal,
        constraints=state.config.constraints
    )
    generation_state = app.invoke(initial_state)
    if generation_state:
        new_proposals = generation_state.get('generated_proposals', [])
        state.proposals.extend(new_proposals)
        state.generation_agent_state = generation_state
        print(f"     + Added {len(new_proposals)} new hypotheses to the pool.")
    else:
        print("     ! Generation Agent failed to produce output.")
    return state

# ---------------------------------------------------------------------------
# 3. ROBUST AGENT ADAPTERS (Ranking Step Fixed)
# ---------------------------------------------------------------------------

def _run_proximity_step(state: FullSystemState) -> FullSystemState:
    print("  -> Running Proximity Agent...")
    initial_count = len(state.proposals)
    if initial_count < 2:
        print("     - Skipping, not enough hypotheses to compare.")
        return state
    temp_gen_state = SimpleNamespace(generated_proposals=state.proposals)
    final_gen_state = run_proximity_agent(temp_gen_state)
    state.proposals = final_gen_state.generated_proposals
    removed_count = initial_count - len(state.proposals)
    print(f"     - Deduplicated proposals. Removed: {removed_count}, Kept: {len(state.proposals)}.")
    return state

def _run_reflection_step(state: FullSystemState) -> FullSystemState:
    print("  -> Running Reflection Agent...")
    if not state.proposals:
        print("     ! No proposals to reflect on.")
        return state
    reflection_agent = RobustReflectionAgent(llm)
    reflection_state = reflection_agent.adaptive_batch_review(
        hypotheses=state.proposals, research_goal=state.config.research_goal
    )
    state.reviews = reflection_state.hypothesis_reviews
    state.reflection_agent_state = reflection_state
    print(f"     - Reviewed {len(state.reviews)} hypotheses.")
    return state

def _run_ranking_step(state: FullSystemState) -> FullSystemState:
    """Runs the Ranking Agent to create a leaderboard."""
    print("  -> Running Ranking Agent...")
    if len(state.reviews) < 2:
        print("     - Skipping, not enough hypotheses to rank.")
        # FIX: When skipping, create a valid RankingState object with the single
        # hypothesis as the final ranking. This prevents downstream NoneType errors.
        if state.reviews:
            ranking_list = [{'rank': 1, 'hypothesis_id': state.reviews[0].hypothesis_id}]
            state.rankings = ranking_list
            state.ranking_agent_state = RankingState(final_rankings=ranking_list)
        else:
            state.rankings = []
            state.ranking_agent_state = RankingState() # Empty state
        return state

    ranking_state = run_ranking_tournament(state.reflection_agent_state, tournament_rounds=2)
    state.rankings = ranking_state.final_rankings
    state.ranking_agent_state = ranking_state
    print(f"     - Ranked {len(state.rankings)} hypotheses.")
    return state

def _run_evolution_step(state: FullSystemState) -> FullSystemState:
    """Runs the Evolution Agent on the top-ranked proposals."""
    print("  -> Running Evolution Agent...")
    if not state.rankings or not state.ranking_agent_state:
        print("     ! No ranked hypotheses to evolve. Skipping.")
        return state
    evolution_state = run_evolution_process(
        ranking_state=state.ranking_agent_state,
        reflection_state=state.reflection_agent_state,
        original_constraints=state.config.constraints,
        top_n=2
    )
    evolved_proposals = [
        {"id": hyp.hypothesis_id, "content": hyp.evolved_content}
        for hyp in evolution_state.evolved_hypotheses
    ]
    state.proposals = evolved_proposals
    state.evolution_agent_state = evolution_state
    print(f"     + Evolved top hypotheses into {len(evolved_proposals)} new proposals.")
    return state

def _run_meta_review_step(state: FullSystemState) -> FullSystemState:
    """Runs the Meta-review Agent to provide system-level feedback."""
    print("  -> Running Meta-review Agent...")
    if not state.reflection_agent_state or not state.ranking_agent_state or not state.rankings:
        print("     ! Insufficient data for meta-review.")
        return state
    temp_rank_state = SimpleNamespace(
        leaderboard=[SimpleNamespace(**r) for r in state.rankings]
    )
    # Corrected attribute access for meta-review agent
    if hasattr(temp_rank_state, 'leaderboard') and isinstance(temp_rank_state.leaderboard, list):
         # Add final_rankings attribute if missing, which the robust correlation function expects
        if not hasattr(temp_rank_state, 'final_rankings'):
            setattr(temp_rank_state, 'final_rankings', [{'hypothesis_id': item.hypothesis_id} for item in temp_rank_state.leaderboard])

    meta_review_state = run_enhanced_meta_review(
        reflection_state=state.reflection_agent_state,
        ranking_state=temp_rank_state
    )
    state.meta_review_agent_state = meta_review_state
    print("     - Generated system-wide feedback.")
    return state

# ---------------------------------------------------------------------------
# 4. THE INTEGRATED SUPERVISOR (Reordered Loop)
# ---------------------------------------------------------------------------
class IntegratedSupervisor:
    """Orchestrates the full Generate -> Reflect -> Rank -> Evolve loop."""
    def __init__(self, config: SupervisorConfig):
        self.config = config

    def run(self) -> FullSystemState:
        """Executes the full, multi-cycle research process."""
        print(f"🚀 Starting AI Co-Scientist Run: '{self.config.research_goal}'")
        print("="*60)
        state = FullSystemState(config=self.config)
        best_leader_id = None
        stagnation_counter = 0

        for cycle in range(1, self.config.max_cycles + 1):
            print(f"\n🔄 CYCLE {cycle} / {self.config.max_cycles}")
            start_time = time.time()

            # 1. Generation (add new ideas to the pool)
            state = _run_generation_step(state)

            # 2. Proximity (Deduplicate)
            if cycle % self.config.proximity_every == 0:
                state = _run_proximity_step(state)

            # 3. Reflection & Ranking
            state = _run_reflection_step(state)
            state = _run_ranking_step(state)

            # 4. Evolution (if applicable, evolve this cycle's best)
            if cycle % self.config.evolution_every == 0:
                state = _run_evolution_step(state)
                # After evolution, we have a new generation of proposals.
                # We reflect and rank them to find the new leader for this cycle.
                print("  -> Post-Evolution Re-evaluation:")
                state = _run_reflection_step(state)
                state = _run_ranking_step(state)

            # 5. Meta-Review
            if cycle % self.config.meta_every == 0:
                state = _run_meta_review_step(state)

            # 6. Logging and Early Stopping Check
            current_leader_id = state.rankings[0]['hypothesis_id'] if state.rankings else None
            state.cycle_history.append({
                'cycle': cycle,
                'num_proposals': len(state.proposals),
                'best_hypothesis_id': current_leader_id,
                'duration_sec': time.time() - start_time
            })
            print(f"  -> Cycle {cycle} complete. Best hypothesis: {current_leader_id}")

            if current_leader_id and current_leader_id == best_leader_id:
                stagnation_counter += 1
            else:
                best_leader_id = current_leader_id
                stagnation_counter = 0

            if stagnation_counter >= self.config.no_improve_patience:
                state.is_finished = True
                state.finish_reason = f"No improvement in top hypothesis for {stagnation_counter} cycles."
                break

        if not state.is_finished:
            state.is_finished = True
            state.finish_reason = "Reached max cycles."
        print("\n" + "="*60)
        print(f"✅ AI Co-Scientist Run Finished. Reason: {state.finish_reason}")
        return state

# ---------------------------------------------------------------------------
# 5. TESTING THE FULLY INTEGRATED SYSTEM
# ---------------------------------------------------------------------------
def test_full_system():
    """Defines a research goal and runs the entire integrated system."""
    print("="*60)
    print("🔬 TESTING THE FULLY INTEGRATED AI CO-SCIENTIST SYSTEM (v3)")
    print("="*60)

    config = SupervisorConfig(
        research_goal="Discover novel epigenetic-based therapeutic strategies for liver fibrosis",
        constraints=[
            "Must be translatable to clinical practice within 5-7 years.",
            "Focus on reversing fibrosis, not just halting progression.",
            "Consider novel delivery mechanisms for targeted therapy."
        ],
        max_cycles=2,
        evolution_every=1, # Run every cycle for a thorough test
        meta_every=2
    )
    supervisor = IntegratedSupervisor(config)
    final_state = supervisor.run()

    print("\n\n" + "="*60)
    print("📈 FINAL REPORT")
    print("="*60)
    print(f"Research Goal: {final_state.config.research_goal}")
    print(f"Total Cycles: {len(final_state.cycle_history)}")
    print(f"Final Proposal Count: {len(final_state.proposals)}")

    if final_state.rankings:
        top_hypothesis_id = final_state.rankings[0]['hypothesis_id']
        top_hypothesis_content = "Content not found in final proposals."
        for prop in final_state.proposals:
             if prop.get('id') == top_hypothesis_id:
                top_hypothesis_content = prop.get('content')
                break
        print(f"\n🏆 Top-Ranked Hypothesis: [{top_hypothesis_id}]")
        print(f"   Content: {top_hypothesis_content[:250]}...")
    else:
        print("\n🏆 No final hypotheses were ranked.")

    if final_state.meta_review_agent_state and hasattr(final_state.meta_review_agent_state, 'actionable_for_generation'):
        print("\n🧠 Final Meta-Review Feedback for Generation Agent:")
        print(f"   - {final_state.meta_review_agent_state.actionable_for_generation}")
    else:
        print("\n🧠 No meta-review was generated in this short run.")

    print("\n📋 Cycle History:")
    for entry in final_state.cycle_history:
        print(f"  - Cycle {entry['cycle']}: {entry['num_proposals']} proposals, Best ID: {entry['best_hypothesis_id']}, Duration: {entry['duration_sec']:.1f}s")
    print("\n" + "="*60)
    print("🎉 FULL SYSTEM TEST COMPLETE")
    print("="*60)

    return final_state

# --- Run the test ---
if __name__ == "__main__":
    final_system_state = test_full_system()




🔬 TESTING THE FULLY INTEGRATED AI CO-SCIENTIST SYSTEM (v3)
🚀 Starting AI Co-Scientist Run: 'Discover novel epigenetic-based therapeutic strategies for liver fibrosis'

🔄 CYCLE 1 / 2
  -> Running Generation Agent...
🔍 Generating multi-source search queries...
✅ Generated 4 optimized queries
   1. AI personalized medicine neurodegenerative diseases
   2. machine learning precision medicine Alzheimer
   3. computational biomarkers neurodegeneration
   4. artificial intelligence clinical decision support
📚 Searching multiple literature sources...


ERROR:__main__:Web academic search failed for 'AI personalized medicine neurodegenerative diseases': https://lite.duckduckgo.com/lite/ RuntimeError: error sending request for url (https://lite.duckduckgo.com/lite/): operation timed out

Caused by:
    operation timed out
ERROR:__main__:Web academic search failed for 'machine learning precision medicine Alzheimer': https://lite.duckduckgo.com/lite/ RuntimeError: error sending request for url (https://lite.duckduckgo.com/lite/): operation timed out

Caused by:
    operation timed out
ERROR:__main__:Web academic search failed for 'computational biomarkers neurodegeneration': https://html.duckduckgo.com/html RuntimeError: error sending request for url (https://html.duckduckgo.com/html): operation timed out

Caused by:
    operation timed out
ERROR:__main__:Web academic search failed for 'artificial intelligence clinical decision support': https://html.duckduckgo.com/html RuntimeError: error sending request for url (https://html.duckduckgo.

✅ Found 12 papers from 1 sources:
   • PubMed: 12 papers
🧠 Synthesizing knowledge from multiple sources...
✅ Multi-source knowledge synthesis complete
💡 Generating hypotheses from multi-source research...
✅ Generated 1 distinct hypotheses.
     + Added 1 new hypotheses to the pool.
  -> Running Proximity Agent...
     - Skipping, not enough hypotheses to compare.
  -> Running Reflection Agent...
     - Reviewed 1 hypotheses.
  -> Running Ranking Agent...
     - Skipping, not enough hypotheses to rank.
  -> Running Evolution Agent...
     + Evolved top hypotheses into 2 new proposals.
  -> Post-Evolution Re-evaluation:
  -> Running Reflection Agent...
     - Reviewed 2 hypotheses.
  -> Running Ranking Agent...
     - Ranked 2 hypotheses.
  -> Cycle 1 complete. Best hypothesis: evolved_simplification_013933

🔄 CYCLE 2 / 2
  -> Running Generation Agent...
🔍 Generating multi-source search queries...
✅ Generated 4 optimized queries
   1. AI personalized medicine neurodegenerative diseases
 

ERROR:__main__:Web academic search failed for 'AI personalized medicine neurodegenerative diseases': https://html.duckduckgo.com/html RuntimeError: error sending request for url (https://html.duckduckgo.com/html): operation timed out

Caused by:
    operation timed out
ERROR:__main__:CrossRef search failed for 'machine learning precision medicine Alzheimer': HTTPSConnectionPool(host='api.crossref.org', port=443): Read timed out. (read timeout=10)
ERROR:__main__:Web academic search failed for 'machine learning precision medicine Alzheimer': https://html.duckduckgo.com/html RuntimeError: error sending request for url (https://html.duckduckgo.com/html): operation timed out

Caused by:
    operation timed out
ERROR:__main__:Web academic search failed for 'computational biomarkers neurodegeneration': https://html.duckduckgo.com/html RuntimeError: error sending request for url (https://html.duckduckgo.com/html): operation timed out

Caused by:
    operation timed out
ERROR:__main__:Web acade

✅ Found 12 papers from 1 sources:
   • PubMed: 12 papers
🧠 Synthesizing knowledge from multiple sources...
✅ Multi-source knowledge synthesis complete
💡 Generating hypotheses from multi-source research...
✅ Generated 1 distinct hypotheses.
     + Added 1 new hypotheses to the pool.
  -> Running Proximity Agent...
     - Deduplicated proposals. Removed: 0, Kept: 3.
  -> Running Reflection Agent...
     - Reviewed 3 hypotheses.
  -> Running Ranking Agent...
     - Ranked 3 hypotheses.
  -> Running Evolution Agent...


  quota_metric: "generativelanguage.googleapis.com/generate_requests_per_model"
  quota_id: "GenerateRequestsPerMinutePerProjectPerModel"
  quota_dimensions {
    key: "model"
    value: "gemini-2.0-flash-exp"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 10
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 52
}
].
ERROR:__main__:Analogical reasoning failed for gen_hyp_0_1_1750038142: 429 You exceeded your current quota. Please migrate to Gemini 2.0 Flash Preview (Image Generation) (models/gemini-2.0-flash-preview-image-generation) for higher quota limits. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
  quota_metric: "generativelanguage.googleapis.com/generate_requests_per_model"
  quota_id: "GenerateRequestsPerMinutePerProjectPerModel"
  quota_dimensions {
    key: "model"
    value: 

     + Evolved top hypotheses into 5 new proposals.
  -> Post-Evolution Re-evaluation:
  -> Running Reflection Agent...


ERROR:__main__:Quick review failed for evolved_simplification_014307: 429 You exceeded your current quota. Please migrate to Gemini 2.0 Flash Preview (Image Generation) (models/gemini-2.0-flash-preview-image-generation) for higher quota limits. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
  quota_metric: "generativelanguage.googleapis.com/generate_requests_per_model"
  quota_id: "GenerateRequestsPerMinutePerProjectPerModel"
  quota_dimensions {
    key: "model"
    value: "gemini-2.0-flash-exp"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 10
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 43
}
]
  quota_metric: "generativelanguage.googleapis.com/generate_requests_per_model"
  quota_id: "GenerateRequestsPerMinutePerProjectPerModel"
  quota_dimensions {
    key: "model"
    value: "g

     - Reviewed 5 hypotheses.
  -> Running Ranking Agent...


  quota_metric: "generativelanguage.googleapis.com/generate_requests_per_model"
  quota_id: "GenerateRequestsPerMinutePerProjectPerModel"
  quota_dimensions {
    key: "model"
    value: "gemini-2.0-flash-exp"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 10
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 48
}
].
ERROR:__main__:Debate failed for evolved_simplification_014307 vs evolved_radical_variation_014307: 429 You exceeded your current quota. Please migrate to Gemini 2.0 Flash Preview (Image Generation) (models/gemini-2.0-flash-preview-image-generation) for higher quota limits. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
  quota_metric: "generativelanguage.googleapis.com/generate_requests_per_model"
  quota_id: "GenerateRequestsPerMinutePerProjectPerModel"
  quota_dimensions {

     - Ranked 5 hypotheses.
  -> Running Meta-review Agent...
     - Generated system-wide feedback.
  -> Cycle 2 complete. Best hypothesis: evolved_simplification_014307

✅ AI Co-Scientist Run Finished. Reason: Reached max cycles.


📈 FINAL REPORT
Research Goal: Discover novel epigenetic-based therapeutic strategies for liver fibrosis
Total Cycles: 2
Final Proposal Count: 5

🏆 Top-Ranked Hypothesis: [evolved_simplification_014307]
   Content: **

A federated learning model trained on longitudinal plasma neurofilament light chain (NfL) data can predict conversion to mild cognitive impairment (MCI) in healthy controls with greater accuracy than baseline NfL levels alone.

**...

🧠 Final Meta-Review Feedback for Generation Agent:
   - Analysis complete: 2 patterns identified from 5 documents

📋 Cycle History:
  - Cycle 1: 2 proposals, Best ID: evolved_simplification_013933, Duration: 172.3s
  - Cycle 2: 5 proposals, Best ID: evolved_simplification_014307, Duration: 298.3s

🎉 FULL SYSTEM 