In [None]:
# Notebook 3: Agent 3 - RAG-based Explanation Engine (Google Drive Version)
# Purpose: Use RAG to explain docking scores in biological context with OpenAI

# ============================================================================
# CELL 1: Mount Google Drive
# ============================================================================
from google.colab import drive
drive.mount('/content/drive')

print("‚úì Google Drive mounted successfully!")

# ============================================================================
# CELL 2: Setup Base Directory and Install Libraries
# ============================================================================
# Set base directory (must match previous agents)
BASE_DIR = '/content/drive/MyDrive/ProteinDocking'

import os
os.makedirs(f'{BASE_DIR}/data/rag', exist_ok=True)

print(f"‚úì Using project directory: {BASE_DIR}")
print("\nInstalling RAG libraries (this may take a minute)...")

# Install dependencies
!pip install langchain langchain-community langchain-openai chromadb sentence-transformers openai -q

print("‚úì Libraries installed successfully!")

# ============================================================================
# CELL 3: Import Libraries
# ============================================================================
import json
from datetime import datetime

# LangChain imports
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.schema import Document

print("‚úì Libraries imported successfully!")

# ============================================================================
# CELL 4: Setup OpenAI API Key
# ============================================================================
import getpass

print("üîë Enter your OpenAI API Key")
print("   (Get one at: https://platform.openai.com/api-keys)")
print("   Your key will not be displayed for security.\n")

openai_api_key = getpass.getpass("Enter OpenAI API Key: ")

# Verify the key is entered
if openai_api_key:
    print("‚úì API key received!")
    os.environ["OPENAI_API_KEY"] = openai_api_key
else:
    print("‚úó No API key provided. Please run this cell again.")

# ============================================================================
# CELL 5: Create Knowledge Base for RAG
# ============================================================================
def create_knowledge_base():
    """
    Create a knowledge base about protein docking and vaccine development
    This provides context for the LLM to explain docking scores
    """

    knowledge_docs = [
        {
            "content": """
            Protein Docking Fundamentals:
            Protein-protein docking is a computational method to predict how two proteins interact and bind together.
            The docking process calculates binding affinity, measured in kcal/mol (kilocalories per mole).
            Lower (more negative) values indicate stronger binding. A binding affinity of -7 kcal/mol or lower
            is generally considered good binding. Values below -10 kcal/mol indicate excellent, very strong binding.
            Values above -5 kcal/mol suggest weak binding that may not be therapeutically useful.
            """,
            "source": "Docking Basics",
            "category": "fundamentals"
        },
        {
            "content": """
            Interpreting Binding Affinity Scores:
            - Excellent binding: < -10 kcal/mol (very strong interaction, ideal for drug/vaccine design)
            - Good binding: -10 to -7 kcal/mol (strong interaction, promising for therapeutics)
            - Moderate binding: -7 to -5 kcal/mol (moderate interaction, may need optimization)
            - Weak binding: > -5 kcal/mol (weak interaction, generally not useful therapeutically)

            These thresholds are based on empirical data from successful drug-protein and antibody-antigen interactions.
            """,
            "source": "Binding Score Interpretation Guide",
            "category": "scoring"
        },
        {
            "content": """
            RMSD (Root Mean Square Deviation) in Docking:
            RMSD measures how different the predicted binding poses are from each other or from a reference structure.
            Lower RMSD values (< 2 Angstroms) indicate high confidence in the binding pose prediction.
            Higher RMSD values suggest more variability and less certainty about the exact binding configuration.
            RMSD is important for validating that multiple docking runs produce consistent results.
            """,
            "source": "RMSD Explanation",
            "category": "metrics"
        },
        {
            "content": """
            Vaccine Development and Protein Docking:
            In vaccine development, protein docking helps predict how antibodies or immune proteins will interact with
            viral or bacterial antigens. Strong binding (low/negative kcal/mol scores) suggests that an antibody or
            vaccine candidate will effectively neutralize the pathogen. For COVID-19 vaccines, researchers used docking
            to predict how antibodies would bind to the spike protein's receptor binding domain (RBD). Binding affinities
            below -8 kcal/mol were associated with effective neutralizing antibodies.
            """,
            "source": "Vaccine Development Applications",
            "category": "vaccine"
        },
        {
            "content": """
            AutoDock Vina Scoring Function:
            AutoDock Vina uses a scoring function that estimates the binding affinity between two molecules.
            The score represents the predicted free energy of binding. Multiple binding modes are generated,
            ranked by their predicted affinity. The best (most negative) score represents the most favorable
            binding configuration. Vina's predictions correlate well with experimental binding data, with typical
            accuracy within 2-3 kcal/mol of experimental values.
            """,
            "source": "AutoDock Vina Documentation",
            "category": "tools"
        },
        {
            "content": """
            Practical Implications for Drug Discovery:
            A binding affinity of -9 kcal/mol roughly corresponds to a dissociation constant (Kd) of approximately
            250 nanomolar, which is considered good affinity for a therapeutic candidate. Each additional -1.4 kcal/mol
            in binding energy corresponds to roughly a 10-fold improvement in binding affinity. This means small
            improvements in binding scores can represent significant improvements in therapeutic efficacy.
            """,
            "source": "Drug Discovery Guidelines",
            "category": "application"
        },
        {
            "content": """
            Comparing Multiple Binding Modes:
            Docking algorithms typically generate multiple binding modes (poses) to account for flexibility and
            uncertainty. The top-ranked mode has the best predicted affinity, but examining multiple modes is important.
            If several modes have similar good scores (within 2 kcal/mol), this suggests a robust binding interaction.
            If only one mode has a good score and others are much worse, this may indicate less reliable binding.
            """,
            "source": "Multiple Pose Analysis",
            "category": "analysis"
        },
        {
            "content": """
            SARS-CoV-2 and Spike Protein:
            The SARS-CoV-2 spike protein's receptor binding domain (RBD) is a key target for vaccine and antibody
            development. Successful neutralizing antibodies typically show binding affinities between -8 to -12 kcal/mol
            to the spike RBD. The spike protein binds to human ACE2 receptor with an affinity around -11 kcal/mol,
            so therapeutic antibodies need comparable or better affinity to effectively block viral entry.
            """,
            "source": "COVID-19 Research",
            "category": "case_study"
        }
    ]

    # Convert to LangChain Document objects
    documents = [
        Document(page_content=doc["content"],
                metadata={"source": doc["source"], "category": doc["category"]})
        for doc in knowledge_docs
    ]

    return documents

print("Creating knowledge base...")
documents = create_knowledge_base()
print(f"‚úì Created {len(documents)} knowledge documents")

# ============================================================================
# CELL 6: Setup Vector Database (ChromaDB)
# ============================================================================
print("\nüìö Setting up vector database...")

# Initialize embeddings (using free HuggingFace model)
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={'device': 'cpu'}
)

# Create vector store
persist_directory = f'{BASE_DIR}/data/rag/chroma_db'

vectorstore = Chroma.from_documents(
    documents=documents,
    embedding=embeddings,
    persist_directory=persist_directory
)

print(f"‚úì Vector database created and saved to Google Drive")
print(f"  Location: {persist_directory.replace('/content/drive/MyDrive/', '')}")

# ============================================================================
# CELL 7: Load Docking Results from Agent 2
# ============================================================================
print("\nüìÇ Loading docking results from Agent 2...")

agent2_file = f'{BASE_DIR}/data/agent2_output.json'

try:
    with open(agent2_file, 'r') as f:
        docking_data = json.load(f)
    print(f"‚úì Loaded docking data from: {agent2_file.replace('/content/drive/MyDrive/', '')}")
except FileNotFoundError:
    print(f"‚úó Error: Agent 2 output not found!")
    print(f"   Make sure you ran Notebook 2 first.")
    print(f"   Looking for: {agent2_file}")
    raise

# Display docking summary
print(f"\nüìä Docking Summary:")
print(f"   Best Affinity: {docking_data['best_affinity']} kcal/mol")
print(f"   Total Modes: {len(docking_data['binding_modes'])}")
print(f"   Receptor: {docking_data['receptor'].split('/')[-1]}")
print(f"   Ligand: {docking_data['ligand'].split('/')[-1]}")

# ============================================================================
# CELL 8: Setup OpenAI LLM and RAG Chain
# ============================================================================
print("\nü§ñ Setting up OpenAI LLM with RAG...")

# Initialize OpenAI LLM
llm = ChatOpenAI(
    model="gpt-3.5-turbo",  # You can change to "gpt-4" for better results
    temperature=0.3,  # Lower = more focused, higher = more creative
    openai_api_key=openai_api_key
)

# Create custom prompt template
prompt_template = """You are an expert in computational biology and vaccine development, specifically in protein docking analysis.

Use the following context from scientific literature to help explain the docking results:

{context}

Based on this context and the docking data provided, please answer the following question in a clear, scientific manner.
Explain what the binding affinity scores mean for vaccine development and therapeutic potential.

Question: {question}

Provide a detailed but accessible explanation that would be suitable for a college-level research project. Include:
1. Interpretation of the binding scores
2. What this means for vaccine/therapeutic development
3. Comparison to known successful therapeutic interactions
4. Any recommendations or next steps

Answer:"""

PROMPT = PromptTemplate(
    template=prompt_template,
    input_variables=["context", "question"]
)

# Create RAG chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever(search_kwargs={"k": 4}),  # Retrieve top 4 relevant docs
    chain_type_kwargs={"prompt": PROMPT},
    return_source_documents=True
)

print("‚úì RAG chain configured successfully!")

# ============================================================================
# CELL 9: Generate Explanation for Docking Results
# ============================================================================
def explain_docking_results(docking_data, qa_chain):
    """
    Use RAG + LLM to explain docking results
    """

    print("\n" + "="*60)
    print("AGENT 3: RAG-BASED EXPLANATION ENGINE")
    print("="*60 + "\n")

    # Prepare docking data summary for the question
    best_mode = docking_data['best_mode']
    all_affinities = [mode['affinity_kcal_mol'] for mode in docking_data['binding_modes']]

    question = f"""
    I performed protein docking between two proteins with the following results:

    Best binding affinity: {best_mode['affinity_kcal_mol']} kcal/mol
    RMSD lower bound: {best_mode['rmsd_lower_bound']} √Ö
    RMSD upper bound: {best_mode['rmsd_upper_bound']} √Ö

    Total binding modes found: {len(docking_data['binding_modes'])}
    Range of affinities: {min(all_affinities):.2f} to {max(all_affinities):.2f} kcal/mol

    Please explain:
    1. What does this binding affinity score tell us about the protein-protein interaction?
    2. Is this binding strength suitable for vaccine development or therapeutic applications?
    3. How does this compare to successful therapeutic antibodies or vaccines?
    4. What would be the next steps in developing this as a vaccine candidate?
    """

    print("üîç Querying RAG system with OpenAI LLM...\n")

    # Get explanation from RAG chain
    result = qa_chain.invoke({"query": question})

    explanation = result['result']
    source_docs = result['source_documents']

    print("="*60)
    print("AI EXPLANATION OF DOCKING RESULTS")
    print("="*60 + "\n")
    print(explanation)
    print("\n" + "="*60)

    print(f"\nüìö Sources consulted by RAG:")
    for i, doc in enumerate(source_docs, 1):
        print(f"   {i}. {doc.metadata['source']} ({doc.metadata['category']})")

    return {
        'explanation': explanation,
        'sources': [doc.metadata['source'] for doc in source_docs],
        'timestamp': datetime.now().isoformat()
    }

# Generate explanation
explanation_result = explain_docking_results(docking_data, qa_chain)

# ============================================================================
# CELL 10: Ask Custom Questions (Interactive)
# ============================================================================

# Extract all affinities for custom questions
all_affinities = [mode['affinity_kcal_mol'] for mode in docking_data['binding_modes']]

def ask_custom_question(question, qa_chain):
    """
    Ask custom questions about the docking results
    """
    print(f"\n‚ùì Question: {question}\n")
    print("ü§î Thinking...\n")

    result = qa_chain.invoke({"query": question})

    print("="*60)
    print("ANSWER")
    print("="*60 + "\n")
    print(result['result'])
    print("\n" + "="*60)

    return result['result']

# Example custom questions you can ask
print("\n\n" + "="*60)
print("INTERACTIVE Q&A - Ask Custom Questions")
print("="*60 + "\n")

# Question 1
custom_answer1 = ask_custom_question(
    f"The binding affinity we found is {docking_data['best_affinity']} kcal/mol. "
    "How does this compare to FDA-approved antibody therapeutics?",
    qa_chain
)

# Question 2
print("\n\n")
custom_answer2 = ask_custom_question(
    f"We have {len([a for a in all_affinities if a < -7])} binding modes with affinity better than -7 kcal/mol. "
    "What does having multiple good binding modes tell us about the reliability of this interaction?",
    qa_chain
)

# ============================================================================
# CELL 11: Generate Comprehensive Report
# ============================================================================
def generate_comprehensive_report(docking_data, explanation_result):
    """
    Generate a complete report combining all agent outputs
    """

    report = f"""
{'='*80}
PROTEIN DOCKING ANALYSIS - COMPREHENSIVE REPORT
{'='*80}

Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}

{'='*80}
SECTION 1: DOCKING PARAMETERS
{'='*80}

Receptor Protein: {docking_data['receptor'].split('/')[-1]}
Ligand Protein: {docking_data['ligand'].split('/')[-1]}

Docking Box Configuration:
  Center: ({docking_data['docking_box']['center'][0]:.2f},
           {docking_data['docking_box']['center'][1]:.2f},
           {docking_data['docking_box']['center'][2]:.2f}) √Ö
  Size: ({docking_data['docking_box']['size'][0]:.2f},
         {docking_data['docking_box']['size'][1]:.2f},
         {docking_data['docking_box']['size'][2]:.2f}) √Ö

{'='*80}
SECTION 2: DOCKING RESULTS
{'='*80}

Best Binding Mode:
  Affinity: {docking_data['best_mode']['affinity_kcal_mol']} kcal/mol
  RMSD (lower bound): {docking_data['best_mode']['rmsd_lower_bound']} √Ö
  RMSD (upper bound): {docking_data['best_mode']['rmsd_upper_bound']} √Ö
  Assessment: {docking_data['interpretation']['category']} binding

All Binding Modes:
"""

    for mode in docking_data['binding_modes'][:5]:
        report += f"  Mode {mode['mode']}: {mode['affinity_kcal_mol']} kcal/mol\n"

    if len(docking_data['binding_modes']) > 5:
        report += f"  ... and {len(docking_data['binding_modes']) - 5} more modes\n"

    report += f"""
{'='*80}
SECTION 3: AI-POWERED INTERPRETATION
{'='*80}

{explanation_result['explanation']}

Knowledge Sources Consulted:
"""

    for i, source in enumerate(explanation_result['sources'], 1):
        report += f"  {i}. {source}\n"

    report += f"""
{'='*80}
SECTION 4: CONCLUSIONS & RECOMMENDATIONS
{'='*80}

Based on the docking analysis:

Binding Strength: {docking_data['interpretation']['category']}
Clinical Potential: {docking_data['interpretation']['interpretation']}

Recommendation for Vaccine Development:
"""

    if docking_data['best_affinity'] < -7:
        report += """  ‚úì PROCEED - This binding affinity shows promise for therapeutic development.
  ‚úì Recommend experimental validation with surface plasmon resonance (SPR) or
    isothermal titration calorimetry (ITC).
  ‚úì Consider optimization through rational design or directed evolution.
"""
    else:
        report += """  ‚ö† OPTIMIZE - Binding affinity may be insufficient for therapeutic use.
  ‚ö† Recommend protein engineering to improve binding.
  ‚ö† Consider screening alternative protein variants.
"""

    report += f"""
{'='*80}
END OF REPORT
{'='*80}
"""

    return report

# Generate report
print("\nüìÑ Generating comprehensive report...\n")
report = generate_comprehensive_report(docking_data, explanation_result)
print(report)

# ============================================================================
# CELL 12: Save Final Report to Google Drive
# ============================================================================
# Save report
report_file = f'{BASE_DIR}/data/final_report.txt'
with open(report_file, 'w') as f:
    f.write(report)

# Save agent 3 output
agent3_output = {
    'agent': 'Agent 3 - RAG Explanation Engine',
    'docking_summary': {
        'best_affinity': docking_data['best_affinity'],
        'best_mode': docking_data['best_mode'],
        'interpretation': docking_data['interpretation']
    },
    'ai_explanation': explanation_result,
    'timestamp': datetime.now().isoformat()
}

agent3_file = f'{BASE_DIR}/data/agent3_output.json'
with open(agent3_file, 'w') as f:
    json.dump(agent3_output, f, indent=2)

print(f"\n‚úì Comprehensive report saved to:")
print(f"  {report_file.replace('/content/drive/MyDrive/', '')}")
print(f"\n‚úì Agent 3 output saved to:")
print(f"  {agent3_file.replace('/content/drive/MyDrive/', '')}")

print("\n" + "="*60)
print("üéâ ALL AGENTS COMPLETED SUCCESSFULLY!")
print("="*60)
print("\nüìÅ Your complete project is saved in Google Drive at:")
print(f"   MyDrive/ProteinDocking/")
print("\nProject files:")
print("  ‚úì data/agent1_output.json - Cleaned protein data")
print("  ‚úì data/agent2_output.json - Docking results")
print("  ‚úì data/agent3_output.json - AI explanation")
print("  ‚úì data/final_report.txt - Comprehensive report")
print("  ‚úì data/cleaned/ - Cleaned PDB files")
print("  ‚úì data/docking/ - Docking visualizations")
print("  ‚úì data/rag/ - RAG knowledge base")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
‚úì Google Drive mounted successfully!
‚úì Using project directory: /content/drive/MyDrive/ProteinDocking

Installing RAG libraries (this may take a minute)...
‚úì Libraries installed successfully!
‚úì Libraries imported successfully!
üîë Enter your OpenAI API Key
   (Get one at: https://platform.openai.com/api-keys)
   Your key will not be displayed for security.

Enter OpenAI API Key: ¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑
‚úì API key received!
Creating knowledge base...
‚úì Created 8 knowledge documents

üìö Setting up vector database...
‚úì Vector database created and saved to Google Drive
  Location: ProteinDocking/data/rag/chroma_db

üìÇ Loading docking results from Agent 2...
‚úì Loaded docking data from: ProteinDocking/data/agent2_output.json

üìä Docking Summary:
   Best Affinity: -8.374 kcal/mol
   Total Modes: 9
   Receptor: 6M0J_clean.pdb
   Ligand: 1R42_clean.