In [None]:
# Notebook 1: Agent 1 - Protein Data Cleaning Agent (Google Drive Version)
# Purpose: Clean and prepare protein structures for docking

# ============================================================================
# CELL 1: Mount Google Drive
# ============================================================================
from google.colab import drive
drive.mount('/content/drive')

print("‚úì Google Drive mounted successfully!")
print("Your Drive is now accessible at: /content/drive/MyDrive/")

# ============================================================================
# CELL 2: Setup Base Directory and Install Libraries
# ============================================================================
# Set base directory in Google Drive
BASE_DIR = '/content/drive/MyDrive/ProteinDocking'

# Create necessary folders
import os
os.makedirs(f'{BASE_DIR}/data/raw', exist_ok=True)
os.makedirs(f'{BASE_DIR}/data/cleaned', exist_ok=True)
os.makedirs(f'{BASE_DIR}/data/docking', exist_ok=True)

print(f"‚úì Project directory created: {BASE_DIR}")
print("\nInstalling required libraries...")

# Install dependencies
!pip install biopython requests -q

print("‚úì Libraries installed successfully!")

# ============================================================================
# CELL 3: Import Libraries
# ============================================================================
from Bio.PDB import PDBParser, PDBIO, Select
from Bio.PDB.PDBExceptions import PDBConstructionWarning
import warnings
import requests

warnings.simplefilter('ignore', PDBConstructionWarning)

print("‚úì Libraries imported successfully!")

# ============================================================================
# CELL 4: Define Protein Cleaner Class
# ============================================================================
class ProteinCleaner(Select):
    """
    Custom selector to clean protein structures:
    - Removes water molecules (HOH)
    - Removes heteroatoms (ligands, ions)
    - Keeps only protein residues
    """
    def accept_residue(self, residue):
        # Only accept standard amino acid residues
        # residue.id[0] == ' ' means standard residue
        return residue.id[0] == ' '

    def accept_atom(self, atom):
        # Remove hydrogens if present (optional)
        return not atom.element == 'H'

print("‚úì ProteinCleaner class defined!")

# ============================================================================
# CELL 5: Function to Download PDB Files
# ============================================================================
def download_pdb(pdb_id, output_dir=f'{BASE_DIR}/data/raw'):
    """
    Download PDB file from RCSB PDB database

    Args:
        pdb_id: 4-character PDB ID (e.g., '1A2K')
        output_dir: Directory to save the file

    Returns:
        Path to downloaded file
    """
    os.makedirs(output_dir, exist_ok=True)

    url = f"https://files.rcsb.org/download/{pdb_id}.pdb"
    output_path = os.path.join(output_dir, f"{pdb_id}.pdb")

    print(f"Downloading {pdb_id} from RCSB PDB...")
    response = requests.get(url)

    if response.status_code == 200:
        with open(output_path, 'w') as f:
            f.write(response.text)
        print(f"‚úì Downloaded to {output_path}")
        return output_path
    else:
        print(f"‚úó Failed to download {pdb_id}")
        return None

# ============================================================================
# CELL 6: Function to Clean Protein Structure
# ============================================================================
def clean_protein(input_pdb, output_pdb):
    """
    Clean protein structure by removing water and heteroatoms

    Args:
        input_pdb: Path to input PDB file
        output_pdb: Path to save cleaned PDB file

    Returns:
        Dictionary with cleaning statistics
    """
    parser = PDBParser(QUIET=True)
    structure = parser.get_structure('protein', input_pdb)

    # Count atoms before cleaning
    atoms_before = sum(1 for _ in structure.get_atoms())

    # Save cleaned structure
    io = PDBIO()
    io.set_structure(structure)
    os.makedirs(os.path.dirname(output_pdb), exist_ok=True)
    io.save(output_pdb, ProteinCleaner())

    # Count atoms after cleaning
    cleaned_structure = parser.get_structure('cleaned', output_pdb)
    atoms_after = sum(1 for _ in cleaned_structure.get_atoms())

    stats = {
        'input_file': input_pdb,
        'output_file': output_pdb,
        'atoms_before': atoms_before,
        'atoms_after': atoms_after,
        'atoms_removed': atoms_before - atoms_after,
        'chains': len(list(cleaned_structure.get_chains()))
    }

    return stats

# ============================================================================
# CELL 7: Function to Validate Protein Structure
# ============================================================================
def validate_structure(pdb_file):
    """
    Perform basic validation on cleaned protein structure

    Args:
        pdb_file: Path to PDB file

    Returns:
        Dictionary with validation results
    """
    parser = PDBParser(QUIET=True)
    structure = parser.get_structure('protein', pdb_file)

    validation = {
        'has_structure': len(list(structure.get_atoms())) > 0,
        'num_models': len(list(structure.get_models())),
        'num_chains': len(list(structure.get_chains())),
        'num_residues': len(list(structure.get_residues())),
        'num_atoms': len(list(structure.get_atoms())),
    }

    # Check for missing residues (gaps in sequence)
    for chain in structure.get_chains():
        residues = list(chain.get_residues())
        if len(residues) > 1:
            gaps = []
            for i in range(len(residues) - 1):
                current_id = residues[i].id[1]
                next_id = residues[i + 1].id[1]
                if next_id - current_id > 1:
                    gaps.append((current_id, next_id))
            validation[f'chain_{chain.id}_gaps'] = gaps

    return validation

# ============================================================================
# CELL 8: Main Workflow Function
# ============================================================================
def process_protein(pdb_id, download=True):
    """
    Complete workflow for Agent 1

    Args:
        pdb_id: PDB ID to process
        download: Whether to download from RCSB (True) or use local file (False)

    Returns:
        Path to cleaned PDB file and statistics
    """
    print(f"\n{'='*60}")
    print(f"AGENT 1: PROTEIN DATA CLEANING - {pdb_id}")
    print(f"{'='*60}\n")

    # Step 1: Get PDB file
    if download:
        input_pdb = download_pdb(pdb_id)
        if not input_pdb:
            return None, None
    else:
        input_pdb = f"{BASE_DIR}/data/raw/{pdb_id}.pdb"

    # Step 2: Clean protein
    output_pdb = f"{BASE_DIR}/data/cleaned/{pdb_id}_clean.pdb"
    print(f"\nCleaning protein structure...")
    stats = clean_protein(input_pdb, output_pdb)

    print(f"\n‚úì Cleaning completed:")
    print(f"  - Atoms before: {stats['atoms_before']}")
    print(f"  - Atoms after: {stats['atoms_after']}")
    print(f"  - Atoms removed: {stats['atoms_removed']}")
    print(f"  - Chains: {stats['chains']}")

    # Step 3: Validate
    print(f"\nValidating cleaned structure...")
    validation = validate_structure(output_pdb)

    print(f"\n‚úì Validation results:")
    print(f"  - Models: {validation['num_models']}")
    print(f"  - Chains: {validation['num_chains']}")
    print(f"  - Residues: {validation['num_residues']}")
    print(f"  - Atoms: {validation['num_atoms']}")

    # Check for gaps
    gap_found = False
    for key, value in validation.items():
        if 'gaps' in key and value:
            print(f"  ‚ö† Warning: Sequence gaps found in {key}: {value}")
            gap_found = True

    if not gap_found:
        print(f"  ‚úì No sequence gaps detected")

    print(f"\n{'='*60}")
    print(f"Agent 1 Output: {output_pdb}")
    print(f"Saved in Google Drive: {output_pdb.replace('/content/drive/MyDrive/', '')}")
    print(f"{'='*60}\n")

    return output_pdb, {**stats, **validation}

# ============================================================================
# CELL 9: Example Usage - Process Multiple Proteins for Docking
# ============================================================================
# For docking, you need two proteins (receptor and ligand)
# Example: SARS-CoV-2 spike protein and ACE2 receptor

protein_pairs = [
    ("6M0J", "receptor"),  # SARS-CoV-2 spike RBD
    ("1R42", "ligand")     # Example antibody fragment
]

print("Processing proteins for docking experiment...")
processed_proteins = {}

for pdb_id, role in protein_pairs:
    print(f"\n{'#'*60}")
    print(f"Processing {role.upper()}: {pdb_id}")
    print(f"{'#'*60}")

    cleaned_file, results = process_protein(pdb_id, download=True)

    if cleaned_file:
        processed_proteins[role] = {
            'pdb_id': pdb_id,
            'cleaned_file': cleaned_file,
            'stats': results
        }

print("\n" + "="*60)
print("AGENT 1 SUMMARY: All proteins processed")
print("="*60)
for role, data in processed_proteins.items():
    print(f"\n{role.upper()}: {data['pdb_id']}")
    print(f"  File: {data['cleaned_file'].replace('/content/drive/MyDrive/', '')}")
    print(f"  Atoms: {data['stats']['atoms_after']}")

# ============================================================================
# CELL 10: Save Results for Next Agent
# ============================================================================
import json

# Save processing results for Agent 2
output_file = f'{BASE_DIR}/data/agent1_output.json'
output_data = {
    'agent': 'Agent 1 - Protein Cleaner',
    'base_directory': BASE_DIR,
    'proteins': processed_proteins
}

with open(output_file, 'w') as f:
    json.dump(output_data, f, indent=2)

print(f"\n‚úì Results saved to Google Drive:")
print(f"  {output_file.replace('/content/drive/MyDrive/', '')}")
print("\nüéØ Ready for Agent 2 (Docking)!")
print(f"\nYou can find all files in your Google Drive at:")
print(f"  MyDrive/ProteinDocking/data/")

# ============================================================================
# CELL 11: View Files in Google Drive (Optional)
# ============================================================================
print("\nüìÅ Files created in Google Drive:\n")
!ls -lh {BASE_DIR}/data/
print("\nüìÅ Raw PDB files:")
!ls -lh {BASE_DIR}/data/raw/
print("\nüìÅ Cleaned PDB files:")
!ls -lh {BASE_DIR}/data/cleaned/

ValueError: mount failed