In [None]:
#!/usr/bin/env python3
"""
Dataset Merger Using Exact Same Approach as Working Code
========================================================

This script uses the EXACT same PDB processing approach as your working
binding site extraction code, but creates negative examples instead.

Requirements: pip install biopython pandas numpy

Usage:
    python exact_same_approach_merger.py

    Up until now, negative.csv contains the following categories: 
    Fabs (Antibodies): will only bind proteins, no small-molecule ligands
    Structural Proteins: 
        Elastin
        Fibrin
        Keratin
    Nucleases: will only bind DNA, no small molecule ligands 

    

"""

import pandas as pd
import numpy as np
import os
import time
import warnings
import json
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import logging

warnings.filterwarnings('ignore')

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Check and import BioPython (EXACT SAME as working code)
try:
    from Bio.PDB import PDBParser, PDBList
    from Bio.PDB.NeighborSearch import NeighborSearch
    from Bio.SeqUtils import seq1
    BIOPYTHON_AVAILABLE = True
    print("✓ BioPython available")
except ImportError:
    print("✗ BioPython not found. Install with: pip install biopython")
    BIOPYTHON_AVAILABLE = False

class NegativeExampleExtractor:
    """
    Extract negative examples using EXACT same approach as working code
    """
    
    def __init__(self, output_dir="negative_pdb_structures", max_workers=4):
        self.output_dir = output_dir
        self.max_workers = max_workers
        
        os.makedirs(output_dir, exist_ok=True)
        
        # Thread-safe storage
        self.negative_examples = []
        self.failed_pdbs = []
        self.lock = threading.Lock()
        
        # Initialize shared parser and downloader (EXACT SAME as working code)
        if BIOPYTHON_AVAILABLE:
            self.parser = PDBParser(QUIET=True)
            self.pdb_list = PDBList()
        
        print(f"Initialized negative extractor with {max_workers} threads")
    
    def clean_pdb_ids(self, pdb_list):
        """Extract valid PDB IDs from strings (EXACT SAME as working code)"""
        clean_ids = []
        
        for pdb_string in pdb_list:
            if isinstance(pdb_string, str):
                # Split by various separators
                for separator in [',', ';', '|', ' ']:
                    if separator in pdb_string:
                        pdb_string = pdb_string.replace(separator, ' ')
                
                # Extract individual PDB IDs
                for pdb_id in pdb_string.split():
                    pdb_id = pdb_id.strip().upper()
                    if len(pdb_id) == 4 and pdb_id.isalnum():
                        clean_ids.append(pdb_id)
        
        return list(set(clean_ids))
    
    def download_pdb_structure(self, pdb_id):
        """Download PDB structure (EXACT SAME as working code)"""
        expected_filename = os.path.join(self.output_dir, f"pdb{pdb_id.lower()}.ent")
        
        if os.path.exists(expected_filename):
            return expected_filename
        
        try:
            filename = self.pdb_list.retrieve_pdb_file(
                pdb_id, pdir=self.output_dir, file_format='pdb'
            )
            return filename if filename and os.path.exists(filename) else None
        except Exception:
            return None
    
    def extract_full_sequences(self, structure):
        """Extract complete protein sequences from PDB structure (EXACT SAME as working code)"""
        sequences = {}
        
        for model in structure:
            for chain in model:
                chain_id = chain.id
                sequence = ""
                
                # Get all protein residues in order
                protein_residues = [res for res in chain if res.id[0] == ' ']
                protein_residues.sort(key=lambda x: x.id[1])
                
                for residue in protein_residues:
                    try:
                        aa = seq1(residue.resname)
                        sequence += aa
                    except KeyError:
                        sequence += 'X'  # Unknown amino acid
                
                if sequence:
                    sequences[chain_id] = {
                        'sequence': sequence,
                        'length': len(sequence),
                        'residue_numbers': [res.id[1] for res in protein_residues]
                    }
        
        return sequences
    
    def create_negative_binding_info(self, chain_sequences):
        """Create negative binding info with all zeros (modified from working code)"""
        binding_info = {}
        
        for chain_id, seq_info in chain_sequences.items():
            # Create all-zero binding array (negative example)
            binding_array = np.zeros(seq_info['length'], dtype=int)
            
            binding_info[chain_id] = {
                'sequence': seq_info['sequence'],
                'binding_array': binding_array.tolist(),
                'binding_positions': [],  # Empty for negative examples
                'binding_residue_numbers': [],  # Empty for negative examples
                'num_binding_residues': 0  # Zero for negative examples
            }
        
        return binding_info
    
    def extract_negative_example_info(self, pdb_id, chain_sequences):
        """Extract negative example information (modified from working code)"""
        
        return {
            'pdb_id': pdb_id,
            'ligand_name': 'NONE',  # No ligand for negative examples
            'ligand_chain': '',
            'ligand_number': '',
            'num_contact_residues': 0,
            'contact_residues': [],  # Empty for negative examples
            'contact_sequence': '',
            'ligand_center': {'x': 0, 'y': 0, 'z': 0},  # No ligand center
            'binding_site_volume': 0.0,
            'chain_sequences': self.create_negative_binding_info(chain_sequences)
        }
    
    def process_single_pdb(self, pdb_id):
        """Process a single PDB structure (EXACT SAME structure as working code)"""
        if not BIOPYTHON_AVAILABLE:
            return {'pdb_id': pdb_id, 'status': 'error', 'reason': 'BioPython not available'}
        
        try:
            # Download structure (EXACT SAME as working code)
            filename = self.download_pdb_structure(pdb_id)
            if not filename:
                return {'pdb_id': pdb_id, 'status': 'error', 'reason': 'Download failed'}
            
            # Parse structure (EXACT SAME as working code)
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                structure = self.parser.get_structure(pdb_id, filename)
            
            # Extract sequences (EXACT SAME as working code)
            chain_sequences = self.extract_full_sequences(structure)
            
            if not chain_sequences:
                return {'pdb_id': pdb_id, 'status': 'error', 'reason': 'No protein chains found'}
            
            # Create negative example (modified from working code)
            negative_example = self.extract_negative_example_info(pdb_id, chain_sequences)
            
            # Thread-safe storage (EXACT SAME as working code)
            with self.lock:
                self.negative_examples.append(negative_example)
            
            return {
                'pdb_id': pdb_id, 
                'status': 'success', 
                'negative_example': negative_example,
                'num_chains': len(chain_sequences)
            }
            
        except Exception as e:
            error_msg = "Duplicate residue numbering" if "defined twice" in str(e) else str(e)
            with self.lock:
                self.failed_pdbs.append({'pdb_id': pdb_id, 'status': 'error', 'reason': error_msg})
            return {'pdb_id': pdb_id, 'status': 'error', 'reason': error_msg}
    
    def process_threaded(self, pdb_ids, max_structures=None, verbose=True):
        """Process PDB IDs using threading (EXACT SAME as working code)"""
        clean_ids = self.clean_pdb_ids(pdb_ids)
        
        if max_structures:
            clean_ids = clean_ids[:max_structures]
        
        print(f"Processing {len(clean_ids)} unique PDB IDs with {self.max_workers} threads...")
        
        completed = 0
        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
            future_to_pdb = {executor.submit(self.process_single_pdb, pdb_id): pdb_id 
                           for pdb_id in clean_ids}
            
            for future in as_completed(future_to_pdb):
                pdb_id = future_to_pdb[future]
                completed += 1
                
                try:
                    result = future.result(timeout=300)
                    if verbose and completed % 10 == 0:
                        print(f"Completed {completed}/{len(clean_ids)} structures... "
                              f"Found {len(self.negative_examples)} negative examples so far")
                    
                    if result['status'] != 'success' and verbose and 'Download failed' not in result['reason']:
                        print(f"Failed {pdb_id}: {result['reason']}")
                
                except Exception as e:
                    with self.lock:
                        self.failed_pdbs.append({'pdb_id': pdb_id, 'status': 'error', 'reason': str(e)})
                    if verbose:
                        print(f"Exception processing {pdb_id}: {str(e)}")
        
        print(f"\nCompleted! Found {len(self.negative_examples)} negative examples")
        print(f"Successfully processed: {len(clean_ids) - len(self.failed_pdbs)}/{len(clean_ids)} structures")
        print(f"Failed: {len(self.failed_pdbs)} structures")
        
        return self.negative_examples, self.failed_pdbs
    
    def export_negative_results(self, output_file="negative_examples.csv"):
        """Export results with full sequence information (EXACT SAME format as working code)"""
        if not self.negative_examples:
            print("No negative examples to export")
            return None
        
        print(f"Exporting {len(self.negative_examples)} negative examples...")
        
        flattened_data = []
        for example in self.negative_examples:
            # Base information (EXACT SAME structure as working code)
            base_row = {
                'pdb_id': example['pdb_id'],
                'ligand_name': example['ligand_name'],
                'ligand_chain': example['ligand_chain'],
                'ligand_number': example['ligand_number'],
                'num_contact_residues': example['num_contact_residues'],
                'contact_sequence': example['contact_sequence'],
                'ligand_center_x': example['ligand_center']['x'],
                'ligand_center_y': example['ligand_center']['y'],
                'ligand_center_z': example['ligand_center']['z'],
                'binding_site_volume': example['binding_site_volume']
            }
            
            # Contact residue details (empty for negative examples)
            base_row.update({
                'contact_residues_list': '',
                'contact_residue_numbers': '',
                'contact_distances': ''
            })
            
            # Full sequence information for each chain (EXACT SAME as working code)
            for chain_id, chain_info in example['chain_sequences'].items():
                base_row.update({
                    f'chain_{chain_id}_sequence': chain_info['sequence'],
                    f'chain_{chain_id}_length': len(chain_info['sequence']),
                    f'chain_{chain_id}_binding_array': json.dumps(chain_info['binding_array']),
                    f'chain_{chain_id}_binding_positions': json.dumps(chain_info['binding_positions']),
                    f'chain_{chain_id}_binding_residue_numbers': json.dumps(chain_info['binding_residue_numbers']),
                    f'chain_{chain_id}_num_binding_residues': chain_info['num_binding_residues']
                })
            
            flattened_data.append(base_row)
        
        # Export to CSV (EXACT SAME as working code)
        df = pd.DataFrame(flattened_data)
        df.to_csv(output_file, index=False)
        
        print(f"✓ Exported to {output_file}")
        return df

def load_negative_pdb_ids(negative_csv_path):
    """Load PDB IDs from negative.csv file"""
    try:
        with open(negative_csv_path, 'r') as f:
            content = f.read().strip()
            pdb_ids = [pdb_id.strip() for pdb_id in content.split(',')]
            return [pdb_id for pdb_id in pdb_ids if pdb_id]
    except Exception as e:
        logger.error(f"Error loading negative PDB IDs: {str(e)}")
        return []

def merge_with_positive_dataset(negative_df, positive_csv_path, output_path):
    """Merge negative examples with positive dataset"""
    logger.info("Loading positive dataset...")
    positive_df = pd.read_csv(positive_csv_path, low_memory=False)
    logger.info(f"Loaded {len(positive_df)} positive examples")
    
    # Add binding labels
    positive_df['is_binding'] = 1
    negative_df['is_binding'] = 0
    
    # Ensure consistent columns
    all_columns = set(positive_df.columns) | set(negative_df.columns)
    
    for col in all_columns:
        if col not in positive_df.columns:
            positive_df[col] = ''
        if col not in negative_df.columns:
            negative_df[col] = ''
    
    # Match column order
    column_order = list(positive_df.columns)
    negative_df = negative_df[column_order]
    
    # Combine and shuffle
    merged_df = pd.concat([positive_df, negative_df], ignore_index=True)
    merged_df = merged_df.sample(frac=1, random_state=42).reset_index(drop=True)
    
    # Save result
    merged_df.to_csv(output_path, index=False)
    
    # Print summary
    positive_final = len(merged_df[merged_df['is_binding'] == 1])
    negative_final = len(merged_df[merged_df['is_binding'] == 0])
    
    logger.info("Merge complete!")
    logger.info(f"Final dataset: {len(merged_df)} total examples")
    logger.info(f"  - Positive (binding): {positive_final}")
    logger.info(f"  - Negative (non-binding): {negative_final}")
    logger.info(f"  - Balance ratio: {negative_final/positive_final:.2f}")

def main():
    """Main execution function using EXACT same approach as working code"""
    
    if not BIOPYTHON_AVAILABLE:
        print("ERROR: BioPython is required. Install with: pip install biopython")
        return
    
    # Configuration
    NEGATIVE_CSV = "negative.csv"
    POSITIVE_CSV = "binding_sites_with_sequences_30000.csv"
    OUTPUT_CSV = "merged_protein_dataset.csv"
    MAX_STRUCTURES = 30000  # Start with 5000 for testing
    MAX_WORKERS = 10
    
    print("="*60)
    print("DATASET MERGER USING EXACT SAME APPROACH")
    print("="*60)
    print(f"Negative CSV: {NEGATIVE_CSV}")
    print(f"Positive CSV: {POSITIVE_CSV}")
    print(f"Output CSV: {OUTPUT_CSV}")
    print(f"Max structures: {MAX_STRUCTURES}")
    print(f"Threads: {MAX_WORKERS}")
    
    # Step 1: Load negative PDB IDs
    logger.info("Loading negative PDB IDs...")
    negative_pdb_ids = load_negative_pdb_ids(NEGATIVE_CSV)
    logger.info(f"Found {len(negative_pdb_ids)} negative PDB IDs")
    
    if not negative_pdb_ids:
        print("ERROR: No negative PDB IDs found!")
        return
    
    # Step 2: Extract negative examples using EXACT same approach
    extractor = NegativeExampleExtractor(max_workers=MAX_WORKERS)
    negative_examples, failed_pdbs = extractor.process_threaded(
        negative_pdb_ids, max_structures=MAX_STRUCTURES
    )
    
    # Step 3: Export negative examples
    negative_df = extractor.export_negative_results("negative_examples_temp.csv")
    
    if negative_df is None or len(negative_df) == 0:
        print("ERROR: No negative examples were successfully processed!")
        return
    
    # Step 4: Merge with positive dataset
    logger.info("Merging with positive dataset...")
    merge_with_positive_dataset(negative_df, POSITIVE_CSV, OUTPUT_CSV)
    
    print(f"\n" + "="*60)
    print(f"SUCCESS! Merged dataset saved to: {OUTPUT_CSV}")
    print(f"Processed {len(negative_examples)} negative examples")
    print(f"Failed: {len(failed_pdbs)} structures")
    print("="*60)

if __name__ == "__main__":
    main()

INFO:__main__:Loading negative PDB IDs...
INFO:__main__:Found 14180 negative PDB IDs


✓ BioPython available
DATASET MERGER USING EXACT SAME APPROACH
Negative CSV: negative.csv
Positive CSV: binding_sites_with_sequences_30000.csv
Output CSV: merged_protein_dataset.csv
Max structures: 5000
Threads: 4
Initialized negative extractor with 4 threads
Processing 5000 unique PDB IDs with 4 threads...
Downloading PDB structure '6vy5'...
Downloading PDB structure '6qbt'...
Downloading PDB structure '6n2x'...
Downloading PDB structure '1kzk'...
Downloading PDB structure '9hjo'...
Downloading PDB structure '3h5b'...
Downloading PDB structure '4gw4'...
Downloading PDB structure '4z0v'...
Downloading PDB structure '8k62'...
Downloading PDB structure '7wcq'...
Downloading PDB structure '8twa'...
Downloading PDB structure '7g3t'...
Downloading PDB structure '7g4x'...
Completed 10/5000 structures... Found 10 negative examples so far
Downloading PDB structure '4n96'...
Desired structure doesn't exist
Downloading PDB structure '8ez8'...
Downloading PDB structure '7r8o'...
Downloading PDB s

INFO:__main__:Merging with positive dataset...
INFO:__main__:Loading positive dataset...


✓ Exported to negative_examples_temp.csv


INFO:__main__:Loaded 35698 positive examples
INFO:__main__:Merge complete!
INFO:__main__:Final dataset: 40251 total examples
INFO:__main__:  - Positive (binding): 35698
INFO:__main__:  - Negative (non-binding): 4553
INFO:__main__:  - Balance ratio: 0.13



SUCCESS! Merged dataset saved to: merged_protein_dataset.csv
Processed 4553 negative examples
Failed: 110 structures
