In [2]:
import os
import glob
import pandas as pd

# --- CONFIGURATION ---
fda_folder_name = "FDA_approved_drugs"
custom_folders = [
    "FDA_drugs", 
    "FDA_New_drugs"
]

subfolder_name = "07_admet_analysis"

def parse_filename_details(filename):
    """
    Parses a filename like 'SNVK-125_fpocket_pocket1_ligand.pdb'
    Returns: (Name, Tool, Pocket)
    """
    if not isinstance(filename, str) or not filename:
        return "Unknown", "Unknown", "Unknown"
    
    # default values
    name = "Unknown"
    tool = "Unknown"
    pocket = "Unknown"
    
    parts = filename.split('_')
    
    # 1. Extract Name (Always the first part)
    if len(parts) > 0:
        name = parts[0]
        
    # 2. Extract Tool (fpocket or p2rank)
    # We look for specific keywords in the parts
    lower_filename = filename.lower()
    if 'fpocket' in lower_filename:
        tool = 'fpocket'
    elif 'p2rank' in lower_filename:
        tool = 'p2rank'
        
    # 3. Extract Pocket (e.g., pocket1)
    # We look for a part that starts with 'pocket'
    for part in parts:
        if part.lower().startswith('pocket'):
            pocket = part
            break
            
    return name, tool, pocket

def get_fda_benchmark(protein_path):
    """
    Finds the BEST FDA score to serve as the benchmark.
    """
    search_path = os.path.join(protein_path, subfolder_name, "*.csv")
    csv_files = glob.glob(search_path)
    
    if not csv_files:
        return None, None
    
    try:
        df = pd.read_csv(csv_files[0])
        df.columns = df.columns.str.strip()
        
        if 'Docking Score' not in df.columns:
            return None, None
            
        # Sort by score (lowest is best)
        best_row = df.sort_values(by='Docking Score', ascending=True).iloc[0]
        
        score = best_row['Docking Score']
        fname = best_row.get('Filename', '')
        
        # We only need the simple name for the FDA benchmark display
        clean_name, _, _ = parse_filename_details(fname)
        
        return score, clean_name
    except Exception:
        return None, None

def find_better_candidates(protein_path, threshold_score):
    """
    Finds ALL rows where Docking Score < threshold_score.
    """
    search_path = os.path.join(protein_path, subfolder_name, "*.csv")
    csv_files = glob.glob(search_path)
    
    candidates = []
    
    if not csv_files:
        return candidates

    try:
        for csv_file in csv_files:
            df = pd.read_csv(csv_file)
            df.columns = df.columns.str.strip()
            
            if 'Docking Score' not in df.columns:
                continue
            
            # Filter: strictly better (lower) than FDA
            better_df = df[df['Docking Score'] < threshold_score].copy()
            
            for _, row in better_df.iterrows():
                candidates.append(row)
                
    except Exception as e:
        print(f"Error reading custom CSV: {e}")
        
    return candidates

def main():
    # 1. Path Verification
    if not os.path.exists(fda_folder_name):
        print(f"Error: FDA folder '{fda_folder_name}' not found.")
        return

    # 2. Find Intersection of Proteins
    common_proteins = {d for d in os.listdir(fda_folder_name) if os.path.isdir(os.path.join(fda_folder_name, d))}
    
    for cf in custom_folders:
        if os.path.exists(cf):
            c_prots = {d for d in os.listdir(cf) if os.path.isdir(os.path.join(cf, d))}
            common_proteins = common_proteins.intersection(c_prots)
    
    print(f"Analyzing {len(common_proteins)} common proteins...")
    
    final_results = []

    # 3. Process Proteins
    for protein in common_proteins:
        # A. Get FDA Benchmark
        fda_path = os.path.join(fda_folder_name, protein)
        fda_score, fda_name = get_fda_benchmark(fda_path)
        
        if fda_score is None:
            continue 
            
        # B. Check Custom Folders
        for cf in custom_folders:
            custom_path = os.path.join(cf, protein)
            better_rows = find_better_candidates(custom_path, fda_score)
            
            for row in better_rows:
                custom_score = row['Docking Score']
                custom_fname = row.get('Filename', '')
                
                # Parse the filename for specific details
                c_name, c_tool, c_pocket = parse_filename_details(custom_fname)
                
                result_entry = {
                    'Protein': protein,
                    # FDA Benchmark
                    'FDA Name': fda_name,
                    'FDA Score': fda_score,
                    # Custom Candidate Info
                    'Better Ligand': c_name,
                    'Tool': c_tool,      # fpocket / p2rank
                    'Pocket': c_pocket,  # pocket1, pocket2, etc.
                    'Ligand Score': custom_score,
                    'Improvement': round(custom_score - fda_score, 2),
                    # ADMET Data
                    'Decision': row.get('Final Decision', 'N/A'),
                    'SA Score': row.get('SA Score', 'N/A'),
                    'QED': row.get('QED', 'N/A'),
                }
                final_results.append(result_entry)

    # 4. Output
    if final_results:
        df = pd.DataFrame(final_results)
        
        # Sort by Protein then Score
        df = df.sort_values(by=['Protein', 'Ligand Score'])
        
        output_file = "better_than_fda_detailed.csv"
        df.to_csv(output_file, index=False)
        
        print("\n=== BETTER CANDIDATES FOUND ===")
        print(f"Total candidates outperforming FDA: {len(df)}")
        print("-" * 120)
        
        # Define display columns
        cols = ['Protein', 'FDA Name', 'FDA Score', 'Better Ligand', 'Tool', 'Pocket', 'Ligand Score', 'Improvement', 'Decision']
        print(df[cols].to_string(index=False))
        
        print(f"\nDetailed CSV saved to: {output_file}")
    else:
        print("No candidates found that outperform the FDA benchmark.")

if __name__ == "__main__":
    main()

Analyzing 13 common proteins...

=== BETTER CANDIDATES FOUND ===
Total candidates outperforming FDA: 1102
------------------------------------------------------------------------------------------------------------------------
Protein     FDA Name  FDA Score Better Ligand    Tool   Pocket  Ligand Score  Improvement                  Decision
   AMPK    Metformin      -4.62      SKNKS-16 fpocket  pocket2        -10.05        -5.43   REJECT (DILI High Risk)
   AMPK    Metformin      -4.62      SKNKS-16 fpocket  pocket8        -10.05        -5.43   REJECT (DILI High Risk)
   AMPK    Metformin      -4.62      SKNKS-16  p2rank  pocket1        -10.03        -5.41   REJECT (DILI High Risk)
   AMPK    Metformin      -4.62      SNVK-130 fpocket  pocket8         -9.97        -5.35      REJECT (PAINS Alert)
   AMPK    Metformin      -4.62      SNVK-130  p2rank  pocket1         -9.96        -5.34      REJECT (PAINS Alert)
   AMPK    Metformin      -4.62      SNVK-130 fpocket  pocket2         -9.81 

In [3]:
import os
import subprocess

def convert_sdf_to_pdbqt(input_folder, output_folder=None):
    # 1. Setup paths
    # If no output folder is defined, save files in the same folder as input
    if output_folder is None:
        output_folder = input_folder
    
    # Create the output directory if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    print(f"Processing files in: {input_folder}")

    # 2. Loop through all files
    for filename in os.listdir(input_folder):
        if filename.lower().endswith(".sdf"):
            input_path = os.path.join(input_folder, filename)
            
            # Create the output filename (replace extension)
            base_name = os.path.splitext(filename)[0]
            output_path = os.path.join(output_folder, f"{base_name}.pdbqt")
            
            # 3. Construct the Open Babel command
            # -isdf : Input format
            # -opdbqt : Output format
            # -O : Output file path
            # -h : Add hydrogens (Required for docking)
            # --partialcharge gasteiger : Add charges (Required for docking)
            command = [
                "obabel",
                "-isdf", input_path,
                "-opdbqt", "-O", output_path,
                "-h",
                "--partialcharge", "gasteiger"
            ]
            
            # 4. Run the command
            try:
                # stdout=subprocess.DEVNULL hides the verbose logs from obabel
                subprocess.run(command, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
                print(f"[SUCCESS] Converted: {filename}")
            except subprocess.CalledProcessError:
                print(f"[ERROR] Failed to convert: {filename}")
            except FileNotFoundError:
                print("Error: 'obabel' command not found. Is Open Babel installed and added to your PATH?")
                break

# --- USAGE ---
# UPDATE THIS PATH to point to your specific folder
# Use r"" to handle Windows backslashes
my_input_folder = r"SKNKS_Series 3 Mol format"
my_output_folder = r"pdbqt_new_v2"

if __name__ == "__main__":
    convert_sdf_to_pdbqt(my_input_folder, my_output_folder)

Processing files in: SKNKS_Series 3 Mol format
[SUCCESS] Converted: SKNKS 1.sdf
[SUCCESS] Converted: SKNKS-10.sdf
[SUCCESS] Converted: SKNKS-11.sdf
[SUCCESS] Converted: SKNKS-12.sdf
[SUCCESS] Converted: SKNKS-13.sdf
[SUCCESS] Converted: SKNKS-14.sdf
[SUCCESS] Converted: SKNKS-15.sdf
[SUCCESS] Converted: SKNKS-16.sdf
[SUCCESS] Converted: SKNKS-17.sdf
[SUCCESS] Converted: SKNKS-18.sdf
[SUCCESS] Converted: SKNKS-19.sdf
[SUCCESS] Converted: SKNKS-2.sdf
[SUCCESS] Converted: SKNKS-20.sdf
[SUCCESS] Converted: SKNKS-3.sdf
[SUCCESS] Converted: SKNKS-4.sdf
[SUCCESS] Converted: SKNKS-5.sdf
[SUCCESS] Converted: SKNKS-6.sdf
[SUCCESS] Converted: SKNKS-7.sdf
[SUCCESS] Converted: SKNKS-8.sdf
[SUCCESS] Converted: SKNKS-9.sdf
