In [None]:
import bblean
import pandas as pd
from rdkit import Chem
from rdkit.Chem import PandasTools


THRESHOLD = 0.7       
BRANCHING = 128            # adjust to memory
NBITS = 2048              # ECFP4

In [None]:
df = pd.read_csv('reference_set_filtered.csv')
smiles_list=df['SMILES'].tolist()

In [None]:
fps = bblean.fps_from_smiles(
    smiles_list,
    kind="ecfp4",
    pack=True,       
    n_features=2048
)
bb = bblean.BitBirch(
    threshold=THRESHOLD,
    branching_factor=BRANCHING,
    merge_criterion="diameter"
)


In [None]:
bb.fit(fps)

In [None]:
clusters = bb.get_cluster_mol_ids()

In [None]:
import numpy as np
import pandas as pd

# 3. Convert to label vector
labels = np.zeros(len(fps), dtype=np.int32)
for cid, members in enumerate(clusters):
    for idx in members:
        labels[idx] = cid


df = pd.DataFrame({
    "SMILES": smiles_list,
    "cluster": labels
})

df.to_csv("reference_set_clusters.csv", index=False)


# Representative selection

In [None]:
chebi = PandasTools.LoadSDF('chebi.sdf')
Antibacterial = PandasTools.LoadSDF('Antibacterial-Library-13880.sdf')
drugRepurposing_hub = pd.read_csv('drug_repurposing_hub.csv')
drugbank_smiles = pd.read_csv('drugbank_smiles.csv')



In [None]:
from rdkit import Chem
from rdkit.Chem.rdinchi import MolToInchiKey
import pandas as pd
from rdkit import Chem
from rdkit.Chem.rdinchi import MolToInchiKey
from rdkit.Chem.MolStandardize import rdMolStandardize
import pandas as pd

# --- Initialize RDKit Standardizers ---
# These are heavy objects, so we create them once globally
lfc = rdMolStandardize.LargestFragmentChooser()
normalizer = rdMolStandardize.Normalizer()
uncharger = rdMolStandardize.Uncharger()

def standardize_and_get_inchikey(mol, strip_stereo=True):
    """
    Standardize a molecule:
      - Largest fragment (strips salts/solvents)
      - Normalize & Uncharge
      - Optional stereochemistry stripping
    Returns (mol_clean, inchikey, smiles)
    """
    if mol is None:
        return None, None, None

    try:
        # 1. Largest fragment (remove salts/solvents)
        mol = lfc.choose(mol)

        # 2. Cleanup + normalize + uncharge
        mol = rdMolStandardize.Cleanup(mol)
        mol = normalizer.normalize(mol)
        mol = uncharger.uncharge(mol)

        # 3. Sanitize structure
        Chem.SanitizeMol(mol)

        # 4. Optionally collapse stereochemistry
        if strip_stereo:
            Chem.RemoveStereochemistry(mol)

        # 5. Generate InChIKey and Canonical SMILES
        ik = MolToInchiKey(mol)
        smiles = Chem.MolToSmiles(mol, canonical=True, isomericSmiles=not strip_stereo)
        
        return mol, ik, smiles
    except Exception as e:
        return None, None, None

def process_dataframe_chemistry(df, smiles_col, new_inchikey_col, strip_stereo=True):
    """
    Helper to apply standardization to a specific dataframe.
    """
    print(f"Processing {smiles_col}...")
    
    # Create initial Mol objects
    df['Mol_Temp'] = df[smiles_col].apply(lambda x: Chem.MolFromSmiles(str(x)) if pd.notna(x) else None)
    
    # Apply the standardization function
    # This returns a Series of tuples, which we expand into 3 columns
    results = df['Mol_Temp'].apply(lambda m: standardize_and_get_inchikey(m, strip_stereo=strip_stereo))
    
    # Assign results back to dataframe
    _, df[new_inchikey_col], df['Canonical_SMILES'] = zip(*results)
    
    # Cleanup
    df.drop('Mol_Temp', axis=1, inplace=True)
    
    valid_count = df[new_inchikey_col].count()
    print(f"Standardized {valid_count}/{len(df)} molecules.")
    return df

# --- Process drugRepurposing_hub ---
drugRepurposing_hub = process_dataframe_chemistry(
    drugRepurposing_hub,
    smiles_col='smiles',
    new_inchikey_col='INCHIKEY-Canonical'
)

# --- Process drugbank_smiles ---
drugbank_smiles = process_dataframe_chemistry(
    drugbank_smiles,
    smiles_col='Smiles',
    new_inchikey_col='INCHIKEY-Canonical'
)

chebi = process_dataframe_chemistry(
    chebi,
    smiles_col='SMILES',
    new_inchikey_col='INCHIKEY-Canonical'
)
Antibacterial= process_dataframe_chemistry(
    Antibacterial,
    smiles_col='Smile',
    new_inchikey_col='INCHIKEY-Canonical'
)


In [None]:
# Standardize the clustered dataframe to get InChIKeys
df_clusters = pd.read_csv("reference_set_clusters.csv")

df_clusters = process_dataframe_chemistry(
    df_clusters, 
    smiles_col='SMILES', 
    new_inchikey_col='INCHIKEY-Canonical'
)

In [None]:
# Create sets of InChIKeys for fast lookup
db_keys = set(drugbank_smiles['INCHIKEY-Canonical'].dropna())
dr_keys = set(drugRepurposing_hub['INCHIKEY-Canonical'].dropna())
ab_keys = set(Antibacterial['INCHIKEY-Canonical'].dropna())
cb_keys = set(chebi['INCHIKEY-Canonical'].dropna())

def assign_priority(ik):
    """Assigns a numerical rank based on library presence (Lower is better)"""
    if ik in db_keys:
        return 1  # DrugBank
    elif ik in dr_keys:
        return 2  # Drug Repurposing Hub
    elif ik in ab_keys:
        return 3  # ChemDiv Antibiotics
    elif ik in cb_keys:
        return 4  # ChEBI
    else:
        return 5  # ChEMBL / Default

# Apply priority ranking to each molecule in the clusters
df_clusters['priority'] = df_clusters['INCHIKEY-Canonical'].apply(assign_priority)

# --- THE SELECTION STEP ---
# 1. Sort by cluster ID
# 2. Sort by priority (1 is better than 5)
# 3. Keep only the first entry for each cluster
representatives = df_clusters.sort_values(by=['cluster', 'priority']).drop_duplicates(subset=['cluster'], keep='first')

# Save the final list
representatives.to_csv("cluster_representatives_final.csv", index=False)

print(f"Selected {len(representatives)} representatives for {df_clusters['cluster'].nunique()} clusters.")

In [None]:

representatives = {}
for idx, c in enumerate(labels):
    if c not in representatives:
        representatives[c] = idx

rep_indices = np.array(list(representatives.values()))
rep_smiles = [smiles_list[i] for i in rep_indices]

df_rep = pd.DataFrame({"SMILES": rep_smiles})
df_rep.to_csv("chembl_bitbirch_representatives_3.csv", index=False)
