In [1]:
import re, sys
import pandas as pd
import numpy as np
from rdkit import Chem, DataStructs, RDLogger
from rdkit.Chem import rdChemReactions, AllChem, Draw, PandasTools
RDLogger.DisableLog('rdApp.*')
import warnings
warnings.filterwarnings('ignore')

#### Utils

In [2]:
def ref_bonds(mol):
    bond_list=[]
    ref_labels=[]

    # Core definition: exit vectors are defined on the lactam FG (B) to get A and C
    g1 = '[*]-!@[#6;R;x2]-@[#7;$([#7]@[#6;$([#6]=[O])])]-!@[*]'
    g1m = list(mol.GetSubstructMatches(Chem.MolFromSmarts(g1)))

    if len(g1m)>0:
        bond_list=[g1m[0][:2],g1m[0][-2:]] # bond 1 before core A-B, bond after core B-C
        ref_labels=[(12,21),(23,32)]
        
    return bond_list,ref_labels

def format_bbs_df(bbs_df, mols_bbs_df, main_ppties):
    bbs_df = bbs_df[['bb_smi', 'bb_id']]
    bbs_df['bb_tag'] = bbs_df['bb_id'].str[0].astype(str)
    
    for ppty in main_ppties:
        for index, row in bbs_df.iterrows():
            median, count, score = get_ppty_median_count_from_bb_id(mols_bbs_df, row['bb_id'], ppty)
            bbs_df.at[index, ppty+'_count'] = int(count)
            bbs_df.at[index, ppty+'_median'] = round(median, 2)
            bbs_df.at[index, ppty+'_score'] = round(score, 2)
        print(ppty)
        
    return bbs_df

def get_ppty_median_count_from_bb_id(df, bb_id, ppty):
    # currently handle 3 or 4 BBs
    try:
        filt = (df['A_id'] == bb_id) | (df['B_id'] == bb_id) | (df['C_id'] == bb_id) | (df['D_id'] == bb_id)
    except:
        filt = (df['A_id'] == bb_id) | (df['B_id'] == bb_id) | (df['C_id'] == bb_id)
    df = df[filt]
    median = df[ppty].median()
    count = df[ppty].count()
    cv = (df[ppty].std() / df[ppty].mean()) * 100
    if ppty in ['IP1_EC50', 'HLM', 'MDR1_efflux', 'logD'] and cv != None:
        score = median + cv
    elif cv != None:
        score = median - cv
    else:
        score = None

    return median, count, score

def rank_bbs_ppties(bbs_df, ppties_list):
    unfavorabel_gain_ppties = ['EC50_main', 'OX1_IP1_EC50', 'HLM']
    filt = bbs_df['bb_id'].str.contains('A')
    bb_a = bbs_df[filt]
    filt = bbs_df['bb_id'].str.contains('B')
    bb_b = bbs_df[filt]
    filt = bbs_df['bb_id'].str.contains('C')
    bb_c = bbs_df[filt]
    for ppty in ppties_list:
        if ppty in unfavorabel_gain_ppties:
            bb_a[ppty+'_rank'] = bb_a[ppty+'_median'].rank(method='min', ascending=True).fillna(0).astype(int)
            bb_b[ppty+'_rank'] = bb_b[ppty+'_median'].rank(method='min', ascending=True).fillna(0).astype(int)
            bb_c[ppty+'_rank'] = bb_c[ppty+'_median'].rank(method='min', ascending=True).fillna(0).astype(int)
        else:
            bb_a[ppty+'_rank'] = bb_a[ppty+'_median'].rank(method='min', ascending=False).fillna(0).astype(int)
            bb_b[ppty+'_rank'] = bb_b[ppty+'_median'].rank(method='min', ascending=False).fillna(0).astype(int)
            bb_c[ppty+'_rank'] = bb_c[ppty+'_median'].rank(method='min', ascending=False).fillna(0).astype(int)
    bbs_df = pd.concat([bb_a, bb_b, bb_c]) 
    return bbs_df

#### Data Processing: fragment the Chemical Series into its consituent BBs
##### Load dataset

In [3]:
df = PandasTools.LoadSDF('DORA_Lactam_mols.sdf', smilesName='mol_smi', molColName='ROMol')

##### Assign BBs from rules

In [4]:
smi = [ Chem.CanonSmiles(s) for s in df['mol_smi'] ]
idx = list(df['ID'])

# ps == settings for smi conversion to mol
ps = Chem.SmilesParserParams()
ps.removeHs=False 
ps.allHsExplicit=True
mol = [ Chem.MolFromSmiles(s,ps) for s in smi ]
molH = [ Chem.RWMol(Chem.AddHs(m,explicitOnly=True)) for m in mol ]
df['molH'] = molH

smis = []
it = []
uncut = 0
for im, m in enumerate(molH):
    mbonds,ref_labels=ref_bonds(m)
	
    if len(mbonds)!=2: # if rules does not work, returns empty bbs
        smis.append(['','','']) # len(list)= 3 fragments coz two bonds/ exit v
        uncut+=1
    else :
        allbonds=m.GetBonds()	
        
        bidx=[]
        labels=[]
        for imb,mb in enumerate(mbonds):
            for ib,b in enumerate(allbonds):
                a=[b.GetBeginAtomIdx(),b.GetEndAtomIdx()]
                if mb[0]==b.GetBeginAtomIdx() and mb[1]==b.GetEndAtomIdx() :
                    bidx.append(b.GetIdx())
                    labels.append(ref_labels[imb][::-1])
                    break
                elif mb[1]==b.GetBeginAtomIdx() and mb[0]==b.GetEndAtomIdx() :
                    bidx.append(b.GetIdx())
                    labels.append(ref_labels[imb])
                    break
                    
        mol_f = Chem.FragmentOnBonds(m,tuple(bidx),dummyLabels=labels) # fragment the mol based on defined bonds
        smi = Chem.MolToSmiles(mol_f)
        rsmi=[ s for isx,s in enumerate(smi.split('.')) if "*" in s ] #and s.count("["+str(isx+1))>0 ]
        
        if len(rsmi)==3:
            smis.append(rsmi) # appends all frag_smi together   
        else :
            smis.append(['','',''])

            uncut+=1

mol_df = pd.DataFrame(smis,columns=['A_smi','B_smi','C_smi'])
mol_df['ID']=idx
mol_df['mol']=molH

In [5]:
# here, a dataframe is created containing the bb smiles, their ids and counts
smi_cols=[ c for c in mol_df.columns if "_smi" in c ] 

for ic,c in enumerate(smi_cols):
    bb_smi=mol_df[mol_df[c]!=''][c].unique() # unique should not change the order of the df
    bb_ids=[ c.replace("_smi","")+str(i+1) for i,s in enumerate(bb_smi)]
    counts=[]
    for bb in bb_smi:
        counts.append(len(mol_df[mol_df[c]==bb][c]))
    if ic==0:
        bb_df=pd.DataFrame({'bb_id':bb_ids,'bb_smi':bb_smi,'count':counts})  
    else:
        bb_df=bb_df._append(pd.DataFrame({'bb_id':bb_ids,'bb_smi':bb_smi,'count':counts}))

In [6]:
for ic,c in enumerate(smi_cols):
    ismis = bb_df[ bb_df['bb_id'].str.contains(c[0])] # same position    
    mol_df_t = mol_df.merge(bb_df,how='left',left_on=mol_df[c].str.replace("*",""),right_on=bb_df['bb_smi'].str.replace("*","")).rename(columns={'bb_id':c[0]+'_id'})
    mol_df = mol_df_t.drop(columns=['count','key_0','bb_smi'])
bb_df = bb_df.reset_index()

##### Write mols_bbs

In [7]:
mols_bbs_df = pd.merge(df, mol_df, on='ID')

In [8]:
mols_bbs_df = mols_bbs_df[['ID', 'mol_smi', 'EC50_main', 'A_id', 'B_id', 'C_id', 'OX1_IP1_EC50', 'HLM', 'CYP_testo',
       'ratio_hOX1R_hOX2R', 'mol_flag']]
mols_bbs_df.rename(columns={'ID':'mol_id'}, inplace=True)
# fix B22/68 stereo: this is done to distinguish the enantio pure BB from the racemic for later analysis
mols_bbs_df.loc[(mols_bbs_df['B_id'] == 'B22') & (mols_bbs_df['mol_flag'] == 'this enantiomer R'), 'B_id'] = 'B68'
mols_bbs_df['bbs_id'] = mols_bbs_df[['A_id', 'B_id', 'C_id']].values.tolist()
mols_bbs_df['sub_series'] = ''
mols_bbs_df.to_json('DORA_Lactam_mols_bbs.json', orient = 'records')

##### Write bbs

In [9]:
# insert B68: this is done to distinguish the enantio pure BB from the racemic for later analysis
new_row = {'index': 323, 'bb_id': 'B68', 'bb_smi': '[21*][C@]1([H])C[C@@]([H])(O)C(=O)N1[23*]', 'count': 15}
bb_df = pd.concat([bb_df, pd.DataFrame([new_row])], ignore_index=True)
bb_df.loc[(bb_df['bb_id'] == 'B22'), 'count'] = 87

In [10]:
# Extract the letter part (e.g., 'A', 'B', etc.) and numeric part separately
bb_df['bb_letter'] = bb_df['bb_id'].str.extract('([A-Z]+)')
bb_df['bb_number'] = bb_df['bb_id'].str.extract('(\d+)').astype(int)

bb_df = bb_df.sort_values(by=['bb_letter', 'bb_number'])
bb_df = bb_df.drop(columns=['bb_letter', 'bb_number'])

In [11]:
mols_bbs_df['EC50_main'] = mols_bbs_df['EC50_main'].astype(float)
mols_bbs_df['CYP_testo'] =  pd.to_numeric(mols_bbs_df['CYP_testo'], errors='coerce')
mols_bbs_df['OX1_IP1_EC50'] = mols_bbs_df['OX1_IP1_EC50'].astype(float)
mols_bbs_df['HLM'] =  pd.to_numeric(mols_bbs_df['HLM'], errors='coerce')
mols_bbs_df['ratio_hOX1R_hOX2R'] = mols_bbs_df['ratio_hOX1R_hOX2R'].astype(float)
bbs_df = format_bbs_df(bb_df, mols_bbs_df, ['EC50_main', 'CYP_testo', 'OX1_IP1_EC50', 'HLM', 'ratio_hOX1R_hOX2R'])

EC50_main
CYP_testo
OX1_IP1_EC50
HLM
ratio_hOX1R_hOX2R


In [12]:
bbs_df = rank_bbs_ppties(bbs_df, ['EC50_main', 'CYP_testo', 'OX1_IP1_EC50', 'HLM', 'ratio_hOX1R_hOX2R'])

In [13]:
bbs_df['project'] = 'DORA'
bbs_df['series'] = 'Lactam'

In [14]:
bbs_df.to_json('../data/DORA_Lactam_bbs.json', orient = 'records')