In [103]:
import csv
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import math
from decimal import Decimal
from collections import Counter
from IPython.display import display
import psycopg2
import os
import warnings

from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw
from rdkit.Chem import MolFromSmiles, RDKFingerprint, RemoveHs
from rdkit.Chem import DataStructs
from rdkit.Chem import inchi
from rdkit import RDLogger

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import r2_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from nltk.tokenize import word_tokenize
from nltk import pos_tag

warnings.filterwarnings("ignore")
RDLogger.DisableLog('rdApp.*')

In [75]:
# Import Data 
fda_approved = pd.read_csv('FDA_Approved.csv', header = None, names = ['ID', 'Drug Name']) # 2331 rows
dc_compounds = pd.read_csv('DC_Compounds.csv') # 4099 rows
dc_drug_tar = pd.read_csv('Drug_Target.csv') # 19378 rows

# Merge files on 'ID' columns with 'left' join
merged_df = pd.merge(dc_compounds, fda_approved, on = 'ID', how = 'left') 

# Create column 'Approved' indicating whether the drug has FDA approval or not (0 or 1) by checking for ID in fda_approved
# Retain results without FDA approval
merged_df['Approved'] = merged_df['ID'].isin(fda_approved['ID']).replace({True: 1, False: 0})
merged_df.loc[merged_df['Approved'] != 1, 'Approved'] = 0

# Compare merged_df with Drug_Target CSV, create new resulting df 
drug_central = pd.merge(dc_drug_tar, merged_df, left_on = 'STRUCT_ID', right_on = 'ID', how = 'left')
drug_central = drug_central.drop(columns = ['Drug Name', 'INN'])

# Count the occurrences of '1' in the 'Approved' column
counts = merged_df['Approved'].value_counts().get(1, 0)

# Display and export
display(drug_central)
drug_central.to_csv('drug_central.csv', index = False)
print(f"'1' appears in the 'Approved' column: {counts}") # FDA approved

Unnamed: 0,DRUG_NAME,STRUCT_ID,TARGET_NAME,TARGET_CLASS,ACCESSION,GENE,SWISSPROT,ACT_VALUE,ACT_UNIT,ACT_TYPE,...,MOA_SOURCE_URL,ACTION_TYPE,TDL,ORGANISM,SMILES,InChI,InChIKey,ID,CAS_RN,Approved
0,levobupivacaine,4,Potassium voltage-gated channel subfamily H me...,Ion channel,Q12809,KCNH2,KCNH2_HUMAN,4.890,,IC50,...,,,Tclin,Homo sapiens,CCCCN1CCCC[C@H]1C(=O)NC1=C(C)C=CC=C1C,InChI=1S/C18H28N2O/c1-4-5-12-20-13-7-6-11-16(2...,LEBVLXFERQHONN-INIZCTEOSA-N,4.0,27262-47-1,1.0
1,levobupivacaine,4,Sodium channel protein type 1 subunit alpha,Ion channel,P35498,SCN1A,SCN1A_HUMAN,5.790,,IC50,...,,,Tclin,Homo sapiens,CCCCN1CCCC[C@H]1C(=O)NC1=C(C)C=CC=C1C,InChI=1S/C18H28N2O/c1-4-5-12-20-13-7-6-11-16(2...,LEBVLXFERQHONN-INIZCTEOSA-N,4.0,27262-47-1,1.0
2,levobupivacaine,4,Sodium channel protein type 4 subunit alpha,Ion channel,P35499,SCN4A,SCN4A_HUMAN,,,,...,https://www.ebi.ac.uk/chembl/compound/inspect/...,BLOCKER,Tclin,Homo sapiens,CCCCN1CCCC[C@H]1C(=O)NC1=C(C)C=CC=C1C,InChI=1S/C18H28N2O/c1-4-5-12-20-13-7-6-11-16(2...,LEBVLXFERQHONN-INIZCTEOSA-N,4.0,27262-47-1,1.0
3,levobupivacaine,4,Prostaglandin E2 receptor EP1 subtype,GPCR,P34995,PTGER1,PE2R1_HUMAN,,,,...,,,Tclin,Homo sapiens,CCCCN1CCCC[C@H]1C(=O)NC1=C(C)C=CC=C1C,InChI=1S/C18H28N2O/c1-4-5-12-20-13-7-6-11-16(2...,LEBVLXFERQHONN-INIZCTEOSA-N,4.0,27262-47-1,1.0
4,levobupivacaine,4,Cytochrome P450 2D6,Enzyme,P10635,CYP2D6,CP2D6_HUMAN,6.707,,IC50,...,,,Tclin,Homo sapiens,CCCCN1CCCC[C@H]1C(=O)NC1=C(C)C=CC=C1C,InChI=1S/C18H28N2O/c1-4-5-12-20-13-7-6-11-16(2...,LEBVLXFERQHONN-INIZCTEOSA-N,4.0,27262-47-1,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19373,samidorphan,5460,Delta-type opioid receptor,GPCR,P41143,OPRD1,OPRD_HUMAN,8.590,,Ki,...,,PARTIAL AGONIST,Tclin,Homo sapiens,C1CC1CN2CCC34CC(=O)CCC3(C2CC5=C4C(=C(C=C5)C(=O...,InChI=1S/C21H26N2O4/c22-19(26)15-4-3-13-9-16-2...,RYIDHLJADOKWFM-UHFFFAOYSA-N,5460.0,852626-89-2,1.0
19374,sotorasib,5461,GTPase KRas,Enzyme,P01116,KRAS,RASK_HUMAN,7.030,,IC50,...,https://www.accessdata.fda.gov/drugsatfda_docs...,INHIBITOR,Tchem,Homo sapiens,C[C@H](C(C)C)[C@]1(CC[C@@]2([C@H]3CC[C@H]4[C@]...,InChI=1S/C44H67N5O4/c1-27(2)28(3)39(7)18-19-41...,BODYFEUFKHPRCK-ZCZMVWJSSA-N,5461.0,2296729-00-3,1.0
19375,ibrexafungerp,5462,"Beta-1,3-glucan synthase catalytic subunit 1",Enzyme,O13428,GSC1,O13428_CANAX,8.350,,IC50,...,https://www.accessdata.fda.gov/drugsatfda_docs...,INHIBITOR,,Candida albicans,C[C@H](C(C)C)[C@]1(CC[C@@]2([C@H]3CC[C@H]4[C@]...,InChI=1S/C44H67N5O4/c1-27(2)28(3)39(7)18-19-41...,BODYFEUFKHPRCK-ZCZMVWJSSA-N,5462.0,1207753-03-4,1.0
19376,ibrexafungerp,5462,"1,3-Beta-D-glucan-UDP glucosyltransferase",Enzyme,Q6FTN8,FKS1,Q6FTN8_CANGA,7.830,,IC50,...,https://www.accessdata.fda.gov/drugsatfda_docs...,INHIBITOR,,Candida glabrata,C[C@H](C(C)C)[C@]1(CC[C@@]2([C@H]3CC[C@H]4[C@]...,InChI=1S/C44H67N5O4/c1-27(2)28(3)39(7)18-19-41...,BODYFEUFKHPRCK-ZCZMVWJSSA-N,5462.0,1207753-03-4,1.0


'1' appears in the 'Approved' column: 1859


## SGC Donated Chemical Probes

In [76]:
sgc_compounds = pd.read_csv('SGC_Compounds.csv', skiprows = 1) #123 rows x 24 columns
sgc_compounds = sgc_compounds.dropna(subset=['SMILES (unique cis trans)']) # none to drop
sgc_compounds

Unnamed: 0,Compound name,Alternative names,Molecular formula,SMILES (unique cis trans),Molecular weight [g/mol],DOI,ChEMBL ID,PubChem CID,CAS,Mode of action,...,References PMID,Patents,Compound name.1,Alternative names.1,Molecular formula.1,SMILES (unique cis trans).1,Molecular weight [g/mol].1,ChEMBL ID.1,PubChem CID.1,CAS.1
0,(R)-9s,TP-009,C15H13ClN4O,C[C@H](c1cccc(C#N)c1)N1C=C(C=C(C1=N)C(N)=O)[Cl],300.1,10.6019/CHEMBL4507301,CHEMBL3799292,44470113.0,1191908-24-3,Antagonist,...,PMID: 26954848,,(S)-9s,,C15H13ClN4O,C[C@@H](c1cccc(C#N)c1)N1C=C(C=C(C1=N)C(N)=O)[Cl],300.10,CHEMBL3797887,127046693.0,1884353-55-2
1,(R)-ZINC-3573,,C18H21N5,CN(C)[C@@H]1CCN(C1)c1cc(c2ccccc2)nc2ccnn12,307.2,10.6019/CHEMBL4507322,CHEMBL4520293,95882507.0,2089389-15-9,Agonist,...,PMID: 28288109,,(S)-ZINC-3573,,C18H21N5,CN(C)[C@H]1CCN(C1)c1cc(c2ccccc2)nc2ccnn12,307.20,CHEMBL4534980,95882508.0,2095596-11-3
2,8RK64,,C14H16N8O2S,C1CN(Cc2c1nc(NC([C@H]1CCN(C1)C#N)=O)s2)C(CN=[N...,360.11,10.6019/CHEMBL4800724,CHEMBL4802050,162677482.0,2705841-52-5,Covalent inhibitor,...,PMID: 32886496,,JYQ88,,C14H16N8O2S,C1CN(Cc2c1nc(NC([C@@H]1CCN(C1)C#N)=O)s2)C(CN=[...,360.11,CHEMBL4802053,162677485.0,2987636-71-3
3,A-079,A-967079,C12H14FNO,CC\C(C(\C)=C\c1ccc(cc1)F)=N/O,207.25,10.6019/CHEMBL4507256,CHEMBL3697701,60150207.0,1170613-55-4,Antagonist,...,"PMID: 22319196, PMID: 21402443",,A-226,A-1115226,C13H16FNO,CC\C(C(\C)=C\c1ccc(c(C)c1)F)=N/O,221.10,CHEMBL3701197,87522699.0,1170614-31-9
4,A-1155463,,C35H32FN5O4S2,CN(C)CC#Cc1ccc(c(c1)F)OCCCc1c(C(O)=O)nc(N2CCc3...,669.2,10.6019/CHEMBL4507311,CHEMBL3342332,59447577.0,1235034-55-5,Inhibitor,...,"PMID: 25313317, PMID: 25787766, PMID: 28946654",,A-1107969,,C34H26N8O4S2,C1CN(Cc2c1cccc2C(Nc1nc2ccccc2s1)=O)c1nc(C(O)=O...,674.20,CHEMBL3342191,46836243.0,1235033-39-2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118,TP-060,"T-036, Glucosylceramide synthase-IN-1",C24H20F4N2O3,CC(C)(c1ccc(cc1)N1Cc2c(ccnc2c2ccc(cc2OCC(F)(F)...,460.14,,CHEMBL5078078,155595464.0,2601393-20-6,Allosteric inhibitor (Noncompetitive against U...,...,"PMID: 35188773 (T-036), PMID: 34398463 (T-036...",,TP-060n,,C20H16N2O2,COc1cccc(c1)c1c2CN(C(c2ccn1)=O)c1ccccc1,316.12,CHEMBL5085407,166634512.0,2766971-12-2
119,UCSF924,9-6-24,C20H22N2O2,Cc1ccc2c(c1)C(C=C(CNCCCOc1ccccc1)N2)=O,322.17,10.6019/CHEMBL4507309,CHEMBL3480577,72901200.0,1434515-70-4,Agonist,...,PMID: 29051383 (Compound 9-6-24),,UCSF924NC,,C19H21N3O,Cc1ccc2c(c1)C(C=C(CNCCCc1ccncc1)N2)=O,307.17,CHEMBL4579981,72898300.0,1434696-12-4
120,VZ185,,C53H67FN8O8S,Cc1c(c2ccc(CNC([C@@H]3C[C@H](CN3C([C@H](C(C)(C...,994.48,,CHEMBL5182441,138454768.0,2306193-61-1,Degrader (PROTAC),...,PMID: 30540463 (compound 51),,cisVZ185,,C53H67FN8O8S,Cc1c(c2ccc(CNC([C@H]3C[C@H](CN3C([C@@H](C(C)(C...,994.48,,,2306193-98-4
121,WEB2086,"Apafant, WEB 2086BS",C22H22ClN5O2S,Cc1nnc2CN=C(c3ccccc3[Cl])c3cc(CCC(N4CCOCC4)=O)...,455.12,,CHEMBL280164,65889.0,105219-56-5,Inhibitor,...,"PMID: 3598913, PMID: 3342883, PMID: 15286429",,WEB2387,(R)-bepafant,C23H22ClN5O2S,Cc1nnc2CN=C(c3ccccc3[Cl])c3c4C[C@H](Cc4sc3n12)...,467.12,,14071229.0,


## Chemical Probes Portal

In [77]:
chem_probes = pd.read_csv('ChemicalProbesPortal.csv') 
chem_probes

Unnamed: 0,Probe name,Rating in cell,Rating in organism,Number of Ratings,Unsuitable,URL,Target name,Target class,Target subclass,Published on,...,Reference (PMID),Mechanism of action,Potency value,Potency assay,Potency value in cells,Potency assay (cells),Potency in cells,Organism,Dose,Control compound
0,Please cite the Chemical Probes Portal,,,,,,,,,,...,,,,,,,,,,
1,SGC0946,3.8,0.0,4.0,No,https://www.chemicalprobes.org/sgc0946,DOT1L,Epigenetic,Protein methyltransferase,2015-10-02,...,"www.doi.org/10.1038/ncomms2304, http://www.ncb...",Inhibitor,0.06 nM,SPR; IC50 = 0.3 nM in enzymatic assay,8.8 nM,in-cell western assay assessing methylation of...,IC50,,,SGC0649
2,EPZ020411,2.7,2.7,3.0,No,https://www.chemicalprobes.org/epz020411,PRMT6,Epigenetic,Protein methyltransferase,2015-11-05,...,"www.doi.org/10.1021/acsmedchemlett.5b00071, ht...",Inhibitor,0.010 uM,Inhibition of 3H-SAM-dependent labeling of pep...,0.0637 uM,H3R2 methylation in PRMT6-overexpressing A375 ...,IC50,Other,5 mg/mL,
3,I-BRD9,3.0,0.0,3.0,No,https://www.chemicalprobes.org/i-brd9,BRD9,Epigenetic,Bromodomain,2016-07-29,...,http://pubs.acs.org/doi/abs/10.1021/acs.jmedch...,Antagonist,BRD9 7.3,"TR-FRET, and BROMOscan profiling with N-methyl...",,"In HUT-78 cells, chemoproteomic competitive bi...",,,,Not available
4,CFI-400945,1.7,1.7,3.0,No,https://www.chemicalprobes.org/cfi-400945,PLK4,Protein kinase,Other,2015-12-10,...,http://www.sciencedirect.com/science/article/p...,ATP-competitive inhibitor,0.26 nM,In vitro kinase assay IC50 = 2.8 nM,12.3 nM,Inhibition of PLK4 autophosphorylation in PLK4...,EC50,Other,2.4-20 mg/kg,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
989,YKL-5-124,3.0,0.0,2.0,No,https://www.chemicalprobes.org/ykl-5-124,CDK7,Kinase,CMGC,2023-12-21,...,,Covalent Inhibitor,"9.7 nM, 53.3 nM, 2.2 nM","Invitrogen, biochemical assay, Geyer, P32 bioc...",<62.5 nM,Inhibition of RNA Pol II phosphorylation,IC50,,,"N-[(1S)-2-(dimethylamino)-1-phenyl-ethyl]-6,6-..."
990,AZD0156,4.0,3.0,1.0,No,https://www.chemicalprobes.org/azd0156,ATM,Kinase,PI3/PI4,2024-01-18,...,,Inhibitor,0.04 nM,Enzymatic assay (HTRF),0.57 nM,Detection of pATM via western blot,IC50,Mouse,5 mg/Kg,
991,DKY709,0.0,0.0,0.0,No,https://www.chemicalprobes.org/dky709,IKZF2,Transcription factor,Zinc Finger,2024-01-26,...,,Degrader (PROTAC),"130 nM, 190 nM","Biochemical CBRN binding assay, SPR of the tri...","4 nM, 73 nM, 11 nM","Cellular degradation assay Dmax 53%, CRBN cell...","DC50, IC50","Mouse, Monkey (Cynomolgus)","2 mg/Kg, 3 mg/Kg, 0.3 mg/Kg, 1.0 mg/Kg",
992,BAY-8400,2.0,1.0,1.0,No,https://www.chemicalprobes.org/bay-8400,PRKDC,Kinase,PI3/PI4,2024-01-18,...,,Inhibitor,81 nM,Biochemical DNA-PK Activity Assay (TR-FRET),69 nM,Phospho-H2AX Assay to Determine Inhibition of ...,IC50,"Rat, Mouse","0.3 mg/kg IV, 0.6 mg/kg PO, 0.3 mg/kg",


In [78]:
def smiles_to_inchi(smiles):
    if isinstance(smiles, float):
        return None

    mol = Chem.MolFromSmiles(str(smiles))
    
    if mol is not None:
        inchi_string = inchi.MolToInchi(mol)
        return inchi_string
    else:
        return None

In [79]:
# Create 'InChI' column using the 'SMILES' column
drug_central['InChI (cleaned)'] = drug_central['SMILES'].apply(smiles_to_inchi)

# Display
display(drug_central)











































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































Unnamed: 0,DRUG_NAME,STRUCT_ID,TARGET_NAME,TARGET_CLASS,ACCESSION,GENE,SWISSPROT,ACT_VALUE,ACT_UNIT,ACT_TYPE,...,ACTION_TYPE,TDL,ORGANISM,SMILES,InChI,InChIKey,ID,CAS_RN,Approved,InChI (cleaned)
0,levobupivacaine,4,Potassium voltage-gated channel subfamily H me...,Ion channel,Q12809,KCNH2,KCNH2_HUMAN,4.890,,IC50,...,,Tclin,Homo sapiens,CCCCN1CCCC[C@H]1C(=O)NC1=C(C)C=CC=C1C,InChI=1S/C18H28N2O/c1-4-5-12-20-13-7-6-11-16(2...,LEBVLXFERQHONN-INIZCTEOSA-N,4.0,27262-47-1,1.0,InChI=1S/C18H28N2O/c1-4-5-12-20-13-7-6-11-16(2...
1,levobupivacaine,4,Sodium channel protein type 1 subunit alpha,Ion channel,P35498,SCN1A,SCN1A_HUMAN,5.790,,IC50,...,,Tclin,Homo sapiens,CCCCN1CCCC[C@H]1C(=O)NC1=C(C)C=CC=C1C,InChI=1S/C18H28N2O/c1-4-5-12-20-13-7-6-11-16(2...,LEBVLXFERQHONN-INIZCTEOSA-N,4.0,27262-47-1,1.0,InChI=1S/C18H28N2O/c1-4-5-12-20-13-7-6-11-16(2...
2,levobupivacaine,4,Sodium channel protein type 4 subunit alpha,Ion channel,P35499,SCN4A,SCN4A_HUMAN,,,,...,BLOCKER,Tclin,Homo sapiens,CCCCN1CCCC[C@H]1C(=O)NC1=C(C)C=CC=C1C,InChI=1S/C18H28N2O/c1-4-5-12-20-13-7-6-11-16(2...,LEBVLXFERQHONN-INIZCTEOSA-N,4.0,27262-47-1,1.0,InChI=1S/C18H28N2O/c1-4-5-12-20-13-7-6-11-16(2...
3,levobupivacaine,4,Prostaglandin E2 receptor EP1 subtype,GPCR,P34995,PTGER1,PE2R1_HUMAN,,,,...,,Tclin,Homo sapiens,CCCCN1CCCC[C@H]1C(=O)NC1=C(C)C=CC=C1C,InChI=1S/C18H28N2O/c1-4-5-12-20-13-7-6-11-16(2...,LEBVLXFERQHONN-INIZCTEOSA-N,4.0,27262-47-1,1.0,InChI=1S/C18H28N2O/c1-4-5-12-20-13-7-6-11-16(2...
4,levobupivacaine,4,Cytochrome P450 2D6,Enzyme,P10635,CYP2D6,CP2D6_HUMAN,6.707,,IC50,...,,Tclin,Homo sapiens,CCCCN1CCCC[C@H]1C(=O)NC1=C(C)C=CC=C1C,InChI=1S/C18H28N2O/c1-4-5-12-20-13-7-6-11-16(2...,LEBVLXFERQHONN-INIZCTEOSA-N,4.0,27262-47-1,1.0,InChI=1S/C18H28N2O/c1-4-5-12-20-13-7-6-11-16(2...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19373,samidorphan,5460,Delta-type opioid receptor,GPCR,P41143,OPRD1,OPRD_HUMAN,8.590,,Ki,...,PARTIAL AGONIST,Tclin,Homo sapiens,C1CC1CN2CCC34CC(=O)CCC3(C2CC5=C4C(=C(C=C5)C(=O...,InChI=1S/C21H26N2O4/c22-19(26)15-4-3-13-9-16-2...,RYIDHLJADOKWFM-UHFFFAOYSA-N,5460.0,852626-89-2,1.0,InChI=1S/C21H26N2O4/c22-19(26)15-4-3-13-9-16-2...
19374,sotorasib,5461,GTPase KRas,Enzyme,P01116,KRAS,RASK_HUMAN,7.030,,IC50,...,INHIBITOR,Tchem,Homo sapiens,C[C@H](C(C)C)[C@]1(CC[C@@]2([C@H]3CC[C@H]4[C@]...,InChI=1S/C44H67N5O4/c1-27(2)28(3)39(7)18-19-41...,BODYFEUFKHPRCK-ZCZMVWJSSA-N,5461.0,2296729-00-3,1.0,InChI=1S/C44H67N5O4/c1-27(2)28(3)39(7)18-19-41...
19375,ibrexafungerp,5462,"Beta-1,3-glucan synthase catalytic subunit 1",Enzyme,O13428,GSC1,O13428_CANAX,8.350,,IC50,...,INHIBITOR,,Candida albicans,C[C@H](C(C)C)[C@]1(CC[C@@]2([C@H]3CC[C@H]4[C@]...,InChI=1S/C44H67N5O4/c1-27(2)28(3)39(7)18-19-41...,BODYFEUFKHPRCK-ZCZMVWJSSA-N,5462.0,1207753-03-4,1.0,InChI=1S/C44H67N5O4/c1-27(2)28(3)39(7)18-19-41...
19376,ibrexafungerp,5462,"1,3-Beta-D-glucan-UDP glucosyltransferase",Enzyme,Q6FTN8,FKS1,Q6FTN8_CANGA,7.830,,IC50,...,INHIBITOR,,Candida glabrata,C[C@H](C(C)C)[C@]1(CC[C@@]2([C@H]3CC[C@H]4[C@]...,InChI=1S/C44H67N5O4/c1-27(2)28(3)39(7)18-19-41...,BODYFEUFKHPRCK-ZCZMVWJSSA-N,5462.0,1207753-03-4,1.0,InChI=1S/C44H67N5O4/c1-27(2)28(3)39(7)18-19-41...


In [80]:
# Create 'InChI' column using the 'SMILES (unique cis trans)' column
sgc_compounds['InChI (cleaned)'] = sgc_compounds['SMILES (unique cis trans)'].apply(smiles_to_inchi)

# Display
display(sgc_compounds)







[18:12:41] SMILES Parse Error: extra open parentheses for input: 'C=CC(Nc1ccc2c(c1)N(C1CCN(CC1)Cc1ccc(cc1)c1c(cc3C(NC=Cc3n1)=O)'



[18:12:41] SMILES Parse Error: syntax error while parsing: Cc1c(c2ccc(CNC([C@@H]3C[C@H](CN3C([C@H](C(C)(C)C)NC(C3(CC3)F)=O)=O)O)=O)c(c2)OCCCCCN2CCN(CC2)Cc2c(cc(cc2OC)C2=
[18:12:41] SMILES Parse Error: Failed parsing SMILES 'Cc1c(c2ccc(CNC([C@@H]3C[C@H](CN3C([C@H](C(C)(C)C)NC(C3(CC3)F)=O)=O)O)=O)c(c2)OCCCCCN2CCN(CC2)Cc2c(cc(cc2OC)C2=' for input: 'Cc1c(c2ccc(CNC([C@@H]3C[C@H](CN3C([C@H](C(C)(C)C)NC(C3(CC3)F)=O)=O)O)=O)c(c2)OCCCCCN2CCN(CC2)Cc2c(cc(cc2OC)C2='


Unnamed: 0,Compound name,Alternative names,Molecular formula,SMILES (unique cis trans),Molecular weight [g/mol],DOI,ChEMBL ID,PubChem CID,CAS,Mode of action,...,Patents,Compound name.1,Alternative names.1,Molecular formula.1,SMILES (unique cis trans).1,Molecular weight [g/mol].1,ChEMBL ID.1,PubChem CID.1,CAS.1,InChI (cleaned)
0,(R)-9s,TP-009,C15H13ClN4O,C[C@H](c1cccc(C#N)c1)N1C=C(C=C(C1=N)C(N)=O)[Cl],300.1,10.6019/CHEMBL4507301,CHEMBL3799292,44470113.0,1191908-24-3,Antagonist,...,,(S)-9s,,C15H13ClN4O,C[C@@H](c1cccc(C#N)c1)N1C=C(C=C(C1=N)C(N)=O)[Cl],300.10,CHEMBL3797887,127046693.0,1884353-55-2,InChI=1S/C15H13ClN4O/c1-9(11-4-2-3-10(5-11)7-1...
1,(R)-ZINC-3573,,C18H21N5,CN(C)[C@@H]1CCN(C1)c1cc(c2ccccc2)nc2ccnn12,307.2,10.6019/CHEMBL4507322,CHEMBL4520293,95882507.0,2089389-15-9,Agonist,...,,(S)-ZINC-3573,,C18H21N5,CN(C)[C@H]1CCN(C1)c1cc(c2ccccc2)nc2ccnn12,307.20,CHEMBL4534980,95882508.0,2095596-11-3,InChI=1S/C18H21N5/c1-21(2)15-9-11-22(13-15)18-...
2,8RK64,,C14H16N8O2S,C1CN(Cc2c1nc(NC([C@H]1CCN(C1)C#N)=O)s2)C(CN=[N...,360.11,10.6019/CHEMBL4800724,CHEMBL4802050,162677482.0,2705841-52-5,Covalent inhibitor,...,,JYQ88,,C14H16N8O2S,C1CN(Cc2c1nc(NC([C@@H]1CCN(C1)C#N)=O)s2)C(CN=[...,360.11,CHEMBL4802053,162677485.0,2987636-71-3,InChI=1S/C14H16N8O2S/c15-8-21-3-1-9(6-21)13(24...
3,A-079,A-967079,C12H14FNO,CC\C(C(\C)=C\c1ccc(cc1)F)=N/O,207.25,10.6019/CHEMBL4507256,CHEMBL3697701,60150207.0,1170613-55-4,Antagonist,...,,A-226,A-1115226,C13H16FNO,CC\C(C(\C)=C\c1ccc(c(C)c1)F)=N/O,221.10,CHEMBL3701197,87522699.0,1170614-31-9,InChI=1S/C12H14FNO/c1-3-12(14-15)9(2)8-10-4-6-...
4,A-1155463,,C35H32FN5O4S2,CN(C)CC#Cc1ccc(c(c1)F)OCCCc1c(C(O)=O)nc(N2CCc3...,669.2,10.6019/CHEMBL4507311,CHEMBL3342332,59447577.0,1235034-55-5,Inhibitor,...,,A-1107969,,C34H26N8O4S2,C1CN(Cc2c1cccc2C(Nc1nc2ccccc2s1)=O)c1nc(C(O)=O...,674.20,CHEMBL3342191,46836243.0,1235033-39-2,InChI=1S/C35H32FN5O4S2/c1-40(2)17-6-8-22-14-15...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118,TP-060,"T-036, Glucosylceramide synthase-IN-1",C24H20F4N2O3,CC(C)(c1ccc(cc1)N1Cc2c(ccnc2c2ccc(cc2OCC(F)(F)...,460.14,,CHEMBL5078078,155595464.0,2601393-20-6,Allosteric inhibitor (Noncompetitive against U...,...,,TP-060n,,C20H16N2O2,COc1cccc(c1)c1c2CN(C(c2ccn1)=O)c1ccccc1,316.12,CHEMBL5085407,166634512.0,2766971-12-2,"InChI=1S/C24H20F4N2O3/c1-23(2,32)14-3-6-16(7-4..."
119,UCSF924,9-6-24,C20H22N2O2,Cc1ccc2c(c1)C(C=C(CNCCCOc1ccccc1)N2)=O,322.17,10.6019/CHEMBL4507309,CHEMBL3480577,72901200.0,1434515-70-4,Agonist,...,,UCSF924NC,,C19H21N3O,Cc1ccc2c(c1)C(C=C(CNCCCc1ccncc1)N2)=O,307.17,CHEMBL4579981,72898300.0,1434696-12-4,InChI=1S/C20H22N2O2/c1-15-8-9-19-18(12-15)20(2...
120,VZ185,,C53H67FN8O8S,Cc1c(c2ccc(CNC([C@@H]3C[C@H](CN3C([C@H](C(C)(C...,994.48,,CHEMBL5182441,138454768.0,2306193-61-1,Degrader (PROTAC),...,,cisVZ185,,C53H67FN8O8S,Cc1c(c2ccc(CNC([C@H]3C[C@H](CN3C([C@@H](C(C)(C...,994.48,,,2306193-98-4,
121,WEB2086,"Apafant, WEB 2086BS",C22H22ClN5O2S,Cc1nnc2CN=C(c3ccccc3[Cl])c3cc(CCC(N4CCOCC4)=O)...,455.12,,CHEMBL280164,65889.0,105219-56-5,Inhibitor,...,,WEB2387,(R)-bepafant,C23H22ClN5O2S,Cc1nnc2CN=C(c3ccccc3[Cl])c3c4C[C@H](Cc4sc3n12)...,467.12,,14071229.0,,InChI=1S/C22H22ClN5O2S/c1-14-25-26-19-13-24-21...


In [81]:
chem_probes['InChI (cleaned)'] = chem_probes['SMILES'].apply(smiles_to_inchi)

# Display
display(chem_probes)
chem_probes.to_csv('chem_probes.csv')











































































Unnamed: 0,Probe name,Rating in cell,Rating in organism,Number of Ratings,Unsuitable,URL,Target name,Target class,Target subclass,Published on,...,Mechanism of action,Potency value,Potency assay,Potency value in cells,Potency assay (cells),Potency in cells,Organism,Dose,Control compound,InChI (cleaned)
0,Please cite the Chemical Probes Portal,,,,,,,,,,...,,,,,,,,,,
1,SGC0946,3.8,0.0,4.0,No,https://www.chemicalprobes.org/sgc0946,DOT1L,Epigenetic,Protein methyltransferase,2015-10-02,...,Inhibitor,0.06 nM,SPR; IC50 = 0.3 nM in enzymatic assay,8.8 nM,in-cell western assay assessing methylation of...,IC50,,,SGC0649,InChI=1S/C28H40BrN7O4/c1-16(2)35(12-6-11-31-27...
2,EPZ020411,2.7,2.7,3.0,No,https://www.chemicalprobes.org/epz020411,PRMT6,Epigenetic,Protein methyltransferase,2015-11-05,...,Inhibitor,0.010 uM,Inhibition of 3H-SAM-dependent labeling of pep...,0.0637 uM,H3R2 methylation in PRMT6-overexpressing A375 ...,IC50,Other,5 mg/mL,,InChI=1S/C25H38N4O3/c1-26-10-11-29(2)18-21-17-...
3,I-BRD9,3.0,0.0,3.0,No,https://www.chemicalprobes.org/i-brd9,BRD9,Epigenetic,Bromodomain,2016-07-29,...,Antagonist,BRD9 7.3,"TR-FRET, and BROMOscan profiling with N-methyl...",,"In HUT-78 cells, chemoproteomic competitive bi...",,,,Not available,InChI=1S/C22H22F3N3O3S2/c1-2-28-12-17(13-4-3-5...
4,CFI-400945,1.7,1.7,3.0,No,https://www.chemicalprobes.org/cfi-400945,PLK4,Protein kinase,Other,2015-12-10,...,ATP-competitive inhibitor,0.26 nM,In vitro kinase assay IC50 = 2.8 nM,12.3 nM,Inhibition of PLK4 autophosphorylation in PLK4...,EC50,Other,2.4-20 mg/kg,,InChI=1S/C33H34N4O3/c1-20-17-37(18-21(2)40-20)...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
989,YKL-5-124,3.0,0.0,2.0,No,https://www.chemicalprobes.org/ykl-5-124,CDK7,Kinase,CMGC,2023-12-21,...,Covalent Inhibitor,"9.7 nM, 53.3 nM, 2.2 nM","Invitrogen, biochemical assay, Geyer, P32 bioc...",<62.5 nM,Inhibition of RNA Pol II phosphorylation,IC50,,,"N-[(1S)-2-(dimethylamino)-1-phenyl-ethyl]-6,6-...",InChI=1S/C28H33N7O3/c1-6-23(36)29-20-14-12-19(...
990,AZD0156,4.0,3.0,1.0,No,https://www.chemicalprobes.org/azd0156,ATM,Kinase,PI3/PI4,2024-01-18,...,Inhibitor,0.04 nM,Enzymatic assay (HTRF),0.57 nM,Detection of pATM via western blot,IC50,Mouse,5 mg/Kg,,InChI=1S/C26H31N5O3/c1-29(2)11-4-12-34-24-8-6-...
991,DKY709,0.0,0.0,0.0,No,https://www.chemicalprobes.org/dky709,IKZF2,Transcription factor,Zinc Finger,2024-01-26,...,Degrader (PROTAC),"130 nM, 190 nM","Biochemical CBRN binding assay, SPR of the tri...","4 nM, 73 nM, 11 nM","Cellular degradation assay Dmax 53%, CRBN cell...","DC50, IC50","Mouse, Monkey (Cynomolgus)","2 mg/Kg, 3 mg/Kg, 0.3 mg/Kg, 1.0 mg/Kg",,InChI=1S/C25H27N3O3/c29-23-9-8-22(24(30)26-23)...
992,BAY-8400,2.0,1.0,1.0,No,https://www.chemicalprobes.org/bay-8400,PRKDC,Kinase,PI3/PI4,2024-01-18,...,Inhibitor,81 nM,Biochemical DNA-PK Activity Assay (TR-FRET),69 nM,Phospho-H2AX Assay to Determine Inhibition of ...,IC50,"Rat, Mouse","0.3 mg/kg IV, 0.6 mg/kg PO, 0.3 mg/kg",,InChI=1S/C21H17F2N5O/c22-20(23)16-8-15(10-24-1...


In [82]:
drug_central_inchi = drug_central[['InChI (cleaned)', 'DRUG_NAME']]
sgc_inchi = sgc_compounds[['InChI (cleaned)', 'Compound name']]
chem_probes_inchi = chem_probes[['InChI (cleaned)', 'Probe name']]

display(drug_central_inchi)
display(sgc_inchi)
display(chem_probes_inchi)

Unnamed: 0,InChI (cleaned),DRUG_NAME
0,InChI=1S/C18H28N2O/c1-4-5-12-20-13-7-6-11-16(2...,levobupivacaine
1,InChI=1S/C18H28N2O/c1-4-5-12-20-13-7-6-11-16(2...,levobupivacaine
2,InChI=1S/C18H28N2O/c1-4-5-12-20-13-7-6-11-16(2...,levobupivacaine
3,InChI=1S/C18H28N2O/c1-4-5-12-20-13-7-6-11-16(2...,levobupivacaine
4,InChI=1S/C18H28N2O/c1-4-5-12-20-13-7-6-11-16(2...,levobupivacaine
...,...,...
19373,InChI=1S/C21H26N2O4/c22-19(26)15-4-3-13-9-16-2...,samidorphan
19374,InChI=1S/C44H67N5O4/c1-27(2)28(3)39(7)18-19-41...,sotorasib
19375,InChI=1S/C44H67N5O4/c1-27(2)28(3)39(7)18-19-41...,ibrexafungerp
19376,InChI=1S/C44H67N5O4/c1-27(2)28(3)39(7)18-19-41...,ibrexafungerp


Unnamed: 0,InChI (cleaned),Compound name
0,InChI=1S/C15H13ClN4O/c1-9(11-4-2-3-10(5-11)7-1...,(R)-9s
1,InChI=1S/C18H21N5/c1-21(2)15-9-11-22(13-15)18-...,(R)-ZINC-3573
2,InChI=1S/C14H16N8O2S/c15-8-21-3-1-9(6-21)13(24...,8RK64
3,InChI=1S/C12H14FNO/c1-3-12(14-15)9(2)8-10-4-6-...,A-079
4,InChI=1S/C35H32FN5O4S2/c1-40(2)17-6-8-22-14-15...,A-1155463
...,...,...
118,"InChI=1S/C24H20F4N2O3/c1-23(2,32)14-3-6-16(7-4...",TP-060
119,InChI=1S/C20H22N2O2/c1-15-8-9-19-18(12-15)20(2...,UCSF924
120,,VZ185
121,InChI=1S/C22H22ClN5O2S/c1-14-25-26-19-13-24-21...,WEB2086


Unnamed: 0,InChI (cleaned),Probe name
0,,Please cite the Chemical Probes Portal
1,InChI=1S/C28H40BrN7O4/c1-16(2)35(12-6-11-31-27...,SGC0946
2,InChI=1S/C25H38N4O3/c1-26-10-11-29(2)18-21-17-...,EPZ020411
3,InChI=1S/C22H22F3N3O3S2/c1-2-28-12-17(13-4-3-5...,I-BRD9
4,InChI=1S/C33H34N4O3/c1-20-17-37(18-21(2)40-20)...,CFI-400945
...,...,...
989,InChI=1S/C28H33N7O3/c1-6-23(36)29-20-14-12-19(...,YKL-5-124
990,InChI=1S/C26H31N5O3/c1-29(2)11-4-12-34-24-8-6-...,AZD0156
991,InChI=1S/C25H27N3O3/c29-23-9-8-22(24(30)26-23)...,DKY709
992,InChI=1S/C21H17F2N5O/c22-20(23)16-8-15(10-24-1...,BAY-8400


## Concatination to determine compound overlap

In [83]:
df = pd.concat([drug_central_inchi, sgc_inchi, chem_probes_inchi], ignore_index=True)

# Rename the columns
df.rename(columns={'DRUG_NAME': 'Drug Central', 'Compound name': 'SGC', 'Probe name': 'Chemical Probes'}, inplace=True)
df

Unnamed: 0,InChI (cleaned),Drug Central,SGC,Chemical Probes
0,InChI=1S/C18H28N2O/c1-4-5-12-20-13-7-6-11-16(2...,levobupivacaine,,
1,InChI=1S/C18H28N2O/c1-4-5-12-20-13-7-6-11-16(2...,levobupivacaine,,
2,InChI=1S/C18H28N2O/c1-4-5-12-20-13-7-6-11-16(2...,levobupivacaine,,
3,InChI=1S/C18H28N2O/c1-4-5-12-20-13-7-6-11-16(2...,levobupivacaine,,
4,InChI=1S/C18H28N2O/c1-4-5-12-20-13-7-6-11-16(2...,levobupivacaine,,
...,...,...,...,...
20490,InChI=1S/C28H33N7O3/c1-6-23(36)29-20-14-12-19(...,,,YKL-5-124
20491,InChI=1S/C26H31N5O3/c1-29(2)11-4-12-34-24-8-6-...,,,AZD0156
20492,InChI=1S/C25H27N3O3/c29-23-9-8-22(24(30)26-23)...,,,DKY709
20493,InChI=1S/C21H17F2N5O/c22-20(23)16-8-15(10-24-1...,,,BAY-8400


In [84]:
melted_df = pd.melt(df, id_vars=['InChI (cleaned)'], var_name='Source', value_name='Compound')
melted_df

Unnamed: 0,InChI (cleaned),Source,Compound
0,InChI=1S/C18H28N2O/c1-4-5-12-20-13-7-6-11-16(2...,Drug Central,levobupivacaine
1,InChI=1S/C18H28N2O/c1-4-5-12-20-13-7-6-11-16(2...,Drug Central,levobupivacaine
2,InChI=1S/C18H28N2O/c1-4-5-12-20-13-7-6-11-16(2...,Drug Central,levobupivacaine
3,InChI=1S/C18H28N2O/c1-4-5-12-20-13-7-6-11-16(2...,Drug Central,levobupivacaine
4,InChI=1S/C18H28N2O/c1-4-5-12-20-13-7-6-11-16(2...,Drug Central,levobupivacaine
...,...,...,...
61480,InChI=1S/C28H33N7O3/c1-6-23(36)29-20-14-12-19(...,Chemical Probes,YKL-5-124
61481,InChI=1S/C26H31N5O3/c1-29(2)11-4-12-34-24-8-6-...,Chemical Probes,AZD0156
61482,InChI=1S/C25H27N3O3/c29-23-9-8-22(24(30)26-23)...,Chemical Probes,DKY709
61483,InChI=1S/C21H17F2N5O/c22-20(23)16-8-15(10-24-1...,Chemical Probes,BAY-8400


In [85]:
# Groupby and aggregate
first_result = melted_df.groupby('InChI (cleaned)', as_index=False).agg({'Source': ', '.join, 'Compound': 'first'}) #3271 rows
# combines source values within each group (same inchi)
first_result

Unnamed: 0,InChI (cleaned),Source,Compound
0,InChI=1S/3C17H14O.2Pd/c3*18-17(13-11-15-7-3-1-...,"Drug Central, SGC, Chemical Probes",Tris-DBA palladium
1,InChI=1S/3ClH.Cr/h3*1H;/q;;;+3/p-3,"Drug Central, Drug Central, SGC, SGC, Chemical...",chromic chloride
2,InChI=1S/As2O3/c3-1-5-2-4,"Drug Central, Drug Central, SGC, SGC, Chemical...",arsenic trioxide
3,InChI=1S/C102H172N36O32S7/c1-50(2)34-63-91(161...,"Drug Central, Drug Central, Drug Central, SGC,...",ziconotide
4,InChI=1S/C10H10ClN5O2/c11-8-4-2-1-3-7(8)9(18-1...,"Drug Central, Drug Central, Drug Central, Drug...",cenobamate
...,...,...,...
3266,"InChI=1S/CH4N2O2/c2-1(4)3-5/h5H,(H3,2,3,4)","Drug Central, Drug Central, Drug Central, Drug...",hydroxycarbamide
3267,"InChI=1S/CH5N3/c2-1(3)4/h(H5,2,3,4)","Drug Central, SGC, Chemical Probes",guanidine
3268,InChI=1S/ClH/h1H,"Drug Central, SGC, Chemical Probes",hydrochloric acid
3269,InChI=1S/NO/c1-2,"Drug Central, SGC, Chemical Probes",nitric oxide


In [86]:
# List result
list_result = melted_df.groupby('InChI (cleaned)', as_index=False).agg({'Source': ', '.join, 'Compound': list}) # list is a more optimal structure
list_result

Unnamed: 0,InChI (cleaned),Source,Compound
0,InChI=1S/3C17H14O.2Pd/c3*18-17(13-11-15-7-3-1-...,"Drug Central, SGC, Chemical Probes","[nan, nan, Tris-DBA palladium]"
1,InChI=1S/3ClH.Cr/h3*1H;/q;;;+3/p-3,"Drug Central, Drug Central, SGC, SGC, Chemical...","[chromic chloride, chromic chloride, nan, nan,..."
2,InChI=1S/As2O3/c3-1-5-2-4,"Drug Central, Drug Central, SGC, SGC, Chemical...","[arsenic trioxide, arsenic trioxide, nan, nan,..."
3,InChI=1S/C102H172N36O32S7/c1-50(2)34-63-91(161...,"Drug Central, Drug Central, Drug Central, SGC,...","[ziconotide, ziconotide, ziconotide, nan, nan,..."
4,InChI=1S/C10H10ClN5O2/c11-8-4-2-1-3-7(8)9(18-1...,"Drug Central, Drug Central, Drug Central, Drug...","[cenobamate, cenobamate, cenobamate, cenobamat..."
...,...,...,...
3266,"InChI=1S/CH4N2O2/c2-1(4)3-5/h5H,(H3,2,3,4)","Drug Central, Drug Central, Drug Central, Drug...","[hydroxycarbamide, hydroxycarbamide, hydroxyca..."
3267,"InChI=1S/CH5N3/c2-1(3)4/h(H5,2,3,4)","Drug Central, SGC, Chemical Probes","[guanidine, nan, nan]"
3268,InChI=1S/ClH/h1H,"Drug Central, SGC, Chemical Probes","[hydrochloric acid, nan, nan]"
3269,InChI=1S/NO/c1-2,"Drug Central, SGC, Chemical Probes","[nitric oxide, nan, nan]"


In [87]:
def remove_nan_sources(row):
    sources = row['Source'].split(', ')
    compounds = row['Compound']
    
    updated_sources = [source for source, compound in zip(sources, compounds) if not pd.isna(compound)]
    
    return ', '.join(updated_sources)

# Apply to update 'Source' column
list_result['Source'] = list_result.apply(remove_nan_sources, axis=1)

# Drop NaN values from the 'Compound' column
list_result['Compound'] = list_result['Compound'].apply(lambda x: [comp for comp in x if not pd.isna(comp)])

# Display
display(list_result)

#3,271

Unnamed: 0,InChI (cleaned),Source,Compound
0,InChI=1S/3C17H14O.2Pd/c3*18-17(13-11-15-7-3-1-...,Chemical Probes,[Tris-DBA palladium]
1,InChI=1S/3ClH.Cr/h3*1H;/q;;;+3/p-3,"Drug Central, Drug Central","[chromic chloride, chromic chloride]"
2,InChI=1S/As2O3/c3-1-5-2-4,"Drug Central, Drug Central","[arsenic trioxide, arsenic trioxide]"
3,InChI=1S/C102H172N36O32S7/c1-50(2)34-63-91(161...,"Drug Central, Drug Central, Drug Central","[ziconotide, ziconotide, ziconotide]"
4,InChI=1S/C10H10ClN5O2/c11-8-4-2-1-3-7(8)9(18-1...,"Drug Central, Drug Central, Drug Central, Drug...","[cenobamate, cenobamate, cenobamate, cenobamate]"
...,...,...,...
3266,"InChI=1S/CH4N2O2/c2-1(4)3-5/h5H,(H3,2,3,4)","Drug Central, Drug Central, Drug Central, Drug...","[hydroxycarbamide, hydroxycarbamide, hydroxyca..."
3267,"InChI=1S/CH5N3/c2-1(3)4/h(H5,2,3,4)",Drug Central,[guanidine]
3268,InChI=1S/ClH/h1H,Drug Central,[hydrochloric acid]
3269,InChI=1S/NO/c1-2,Drug Central,[nitric oxide]


In [88]:
# Create copies to avoid SettingWithCopyWarning
drug_central_inchi_copy = drug_central_inchi.copy()
sgc_inchi_copy = sgc_inchi.copy()
chem_probes_inchi_copy = chem_probes_inchi.copy()

# Remove duplicates based on 'InChI' column for drug_central_inchi_copy
drug_central_inchi_no_dupes = drug_central_inchi_copy.drop_duplicates(subset=['InChI (cleaned)'])

# Remove duplicates based on 'InChI' column for sgc_inchi_copy
sgc_inchi_no_dupes = sgc_inchi_copy.drop_duplicates(subset=['InChI (cleaned)'])

# Remove duplicates based on 'InChI' column for chem_probes_inchi_copy
chem_probes_inchi_no_dupes = chem_probes_inchi_copy.drop_duplicates(subset=['InChI (cleaned)'])

# Display the DataFrames without duplicates based on 'InChI' column
print("drug_central_inchi without duplicates:")
display(drug_central_inchi_no_dupes)

print("\nsgc_inchi without duplicates:")
display(sgc_inchi_no_dupes)

print("\nchem_probes_inchi without duplicates:")
display(chem_probes_inchi_no_dupes)

drug_central_inchi without duplicates:


Unnamed: 0,InChI (cleaned),DRUG_NAME
0,InChI=1S/C18H28N2O/c1-4-5-12-20-13-7-6-11-16(2...,levobupivacaine
8,InChI=1S/C26H29N3O6/c1-17-22(25(30)34-4)24(20-...,(S)-nicardipine
9,InChI=1S/C18H20N2O6/c1-5-26-18(22)15-11(3)19-1...,(S)-nitrendipine
15,InChI=1S/C18H23NO3/c1-13(2-3-14-4-7-16(20)8-5-...,levdobutamine
16,InChI=1S/C19H20N8O5/c20-15-14-16(27-19(21)26-1...,aminopterin
...,...,...
19301,InChI=1S/C18H24O4/c1-18-7-6-12-11-5-3-10(19)8-...,estetrol
19310,InChI=1S/C18H23FN4O8/c19-13-6-4-10(9-21-13)15(...,piflufolastat F-18
19311,InChI=1S/C26H31Cl2N7O3/c1-5-34-10-12-35(13-11-...,infigratinib
19371,InChI=1S/C21H26N2O4/c22-19(26)15-4-3-13-9-16-2...,samidorphan



sgc_inchi without duplicates:


Unnamed: 0,InChI (cleaned),Compound name
0,InChI=1S/C15H13ClN4O/c1-9(11-4-2-3-10(5-11)7-1...,(R)-9s
1,InChI=1S/C18H21N5/c1-21(2)15-9-11-22(13-15)18-...,(R)-ZINC-3573
2,InChI=1S/C14H16N8O2S/c15-8-21-3-1-9(6-21)13(24...,8RK64
3,InChI=1S/C12H14FNO/c1-3-12(14-15)9(2)8-10-4-6-...,A-079
4,InChI=1S/C35H32FN5O4S2/c1-40(2)17-6-8-22-14-15...,A-1155463
...,...,...
117,InChI=1S/C29H31FO6S/c1-19-14-26(36-24-10-12-37...,TP-051
118,"InChI=1S/C24H20F4N2O3/c1-23(2,32)14-3-6-16(7-4...",TP-060
119,InChI=1S/C20H22N2O2/c1-15-8-9-19-18(12-15)20(2...,UCSF924
121,InChI=1S/C22H22ClN5O2S/c1-14-25-26-19-13-24-21...,WEB2086



chem_probes_inchi without duplicates:


Unnamed: 0,InChI (cleaned),Probe name
0,,Please cite the Chemical Probes Portal
1,InChI=1S/C28H40BrN7O4/c1-16(2)35(12-6-11-31-27...,SGC0946
2,InChI=1S/C25H38N4O3/c1-26-10-11-29(2)18-21-17-...,EPZ020411
3,InChI=1S/C22H22F3N3O3S2/c1-2-28-12-17(13-4-3-5...,I-BRD9
4,InChI=1S/C33H34N4O3/c1-20-17-37(18-21(2)40-20)...,CFI-400945
...,...,...
987,InChI=1S/C20H32N6O2S/c1-13-12-28-10-9-26(13)17...,CERALASERTIB
989,InChI=1S/C28H33N7O3/c1-6-23(36)29-20-14-12-19(...,YKL-5-124
990,InChI=1S/C26H31N5O3/c1-29(2)11-4-12-34-24-8-6-...,AZD0156
991,InChI=1S/C25H27N3O3/c29-23-9-8-22(24(30)26-23)...,DKY709


In [89]:
df = pd.concat([drug_central_inchi_no_dupes, sgc_inchi_no_dupes, chem_probes_inchi_no_dupes], ignore_index=True)

# Rename the columns
df.rename(columns={'DRUG_NAME': 'Drug Central', 'Compound name': 'SGC', 'Probe name': 'Chemical Probes'}, inplace=True)
df

Unnamed: 0,InChI (cleaned),Drug Central,SGC,Chemical Probes
0,InChI=1S/C18H28N2O/c1-4-5-12-20-13-7-6-11-16(2...,levobupivacaine,,
1,InChI=1S/C26H29N3O6/c1-17-22(25(30)34-4)24(20-...,(S)-nicardipine,,
2,InChI=1S/C18H20N2O6/c1-5-26-18(22)15-11(3)19-1...,(S)-nitrendipine,,
3,InChI=1S/C18H23NO3/c1-13(2-3-14-4-7-16(20)8-5-...,levdobutamine,,
4,InChI=1S/C19H20N8O5/c20-15-14-16(27-19(21)26-1...,aminopterin,,
...,...,...,...,...
3443,InChI=1S/C20H32N6O2S/c1-13-12-28-10-9-26(13)17...,,,CERALASERTIB
3444,InChI=1S/C28H33N7O3/c1-6-23(36)29-20-14-12-19(...,,,YKL-5-124
3445,InChI=1S/C26H31N5O3/c1-29(2)11-4-12-34-24-8-6-...,,,AZD0156
3446,InChI=1S/C25H27N3O3/c29-23-9-8-22(24(30)26-23)...,,,DKY709


In [90]:
result_df = melted_df.groupby('InChI (cleaned)', as_index=False).agg({'Source': ', '.join, 'Compound': list})

def remove_nan_sources(row):
    sources = row['Source'].split(', ')
    compounds = row['Compound']
    
    updated_sources = [source for source, 
                       compound in zip(sources, compounds)
                       if not pd.isna(compound)]
    
    return ', '.join(updated_sources)

# Apply to update 'Source' column
result_df['Source'] = result_df.apply(remove_nan_sources, axis=1)

# Drop NaN values from the 'Compound' column
result_df['Compound'] = result_df['Compound'].apply(lambda x: [comp for comp in x if not pd.isna(comp)])

# Convert 'Compound' column to string format
result_df['Compound'] = result_df['Compound'].apply(lambda x: ', '.join(x))

# Reorganize columns for aethetics
result_df = result_df[['InChI (cleaned)', 'Compound', 'Source']]
all_inchi = result_df[['InChI (cleaned)']]

# Select ONLY the overlapping InChI
selected_inchi = result_df[(result_df['Source'] == 'Drug Central, Chemical Probes') | (result_df['Source'] == 'SGC, Chemical Probes')]
selected_inchi = selected_inchi[['InChI (cleaned)']]

# Display
display(result_df)
display(all_inchi)
display(selected_inchi)
result_df.to_csv('result_df.csv')

Unnamed: 0,InChI (cleaned),Compound,Source
0,InChI=1S/3C17H14O.2Pd/c3*18-17(13-11-15-7-3-1-...,Tris-DBA palladium,Chemical Probes
1,InChI=1S/3ClH.Cr/h3*1H;/q;;;+3/p-3,"chromic chloride, chromic chloride","Drug Central, Drug Central"
2,InChI=1S/As2O3/c3-1-5-2-4,"arsenic trioxide, arsenic trioxide","Drug Central, Drug Central"
3,InChI=1S/C102H172N36O32S7/c1-50(2)34-63-91(161...,"ziconotide, ziconotide, ziconotide","Drug Central, Drug Central, Drug Central"
4,InChI=1S/C10H10ClN5O2/c11-8-4-2-1-3-7(8)9(18-1...,"cenobamate, cenobamate, cenobamate, cenobamate","Drug Central, Drug Central, Drug Central, Drug..."
...,...,...,...
3266,"InChI=1S/CH4N2O2/c2-1(4)3-5/h5H,(H3,2,3,4)","hydroxycarbamide, hydroxycarbamide, hydroxycar...","Drug Central, Drug Central, Drug Central, Drug..."
3267,"InChI=1S/CH5N3/c2-1(3)4/h(H5,2,3,4)",guanidine,Drug Central
3268,InChI=1S/ClH/h1H,hydrochloric acid,Drug Central
3269,InChI=1S/NO/c1-2,nitric oxide,Drug Central


Unnamed: 0,InChI (cleaned)
0,InChI=1S/3C17H14O.2Pd/c3*18-17(13-11-15-7-3-1-...
1,InChI=1S/3ClH.Cr/h3*1H;/q;;;+3/p-3
2,InChI=1S/As2O3/c3-1-5-2-4
3,InChI=1S/C102H172N36O32S7/c1-50(2)34-63-91(161...
4,InChI=1S/C10H10ClN5O2/c11-8-4-2-1-3-7(8)9(18-1...
...,...
3266,"InChI=1S/CH4N2O2/c2-1(4)3-5/h5H,(H3,2,3,4)"
3267,"InChI=1S/CH5N3/c2-1(3)4/h(H5,2,3,4)"
3268,InChI=1S/ClH/h1H
3269,InChI=1S/NO/c1-2


Unnamed: 0,InChI (cleaned)
157,InChI=1S/C11H19N5/c1-7(2)9-5-10(15-11(13)14-9)...
183,InChI=1S/C12H14FNO/c1-3-12(14-15)9(2)8-10-4-6-...
279,InChI=1S/C13H15F4NO/c14-9-4-6-10(7-5-9)18-11-3...
330,InChI=1S/C13H8N2OS/c16-12-9-5-1-2-7-11(9)17-13...
393,InChI=1S/C14H16N8O2S/c15-8-21-3-1-9(6-21)13(24...
...,...
2758,InChI=1S/C35H33FN4O4/c36-31-13-11-28(22-32(31)...
2829,InChI=1S/C40H43ClFN5O5S/c1-24-36-33-11-10-32(4...
2895,InChI=1S/C46H62N2O10/c1-51-39-20-18-32(28-40(3...
2904,InChI=1S/C47H54ClN7O7S/c1-47(2)18-16-34(41(27-...


In [91]:
def get_inchikey(inchi_string):
    try:
        mol = Chem.MolFromInchi(inchi_string)
        inchikey = Chem.InchiToInchiKey(inchi_string)
        return inchikey
    except (TypeError, ValueError):
        return None

def inchi_to_inchikey(inchi_dataframe):
    inchi_copy = inchi_dataframe.copy() 
    inchi_copy['InChIKey'] = inchi_copy['InChI (cleaned)'].apply(get_inchikey)
    return inchi_copy

all_inchikey = inchi_to_inchikey(all_inchi)
all_inchikey = all_inchikey[['InChIKey']]

selected_inchikey = inchi_to_inchikey(selected_inchi)
selected_inchikey = selected_inchikey[['InChIKey']]
adjusted_inchikey = selected_inchikey.copy()
adjusted_inchikey['InChIKey']= 'InChIKey=' + adjusted_inchikey['InChIKey']

display(all_inchikey) 
display(selected_inchikey)
display(adjusted_inchikey)
selected_inchi.to_csv('selected_inchi.csv')

#174

Unnamed: 0,InChIKey
0,CYPYTURSJDMMMP-WVCUSYJESA-N
1,QSWDMMVNRMROPK-UHFFFAOYSA-K
2,IKWTVSLWAPBBKU-UHFFFAOYSA-N
3,BPKIMPVREBSLAJ-QTBYCLKRSA-N
4,GFHAXPJGXSQLPT-VIFPVBQESA-N
...,...
3266,VSNHCAURESNICA-UHFFFAOYSA-N
3267,ZRALSGWEFCBTJO-UHFFFAOYSA-N
3268,VEXZGXHMUGYJMC-UHFFFAOYSA-N
3269,MWUXSHHQAYIFBG-UHFFFAOYSA-N


Unnamed: 0,InChIKey
157,COOGVHJHSCBOQT-MRVPVSSYSA-N
183,HKROEBDHHKMNBZ-CHBKHGQFSA-N
279,IESAJAZKMLPVIB-VXGBXAGGSA-N
330,GBAKVEWPYUIGHN-UHFFFAOYSA-N
393,KIWKRCCIHSGWQS-VIFPVBQESA-N
...,...
2758,JWZSSEWMVYKYKW-UHFFFAOYSA-N
2829,LUVOYGUFQRWXGQ-UHFFFAOYSA-N
2895,ZDBWLRLGUBSLPG-FDHYQTMZSA-N
2904,CSBKUBOVPUXFLO-MAVVKCOWSA-N


Unnamed: 0,InChIKey
157,InChIKey=COOGVHJHSCBOQT-MRVPVSSYSA-N
183,InChIKey=HKROEBDHHKMNBZ-CHBKHGQFSA-N
279,InChIKey=IESAJAZKMLPVIB-VXGBXAGGSA-N
330,InChIKey=GBAKVEWPYUIGHN-UHFFFAOYSA-N
393,InChIKey=KIWKRCCIHSGWQS-VIFPVBQESA-N
...,...
2758,InChIKey=JWZSSEWMVYKYKW-UHFFFAOYSA-N
2829,InChIKey=LUVOYGUFQRWXGQ-UHFFFAOYSA-N
2895,InChIKey=ZDBWLRLGUBSLPG-FDHYQTMZSA-N
2904,InChIKey=CSBKUBOVPUXFLO-MAVVKCOWSA-N


In [92]:
result_df

Unnamed: 0,InChI (cleaned),Compound,Source
0,InChI=1S/3C17H14O.2Pd/c3*18-17(13-11-15-7-3-1-...,Tris-DBA palladium,Chemical Probes
1,InChI=1S/3ClH.Cr/h3*1H;/q;;;+3/p-3,"chromic chloride, chromic chloride","Drug Central, Drug Central"
2,InChI=1S/As2O3/c3-1-5-2-4,"arsenic trioxide, arsenic trioxide","Drug Central, Drug Central"
3,InChI=1S/C102H172N36O32S7/c1-50(2)34-63-91(161...,"ziconotide, ziconotide, ziconotide","Drug Central, Drug Central, Drug Central"
4,InChI=1S/C10H10ClN5O2/c11-8-4-2-1-3-7(8)9(18-1...,"cenobamate, cenobamate, cenobamate, cenobamate","Drug Central, Drug Central, Drug Central, Drug..."
...,...,...,...
3266,"InChI=1S/CH4N2O2/c2-1(4)3-5/h5H,(H3,2,3,4)","hydroxycarbamide, hydroxycarbamide, hydroxycar...","Drug Central, Drug Central, Drug Central, Drug..."
3267,"InChI=1S/CH5N3/c2-1(3)4/h(H5,2,3,4)",guanidine,Drug Central
3268,InChI=1S/ClH/h1H,hydrochloric acid,Drug Central
3269,InChI=1S/NO/c1-2,nitric oxide,Drug Central


In [93]:
compounds_cv = pd.read_csv("compounds_cv.csv", encoding='utf-8')
compounds_cv

Unnamed: 0,Key,Virtual Compound Preferred Name,EUbOPEN Compound ID,Compound SMILES,Compound InChi,Compound InChi Key,Protein Family,Target ID,NCBI Gene ID,UniProt ID,...,Affinity On-target Cellular Assay Type,Affinity on-target cellular Source Knowledge,Affinity On-target Cellular Relation,Selectivity Number of Off-targets,Selectivity Platform,Selectivity Platform Number of Targets,Selectivity Remarks,Selectivity Source Knowledge,CG-Set,Recommended Concentration
0,EUB0000326a_EP300@Acetyltransferase,A-486,EUB0000326a,C[C@H](N(C(CN1C([C@]2(OC1=O)CCC3=C2C=C(NC(NC)=...,"InChI=1S/C25H24F4N4O5/c1-14(25(27,28)29)32(12-...",MTTJOZOOUCZVHO-BSEYFRJRSA-N,HAT,EP300@Acetyltransferase,2033.0,Q09472,...,,,,,,,,,Epigenetic set,1 µM
1,EUB0000326a_CREBBP@Acetyltransferase,A-486,EUB0000326a,C[C@H](N(C(CN1C([C@]2(OC1=O)CCC3=C2C=C(NC(NC)=...,"InChI=1S/C25H24F4N4O5/c1-14(25(27,28)29)32(12-...",MTTJOZOOUCZVHO-BSEYFRJRSA-N,HAT,CREBBP@Acetyltransferase,1387.0,Q92793,...,,,,,,,,,Epigenetic set,1 µM
2,EUB0000327a_MYST3@Acetyltransferase,WM-2474,EUB0000327a,FC1=CC=C(C2=CC=NN=C2)C=C1C(NNS(C3=CC=CC=C3)(=O...,InChI=1S/C17H13FN4O3S/c18-16-7-6-12(13-8-9-19-...,WYMCVPPNOFFNGE-UHFFFAOYSA-N,HAT,MYST3@Acetyltransferase,7994.0,Q92794,...,,,,,,,,,Epigenetic set,1 µM
3,EUB0000327a_MYST4@Acetyltransferase,WM-2474,EUB0000327a,FC1=CC=C(C2=CC=NN=C2)C=C1C(NNS(C3=CC=CC=C3)(=O...,InChI=1S/C17H13FN4O3S/c18-16-7-6-12(13-8-9-19-...,WYMCVPPNOFFNGE-UHFFFAOYSA-N,HAT,MYST4@Acetyltransferase,23522.0,Q8WYB5,...,,,,,,,,,Epigenetic set,1 µM
4,EUB0000328a_EP300@Acetyltransferase,A-485,EUB0000328a,C[C@H](N(C(CN1C([C@]2(OC1=O)CCC3=CC(NC(NC)=O)=...,"InChI=1S/C25H24F4N4O5/c1-14(25(27,28)29)32(12-...",VRVJKILQRBSEAG-BSEYFRJRSA-N,HAT,EP300@Acetyltransferase,2033.0,Q09472,...,,,,,,,"Selective against other available HATs (MYST3,...","https://doi.org/10.1038/nature24028, https://w...",Epigenetic set,1 µM
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
788,EUB0001573a_OGA,TP-040n,EUB0001573a,CN1CCN(c2ccnc(NCCc3ccccc3)n2)CC1,InChI=1S/C17H23N5/c1-21-11-13-22(14-12-21)16-8...,BNXNWRSVYQHTOH-UHFFFAOYSA-N,,OGA,10724.0,O60502,...,,,,1.0,GPCR panel (PDSP screen),45.0,"Screened at 10 µM, in-vitro potencies of close...",https://www.sgc-ffm.uni-frankfurt.de/chemProbe...,Other targets set,1 µM
789,EUB0001556a_GRIN2A,TP-050,EUB0001556a,Cc1cn([C@H](C)CC#N)c(=O)c2cc(Cn3nc(C(F)F)cc3Cl...,InChI=1S/C16H15ClF2N6O/c1-9(3-4-20)23-7-10(2)2...,BEBDKSYNJJVZSO-SECBINFHSA-N,Ion Channel,GRIN2A,2903.0,Q12879,...,Ca2+ influx assay (using CHO cells expressing ...,https://www.sciencedirect.com/science/article/...,0,0.0,NMDA receptor panel,3.0,Screened in Ca2+ influx assay (using CHO cells...,https://www.sciencedirect.com/science/article/...,Ion Channel,1 µM
790,EUB0001557a_GRIN2A,TP-050n,EUB0001557a,Cc1cn([C@@H](C)CC#N)c(=O)c2cc(Cn3nc(C(F)F)cc3C...,InChI=1S/C16H15ClF2N6O/c1-9(3-4-20)23-7-10(2)2...,BEBDKSYNJJVZSO-VIFPVBQESA-N,Ion Channel,GRIN2A,2903.0,Q12879,...,Ca2+ influx assay (using CHO cells expressing ...,https://www.sciencedirect.com/science/article/...,>,,,,Screened in Ca2+ influx assay (using CHO cells...,https://www.sciencedirect.com/science/article/...,Ion Channel,1 µM
791,EUB0001554a_FFAR1,TP-051,EUB0001554a,Cc1cc(OC2CCS(=O)(=O)CC2)cc(C)c1-c1cccc(COc2ccc...,InChI=1S/C29H31FO6S/c1-19-14-26(36-24-10-12-37...,RPAHCZZXEGWBDL-UHFFFAOYSA-N,GPCR,FFAR1,2864.0,O14842,...,FLIPR assay (Ca influx activity of CHO cells e...,https://pubs.acs.org/doi/10.1021/jm2016123,0,0.0,Eurofins Panlabs panel,118.0,"Screened at 10 µM, closest targets as % of inh...",https://www.sgc-ffm.uni-frankfurt.de/chemProbe...,GPCR set,1 µM


In [94]:
eubopen_merge = pd.merge(compounds_cv, result_df, left_on='Compound InChi', right_on='InChI (cleaned)')
eubopen_merge

Unnamed: 0,Key,Virtual Compound Preferred Name,EUbOPEN Compound ID,Compound SMILES,Compound InChi,Compound InChi Key,Protein Family,Target ID,NCBI Gene ID,UniProt ID,...,Selectivity Number of Off-targets,Selectivity Platform,Selectivity Platform Number of Targets,Selectivity Remarks,Selectivity Source Knowledge,CG-Set,Recommended Concentration,InChI (cleaned),Compound,Source
0,EUB0000329a_MYST3@Acetyltransferase,WM-1119,EUB0000329a,FC1=CC=CC=C1S(NNC(C2=CC(F)=CC(C3=NC=CC=C3)=C2)...,InChI=1S/C18H13F2N3O3S/c19-14-10-12(16-6-3-4-8...,QLXULUNLCRKWRD-UHFFFAOYSA-N,HAT,MYST3@Acetyltransferase,7994.0,Q92794,...,0.0,"HAT panel, literature",6.0,> 200-fold selective on all other HATs tested;...,https://www.sgc-ffm.uni-frankfurt.de/#!specifi...,Epigenetic set,1 µM,InChI=1S/C18H13F2N3O3S/c19-14-10-12(16-6-3-4-8...,"WM-1119, WM-1119","SGC, Chemical Probes"
1,EUB0000329a_MYST4@Acetyltransferase,WM-1119,EUB0000329a,FC1=CC=CC=C1S(NNC(C2=CC(F)=CC(C3=NC=CC=C3)=C2)...,InChI=1S/C18H13F2N3O3S/c19-14-10-12(16-6-3-4-8...,QLXULUNLCRKWRD-UHFFFAOYSA-N,HAT,MYST4@Acetyltransferase,23522.0,Q8WYB5,...,0.0,"HAT panel, literature",6.0,> 200-fold selective on all other HATs tested;...,https://www.sgc-ffm.uni-frankfurt.de/#!specifi...,Epigenetic set,1 µM,InChI=1S/C18H13F2N3O3S/c19-14-10-12(16-6-3-4-8...,"WM-1119, WM-1119","SGC, Chemical Probes"
2,EUB0000195c_CREBBP@BRD,I-CBP112,EUB0000195c,CCC(=O)N1CCOc2c(cc(-c3ccc(OC)c(OC)c3)cc2OC[C@H...,InChI=1S/C27H36N2O5/c1-5-26(30)29-11-12-33-27-...,YKNAKDFZAWQEEO-IBGZPJMESA-N,Bromodomain,CREBBP@BRD,1387.0,Q92793,...,0.0,"Bromodomain panel, literature",42.0,Screened at 2 µM via biolayer interferometry (...,https://doi.org/10.1158/0008-5472.CAN-15-0236,Epigenetic set,1 µM,InChI=1S/C27H36N2O5/c1-5-26(30)29-11-12-33-27-...,I-CBP112,Chemical Probes
3,EUB0000195c_EP300@BRD,I-CBP112,EUB0000195c,CCC(=O)N1CCOc2c(cc(-c3ccc(OC)c(OC)c3)cc2OC[C@H...,InChI=1S/C27H36N2O5/c1-5-26(30)29-11-12-33-27-...,YKNAKDFZAWQEEO-IBGZPJMESA-N,Bromodomain,EP300@BRD,2033.0,Q09472,...,0.0,"Bromodomain panel, literature",42.0,Screened at 2 µM via biolayer interferometry (...,https://doi.org/10.1158/0008-5472.CAN-15-0236,Epigenetic set,1 µM,InChI=1S/C27H36N2O5/c1-5-26(30)29-11-12-33-27-...,I-CBP112,Chemical Probes
4,EUB0000208c_CREBBP@BRD,SGC-CBP30,EUB0000208c,COc1ccc(CCc2nc3cc(-c4c(C)noc4C)ccc3n2C[C@H](C)...,InChI=1S/C28H33ClN4O3/c1-18(32-11-13-35-14-12-...,GEPYBHCJBORHCE-SFHVURJKSA-N,Bromodomain,CREBBP@BRD,1387.0,Q92793,...,0.0,"Bromodomain panel, literature",45.0,"Screened at 10 µM in DSF assay, dTM (CREBBPA/E...","https://doi.org/10.1073/pnas.1501956112, https...",Epigenetic set,1 µM,InChI=1S/C28H33ClN4O3/c1-18(32-11-13-35-14-12-...,SGC-CBP30,Chemical Probes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
363,EUB0001118a_SMARCA4@BRD,SGC-SMARCA-BRDVIII,EUB0001118a,OC1=CC=CC=C1C2=CC(N3CCN(C(OC(C)(C)C)=O)CC3)=C(...,"InChI=1S/C19H25N5O3/c1-19(2,3)27-18(26)24-10-8...",AQTNUGRRZDRZIA-UHFFFAOYSA-N,Bromodomain,SMARCA4@BRD,6597.0,P51532,...,0.0,Bromodomain panel (DSF assays),25.0,"Screened at 20 µM, dTm(SMARCA2) = 7.7 K, dTm(S...",https://www.thesgc.org/chemical-probes/SGC-SMA...,Epigenetic set,1 µM,"InChI=1S/C19H25N5O3/c1-19(2,3)27-18(26)24-10-8...",SGC-SMARCA-BRDVIII,Chemical Probes
364,EUB0001118a_PBRM1@BRD,SGC-SMARCA-BRDVIII,EUB0001118a,OC1=CC=CC=C1C2=CC(N3CCN(C(OC(C)(C)C)=O)CC3)=C(...,"InChI=1S/C19H25N5O3/c1-19(2,3)27-18(26)24-10-8...",AQTNUGRRZDRZIA-UHFFFAOYSA-N,Bromodomain,PBRM1@BRD,55193.0,Q86U86,...,0.0,Bromodomain panel (DSF assays),25.0,"Screened at 20 µM, dTm(SMARCA2) = 7.7 K, dTm(S...",https://www.thesgc.org/chemical-probes/SGC-SMA...,Epigenetic set,1 µM,"InChI=1S/C19H25N5O3/c1-19(2,3)27-18(26)24-10-8...",SGC-SMARCA-BRDVIII,Chemical Probes
365,EUB0001572a_OGA,TP-040,EUB0001572a,CC1CCN(c2ccnc(NCc3cn(C)cn3)n2)CC1,InChI=1S/C15H22N6/c1-12-4-7-21(8-5-12)14-3-6-1...,PWKAYICUBVNJAZ-UHFFFAOYSA-N,,OGA,10724.0,O60502,...,0.0,Kinase panel (literature),277.0,"Screened at 1 µM, clean selectivity profile wi...",https://pubs.acs.org/doi/10.1021/acs.jmedchem....,Other targets set,1 µM,InChI=1S/C15H22N6/c1-12-4-7-21(8-5-12)14-3-6-1...,"TP-040, TP-040","SGC, Chemical Probes"
366,EUB0001556a_GRIN2A,TP-050,EUB0001556a,Cc1cn([C@H](C)CC#N)c(=O)c2cc(Cn3nc(C(F)F)cc3Cl...,InChI=1S/C16H15ClF2N6O/c1-9(3-4-20)23-7-10(2)2...,BEBDKSYNJJVZSO-SECBINFHSA-N,Ion Channel,GRIN2A,2903.0,Q12879,...,0.0,NMDA receptor panel,3.0,Screened in Ca2+ influx assay (using CHO cells...,https://www.sciencedirect.com/science/article/...,Ion Channel,1 µM,InChI=1S/C16H15ClF2N6O/c1-9(3-4-20)23-7-10(2)2...,"TP-050, TP-050","SGC, Chemical Probes"


In [101]:
def get_similar_compounds(smiles, dataset, top_n=5):
    mol = Chem.MolFromSmiles(smiles)
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024)

    similarities = []
    for _, row in dataset.iterrows():
        mol_other = Chem.MolFromSmiles(row['Compound SMILES'])
        if mol_other:
            fp_other = AllChem.GetMorganFingerprintAsBitVect(mol_other, 2, nBits=1024)
            similarity = DataStructs.TanimotoSimilarity(fp, fp_other)
            similarities.append((row['Virtual Compound Preferred Name'], row['Compound SMILES'], similarity))

    similarities.sort(key=lambda x: x[2], reverse=True)
    top_similar = similarities[:top_n]

    return top_similar

# Example usage
smiles_input = "CCO"
similar_compounds = get_similar_compounds(smiles_input, eubopen_merge)
for name, smiles, similarity in similar_compounds:
    print(f"Name: {name}, SMILES: {smiles}, Similarity: {similarity}")


Name: ERKi, SMILES: C[C@@H](NC(=O)Nc1cc2[nH]ncc2c(CO)n1)c1ccccc1, Similarity: 0.10204081632653061
Name: ERKi, SMILES: C[C@@H](NC(=O)Nc1cc2[nH]ncc2c(CO)n1)c1ccccc1, Similarity: 0.10204081632653061
Name: PHENYLEPHRINE, SMILES: CNC[C@H](O)C1=CC(O)=CC=C1, Similarity: 0.1
Name: PHENYLEPHRINE, SMILES: CNC[C@H](O)C1=CC(O)=CC=C1, Similarity: 0.1
Name: FLUMAZENIL, SMILES: CCOC(=O)C1=C2CN(C)C(=O)C3=C(C=CC(F)=C3)N2C=N1, Similarity: 0.1


In [102]:
eubopen_merge

Unnamed: 0,Key,Virtual Compound Preferred Name,EUbOPEN Compound ID,Compound SMILES,Compound InChi,Compound InChi Key,Protein Family,Target ID,NCBI Gene ID,UniProt ID,...,Selectivity Platform,Selectivity Platform Number of Targets,Selectivity Remarks,Selectivity Source Knowledge,CG-Set,Recommended Concentration,InChI (cleaned),Compound,Source,Fingerprint
0,EUB0000329a_MYST3@Acetyltransferase,WM-1119,EUB0000329a,FC1=CC=CC=C1S(NNC(C2=CC(F)=CC(C3=NC=CC=C3)=C2)...,InChI=1S/C18H13F2N3O3S/c19-14-10-12(16-6-3-4-8...,QLXULUNLCRKWRD-UHFFFAOYSA-N,HAT,MYST3@Acetyltransferase,7994.0,Q92794,...,"HAT panel, literature",6.0,> 200-fold selective on all other HATs tested;...,https://www.sgc-ffm.uni-frankfurt.de/#!specifi...,Epigenetic set,1 µM,InChI=1S/C18H13F2N3O3S/c19-14-10-12(16-6-3-4-8...,"WM-1119, WM-1119","SGC, Chemical Probes","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,EUB0000329a_MYST4@Acetyltransferase,WM-1119,EUB0000329a,FC1=CC=CC=C1S(NNC(C2=CC(F)=CC(C3=NC=CC=C3)=C2)...,InChI=1S/C18H13F2N3O3S/c19-14-10-12(16-6-3-4-8...,QLXULUNLCRKWRD-UHFFFAOYSA-N,HAT,MYST4@Acetyltransferase,23522.0,Q8WYB5,...,"HAT panel, literature",6.0,> 200-fold selective on all other HATs tested;...,https://www.sgc-ffm.uni-frankfurt.de/#!specifi...,Epigenetic set,1 µM,InChI=1S/C18H13F2N3O3S/c19-14-10-12(16-6-3-4-8...,"WM-1119, WM-1119","SGC, Chemical Probes","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,EUB0000195c_CREBBP@BRD,I-CBP112,EUB0000195c,CCC(=O)N1CCOc2c(cc(-c3ccc(OC)c(OC)c3)cc2OC[C@H...,InChI=1S/C27H36N2O5/c1-5-26(30)29-11-12-33-27-...,YKNAKDFZAWQEEO-IBGZPJMESA-N,Bromodomain,CREBBP@BRD,1387.0,Q92793,...,"Bromodomain panel, literature",42.0,Screened at 2 µM via biolayer interferometry (...,https://doi.org/10.1158/0008-5472.CAN-15-0236,Epigenetic set,1 µM,InChI=1S/C27H36N2O5/c1-5-26(30)29-11-12-33-27-...,I-CBP112,Chemical Probes,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
3,EUB0000195c_EP300@BRD,I-CBP112,EUB0000195c,CCC(=O)N1CCOc2c(cc(-c3ccc(OC)c(OC)c3)cc2OC[C@H...,InChI=1S/C27H36N2O5/c1-5-26(30)29-11-12-33-27-...,YKNAKDFZAWQEEO-IBGZPJMESA-N,Bromodomain,EP300@BRD,2033.0,Q09472,...,"Bromodomain panel, literature",42.0,Screened at 2 µM via biolayer interferometry (...,https://doi.org/10.1158/0008-5472.CAN-15-0236,Epigenetic set,1 µM,InChI=1S/C27H36N2O5/c1-5-26(30)29-11-12-33-27-...,I-CBP112,Chemical Probes,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
4,EUB0000208c_CREBBP@BRD,SGC-CBP30,EUB0000208c,COc1ccc(CCc2nc3cc(-c4c(C)noc4C)ccc3n2C[C@H](C)...,InChI=1S/C28H33ClN4O3/c1-18(32-11-13-35-14-12-...,GEPYBHCJBORHCE-SFHVURJKSA-N,Bromodomain,CREBBP@BRD,1387.0,Q92793,...,"Bromodomain panel, literature",45.0,"Screened at 10 µM in DSF assay, dTM (CREBBPA/E...","https://doi.org/10.1073/pnas.1501956112, https...",Epigenetic set,1 µM,InChI=1S/C28H33ClN4O3/c1-18(32-11-13-35-14-12-...,SGC-CBP30,Chemical Probes,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
363,EUB0001118a_SMARCA4@BRD,SGC-SMARCA-BRDVIII,EUB0001118a,OC1=CC=CC=C1C2=CC(N3CCN(C(OC(C)(C)C)=O)CC3)=C(...,"InChI=1S/C19H25N5O3/c1-19(2,3)27-18(26)24-10-8...",AQTNUGRRZDRZIA-UHFFFAOYSA-N,Bromodomain,SMARCA4@BRD,6597.0,P51532,...,Bromodomain panel (DSF assays),25.0,"Screened at 20 µM, dTm(SMARCA2) = 7.7 K, dTm(S...",https://www.thesgc.org/chemical-probes/SGC-SMA...,Epigenetic set,1 µM,"InChI=1S/C19H25N5O3/c1-19(2,3)27-18(26)24-10-8...",SGC-SMARCA-BRDVIII,Chemical Probes,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
364,EUB0001118a_PBRM1@BRD,SGC-SMARCA-BRDVIII,EUB0001118a,OC1=CC=CC=C1C2=CC(N3CCN(C(OC(C)(C)C)=O)CC3)=C(...,"InChI=1S/C19H25N5O3/c1-19(2,3)27-18(26)24-10-8...",AQTNUGRRZDRZIA-UHFFFAOYSA-N,Bromodomain,PBRM1@BRD,55193.0,Q86U86,...,Bromodomain panel (DSF assays),25.0,"Screened at 20 µM, dTm(SMARCA2) = 7.7 K, dTm(S...",https://www.thesgc.org/chemical-probes/SGC-SMA...,Epigenetic set,1 µM,"InChI=1S/C19H25N5O3/c1-19(2,3)27-18(26)24-10-8...",SGC-SMARCA-BRDVIII,Chemical Probes,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
365,EUB0001572a_OGA,TP-040,EUB0001572a,CC1CCN(c2ccnc(NCc3cn(C)cn3)n2)CC1,InChI=1S/C15H22N6/c1-12-4-7-21(8-5-12)14-3-6-1...,PWKAYICUBVNJAZ-UHFFFAOYSA-N,,OGA,10724.0,O60502,...,Kinase panel (literature),277.0,"Screened at 1 µM, clean selectivity profile wi...",https://pubs.acs.org/doi/10.1021/acs.jmedchem....,Other targets set,1 µM,InChI=1S/C15H22N6/c1-12-4-7-21(8-5-12)14-3-6-1...,"TP-040, TP-040","SGC, Chemical Probes","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
366,EUB0001556a_GRIN2A,TP-050,EUB0001556a,Cc1cn([C@H](C)CC#N)c(=O)c2cc(Cn3nc(C(F)F)cc3Cl...,InChI=1S/C16H15ClF2N6O/c1-9(3-4-20)23-7-10(2)2...,BEBDKSYNJJVZSO-SECBINFHSA-N,Ion Channel,GRIN2A,2903.0,Q12879,...,NMDA receptor panel,3.0,Screened in Ca2+ influx assay (using CHO cells...,https://www.sciencedirect.com/science/article/...,Ion Channel,1 µM,InChI=1S/C16H15ClF2N6O/c1-9(3-4-20)23-7-10(2)2...,"TP-050, TP-050","SGC, Chemical Probes","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [None]:
def import_data(csv, **kwargs):
    return pd.read_csv(csv, **kwargs)

def merge_df(data1, data2, col1, col2, new_col):
    data1.columns = [col1, col2]
    merged_df = pd.merge(data2, data1[[col1, col2]], on=col1, how='left')
    merged_df[new_col] = merged_df[col1].isin(data1[col1]).astype(int)
    return merged_df

def compare_df(data3, merged_df, merge_col1, merge_col2):
    return pd.merge(data3, merged_df, left_on=merge_col1, right_on=merge_col2, how='left')

def filter_df(data, column, value):
    return data[data[column] == value]

fda_approved = import_data('FDA_Approved.csv', header=None, names=['ID', 'Drug Name'])
dc_compounds = import_data("DC_Compounds.csv")
dc_drug_tar = import_data("Drug_Target.csv")

merged_df = merge_df(fda_approved, dc_compounds, 'ID', 'Drug Name', "Approved")
drug_central = compare_df(dc_drug_tar, merged_df, 'STRUCT_ID', 'ID').drop(columns=['Drug Name', 'INN'])

# Additional filtering and display
drug_central = filter_df(drug_central, 'TARGET_CLASS', 'GPCR') # input desired protein family
display(drug_central)

eubopen_compounds_cv = import_data("compounds_cv.csv", encoding='utf-8')
eubopen_compounds_cv = filter_df(eubopen_compounds_cv, 'Protein Family', 'GPCR') # input desired protein family
display(eubopen_compounds_cv)

sgc_compounds = import_data('SGC_Compounds.csv', skiprows=1).dropna(subset=['SMILES (unique cis trans)'])
sgc_compounds = filter_df(sgc_compounds, 'Targeted domain', 'GPCR') # input desired protein family
display(sgc_compounds)

chem_probes = import_data('ChemicalProbesPortal.csv')
chem_probes = filter_df(chem_probes, 'Target class', 'GPCR')
display(chem_probes)

In [None]:
# Function to import data from CSV files
def import_data(*file_paths, **kwargs):
    dfs = []
    for path in file_paths:
        df = pd.read_csv(path, **kwargs)
        dfs.append(df)
    return pd.concat(dfs, ignore_index=True)

# Function to convert SMILES to InChI
def smiles_to_inchi(smiles):
    if isinstance(smiles, float):
        return None

    mol = Chem.MolFromSmiles(str(smiles))
    if mol is not None:
        inchi_string = inchi.MolToInchi(mol)
        return inchi_string
    else:
        return None

# Function to process df and create 'InChI (cleaned)' column
def process_dataframe(df, smiles_column, id_column):
    df['InChI (cleaned)'] = df[smiles_column].apply(smiles_to_inchi)
    return df[['InChI (cleaned)', id_column]].copy()

# Function to remove duplicates based on 'InChI (cleaned)' column
def remove_duplicates(df):
    return df.drop_duplicates(subset=['InChI (cleaned)']).copy()

# Function to concatenate dataframes and rename columns
def concatenate_and_rename(drug_central, eubopen, chem_probes):
    df = pd.concat([drug_central, eubopen, chem_probes], ignore_index=True)
    df.rename(columns={'DRUG_NAME': 'Drug Central', 
                       'Virtual Compound Preferred Name': 'EUbOPEN', 
                       'Probe name': 'Chemical Probes'}, inplace=True)
    return df

# Function to melt df and export to CSV
def melt_and_export(df, filename):
    melted_df = pd.melt(df, id_vars=['InChI (cleaned)'], var_name='Source', value_name='Compound')
    melted_df.to_csv(filename, index=False) # in this script version, I did not export but feel free to manipulate code to streamline exportation
    return melted_df

# Function to get InChIKey from InChI string
def get_inchikey(inchi_string):
    try:
        mol = Chem.MolFromInchi(inchi_string)
        inchikey = Chem.InchiToInchiKey(inchi_string)
        return inchikey
    except (TypeError, ValueError):
        return None

# Function to add InChIKey column to df
def inchi_to_inchikey(inchi_dataframe):
    inchi_copy = inchi_dataframe.copy() 
    inchi_copy['InChIKey'] = inchi_copy['InChI (cleaned)'].apply(get_inchikey)
    return inchi_copy

# Function to process melted df to find overlapping sources
def process_and_find_overlap(melted_df):
    result_df = melted_df.groupby('InChI (cleaned)', as_index=False).agg({'Source': ', '.join, 'Compound': list})
    
    def remove_nan_sources(row):
        sources = row['Source'].split(', ')
        compounds = row['Compound']
        updated_sources = [source for source, compound in zip(sources, compounds) if not pd.isna(compound)]
        return ', '.join(updated_sources)

    result_df['Source'] = result_df.apply(remove_nan_sources, axis=1)
    result_df['Compound'] = result_df['Compound'].apply(lambda x: [comp for comp in x if not pd.isna(comp)])
    result_df['Compound'] = result_df['Compound'].apply(lambda x: ', '.join(x))
    
    result_df = result_df[['InChI (cleaned)', 'Compound', 'Source']]
    return result_df

def filter_selected_inchi(result_df):
    all_inchi = result_df[['InChI (cleaned)']]
    
    selected_inchi = result_df[(result_df['Source'] == 'Drug Central, Chemical Probes') |
                               (result_df['Source'] == 'EUbOPEN, Chemical Probes') |
                               (result_df['Source'] == 'Drug Central, EUbOPEN')]
    selected_inchi = selected_inchi[['InChI (cleaned)']]
    
    return all_inchi, selected_inchi

# Process each df
drug_central_inchi = process_dataframe(drug_central, 'SMILES', 'DRUG_NAME')
eubopen_inchi = process_dataframe(eubopen_compounds_cv, 'Compound SMILES', 'Virtual Compound Preferred Name')
chem_probes_inchi = process_dataframe(chem_probes, 'SMILES', 'Probe name')

# Remove duplicates
drug_central_inchi_no_dupes = remove_duplicates(drug_central_inchi)
eubopen_inchi_no_dupes = remove_duplicates(eubopen_inchi)
chem_probes_inchi_no_dupes = remove_duplicates(chem_probes_inchi)

# Concatenate and rename columns
df = concatenate_and_rename(drug_central_inchi_no_dupes, eubopen_inchi_no_dupes, chem_probes_inchi_no_dupes)

# Melt dataframe
melted_df = melt_and_export(df, 'melted_dataframe.csv')

# Process melted df to find overlaps
result_df = process_and_find_overlap(melted_df)

# Filter selected InChI based on overlapping sources
all_inchi, selected_inchi = filter_selected_inchi(result_df)

# Display results
print("Result dataframe:")
display(result_df)
print("\nAll InChI:")
display(all_inchi)
print("\nSelected InChI:")
display(selected_inchi)

# Convert InChI to InChIKey
all_inchikey = inchi_to_inchikey(all_inchi)
all_inchikey = all_inchikey[['InChIKey']]

selected_inchikey = inchi_to_inchikey(selected_inchi)
selected_inchikey = selected_inchikey[['InChIKey']]
adjusted_inchikey = selected_inchikey.copy()
adjusted_inchikey['InChIKey'] = 'InChIKey=' + adjusted_inchikey['InChIKey']

display(all_inchikey) 
display(selected_inchikey)
display(adjusted_inchikey)

In [None]:
# Calculate logarithm
result['converted_activity_value'] = pd.to_numeric(result['converted_activity_value'], errors='coerce')

# Drop rows with NaN values in the 'converted_activity_value' column
result = result.dropna(subset=['converted_activity_value'])

result['log10_activity'] = np.log10(result['converted_activity_value'])

# Display
display(result)

In [None]:
# Plot histograms with fitted density curve using sns

sns.set(style="whitegrid")

graph_count = 0

for inchi_key, group_df in result.groupby('inchi_key'):
    plt.figure(figsize=(10, 6))
    
    sns.histplot(group_df['log10_activity'], bins=20, kde=True, stat='count')
    
    plt.title(f'Distribution for InchiKey: {inchi_key}')
    plt.xlabel('Log10 Activity')
    plt.ylabel('Count')
    plt.show()
    
    graph_count += 1
    
print(f'Total number of graphs: {graph_count}')

# Confirm # of plots

In [None]:
# Plot overlay histogram

sns.set(style="whitegrid")

# Create a figure for overlay histogram
plt.figure(figsize=(12, 8))

# Group by 'inchi_key' and plot histograms for each group
for inchi_key, group_df in result.groupby('inchi_key'):
    sns.histplot(group_df['log10_activity'], bins=20, kde=True, stat='count')

# Add quartile lines
quartiles = result['log10_activity'].quantile([0.25, 0.5, 0.75])
for q in quartiles:
    plt.axvline(q, color='r', linestyle='dashed', linewidth=2)

plt.title('Overlay Histogram with Quartile Lines')
plt.xlabel('Log10 Activity')
plt.ylabel('Count') 

plt.show()
IQR = (-2.958607 - (0.146128)) # adjust accordingly

print(quartiles)
print(IQR)

In [None]:
# Create histograms for compound activity against targets

sns.set(style="whitegrid")

for inchi_key, group_df in result.groupby('inchi_key'):
    plt.figure(figsize=(10, 6))
    
    sns.histplot(data=group_df, x='log10_activity', bins=20, hue='common_name', multiple="stack")
    
    plt.title(f'Distribution for InchiKey: {inchi_key}')
    plt.xlabel('Log10 Activity')
    plt.ylabel('Count')
    plt.show()

In [None]:
# Calculate the count of entries for each compound-target set
count_series = merged_df.groupby(['inchi_key', 'common_name'])['converted_activity_value'].transform('count')

# Calculate the average activity only for compound-target sets with more than one entry
merged_df['avg_activity'] = merged_df.groupby(['inchi_key', 'common_name'])['converted_activity_value'].transform('mean')
merged_df['avg_activity'] = merged_df['avg_activity'].where(count_series > 1, merged_df['converted_activity_value'])

# Calculate the average log10_activity only for compound-target sets with more than one entry
merged_df['log10_avg_activity'] = merged_df.groupby(['inchi_key', 'common_name'])['log10_activity'].transform('mean')
merged_df['log10_avg_activity'] = merged_df['log10_avg_activity'].where(count_series > 1, merged_df['log10_activity'])

# Function to calculate the median for the same 'inchi_key' but different 'target'
def calc_selectivity(row, df):
    compound = row['inchi_key']
    target = row['common_name']
    activity = row['converted_activity_value']

    filter_condition = (df['inchi_key'] == compound) & (df['common_name'] != target)
    relevant_rows = df.loc[filter_condition, 'converted_activity_value']

    median_activity_uM = relevant_rows.median()

    return median_activity_uM
                                                                                                                                                                                                                            
# Apply the function to each row and create a new column 'median_activity'
merged_df['median_activity'] = merged_df.apply(
    lambda row: calc_selectivity(row, merged_df), axis=1)

# Calculate selectivity by dividing avg_activity by median_activity
merged_df['selectivity'] = merged_df['avg_activity'] / merged_df['median_activity']
merged_df['log10_selectivity'] = np.log10(merged_df['selectivity'])

# Display
display(merged_df)

In [None]:
candidates_copy = candidates.copy()

# Create lists to store MF
mfp_list = []

# Iterate over each row in the df
for index, row in candidates_copy.iterrows():
    inchi = row['InChI']
    
    mol = Chem.inchi.MolFromInchi(inchi)

    if mol is not None:
        morgan_fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=1024)
        morgan_fp_str = ''.join(map(str, morgan_fp.ToBitString()))
        # append the fingerprint to the list
        mfp_list.append(morgan_fp_str)
    else:
        mfp_list.append(None)

# Add a col to store the MF
candidates_copy['MorganFingerprint'] = mfp_list

display(candidates_copy)

In [None]:
# df to store expanded MF columns
morgan_fp_df = pd.DataFrame()

# Iterate through the MF column
for index, row in candidates_copy.iterrows():
    morgan_fp_str = row['MorganFingerprint']
    if morgan_fp_str is not None:
        # Split the string into individual bits and create columns
        for i, bit in enumerate(morgan_fp_str):
            morgan_fp_df.at[index, f'MorganFingerprint_{i}'] = int(bit)

# Concatenate the original df with the new df
candidates_copy = pd.concat([candidates_copy, morgan_fp_df], axis=1)

display(candidates_copy)

In [None]:
# Let's compare MF of candidate compounds w/ MF of internal compounds
# Metadata EXCLUDED
# Script: Tanimoto-Similarity-Final

# Step 1: Extract the 'Synonyms' and 'InChI' columns
synonyms_inchi = candidates[['Synonyms', 'InChI']]

# Step 2: Drop duplicate rows
unique_synonyms_inchi = synonyms_inchi.drop_duplicates()

# Step 3: Create a new df from the unique rows
candidates_names = pd.DataFrame(unique_synonyms_inchi, columns=['Synonyms', 'InChI'])

# Create a list to store Morgan Fingerprints
mfp_list = []

# Iterate over each row in the df
for index, row in candidates_names.iterrows():
    inchi = row['InChI']
    
    # Convert InChI to RDKit molecule object
    mol = Chem.MolFromInchi(inchi)

    if mol is not None:
        # Generate Morgan Fingerprint
        morgan_fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=1024)
        # Convert fingerprint to bit string
        morgan_fp_str = ''.join(map(str, morgan_fp.ToBitString()))
        # Append the fingerprint to the list
        mfp_list.append(morgan_fp_str)
    else:
        mfp_list.append(None)

# Add a column to store the Morgan Fingerprints
candidates_names['MorganFingerprint'] = mfp_list

display(candidates_names)

In [None]:
def find_optimal_compound(group):
    # Find the compound with the lowest avg_activity
    return group.loc[group['avg_activity'].idxmin()]

def find_optimal_compounds(df):
    # Group by 'common_name' (which represents the gene) and apply the find_optimal_compound function
    optimal_compounds = df.groupby('common_name').apply(find_optimal_compound)
    
    # Remove the group keys (optional, based on preference)
    optimal_compounds.reset_index(drop=True, inplace=True)
    
    return optimal_compounds

optimal_compounds_df = find_optimal_compounds(candidates)
display(optimal_compounds_df)

In [None]:
# List of synonyms
synonyms_list = optimal_compounds_df['Synonyms'].tolist()

# Filter out non-string values
filtered_synonyms_list = [synonym for synonym in synonyms_list if isinstance(synonym, str)]

# Count occurrences of each synonym
synonyms_count = Counter(filtered_synonyms_list)

# Sort the synonyms alphabetically
sorted_synonyms = sorted(synonyms_count.items())

# Print the sorted list of synonyms with their counts
for synonym, count in sorted_synonyms:
    print(f"{synonym}: {count}")
