Initial Preparation

In [None]:
# The requests module allows you to send HTTP requests using Python
pip install requests 

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.1.2 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
# to access and read zip file
import zipfile

In [1]:
import pandas as pd

ChemBL Data Preparation

In [7]:
def get_data(url):
    import requests
    import zipfile
    import io
    response = requests.get(url)
    if response.status_code == 200:
        # Reading zip file from requests response
        zip_file = zipfile.ZipFile(io.BytesIO(response.content))
        # Printing file content in zip
        list_of_files = zip_file.namelist()
        print("ZIP file content:")
        for file_name in list_of_files:
            print(file_name)
        # Extracting file from zip
        zip_file.extractall()
        print("Successfully extracted zip file.")
    else:
        print("Failed to download file. Response:", response.status_code)
    return file_name

In [None]:
# retrieve chembl data directly from the web
chembl_file = get_data("https://www.ebi.ac.uk/chembl/interface_api/delayed_jobs/outputs/DOWNLOAD-Mb6a6SKB66gYEYj_xcN99vizbpIwDbHy6TPIjg-ZkQA=/DOWNLOAD-Mb6a6SKB66gYEYj_xcN99vizbpIwDbHy6TPIjg-ZkQA=.zip")

ZIP file content:
DOWNLOAD-Mb6a6SKB66gYEYj_xcN99vizbpIwDbHy6TPIjg-ZkQA=.csv
Successfully extracted zip file.


In [None]:
chembl_data = pd.read_csv(chembl_file, sep=';')
chembl_data.head()

In [17]:
# In case you don't want to redownload the csv files online
# run this block to read chembl csv file names available in current directory
# if you ran previous block above, just skip this block

chembl_data = pd.read_csv('DOWNLOAD-Mb6a6SKB66gYEYj_xcN99vizbpIwDbHy6TPIjg-ZkQA=.csv', sep=';')
chembl_data.head()

Unnamed: 0,Parent Molecule,Name,Synonyms,Research Codes,Phase,Drug Applicants,USAN Stem,USAN Year,USAN Definition,USAN Stem - Substem,...,Passes Rule of Five,First In Class,Chirality,Prodrug,Oral,Parenteral,Topical,Black Box,Availability Type,Smiles
0,CHEMBL492491,FENVALERATE,BELMARK|FENVALERATE|PHENVALERATE|PYDRIN|S-5602...,S-5602|SD-43775|WL-43775,-1.0,,,,,,...,0,0,Racemic Mixture,0,0,0,0,0,Unknown,CC(C)C(C(=O)OC(C#N)c1cccc(Oc2ccccc2)c1)c1ccc(C...
1,CHEMBL2103777,THIOPHANATE,NSC-170810|THIOPHANATE,NSC-170810,-1.0,,,,,,...,1,0,Achiral Molecule,0,0,0,0,0,Unknown,CCOC(=O)NC(=S)Nc1ccccc1NC(=S)NC(=O)OCC
2,CHEMBL1868702,GESTRINONE,A 46 745|A-46-745|A-46745|DIMETRIOSE|GESTRINON...,A 46 745|A-46-745|A-46745|R 2323|R-2323|RU 232...,4.0,,'-estr-; -rinone',1978.0,estrogens; cardiotonics (amrinone type),'-estr-(-estr-); -rinone(-rinone)',...,1,0,Single Stereoisomer,0,0,0,0,0,Unknown,C#C[C@]1(O)CC[C@H]2[C@@H]3CCC4=CC(=O)CCC4=C3C=...
3,CHEMBL2104213,DESMENINOL,DESMENINOL|METHIONINE HYDROXY ANALOG,,-1.0,,,,,,...,1,0,Racemic Mixture,0,0,0,0,0,Unknown,CSCCC(O)C(=O)O
4,CHEMBL1904952,QUAZODINE,MJ 1988|MJ-1988|QUAZODINE,MJ 1988|MJ-1988,-1.0,,,1968.0,,,...,1,0,Achiral Molecule,0,0,0,0,0,Unknown,CCc1ncnc2cc(OC)c(OC)cc12


In [18]:
# get only the name and smiles column
chembl_data_smiles = chembl_data[['Name', 'Smiles']]
chembl_data_smiles.head()

Unnamed: 0,Name,Smiles
0,FENVALERATE,CC(C)C(C(=O)OC(C#N)c1cccc(Oc2ccccc2)c1)c1ccc(C...
1,THIOPHANATE,CCOC(=O)NC(=S)Nc1ccccc1NC(=S)NC(=O)OCC
2,GESTRINONE,C#C[C@]1(O)CC[C@H]2[C@@H]3CCC4=CC(=O)CCC4=C3C=...
3,DESMENINOL,CSCCC(O)C(=O)O
4,QUAZODINE,CCc1ncnc2cc(OC)c(OC)cc12


In [21]:
# drop duplicated data
chembl_smiles = chembl_data_smiles.drop_duplicates(subset='Smiles')
# inspect shape of the data after duplicates removed
chembl_smiles.shape

(10891, 2)

PubChem Data Preparation

In [14]:
def get_json_data(url, file_name):
    "Function to retrieve data from pubchem in json format"
    import requests
    res = requests.get(url)
    if res.status_code == 200:
        with open(f'{file_name}.json', 'wb') as json_file:
            json_file.write(res.content)
        print("Successfully downloaded json data")
    else:
        print(f"Failed to download data. Response: {res.status_code}")

In [15]:
# url from pubchem
json_url = "https://pubchem.ncbi.nlm.nih.gov/sdq/sdqagent.cgi?infmt=json&outfmt=json&query={%22download%22:%22*%22,%22collection%22:%22compound%22,%22where%22:{%22ands%22:[{%22input%22:{%22type%22:%22netcachekey%22,%22idtype%22:%22cid%22,%22key%22:%22M3SV183lqFmfc6pqKBLjQadebD5DsiwkVgE3aE0QJWlNCRk%22}}]},%22order%22:[%22relevancescore,desc%22],%22start%22:1,%22limit%22:10000000,%22downloadfilename%22:%22PubChem_compound_list_M3SV183lqFmfc6pqKBLjQadebD5DsiwkVgE3aE0QJWlNCRk%22}"
# json file name
json_filename = "pubchem_data_json"

In [16]:
get_json_data(json_url, json_filename)
# Proceed to manually clean the data 

Successfully downloaded json data


In [20]:
# If not doing the manual cleaning, there will be error in the reading
# read json data using pandas
pubchem_json = pd.read_json('pubchem_data.json')
pubchem_json.to_csv('pubchem_data.csv')

In [22]:
pubchem_data = pd.read_csv('pubchem_data.csv')
pubchem_data.head()

Unnamed: 0.1,Unnamed: 0,cid,cmpdname,cmpdsynonym,mw,mf,polararea,complexity,xlogp,heavycnt,...,gpfamilycnt,neighbortype,meshheadings,annothits,annothitcnt,cidcdate,sidsrcname,depcatg,annotation,aids
0,0,1,Acetylcarnitine,"['Acetyl-DL-carnitine', 'acetylcarnitine', 'DL...",203.24,C9H17NO4,66.4,214.0,0.4,14,...,1232,2D+3D,Acetylcarnitine,"['Classification', 'Drug and Medication Inform...",9,2005-06-23,"['3WAY PHARM INC', 'A2B Chem', 'AA BLOCKS', 'A...","['Chemical Vendors', 'Curation Efforts', 'Gove...",['D002491 - Central Nervous System Agents > D0...,
1,1,2,"1-Propanaminium, 2-(acetyloxy)-3-carboxy-N,N,N...","['14992-62-2', '[2-(ACETYLOXY)-3-CARBOXYPROPYL...",204.24,C9H18NO4+,63.6,219.0,-0.3,14,...,1359,2D+3D,,"['Biological Test Results', 'Chemical and Phys...",9,2005-06-23,"['001Chemical', '1st Scientific', '3B Scientif...","['Chemical Vendors', 'Curation Efforts', 'Gove...",,"[781325, 1272365, 1671498]"
2,2,6,"1-Chloro-2,4-dinitrobenzene","['1-chloro-2,4-dinitrobenzene', '2,4-Dinitroch...",202.55,C6H3ClN2O4,91.6,224.0,2.3,13,...,4998,2D+3D,Dinitrochlorobenzene,"['Biological Test Results', 'Chemical and Phys...",14,2005-03-26,"['3B Scientific (Wuhan) Corp', '3WAY PHARM INC...","['Chemical Vendors', 'Curation Efforts', 'Gove...",['C308 - Immunotherapeutic Agent > C2139 - Imm...,"[155, 157, 161, 165, 167, 175, 179, 192, 220, ..."
3,3,11,"1,2-Dichloroethane","['1,2-dichloroethane', 'Ethylene dichloride', ...",98.96,C2H4Cl2,0.0,6.0,1.5,4,...,3450,2D+3D,,"['Agrochemical Information', 'Biological Test ...",17,2004-09-16,"['001Chemical', '1st Scientific', '3B Scientif...","['Chemical Vendors', 'Curation Efforts', 'Gove...",,"[421, 426, 427, 433, 434, 435, 445, 530, 540, ..."
4,4,34,2-Chloroethanol,"['2-chloroethanol', 'Ethylene chlorohydrin', '...",80.51,C2H5ClO,20.2,10.0,-0.1,4,...,17847,2D+3D,Ethylene Chlorohydrin,"['Biological Test Results', 'Chemical and Phys...",14,2005-03-26,"['3B Scientific (Wuhan) Corp', 'A2B Chem', 'AA...","['Chemical Vendors', 'Curation Efforts', 'Gove...",,"[256, 1188, 384212, 651631, 651632, 651633, 65..."


In [None]:
# get smiles data
pubchem_smiles = pubchem_data[['cmpdname', 'canonicalsmiles']]
pubchem_smiles.columns = ['Name', 'Smiles']

Unnamed: 0,Name,Smiles
0,Acetylcarnitine,CC(=O)OC(CC(=O)[O-])C[N+](C)(C)C
1,"1-Propanaminium, 2-(acetyloxy)-3-carboxy-N,N,N...",CC(=O)OC(CC(=O)O)C[N+](C)(C)C
2,"1-Chloro-2,4-dinitrobenzene",C1=CC(=C(C=C1[N+](=O)[O-])[N+](=O)[O-])Cl
3,"1,2-Dichloroethane",C(CCl)Cl
4,2-Chloroethanol,C(CCl)O


Merging ChEMBL and PubChem Data

In [26]:
# join data from pubchem and chembl
smiles_data = pd.concat([chembl_data_smiles, pubchem_smiles], axis=0)
smiles_data.head()

Unnamed: 0,Name,Smiles
0,FENVALERATE,CC(C)C(C(=O)OC(C#N)c1cccc(Oc2ccccc2)c1)c1ccc(C...
1,THIOPHANATE,CCOC(=O)NC(=S)Nc1ccccc1NC(=S)NC(=O)OCC
2,GESTRINONE,C#C[C@]1(O)CC[C@H]2[C@@H]3CCC4=CC(=O)CCC4=C3C=...
3,DESMENINOL,CSCCC(O)C(=O)O
4,QUAZODINE,CCc1ncnc2cc(OC)c(OC)cc12


In [28]:
# check and delete duplicates
print(f"Number of duplicates: {sum(smiles_data.duplicated(subset='Smiles') == True)}")
smiles_data.drop_duplicates(subset='Smiles', inplace=True)
# inspect data's shape after dropping duplicates
smiles_data.shape

Number of duplicates: 8903


(25998, 2)

DrugCentral Data Preparation

In [33]:
def get_tsv_data(url, file_name):
    import requests
    res = requests.get(url)
    if res.status_code == 200:
        with open(f'{file_name}.tsv', 'wb') as tsv_file:
            tsv_file.write(res.content)
        print("Successfully downloaded tsv data.")
    else:
        print("Failed to download tsv data.")

In [34]:
get_tsv_data("https://unmtid-shinyapps.net/download/DrugCentral/2021_09_01/structures.smiles.tsv", "drugcentral_data")

Successfully downloaded tsv data.


In [30]:
drugcentral_data = pd.read_table('drugcentral_data.tsv')
drugcentral_data.head()

Unnamed: 0,SMILES,InChI,InChIKey,ID,INN,CAS_RN
0,CNC(=O)C1=C(C=C(C=C1)C2=NN3C(=CN=C3N=C2)CC4=CC...,InChI=1S/C23H17FN6O/c1-25-22(31)18-6-5-16(11-1...,LIOLIMKSCNQPLV-UHFFFAOYSA-N,5392,capmatinib,1029712-80-8
1,CC(C)(COC1=CN2C(=C(C=N2)C#N)C(=C1)C3=CN=C(C=C3...,"InChI=1S/C29H31N7O3/c1-29(2,37)18-39-24-9-25(2...",XIIOFHFUYBLOLW-UHFFFAOYSA-N,5393,selpercatinib,2152628-33-4
2,CCN1C2=CC(=NC=C2C=C(C1=O)C3=CC(=C(C=C3Br)F)NC(...,InChI=1S/C24H21BrFN5O2/c1-3-31-21-12-22(27-2)2...,CEFJVGZHQAGLHS-UHFFFAOYSA-N,5394,ripretinib,1442472-39-0
3,C[C@]12CC[C@H]3[C@H]([C@@H]1C[C@H]([C@@H]2O)[1...,InChI=1S/C18H23FO2/c1-18-7-6-13-12-5-3-11(20)8...,KDLLNMRYZGUVMA-ZYMZXAKXSA-N,5395,fluoroestradiol F 18,94153-53-4
4,C1=CC2=C(C=C1C3=CN=C(C=C3)[18F])NC4=C2C=NC=C4,InChI=1S/C16H10FN3/c17-16-4-2-11(8-19-16)10-1-...,GETAAWDSFUCLBS-SJPDSGJFSA-N,5396,flortaucipir F 18,1522051-90-6


In [31]:
# get name and smiles column
drugcentral_smiles = drugcentral_data[['INN', 'SMILES']]
drugcentral_smiles.columns = ['Name', 'Smiles']
drugcentral_smiles.head()

Unnamed: 0,Name,Smiles
0,capmatinib,CNC(=O)C1=C(C=C(C=C1)C2=NN3C(=CN=C3N=C2)CC4=CC...
1,selpercatinib,CC(C)(COC1=CN2C(=C(C=N2)C#N)C(=C1)C3=CN=C(C=C3...
2,ripretinib,CCN1C2=CC(=NC=C2C=C(C1=O)C3=CC(=C(C=C3Br)F)NC(...
3,fluoroestradiol F 18,C[C@]12CC[C@H]3[C@H]([C@@H]1C[C@H]([C@@H]2O)[1...
4,flortaucipir F 18,C1=CC2=C(C=C1C3=CN=C(C=C3)[18F])NC4=C2C=NC=C4


In [32]:
drugcentral_smiles.shape

(4099, 2)

Merging DrugCentral and Primary Data

In [33]:
# join data with our primary smiles data
smiles_data = pd.concat([smiles_data, drugcentral_smiles], axis=0)
smiles_data.shape

(30097, 2)

In [34]:
# drop duplicates
print(f"Number of duplicates: {sum(smiles_data.duplicated(subset='Smiles') == True)}")
smiles_data.drop_duplicates(subset='Smiles', inplace=True)

Number of duplicates: 323


In [35]:
# number of rows after duplicates removed
smiles_data.shape

(29774, 2)

DrugBank Data Preparation

In [36]:
drugbank_data = pd.read_csv('drugbank.csv')
drugbank_data.head()

Unnamed: 0,DrugBank ID,Name,CAS Number,Drug Groups,InChIKey,InChI,SMILES,Formula,KEGG Compound ID,KEGG Drug ID,PubChem Compound ID,PubChem Substance ID,ChEBI ID,ChEMBL ID,HET ID,ChemSpider ID,BindingDB ID
0,DB00006,Bivalirudin,128270-60-0,approved; investigational,OIRCOABEOLEUMC-GEJPAHFPSA-N,InChI=1S/C98H138N24O33/c1-5-52(4)82(96(153)122...,CC[C@H](C)[C@H](NC(=O)[C@H](CCC(O)=O)NC(=O)[C@...,C98H138N24O33,,D03136,16129704.0,46507415.0,59173.0,CHEMBL2103749,,10482069.0,50248103.0
1,DB00007,Leuprolide,53714-56-0,approved; investigational,GFIJNRVAKGFPGQ-LIJARHBVSA-N,InChI=1S/C59H84N16O12/c1-6-63-57(86)48-14-10-2...,CCNC(=O)[C@@H]1CCCN1C(=O)[C@H](CCCNC(N)=N)NC(=...,C59H84N16O12,C07612,D08113,,46507635.0,6427.0,CHEMBL1201199,,571356.0,50369395.0
2,DB00014,Goserelin,65807-02-5,approved,BLCLNMBMMGCOAS-URPVMXJPSA-N,InChI=1S/C59H84N18O14/c1-31(2)22-40(49(82)68-3...,CC(C)C[C@H](NC(=O)[C@@H](COC(C)(C)C)NC(=O)[C@H...,C59H84N18O14,,D00573,5311128.0,46507336.0,5523.0,CHEMBL1201247,,4470656.0,
3,DB00027,Gramicidin D,1405-97-6,approved,NDAYQJDHGXTBJL-MWWSRJDJSA-N,InChI=1S/C96H135N19O16/c1-50(2)36-71(105-79(11...,CC(C)C[C@@H](NC(=O)CNC(=O)[C@@H](NC=O)C(C)C)C(...,C96H135N19O16,,D04369,45267103.0,46507412.0,,CHEMBL557217,,24623445.0,
4,DB00035,Desmopressin,16679-58-6,approved,NFLWUMRGJYTJIN-PNIOQBSNSA-N,InChI=1S/C46H64N14O12S2/c47-35(62)15-14-29-40(...,NC(=O)CC[C@@H]1NC(=O)[C@H](CC2=CC=CC=C2)NC(=O)...,C46H64N14O12S2,C06944,D00291,,,4450.0,CHEMBL1429,,4470602.0,50205308.0


In [37]:
drugbank_smiles = drugbank_data.loc[:, ['Name', 'SMILES']]
drugbank_smiles.columns = ['Name', 'Smiles']
drugbank_smiles.head()

Unnamed: 0,Name,Smiles
0,Bivalirudin,CC[C@H](C)[C@H](NC(=O)[C@H](CCC(O)=O)NC(=O)[C@...
1,Leuprolide,CCNC(=O)[C@@H]1CCCN1C(=O)[C@H](CCCNC(N)=N)NC(=...
2,Goserelin,CC(C)C[C@H](NC(=O)[C@@H](COC(C)(C)C)NC(=O)[C@H...
3,Gramicidin D,CC(C)C[C@@H](NC(=O)CNC(=O)[C@@H](NC=O)C(C)C)C(...
4,Desmopressin,NC(=O)CC[C@@H]1NC(=O)[C@H](CC2=CC=CC=C2)NC(=O)...


In [38]:
drugbank_smiles.shape

(12695, 2)

Merging DrugBank and Primary Data

In [39]:
smiles_data = pd.concat([smiles_data, drugbank_smiles], axis=0)
smiles_data.head()

Unnamed: 0,Name,Smiles
0,FENVALERATE,CC(C)C(C(=O)OC(C#N)c1cccc(Oc2ccccc2)c1)c1ccc(C...
1,THIOPHANATE,CCOC(=O)NC(=S)Nc1ccccc1NC(=S)NC(=O)OCC
2,GESTRINONE,C#C[C@]1(O)CC[C@H]2[C@@H]3CCC4=CC(=O)CCC4=C3C=...
3,DESMENINOL,CSCCC(O)C(=O)O
4,QUAZODINE,CCc1ncnc2cc(OC)c(OC)cc12


In [40]:
# drop duplicates
print(f"Number of duplicates: {sum(smiles_data.duplicated(subset='Smiles') == True)}")
smiles_data.drop_duplicates(subset='Smiles', inplace=True)

Number of duplicates: 2596


In [41]:
# number of rows after duplicates removed
smiles_data.shape

(39873, 2)

In [42]:
# save data to csv
smiles_data.to_csv('smiles_data_pub_chembl_drugcent_drugbank.csv')

SMILES Refinement & Standardization

In [43]:
import pandas as pd
smiles_data = pd.read_csv('smiles_data_pub_chembl_drugcent_drugbank.csv')
smiles_dropna = smiles_data.dropna(subset=['Smiles'])

In [44]:
def data_cleaning(data:pd.DataFrame):
    """Drop rows containing null value"""
    print(f"Before dropping null values: {data.shape}")
    data = data.dropna()
    print(f"After dropping null values: {data.shape}")
    return data

In [45]:
def to_canonical_smiles(data: pd.DataFrame):
    """Function to transform smiles to canonical smiles"""
    from rdkit import Chem
    def transform_canonical(smile):
        try:
            return Chem.MolToSmiles(Chem.MolFromSmiles(smile))
        except:
            print(f"an error occurred: {smile}")
            return smile
    data = data_cleaning(data)
    data['Smiles'] = data['Smiles'].apply(transform_canonical)
    print(f'Before dropping duplicates: {data.shape}')
    data = data.drop_duplicates(subset=['Smiles'])
    print(f'After dropping duplicates: {data.shape}')
    return data

In [46]:
# remove multimolecule smiles
def data_selection(data:pd.DataFrame):
    def remove_multimolecule(smiles):
        import re
        import numpy as np
        pattern = r'^[^.]*$'
        if re.match(pattern, smiles):
            return smiles
        else:
            print(f'Removing molecule: {smiles}')
            return np.nan
    print(f"Before removing multimolecule smiles: {data.shape}")
    data.loc[:, ['Smiles']] = data['Smiles'].apply(remove_multimolecule)
    data = data.dropna(subset='Smiles')
    print(f"After removing multimolecule smiles: {data.shape}")
    return data

In [47]:
smiles_cleaned = data_cleaning(smiles_data)

Before dropping null values: (39873, 3)
After dropping null values: (39850, 3)


In [48]:
smiles_canonical = to_canonical_smiles(smiles_cleaned)

Before dropping null values: (39850, 3)
After dropping null values: (39850, 3)


[11:50:20] Explicit valence for atom # 1 Si, 8, is greater than permitted
[11:50:20] Explicit valence for atom # 1 Si, 8, is greater than permitted
[11:50:20] Explicit valence for atom # 3 Si, 8, is greater than permitted
[11:50:20] Explicit valence for atom # 1 Si, 8, is greater than permitted


an error occurred: F[Si-2](F)(F)(F)(F)F.[Zn+2]
an error occurred: F[Si-2](F)(F)(F)(F)F.[Na+].[Na+]
an error occurred: [NH4+].[NH4+].F[Si-2](F)(F)(F)(F)F
an error occurred: F[Si-2](F)(F)(F)(F)F.[Ba+2]


[11:50:22] Explicit valence for atom # 1 Si, 8, is greater than permitted


an error occurred: F[Si-2](F)(F)(F)(F)F.[Ca+2]


[11:50:24] Explicit valence for atom # 28 Cl, 2, is greater than permitted


an error occurred: CCCOOCC.CNCCCCOC1CCCCC1CCC2CCCCC2ClC3CCCCC3


[11:50:28] Explicit valence for atom # 0 N, 4, is greater than permitted
[11:50:28] Explicit valence for atom # 0 N, 4, is greater than permitted
[11:50:28] Explicit valence for atom # 0 N, 4, is greater than permitted


an error occurred: [H][N]([H])([H])[Pt](Cl)(Cl)[N]([H])([H])[H]
an error occurred: [H][N]1([H])[C@@H]2CCCC[C@H]2[N]([H])([H])[Pt]11OC(=O)C(=O)O1
an error occurred: [H][N]([H])([H])[Pt]1(OC(=O)C2(CCC2)C(=O)O1)[N]([H])([H])[H]


[11:50:28] Explicit valence for atom # 13 Cl, 5, is greater than permitted
[11:50:28] SMILES Parse Error: syntax error while parsing: OS(O)(O)C1=CC=C(C=C1)C-1=C2\C=CC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC=C(C=C1)S(O)(O)O)C1=CC=C(C=C1)S([O-])([O-])[O-])\C1=CC=C(C=C1)S(O)(O)[O-]
[11:50:28] SMILES Parse Error: Failed parsing SMILES 'OS(O)(O)C1=CC=C(C=C1)C-1=C2\C=CC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC=C(C=C1)S(O)(O)O)C1=CC=C(C=C1)S([O-])([O-])[O-])\C1=CC=C(C=C1)S(O)(O)[O-]' for input: 'OS(O)(O)C1=CC=C(C=C1)C-1=C2\C=CC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC=C(C=C1)S(O)(O)O)C1=CC=C(C=C1)S([O-])([O-])[O-])\C1=CC=C(C=C1)S(O)(O)[O-]'
[11:50:28] Explicit valence for atom # 19 O, 3, is greater than permitted


an error occurred: NC1=C(C2=C(N)N=C(N)N=C2C=C1)[Cl](=O)=O
an error occurred: OS(O)(O)C1=CC=C(C=C1)C-1=C2\C=CC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC=C(C=C1)S(O)(O)O)C1=CC=C(C=C1)S([O-])([O-])[O-])\C1=CC=C(C=C1)S(O)(O)[O-]
an error occurred: COC1=CC=C2N(C=NC2=C1)[C@H]1O[C@H](CO)[C@@H](O[P@]([OH-])(=O)O[C@H](C)CNC(=O)CC[C@]2(C)[C@@H](CC(N)=O)C3=[N+]4C2=C(C)C2=[N+]5C(=CC6=[N+]7C(=C(C)C8=[N+]([C@]3(C)[C@@](C)(CC(N)=O)[C@@H]8CCC(N)=O)[Co@@]457)[C@@](C)(CC(N)=O)[C@@H]6CCC(N)=O)C(C)(C)[C@@H]2CCC(N)=O)[C@H]1O


[11:50:28] Explicit valence for atom # 6 N, 4, is greater than permitted
[11:50:28] Explicit valence for atom # 0 O, 3, is greater than permitted
[11:50:29] Explicit valence for atom # 3 N, 4, is greater than permitted
[11:50:29] Unusual charge on atom 0 number of radical electrons set to zero
[11:50:29] Explicit valence for atom # 4 F, 2, is greater than permitted


an error occurred: C1=CN(C=N1)[Os++]123[N]4=CC=CC=C4C4=[N]1C=CC=C4.C1=CC(C4=CC=CC=[N]24)=[N]3C=C1
an error occurred: [O]#C[Re+]1(C#[O])(C#[O])[N]2=CC=CC3=C2C2=C(C=CC=[N]12)C=C3
an error occurred: [OH2+][Cu-4]([OH2+])([N]1=CNC=C1)([N+]1=CNC=C1)([N+]1=CNC=C1)[N+]1=CNC=C1
an error occurred: F[Al](F)(F)[F-]


[11:50:29] Explicit valence for atom # 13 Be, 3, is greater than permitted
[11:50:30] Explicit valence for atom # 84 N, 4, is greater than permitted


an error occurred: CN(CCO[P@](O)(=O)O[P@@](O)(=O)O[Be-](F)(F)F)C1=CC=CC=C1[N+]([O-])=O
an error occurred: [57Co+3].[C-]#N.C[C@H](CNC(=O)CC[C@]1(C)[C@@H](CC(N)=O)C2[N-]\C1=C(C)/C1=N/C(=C\C3=N\C(=C(C)/C4=N[C@]2(C)[C@@](C)(CC(N)=O)[C@@H]4CCC(N)=O)\[C@@](C)(CC(N)=O)[C@@H]3CCC(N)=O)/C(C)(C)[C@@H]1CCC(N)=O)OP([O-])(=O)O[C@@H]1[C@@H](CO)O[C@@H]([C@@H]1O)N1C=[NH]C2=C1C=C(C)C(C)=C2


[11:50:30] SMILES Parse Error: syntax error while parsing: OC1=CC=CC(=C1)C-1=C2\CCC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC(O)=CC=C1)C1=CC(O)=CC=C1)\C1=CC(O)=CC=C1
[11:50:30] SMILES Parse Error: Failed parsing SMILES 'OC1=CC=CC(=C1)C-1=C2\CCC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC(O)=CC=C1)C1=CC(O)=CC=C1)\C1=CC(O)=CC=C1' for input: 'OC1=CC=CC(=C1)C-1=C2\CCC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC(O)=CC=C1)C1=CC(O)=CC=C1)\C1=CC(O)=CC=C1'
[11:50:30] Explicit valence for atom # 1 Cl, 4, is greater than permitted


an error occurred: OC1=CC=CC(=C1)C-1=C2\CCC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC(O)=CC=C1)C1=CC(O)=CC=C1)\C1=CC(O)=CC=C1
an error occurred: O=[Cl]=O


[11:50:30] Explicit valence for atom # 0 N, 4, is greater than permitted
[11:50:30] Explicit valence for atom # 5 K, 2, is greater than permitted


an error occurred: [H][N]([H])([H])[Pt]1(OCC(=O)O1)[N]([H])([H])[H]
an error occurred: [OH-].[OH-].[OH-].[Mg++].[Cl-].[K++].[K++].[Ca++].[O-]C([O-])=O


[11:50:31] Explicit valence for atom # 6 N, 4, is greater than permitted


an error occurred: [Na+].Cl[Ru](Cl)(Cl)(Cl)([N]1=CC2=CC=CC=C2N1)[N]1=CC2=CC=CC=C2N1
an error occurred: S[10B]1234[10B]567[10B]89%10[10B]%11%12%13[10B]585[10B]%118%11[10B]%12%12%14[10B]9%139[10B]16%10[10B]2%129[10B]38%14[10B]475%11
an error occurred: [NH4+].[NH4+].[NH4+].[NH4+].[NH4+].[NH4+].[NH4+].[NH4+].[NH4+].[NH4+].[NH4+].[NH4+].[NH4+].[NH4+].[NH4+].[NH4+].[NH4+].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[O--].[Na++].[Na++].[Sb+3].[Sb+3].[Sb+3].[Sb+3].[Sb+3].[Sb+3].[Sb+3].[Sb+3].[Sb+3].[W+6].[W+6].[W+6].[W+6].[W+6].[W+6].[W

[11:50:31] Explicit valence for atom # 1 B, 6, is greater than permitted
[11:50:31] Explicit valence for atom # 103 Na, 2, is greater than permitted


In [49]:
smiles_refined = data_selection(smiles_canonical)
smiles_refined.head()

Before removing multimolecule smiles: (29822, 3)
Removing molecule: O=C(O)c1ccccc1O.OCCN(CCO)CCO
Removing molecule: C[N+](C)(C)CCO.O=C([O-])[C@H](O)[C@@H](O)[C@H](O)[C@H](O)CO
Removing molecule: CC(O)C(=O)[O-].CC(O)C(=O)[O-].[Mg+2]
Removing molecule: O=C([O-])[O-].[Al+3].[Al+3].[Mg+2].[Mg+2].[Mg+2].[Mg+2].[Mg+2].[Mg+2].[OH-].[OH-].[OH-].[OH-].[OH-].[OH-].[OH-].[OH-].[OH-].[OH-].[OH-].[OH-].[OH-].[OH-].[OH-].[OH-]
Removing molecule: C=CCCCCCCCCC(=O)[O-].C=CCCCCCCCCC(=O)[O-].[Ca+2]
Removing molecule: O=C([O-])c1ccccc1O.O=C([O-])c1ccccc1O.[Mg+2]
Removing molecule: CCCCC(=O)O.N
Removing molecule: O=P([O-])([O-])[O-].[Na+].[Na+].[Na+]
Removing molecule: NC(=S)NS(=O)(=O)c1ccc(N)cc1.NCc1ccc(S(N)(=O)=O)cc1
Removing molecule: O=S(=O)([O-])NC1CCCCC1.O=S(=O)([O-])NC1CCCCC1.[Ca+2]
Removing molecule: Cl.NCCCC[C@H](N)C(=O)O
Removing molecule: COCCOCCOCCOc1cccc(C2=N[C@@](C)(C(=O)[O-])CS2)c1O.O[Mg+]
Removing molecule: O=S(=O)([O-])[O-].O=S(=O)([O-])[O-].[Al+3].[Al+3].[Al+3].[Al+3].[Al+3].[Mg+2].[Mg+2]

Unnamed: 0.1,Unnamed: 0,Name,Smiles
0,0,FENVALERATE,CC(C)C(C(=O)OC(C#N)c1cccc(Oc2ccccc2)c1)c1ccc(C...
1,1,THIOPHANATE,CCOC(=O)NC(=S)Nc1ccccc1NC(=S)NC(=O)OCC
2,2,GESTRINONE,C#C[C@]1(O)CC[C@H]2[C@@H]3CCC4=CC(=O)CCC4=C3C=...
3,3,DESMENINOL,CSCCC(O)C(=O)O
4,4,QUAZODINE,CCc1ncnc2cc(OC)c(OC)cc12


In [50]:
smiles_refined.to_csv('d2_refined.csv')
# manual cleaning of problematic smiles

SMILES to SDF (for Structure-based virtual screening/molecular docking)

In [46]:
pip install rdkit





[notice] A new release of pip is available: 23.1.2 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
def smiles_to_sdf(smiles_data):
    """Genrates sdf files from smiles data"""
    from rdkit.Chem import PandasTools

    filenumber = 1
    i = 0
    batch_size = 1000 

    while i < len(smiles_data):
        slc = smiles_data.iloc[i:i+batch_size, :]
        try:
            PandasTools.AddMoleculeColumnToFrame(slc, 'Smiles', 'Molecule', includeFingerprints=True)
            PandasTools.WriteSDF(slc, f'./sdf/smiles_data{filenumber}.sdf', molColName='Molecule', idName='Name')
            filenumber += 1
        except Exception as e:
            print(f"Failed to write batch-{filenumber}: {e.args[0]}")
            break
        finally:
            i += batch_size

In [None]:
import pandas as pd
d2_refined = pd.read_csv('d2_refined.csv')
d2_refined.head()

In [None]:
d2_refined.shape

In [None]:
# convert smiles to sdf
smiles_to_sdf(d2_refined)