Initial Preparation

In [1]:
# Put data we want to read in working directory
pip install pandas # software library for data manipulation and analysis: merge, drop, etc




[notice] A new release of pip is available: 23.1.2 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip





In [2]:
import zipfile # to access and read zip file

In [3]:
import pandas as pd # import pandas and use pd as alias

In [4]:
pip install requests # The requests module allows you to send HTTP requests using Python

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.1.2 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip


ChemBL Data Preparation

In [5]:
# retrieve chembl data directly from the web
import requests
chembl_file = requests.get("https://www.ebi.ac.uk/chembl/interface_api/delayed_jobs/outputs/DOWNLOAD-Mb6a6SKB66gYEYj_xcN99vizbpIwDbHy6TPIjg-ZkQA=/DOWNLOAD-Mb6a6SKB66gYEYj_xcN99vizbpIwDbHy6TPIjg-ZkQA=.zip")

In [6]:
# check if the download link is expired (200 is okay, 404 is expired)
chembl_file

<Response [200]>

In [7]:
def get_data(url):
    import requests
    import zipfile
    import io
    response = requests.get(url)
    if response.status_code == 200:
        # Reading zip file from requests response
        zip_file = zipfile.ZipFile(io.BytesIO(response.content))
        # Printing file content in zip
        list_of_files = zip_file.namelist()
        print("ZIP file content:")
        for file_name in list_of_files:
            print(file_name)
        # Extracting file from zip
        zip_file.extractall()
        print("Successfully extracted zip file.")
    else:
        print("Failed to download file. Response:", response.status_code)
    return file_name

In [8]:
chembl_file = get_data("https://www.ebi.ac.uk/chembl/interface_api/delayed_jobs/outputs/DOWNLOAD-Mb6a6SKB66gYEYj_xcN99vizbpIwDbHy6TPIjg-ZkQA=/DOWNLOAD-Mb6a6SKB66gYEYj_xcN99vizbpIwDbHy6TPIjg-ZkQA=.zip")

ZIP file content:
DOWNLOAD-Mb6a6SKB66gYEYj_xcN99vizbpIwDbHy6TPIjg-ZkQA=.csv
Successfully extracted zip file.


In [9]:
chembl_data = pd.read_csv(chembl_file, sep=';')
# inspect few elements on the top of table to check the structure of the table
chembl_data.head()

Unnamed: 0,Parent Molecule,Name,Synonyms,Research Codes,Phase,Drug Applicants,USAN Stem,USAN Year,USAN Definition,USAN Stem - Substem,...,Passes Rule of Five,First In Class,Chirality,Prodrug,Oral,Parenteral,Topical,Black Box,Availability Type,Smiles
0,CHEMBL492491,FENVALERATE,BELMARK|FENVALERATE|PHENVALERATE|PYDRIN|S-5602...,S-5602|SD-43775|WL-43775,-1.0,,,,,,...,0,0,Racemic Mixture,0,0,0,0,0,Unknown,CC(C)C(C(=O)OC(C#N)c1cccc(Oc2ccccc2)c1)c1ccc(C...
1,CHEMBL2103777,THIOPHANATE,NSC-170810|THIOPHANATE,NSC-170810,-1.0,,,,,,...,1,0,Achiral Molecule,0,0,0,0,0,Unknown,CCOC(=O)NC(=S)Nc1ccccc1NC(=S)NC(=O)OCC
2,CHEMBL1868702,GESTRINONE,A 46 745|A-46-745|A-46745|DIMETRIOSE|GESTRINON...,A 46 745|A-46-745|A-46745|R 2323|R-2323|RU 232...,4.0,,'-estr-; -rinone',1978.0,estrogens; cardiotonics (amrinone type),'-estr-(-estr-); -rinone(-rinone)',...,1,0,Single Stereoisomer,0,0,0,0,0,Unknown,C#C[C@]1(O)CC[C@H]2[C@@H]3CCC4=CC(=O)CCC4=C3C=...
3,CHEMBL2104213,DESMENINOL,DESMENINOL|METHIONINE HYDROXY ANALOG,,-1.0,,,,,,...,1,0,Racemic Mixture,0,0,0,0,0,Unknown,CSCCC(O)C(=O)O
4,CHEMBL1904952,QUAZODINE,MJ 1988|MJ-1988|QUAZODINE,MJ 1988|MJ-1988,-1.0,,,1968.0,,,...,1,0,Achiral Molecule,0,0,0,0,0,Unknown,CCc1ncnc2cc(OC)c(OC)cc12


In [10]:
# get only the name and smiles column
chembl_data_smiles = chembl_data[['Name', 'Smiles']]
chembl_data_smiles.head()

Unnamed: 0,Name,Smiles
0,FENVALERATE,CC(C)C(C(=O)OC(C#N)c1cccc(Oc2ccccc2)c1)c1ccc(C...
1,THIOPHANATE,CCOC(=O)NC(=S)Nc1ccccc1NC(=S)NC(=O)OCC
2,GESTRINONE,C#C[C@]1(O)CC[C@H]2[C@@H]3CCC4=CC(=O)CCC4=C3C=...
3,DESMENINOL,CSCCC(O)C(=O)O
4,QUAZODINE,CCc1ncnc2cc(OC)c(OC)cc12


In [11]:
# inspect number of rows
chembl_data_smiles.shape

(14805, 2)

In [12]:
# check if there are any duplicated data
sum(chembl_data_smiles.duplicated(subset='Smiles') == True)

3914

In [13]:
# drop duplicated data
chembl_smiles = chembl_data_smiles.drop_duplicates(subset='Smiles')
# inspect shape of the data after duplicates removed
chembl_smiles.shape

(10891, 2)

PubChem Data Preparation

In [14]:
def get_json_data(url, file_name):
    "Function to retrieve data from pubchem in json format"
    import requests
    res = requests.get(url)
    if res.status_code == 200:
        with open(f'{file_name}.json', 'wb') as json_file:
            json_file.write(res.content)
        print("Successfully downloaded json data")
    else:
        print(f"Failed to download data. Response: {res.status_code}")

In [15]:
# url from pubchem
json_url = "https://pubchem.ncbi.nlm.nih.gov/sdq/sdqagent.cgi?infmt=json&outfmt=json&query={%22download%22:%22*%22,%22collection%22:%22compound%22,%22where%22:{%22ands%22:[{%22input%22:{%22type%22:%22netcachekey%22,%22idtype%22:%22cid%22,%22key%22:%22M3SV183lqFmfc6pqKBLjQadebD5DsiwkVgE3aE0QJWlNCRk%22}}]},%22order%22:[%22relevancescore,desc%22],%22start%22:1,%22limit%22:10000000,%22downloadfilename%22:%22PubChem_compound_list_M3SV183lqFmfc6pqKBLjQadebD5DsiwkVgE3aE0QJWlNCRk%22}"
# json file name
json_filename = "pubchem_data_json"

In [16]:
get_json_data(json_url, json_filename)
# Proceed to manually clean the data 

Successfully downloaded json data


In [20]:
# If not doing the manual cleaning, there will be error in the reading
# read json data using pandas
pubchem_json = pd.read_json('pubchem_data.json')
pubchem_json.to_csv('pubchem_data.csv')

In [23]:
pubchem_data = pd.read_csv('pubchem_data.csv')
pubchem_data.head()

Unnamed: 0.1,Unnamed: 0,cid,cmpdname,cmpdsynonym,mw,mf,polararea,complexity,xlogp,heavycnt,...,gpfamilycnt,neighbortype,meshheadings,annothits,annothitcnt,cidcdate,sidsrcname,depcatg,annotation,aids
0,0,1,Acetylcarnitine,"['Acetyl-DL-carnitine', 'acetylcarnitine', 'DL...",203.24,C9H17NO4,66.4,214.0,0.4,14,...,1232,2D+3D,Acetylcarnitine,"['Classification', 'Drug and Medication Inform...",9,2005-06-23,"['3WAY PHARM INC', 'A2B Chem', 'AA BLOCKS', 'A...","['Chemical Vendors', 'Curation Efforts', 'Gove...",['D002491 - Central Nervous System Agents > D0...,
1,1,2,"1-Propanaminium, 2-(acetyloxy)-3-carboxy-N,N,N...","['14992-62-2', '[2-(ACETYLOXY)-3-CARBOXYPROPYL...",204.24,C9H18NO4+,63.6,219.0,-0.3,14,...,1359,2D+3D,,"['Biological Test Results', 'Chemical and Phys...",9,2005-06-23,"['001Chemical', '1st Scientific', '3B Scientif...","['Chemical Vendors', 'Curation Efforts', 'Gove...",,"[781325, 1272365, 1671498]"
2,2,6,"1-Chloro-2,4-dinitrobenzene","['1-chloro-2,4-dinitrobenzene', '2,4-Dinitroch...",202.55,C6H3ClN2O4,91.6,224.0,2.3,13,...,4998,2D+3D,Dinitrochlorobenzene,"['Biological Test Results', 'Chemical and Phys...",14,2005-03-26,"['3B Scientific (Wuhan) Corp', '3WAY PHARM INC...","['Chemical Vendors', 'Curation Efforts', 'Gove...",['C308 - Immunotherapeutic Agent > C2139 - Imm...,"[155, 157, 161, 165, 167, 175, 179, 192, 220, ..."
3,3,11,"1,2-Dichloroethane","['1,2-dichloroethane', 'Ethylene dichloride', ...",98.96,C2H4Cl2,0.0,6.0,1.5,4,...,3450,2D+3D,,"['Agrochemical Information', 'Biological Test ...",17,2004-09-16,"['001Chemical', '1st Scientific', '3B Scientif...","['Chemical Vendors', 'Curation Efforts', 'Gove...",,"[421, 426, 427, 433, 434, 435, 445, 530, 540, ..."
4,4,34,2-Chloroethanol,"['2-chloroethanol', 'Ethylene chlorohydrin', '...",80.51,C2H5ClO,20.2,10.0,-0.1,4,...,17847,2D+3D,Ethylene Chlorohydrin,"['Biological Test Results', 'Chemical and Phys...",14,2005-03-26,"['3B Scientific (Wuhan) Corp', 'A2B Chem', 'AA...","['Chemical Vendors', 'Curation Efforts', 'Gove...",,"[256, 1188, 384212, 651631, 651632, 651633, 65..."


In [26]:
# get smiles data
pubchem_smiles = pubchem_data[['cmpdname', 'canonicalsmiles']]
pubchem_smiles.columns = ['Name', 'Smiles']
pubchem_smiles.head()

Unnamed: 0,Name,Smiles
0,Acetylcarnitine,CC(=O)OC(CC(=O)[O-])C[N+](C)(C)C
1,"1-Propanaminium, 2-(acetyloxy)-3-carboxy-N,N,N...",CC(=O)OC(CC(=O)O)C[N+](C)(C)C
2,"1-Chloro-2,4-dinitrobenzene",C1=CC(=C(C=C1[N+](=O)[O-])[N+](=O)[O-])Cl
3,"1,2-Dichloroethane",C(CCl)Cl
4,2-Chloroethanol,C(CCl)O


In [29]:
pubchem_smiles.shape

(20096, 2)

Merging ChEMBL and PubChem Data

In [27]:
# join data from pubchem and chembl
smiles_data = pd.concat([chembl_data_smiles, pubchem_smiles], axis=0)
smiles_data.head()

Unnamed: 0,Name,Smiles
0,FENVALERATE,CC(C)C(C(=O)OC(C#N)c1cccc(Oc2ccccc2)c1)c1ccc(C...
1,THIOPHANATE,CCOC(=O)NC(=S)Nc1ccccc1NC(=S)NC(=O)OCC
2,GESTRINONE,C#C[C@]1(O)CC[C@H]2[C@@H]3CCC4=CC(=O)CCC4=C3C=...
3,DESMENINOL,CSCCC(O)C(=O)O
4,QUAZODINE,CCc1ncnc2cc(OC)c(OC)cc12


In [28]:
# inspect shape of the data after concatenation
smiles_data.shape

(34901, 2)

In [30]:
# check and delete duplicates
print(f"Number of duplicates: {sum(smiles_data.duplicated(subset='Smiles') == True)}")
smiles_data.drop_duplicates(subset='Smiles', inplace=True)
# inspect data's shape after dropping duplicates
smiles_data.shape

Number of duplicates: 8903


(25998, 2)

In [31]:
smiles_data.head()

Unnamed: 0,Name,Smiles
0,FENVALERATE,CC(C)C(C(=O)OC(C#N)c1cccc(Oc2ccccc2)c1)c1ccc(C...
1,THIOPHANATE,CCOC(=O)NC(=S)Nc1ccccc1NC(=S)NC(=O)OCC
2,GESTRINONE,C#C[C@]1(O)CC[C@H]2[C@@H]3CCC4=CC(=O)CCC4=C3C=...
3,DESMENINOL,CSCCC(O)C(=O)O
4,QUAZODINE,CCc1ncnc2cc(OC)c(OC)cc12


In [32]:
# save cleaned data
smiles_data.to_csv('smiles_pubchem_chembl.csv')

DrugCentral Data Preparation

In [33]:
def get_tsv_data(url, file_name):
    import requests
    res = requests.get(url)
    if res.status_code == 200:
        with open(f'{file_name}.tsv', 'wb') as tsv_file:
            tsv_file.write(res.content)
        print("Successfully downloaded tsv data.")
    else:
        print("Failed to download tsv data.")

In [34]:
get_tsv_data("https://unmtid-shinyapps.net/download/DrugCentral/2021_09_01/structures.smiles.tsv", "drugcentral_data")

Successfully downloaded tsv data.


In [36]:
drugcentral_data = pd.read_table('drugcentral_data.tsv')
drugcentral_data.head()

Unnamed: 0,SMILES,InChI,InChIKey,ID,INN,CAS_RN
0,CNC(=O)C1=C(C=C(C=C1)C2=NN3C(=CN=C3N=C2)CC4=CC...,InChI=1S/C23H17FN6O/c1-25-22(31)18-6-5-16(11-1...,LIOLIMKSCNQPLV-UHFFFAOYSA-N,5392,capmatinib,1029712-80-8
1,CC(C)(COC1=CN2C(=C(C=N2)C#N)C(=C1)C3=CN=C(C=C3...,"InChI=1S/C29H31N7O3/c1-29(2,37)18-39-24-9-25(2...",XIIOFHFUYBLOLW-UHFFFAOYSA-N,5393,selpercatinib,2152628-33-4
2,CCN1C2=CC(=NC=C2C=C(C1=O)C3=CC(=C(C=C3Br)F)NC(...,InChI=1S/C24H21BrFN5O2/c1-3-31-21-12-22(27-2)2...,CEFJVGZHQAGLHS-UHFFFAOYSA-N,5394,ripretinib,1442472-39-0
3,C[C@]12CC[C@H]3[C@H]([C@@H]1C[C@H]([C@@H]2O)[1...,InChI=1S/C18H23FO2/c1-18-7-6-13-12-5-3-11(20)8...,KDLLNMRYZGUVMA-ZYMZXAKXSA-N,5395,fluoroestradiol F 18,94153-53-4
4,C1=CC2=C(C=C1C3=CN=C(C=C3)[18F])NC4=C2C=NC=C4,InChI=1S/C16H10FN3/c17-16-4-2-11(8-19-16)10-1-...,GETAAWDSFUCLBS-SJPDSGJFSA-N,5396,flortaucipir F 18,1522051-90-6


In [37]:
# get name and smiles column
drugcentral_smiles = drugcentral_data[['INN', 'SMILES']]
drugcentral_smiles.columns = ['Name', 'Smiles']
drugcentral_smiles.head()

Unnamed: 0,Name,Smiles
0,capmatinib,CNC(=O)C1=C(C=C(C=C1)C2=NN3C(=CN=C3N=C2)CC4=CC...
1,selpercatinib,CC(C)(COC1=CN2C(=C(C=N2)C#N)C(=C1)C3=CN=C(C=C3...
2,ripretinib,CCN1C2=CC(=NC=C2C=C(C1=O)C3=CC(=C(C=C3Br)F)NC(...
3,fluoroestradiol F 18,C[C@]12CC[C@H]3[C@H]([C@@H]1C[C@H]([C@@H]2O)[1...
4,flortaucipir F 18,C1=CC2=C(C=C1C3=CN=C(C=C3)[18F])NC4=C2C=NC=C4


In [38]:
drugcentral_smiles.shape

(4099, 2)

Merging DrugCentral and Primary Data

In [39]:
# join data with our primary smiles data
smiles_data = pd.concat([smiles_data, drugcentral_smiles], axis=0)
smiles_data.shape

(30097, 2)

In [40]:
# drop duplicates
print(f"Number of duplicates: {sum(smiles_data.duplicated(subset='Smiles') == True)}")
smiles_data.drop_duplicates(subset='Smiles', inplace=True)

Number of duplicates: 323


In [60]:
# number of rows after duplicates removed
smiles_data.shape

(29774, 2)

In [41]:
# save data to csv
smiles_data.to_csv('smiles_data_pub_chembl_drugcent.csv')

In [44]:
import pandas as pd
smiles_data = pd.read_csv('smiles_data_pub_chembl_drugcent.csv')
smiles_dropna = smiles_data.dropna(subset=['Smiles'])

In [45]:
sum(smiles_dropna.isna()['Smiles'] == True)

0

In [46]:
pip install rdkit





[notice] A new release of pip is available: 23.1.2 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [47]:
smiles_dropna.shape

(29766, 3)

In [48]:
from rdkit.Chem import PandasTools

In [49]:
smiles_data.describe()

Unnamed: 0.1,Unnamed: 0
count,29767.0
mean,7584.610777
std,5425.326286
min,0.0
25%,2795.0
50%,6598.0
75%,12035.5
max,20095.0


In [50]:
# TODO: for testing the error
from rdkit.Chem import PandasTools

filenumber = 1
i = 0
batch_size = 1000

while i < len(smiles_dropna):
    slc = smiles_dropna.iloc[i:i+batch_size, :]
    try:
        PandasTools.AddMoleculeColumnToFrame(slc, 'Smiles', 'Molecule', includeFingerprints=True)
        PandasTools.WriteSDF(slc, f'./sdf/smiles_data{filenumber}.sdf', molColName='Molecule', idName='Name')
        filenumber += 1
    except Exception as e:
        print(f"Failed to write batch-{filenumber}: {e.args[0]}")
        break
    finally:
        i += batch_size


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  frame[molCol] = frame[smilesCol].map(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  frame[molCol] = frame[smilesCol].map(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  frame[molCol] = frame[smilesCol].map(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_index

In [1]:
from rdkit.Chem import PandasTools

filenumber = 1
i = 0
batch_size = 1000

while i < len(smiles_data):
    slc = smiles_data.iloc[i:i+batch_size, :]
    try:
        PandasTools.AddMoleculeColumnToFrame(slc, 'Smiles', 'Molecule', includeFingerprints=True)
        PandasTools.WriteSDF(slc, f'./sdf/smiles_data{filenumber}.sdf', molColName='Molecule', idName='Name')
        filenumber += 1
    except Exception as e:
        print(f"Failed to write batch-{filenumber}: {e.args[0]}")
    finally:
        i += batch_size


NameError: name 'smiles_data' is not defined

In [None]:
data_cleaning = pd.read_csv('smiles_data_pub_chembl_drugcent.csv')
data_cleaning.loc[18:30, :]