In [7]:
import pandas as pd

In [8]:
def get_data(url):
    import requests
    import zipfile
    import io
    response = requests.get(url)
    if response.status_code == 200:
        # Reading zip file from requests response
        zip_file = zipfile.ZipFile(io.BytesIO(response.content))
        # Printing file content in zip
        list_of_files = zip_file.namelist()
        print("ZIP file content:")
        for file_name in list_of_files:
            print(file_name)
        # Extracting file from zip
        zip_file.extractall()
        print("Successfully extracted zip file.")
    else:
        print("Failed to download file. Response:", response.status_code)
    return file_name

In [9]:
# copy url for csv download here
# TODO: the urls need to be updated
urls = [
    "https://www.ebi.ac.uk/chembl/interface_api/delayed_jobs/outputs/DOWNLOAD-TyK57H3Jm1mAnrOx9bQFst7sKvrpnaFiECEPv9KxldU=/DOWNLOAD-TyK57H3Jm1mAnrOx9bQFst7sKvrpnaFiECEPv9KxldU=.zip",
    "https://www.ebi.ac.uk/chembl/interface_api/delayed_jobs/outputs/DOWNLOAD-m3ajtVTYhqY3Lf7BZNMh8CBOrmJABV8KGN0k2SAwvdY=/DOWNLOAD-m3ajtVTYhqY3Lf7BZNMh8CBOrmJABV8KGN0k2SAwvdY=.zip",
    "https://www.ebi.ac.uk/chembl/interface_api/delayed_jobs/outputs/DOWNLOAD-_1WCrliOUA5F3JVss-saOqsaJ8-pwb9NLfgygYmLUsI=/DOWNLOAD-_1WCrliOUA5F3JVss-saOqsaJ8-pwb9NLfgygYmLUsI=.zip",
    "https://www.ebi.ac.uk/chembl/interface_api/delayed_jobs/outputs/DOWNLOAD-tRULel1nwmAFUjCWFEudZranAWp4kggn6bgeZyY5avw=/DOWNLOAD-tRULel1nwmAFUjCWFEudZranAWp4kggn6bgeZyY5avw=.zip",
    "https://www.ebi.ac.uk/chembl/interface_api/delayed_jobs/outputs/DOWNLOAD-tRULel1nwmAFUjCWFEudZranAWp4kggn6bgeZyY5avw=/DOWNLOAD-tRULel1nwmAFUjCWFEudZranAWp4kggn6bgeZyY5avw=.zip",
    "https://www.ebi.ac.uk/chembl/interface_api/delayed_jobs/outputs/DOWNLOAD-8Y8JMmUmdxLExJ8oxA-wt5atjTHH3HJ8uRKtlObEt0M=/DOWNLOAD-8Y8JMmUmdxLExJ8oxA-wt5atjTHH3HJ8uRKtlObEt0M=.zip"
]

In [10]:
file_names = []
for url in urls:
    # store file name in file_names
    file_names.append(get_data(url))

ZIP file content:
DOWNLOAD-TyK57H3Jm1mAnrOx9bQFst7sKvrpnaFiECEPv9KxldU=.csv
Successfully extracted zip file.
ZIP file content:
DOWNLOAD-m3ajtVTYhqY3Lf7BZNMh8CBOrmJABV8KGN0k2SAwvdY=.csv
Successfully extracted zip file.
ZIP file content:
DOWNLOAD-_1WCrliOUA5F3JVss-saOqsaJ8-pwb9NLfgygYmLUsI=.csv
Successfully extracted zip file.
ZIP file content:
DOWNLOAD-tRULel1nwmAFUjCWFEudZranAWp4kggn6bgeZyY5avw=.csv
Successfully extracted zip file.
ZIP file content:
DOWNLOAD-tRULel1nwmAFUjCWFEudZranAWp4kggn6bgeZyY5avw=.csv
Successfully extracted zip file.
ZIP file content:
DOWNLOAD-8Y8JMmUmdxLExJ8oxA-wt5atjTHH3HJ8uRKtlObEt0M=.csv
Successfully extracted zip file.


In [68]:
# In case you don't want to redownload the csv files online
# run this block to read csv file names available in current directory
# if you ran previous block above, just skip this block
from pathlib import Path
file_names = []
files = Path('./')
for file in files.iterdir():
    if 'DOWNLOAD' in file.name:
        file_names.append(file.name)

In [12]:
def process_data(files):
    import numpy as np
    # initiate data_main, main data frame that will be used to aggregate all csv
    # only use chembl ID, smile, and standard value columns
    data_main = pd.DataFrame({'Molecule ChEMBL ID':[], 'Smiles':[], 'Standard Value':[]})
    # for every csv downloaded, do these following tasks
    for file in files:
        # read data
        data = pd.read_csv(file, sep=';')
        # drop rows if Smiles or Standard Value is null
        removed_null = data.dropna(subset=['Smiles', 'Standard Value'])
        # slice data from initial data frame. only use columns in data_main
        sliced = removed_null[data_main.columns]
        # convert IC50 to pIC50
        # convert IC50 from nM to M
        sliced.loc[:, ['Standard Value']] = sliced[['Standard Value']].apply(lambda x: x*10**-9, axis=1)
        # transform to pIC50
        sliced.loc[:, ['Standard Value']] = -np.log10(sliced[['Standard Value']])
        # join data
        data_main = pd.concat([data_main, sliced])
    # drop duplicates according to Smiles column
    data_main = data_main.drop_duplicates(subset=['Smiles'])
    return data_main
        

In [13]:
# store result of data processing in df_main
df_main = process_data(file_names)

In [14]:
df_main

Unnamed: 0,Molecule ChEMBL ID,Smiles,Standard Value
0,CHEMBL3318761,C[C@]1(CC(=O)O)C[C@H](c2cccc(Cl)c2)[C@@H](c2cc...,10.229148
1,CHEMBL3318777,CN(C1CC1)S(=O)(=O)C[C@@H](N1C(=O)[C@@](C)(CC(=...,9.420216
2,CHEMBL3318779,C[C@@H]1CCCN1S(=O)(=O)C[C@@H](N1C(=O)[C@@](C)(...,9.823909
3,CHEMBL3318780,C[C@H]1CCCN1S(=O)(=O)C[C@@H](N1C(=O)[C@@](C)(C...,8.958607
4,CHEMBL3318781,CN1CCN(S(=O)(=O)C[C@@H](N2C(=O)[C@@](C)(CC(=O)...,10.130768
...,...,...,...
200,CHEMBL1256067,CC(C)COC(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](Cc...,3.522879
202,CHEMBL3653163,COc1ccncc1-c1nc2c(n1C(C)C)C(c1ccc(Cl)cc1C)N(c1...,5.149354
206,CHEMBL1256053,CC(C)C[C@H](NC(=O)OC(C)(C)C)C(=O)N[C@@H](CCCNC...,3.716699
225,CHEMBL1256062,CC(C)(C)OC(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](...,3.512862


In [15]:
df_main.describe()

Unnamed: 0,Standard Value
count,3447.0
mean,7.28275
std,1.783399
min,2.992252
25%,5.869714
50%,7.522879
75%,8.744727
max,10.39794


In [16]:
df_main.isna().describe()

Unnamed: 0,Molecule ChEMBL ID,Smiles,Standard Value
count,3447,3447,3447
unique,1,1,1
top,False,False,False
freq,3447,3447,3447


In [17]:
# save df_main to a csv file
df_main.to_csv('data_train.csv')

In [18]:
data = pd.read_csv('data_train.csv', index_col=0)

In [19]:
data.head()

Unnamed: 0,Molecule ChEMBL ID,Smiles,Standard Value
0,CHEMBL3318761,C[C@]1(CC(=O)O)C[C@H](c2cccc(Cl)c2)[C@@H](c2cc...,10.229148
1,CHEMBL3318777,CN(C1CC1)S(=O)(=O)C[C@@H](N1C(=O)[C@@](C)(CC(=...,9.420216
2,CHEMBL3318779,C[C@@H]1CCCN1S(=O)(=O)C[C@@H](N1C(=O)[C@@](C)(...,9.823909
3,CHEMBL3318780,C[C@H]1CCCN1S(=O)(=O)C[C@@H](N1C(=O)[C@@](C)(C...,8.958607
4,CHEMBL3318781,CN1CCN(S(=O)(=O)C[C@@H](N2C(=O)[C@@](C)(CC(=O)...,10.130768
