In [1]:
import pandas as pd

In [2]:
def get_data(url):
    import requests
    import zipfile
    import io
    response = requests.get(url)
    if response.status_code == 200:
        # Reading zip file from requests response
        zip_file = zipfile.ZipFile(io.BytesIO(response.content))
        # Printing file content in zip
        list_of_files = zip_file.namelist()
        print("ZIP file content:")
        for file_name in list_of_files:
            print(file_name)
        # Extracting file from zip
        zip_file.extractall()
        print("Successfully extracted zip file.")
    else:
        print("Failed to download file. Response:", response.status_code)
    return file_name

In [3]:
# copy url for csv download here
# TODO: the urls need to be updated
urls = [
    "https://www.ebi.ac.uk/chembl/interface_api/delayed_jobs/outputs/DOWNLOAD-IJvMxRB-MW88A6s5f5hbeZA8F_TNpbhszCzd5L0WXks=/DOWNLOAD-IJvMxRB-MW88A6s5f5hbeZA8F_TNpbhszCzd5L0WXks=.zip"
]

In [4]:
file_names = []
for url in urls:
    # store file name in file_names
    file_names.append(get_data(url))

ZIP file content:
DOWNLOAD-IJvMxRB-MW88A6s5f5hbeZA8F_TNpbhszCzd5L0WXks=.csv
Successfully extracted zip file.


In [68]:
# In case you don't want to redownload the csv files online
# run this block to read csv file names available in current directory
# if you ran previous block above, just skip this block
from pathlib import Path
file_names = []
files = Path('./')
for file in files.iterdir():
    if 'DOWNLOAD' in file.name:
        file_names.append(file.name)

In [5]:
def process_data(files):
    import numpy as np
    # initiate data_main, main data frame that will be used to aggregate all csv
    # only use chembl ID, smile, and standard value columns
    data_main = pd.DataFrame({'Molecule ChEMBL ID':[], 'Smiles':[], 'Standard Value':[]})
    # for every csv downloaded, do these following tasks
    for file in files:
        # read data
        data = pd.read_csv(file, sep=';')
        # drop rows if Smiles or Standard Value is null
        removed_null = data.dropna(subset=['Smiles', 'Standard Value'])
        # slice data from initial data frame. only use columns in data_main
        sliced = removed_null[data_main.columns]
        # convert IC50 to pIC50
        # convert IC50 from nM to M
        sliced.loc[:, ['Standard Value']] = sliced[['Standard Value']].apply(lambda x: x*10**-9, axis=1)
        # transform to pIC50
        sliced.loc[:, ['Standard Value']] = -np.log10(sliced[['Standard Value']])
        # join data
        data_main = pd.concat([data_main, sliced])
    # drop duplicates according to Smiles column
    data_main = data_main.drop_duplicates(subset=['Smiles'])
    return data_main
        

In [6]:
# store result of data processing in df_main
df_main = process_data(file_names)

In [7]:
df_main

Unnamed: 0,Molecule ChEMBL ID,Smiles,Standard Value
0,CHEMBL3649851,Cn1ncnc1C1CCCN1c1nc2cc(-c3noc(=O)[nH]3)nc(-c3c...,7.677781
1,CHEMBL3649993,COCC(C)c1nc2cc(-c3noc(=O)[nH]3)nc(-c3cncc(Cl)c...,8.045757
2,CHEMBL3650023,O=c1[nH]c(-c2cc3nc(N4CCO[C@@H]5CCC[C@H]54)n(Cc...,7.408935
3,CHEMBL3653333,CCC(CSC)c1nc2cc(-c3noc(=O)[nH]3)nc(-c3cncc(Cl)...,8.698970
4,CHEMBL3653334,CCC(CS(C)(=O)=O)c1nc2cc(-c3noc(=O)[nH]3)nc(-c3...,8.154902
...,...,...,...
349,CHEMBL3952519,C[C@H]1CCCN1c1nc2cc(-c3nc(=O)o[nH]3)nc(-c3cncc...,8.397940
350,CHEMBL3649854,Cc1noc(C)c1C1CCCN1c1nc2cc(-c3noc(=O)[nH]3)nc(-...,7.958607
351,CHEMBL4111957,CC1CCC(Cn2c(N3CCO[C@@H]4CNCC[C@H]43)nc3cc(-c4n...,6.290730
354,CHEMBL3649940,CC(C)OC(=O)N1C[C@@H](C)N(c2nc3cc(-c4noc(=O)[nH...,8.522879


In [8]:
df_main.describe()

Unnamed: 0,Standard Value
count,263.0
mean,8.265027
std,0.631496
min,6.29073
25%,7.886057
50%,8.39794
75%,8.69897
max,9.0


In [9]:
df_main.isna().describe()

Unnamed: 0,Molecule ChEMBL ID,Smiles,Standard Value
count,263,263,263
unique,1,1,1
top,False,False,False
freq,263,263,263


In [11]:
# save df_main to a csv file
df_main.to_csv('data_train.csv')

In [12]:
data = pd.read_csv('data_train.csv', index_col=0)

In [13]:
data.head()

Unnamed: 0,Molecule ChEMBL ID,Smiles,Standard Value
0,CHEMBL3649851,Cn1ncnc1C1CCCN1c1nc2cc(-c3noc(=O)[nH]3)nc(-c3c...,7.677781
1,CHEMBL3649993,COCC(C)c1nc2cc(-c3noc(=O)[nH]3)nc(-c3cncc(Cl)c...,8.045757
2,CHEMBL3650023,O=c1[nH]c(-c2cc3nc(N4CCO[C@@H]5CCC[C@H]54)n(Cc...,7.408935
3,CHEMBL3653333,CCC(CSC)c1nc2cc(-c3noc(=O)[nH]3)nc(-c3cncc(Cl)...,8.69897
4,CHEMBL3653334,CCC(CS(C)(=O)=O)c1nc2cc(-c3noc(=O)[nH]3)nc(-c3...,8.154902
