In [1]:
import pandas as pd

In [2]:
def get_data(url):
    import requests
    import zipfile
    import io
    response = requests.get(url)
    if response.status_code == 200:
        # Reading zip file from requests response
        zip_file = zipfile.ZipFile(io.BytesIO(response.content))
        # Printing file content in zip
        list_of_files = zip_file.namelist()
        print("ZIP file content:")
        for file_name in list_of_files:
            print(file_name)
        # Extracting file from zip
        zip_file.extractall()
        print("Successfully extracted zip file.")
    else:
        print("Failed to download file. Response:", response.status_code)
    return file_name

In [3]:
# copy url for csv download here
# TODO: the urls need to be updated
urls = [
    "https://www.ebi.ac.uk/chembl/interface_api/delayed_jobs/outputs/DOWNLOAD-C6Hr8avtZQsO1U7kXtCel5iuG7Z-TjKpkwU6SwmXM00=/DOWNLOAD-C6Hr8avtZQsO1U7kXtCel5iuG7Z-TjKpkwU6SwmXM00=.zip"
]

In [4]:
file_names = []
for url in urls:
    # store file name in file_names
    file_names.append(get_data(url))

ZIP file content:
DOWNLOAD-C6Hr8avtZQsO1U7kXtCel5iuG7Z-TjKpkwU6SwmXM00=.csv
Successfully extracted zip file.


In [68]:
# In case you don't want to redownload the csv files online
# run this block to read csv file names available in current directory
# if you ran previous block above, just skip this block
from pathlib import Path
file_names = []
files = Path('./')
for file in files.iterdir():
    if 'DOWNLOAD' in file.name:
        file_names.append(file.name)

In [6]:
def process_data(files):
    import numpy as np
    # initiate data_main, main data frame that will be used to aggregate all csv
    # only use chembl ID, smile, and standard value columns
    data_main = pd.DataFrame({'Molecule ChEMBL ID':[], 'Smiles':[], 'Standard Value':[]})
    # for every csv downloaded, do these following tasks
    for file in files:
        # read data
        data = pd.read_csv(file, sep=';')
        # drop rows if Smiles or Standard Value is null
        removed_null = data.dropna(subset=['Smiles', 'Standard Value'])
        # slice data from initial data frame. only use columns in data_main
        sliced = removed_null[data_main.columns]
        # convert IC50 to pIC50
        # convert IC50 from nM to M
        sliced.loc[:, ['Standard Value']] = sliced[['Standard Value']].apply(lambda x: x*10**-9, axis=1)
        # transform to pIC50
        sliced.loc[:, ['Standard Value']] = -np.log10(sliced[['Standard Value']])
        # join data
        data_main = pd.concat([data_main, sliced])
    # drop duplicates according to Smiles column
    data_main = data_main.drop_duplicates(subset=['Smiles'])
    return data_main
        

In [7]:
# store result of data processing in df_main
df_main = process_data(file_names)

In [8]:
df_main

Unnamed: 0,Molecule ChEMBL ID,Smiles,Standard Value
0,CHEMBL3657004,COc1ccc(C(=O)N(C)C)cc1-c1nc2c(n1C(C)C)[C@@H](c...,7.151811
1,CHEMBL3657113,COC[C@@H](C)n1c(-c2cnc(N)nc2OC)nc2c1C(c1ccc(Cl...,9.430626
2,CHEMBL3657092,COc1ncc(-c2nc3c(n2C(C)C)C(c2ccc(Cl)c(F)c2)N(c2...,9.598599
3,CHEMBL3653169,COc1ccc(C(=O)NC(C)C)cc1-c1nc2c(n1C(C)C)C(c1ccc...,9.823909
4,CHEMBL3657038,COc1ccc(C(=O)NC(C)C)cc1-c1nc2c(n1C(C)C)[C@H](c...,10.096910
...,...,...,...
308,CHEMBL3657086,COc1ncc(-c2nc3c(n2C(C)C)C(c2ccc(Cl)c(F)c2)N(c2...,9.279841
309,CHEMBL3657020,COc1nc(N)ncc1-c1nc2c(n1C(C)C)[C@H](c1ccc(Cl)cc...,9.958607
310,CHEMBL3657111,COc1nc(N2CCS(=O)(=O)CC2)ncc1-c1nc2c(n1C(C)C)C(...,9.199971
311,CHEMBL3653171,COc1ccc(C(=O)N2CC(O)C2)cc1-c1nc2c(n1C(C)C)C(c1...,9.744727


In [9]:
df_main.describe()

Unnamed: 0,Standard Value
count,307.0
mean,9.172043
std,0.828374
min,6.296107
25%,8.881077
50%,9.443697
75%,9.744727
max,10.39794


In [10]:
df_main.isna().describe()

Unnamed: 0,Molecule ChEMBL ID,Smiles,Standard Value
count,307,307,307
unique,1,1,1
top,False,False,False
freq,307,307,307


In [11]:
# save df_main to a csv file
df_main.to_csv('data_train.csv')

In [12]:
data = pd.read_csv('data_train.csv', index_col=0)

In [13]:
data.head()

Unnamed: 0,Molecule ChEMBL ID,Smiles,Standard Value
0,CHEMBL3657004,COc1ccc(C(=O)N(C)C)cc1-c1nc2c(n1C(C)C)[C@@H](c...,7.151811
1,CHEMBL3657113,COC[C@@H](C)n1c(-c2cnc(N)nc2OC)nc2c1C(c1ccc(Cl...,9.430626
2,CHEMBL3657092,COc1ncc(-c2nc3c(n2C(C)C)C(c2ccc(Cl)c(F)c2)N(c2...,9.598599
3,CHEMBL3653169,COc1ccc(C(=O)NC(C)C)cc1-c1nc2c(n1C(C)C)C(c1ccc...,9.823909
4,CHEMBL3657038,COc1ccc(C(=O)NC(C)C)cc1-c1nc2c(n1C(C)C)[C@H](c...,10.09691
