# Pathogen Box

In [20]:
import os
import pandas as pd
import numpy as np
from rdkit import Chem

DATAPATH = "../data"

In [45]:
xls = pd.ExcelFile(os.path.join(DATAPATH, "original", "Pathogen_Box_Activity_Biological_Data_Smiles.xlsx"))
xls.sheet_names

['TUBERCULOSIS',
 'MALARIA',
 'KINETOPLASTIDS',
 'CRYPTOSPORIDIOSIS',
 'L. FILARIASIS-ONCHOCERCIASIS',
 'WOLBACHIA (LF)',
 'SCHISTOSOMIASIS',
 'HOOKWORM-TRICHURIASIS',
 'TOXOPLASMOSIS',
 'DENGUE-CHIKUNGUNYA',
 'REFERENCE COMPOUNDS',
 'MASTER SHEET',
 'in vitro DMPK',
 'in vivo DMPK',
 'ESRI_MAPINFO_SHEET']

In [46]:
df = pd.read_excel(xls, sheet_name="MASTER SHEET")

In [47]:
df.columns

Index(['*Please refer to the individual tabs fo each disease set for more details',
       'Note: empty cell = "not tested"', 'Unnamed: 2', 'Unnamed: 3',
       'Unnamed: 4', 'Anti - Mycobacterium tuberculosis activity',
       'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10',
       'Unnamed: 11', 'Unnamed: 12', 'Unnamed: 13', 'Unnamed: 14',
       'Unnamed: 15', 'Unnamed: 16', 'Unnamed: 17', 'Unnamed: 18',
       'Unnamed: 19', 'Unnamed: 20', 'Unnamed: 21',
       'Antimalarial activity: Asexual Blood Stage', 'Unnamed: 23',
       'Unnamed: 24', 'Antimalarial activity: Liver Stage', 'Unnamed: 26',
       'Antimalarial activity: Gametocytes', 'Unnamed: 28',
       'Anti - Trypanosoma cruzi activity',
       'Anti - Trypanosoma brucei activity', 'Unnamed: 31',
       'Anti - Leishmania activity', 'Unnamed: 33', 'Unnamed: 34',
       'Unnamed: 35', 'Unnamed: 36', 'Anti - Cryptosporidium parvum activity',
       'Unnamed: 38', 'Anti - Brugia pahangi activity', 'Unnam

In [48]:
df = df[['*Please refer to the individual tabs fo each disease set for more details',
        'Unnamed: 4',
        'Anti - Mycobacterium tuberculosis activity', 
        'Antimalarial activity: Asexual Blood Stage',
        'Cytotoxicity data*'
        ]]
df.rename(columns = {
    '*Please refer to the individual tabs fo each disease set for more details': "SMILES",
    'Unnamed: 4': "indication",
    'Anti - Mycobacterium tuberculosis activity': "mtb_MIC90",
    'Antimalarial activity: Asexual Blood Stage': "pf_IC50",
    'Cytotoxicity data*': "hepg2_CC50"
    }, inplace=True)

df.drop(labels=[0, 401, 402], axis = 0, inplace=True)
df.dropna(subset=["SMILES"], inplace=True)
df.reset_index(drop=True, inplace=True)

In [49]:
mols = [Chem.MolFromSmiles(smi) for smi in df["SMILES"].tolist()]
can_smi = [Chem.MolToSmiles(mol) for mol in mols]
df["CAN_SMILES"] = can_smi
df = df[["SMILES", "CAN_SMILES", "indication","pf_IC50", "mtb_MIC90", "hepg2_CC50"]]

In [50]:
num_cols = ["mtb_MIC90", "pf_IC50", "hepg2_CC50"]

In [51]:
df[num_cols] = df[num_cols].replace({"< ":""}, regex=True)
df[num_cols] = df[num_cols].replace({"<":""}, regex=True)
df[num_cols] = df[num_cols].replace({"> ":""}, regex=True)
df[num_cols] = df[num_cols].replace({">":""}, regex=True)

In [52]:
vals = [x for x in df["pf_IC50"]]
for i, v in enumerate(vals):
    try:
        float(v)
    except:
        print(i, v)

221 0.1 (2009 data)
222 0.1 (2011 data)
223 0.007 (2012 data)
240 Inactive (please see the malaria disease sheet for more details) 


In [53]:
import numpy as np
df.loc[240]["pf_IC50"]=np.nan
df.loc[221]["pf_IC50"]= 0.1
df.loc[222]["pf_IC50"]= 0.1
df.loc[223]["pf_IC50"]= 0.007


In [54]:
df = df.astype({"SMILES": "string",
        "CAN_SMILES": "string",
        "indication": "string",
        "mtb_MIC90": "float",
        "pf_IC50":"float",
        "hepg2_CC50":"float"
        })

In [55]:
df.dtypes

SMILES         string
CAN_SMILES     string
indication     string
pf_IC50       float64
mtb_MIC90     float64
hepg2_CC50    float64
dtype: object

In [56]:
df.to_csv(os.path.join(DATAPATH, "processed", "mmv_pathogenbox.csv"), index=False)

In [57]:
with open(os.path.join(DATAPATH, "processed", "mmv_pathogenbox_smiles.txt"), "w") as f:
    for s in df["CAN_SMILES"].tolist():
        f.write(f"{s}\n")

# Malaria Box

In [58]:
xls = pd.ExcelFile(os.path.join(DATAPATH, "original", "MalariaBox400compoundsDec2014.xls"))
xls.sheet_names

['vortex_sheet']

In [59]:
df = pd.read_excel(xls, sheet_name="vortex_sheet")

In [60]:
df.columns

Index(['HEOS_COMPOUND_ID', 'Batch_No_March2012', 'Batch_No_June2012',
       'Batch_No_April2013', 'Smiles', 'percent_inh @ 2 uM',
       'percent_inh @ 5 uM', 'EC50_nM', 'ChEMBL_NTD_ID', 'source',
       'CHEMBL EC50 in uM', 'Set', 'Ro5_ViolationCount', 'NplusO_Count',
       'Molecular_Weight', 'Num_H_Donors', 'ALogP', 'Comment'],
      dtype='object')

In [61]:
df = df[["Smiles", "EC50_nM"]]
df.rename(columns={"Smiles": "SMILES", "EC50_nM": "pf_IC50"}, inplace=True)

In [62]:
mols = [Chem.MolFromSmiles(smi) for smi in df["SMILES"].tolist()]
can_smi = [Chem.MolToSmiles(mol) for mol in mols]
df["CAN_SMILES"] = can_smi
df = df[["SMILES", "CAN_SMILES", "pf_IC50"]]

In [63]:
vals = [x for x in df["pf_IC50"]]
nd = []
for i, v in enumerate(vals):
    try:
        float(v)
    except:
        print(i, v)
        nd += [i]

83 ND
86 ND
88 ND
89 ND
93 ND
95 ND
99 ND
100 ND
103 ND
105 ND
106 ND
107 ND
109 ND
111 ND
112 ND
114 ND
116 ND
117 ND
119 ND
121 ND
122 ND
124 ND
125 ND
126 ND
131 ND
138 ND
139 ND
141 ND
143 ND
151 ND
152 ND
153 ND
157 ND
158 ND
174 ND
186 ND
187 ND
195 ND
204 ND
206 ND
207 ND
208 ND
209 ND
220 ND
224 ND
225 ND
228 ND
229 ND
233 ND
235 ND
237 ND
239 ND
240 ND
244 ND
248 ND
250 ND
253 ND
258 ND
259 ND
260 ND
261 ND
262 ND
267 ND
296 ND
299 ND
303 ND
305 ND
309 ND
310 ND
312 ND
315 ND
320 ND
326 ND
346 ND
358 ND
361 ND
365 ND
373 ND
397 ND
398 ND


In [64]:
for i in nd:
    df.loc[i]["pf_IC50"]=np.nan

In [65]:
df = df.astype({"SMILES": "string",
        "CAN_SMILES": "string",
        "pf_IC50":"float",
        })
df.dtypes

SMILES         string
CAN_SMILES     string
pf_IC50       float64
dtype: object

In [66]:
df["pf_IC50"] = df["pf_IC50"].apply(lambda x: x*0.001)

In [67]:
df.to_csv(os.path.join(DATAPATH, "processed", "mmv_malariabox.csv"), index=False)

In [68]:
with open(os.path.join(DATAPATH, "processed", "mmv_malariabox_smiles.txt"), "w") as f:
    for s in df["CAN_SMILES"].tolist():
        f.write(f"{s}\n")

# OSM Data

In [73]:
df = pd.read_csv(os.path.join(DATAPATH,"original", "osm_series4.csv"))

In [74]:
df.rename(columns={"osm":"ID", "smiles":"SMILES", "activity": "pf_IC50"}, inplace=True)

In [77]:
mols = [Chem.MolFromSmiles(smi) for smi in df["SMILES"].tolist()]
can_smi = [Chem.MolToSmiles(mol) for mol in mols]
df["CAN_SMILES"] = can_smi
df = df[["ID", "SMILES", "CAN_SMILES", "pf_IC50"]]

In [80]:
df.to_csv(os.path.join(DATAPATH, "processed", "osm_series4.csv"), index=False)

In [81]:
with open(os.path.join(DATAPATH, "processed", "osm_series4_smiles.txt"), "w") as f:
    for s in df["CAN_SMILES"].tolist():
        f.write(f"{s}\n")