In [1]:
import os
import pandas as pd
import numpy as np
from rdkit import Chem

DATAPATH = "../data/mmv_boxes"

# Pathogen Box

In [None]:
xls = pd.ExcelFile(os.path.join(DATAPATH, "Pathogen_Box_Activity_Biological_Data_Smiles.xlsx"))
xls.sheet_names

In [None]:
df = pd.read_excel(xls, sheet_name="MASTER SHEET")

In [None]:
df.columns

In [None]:
df = df[['*Please refer to the individual tabs fo each disease set for more details',
        'Unnamed: 4',
        'Anti - Mycobacterium tuberculosis activity', 
        'Antimalarial activity: Asexual Blood Stage',
        'Cytotoxicity data*'
        ]]
df.rename(columns = {
    '*Please refer to the individual tabs fo each disease set for more details': "SMILES",
    'Unnamed: 4': "indication",
    'Anti - Mycobacterium tuberculosis activity': "mtb_MIC90",
    'Antimalarial activity: Asexual Blood Stage': "pf_IC50",
    'Cytotoxicity data*': "hepg2_CC50"
    }, inplace=True)

df.drop(labels=[0, 401, 402], axis = 0, inplace=True)
df.dropna(subset=["SMILES"], inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
mols = [Chem.MolFromSmiles(smi) for smi in df["SMILES"].tolist()]
can_smi = [Chem.MolToSmiles(mol) for mol in mols]
df["CAN_SMILES"] = can_smi
df = df[["SMILES", "CAN_SMILES", "indication","pf_IC50", "mtb_MIC90", "hepg2_CC50"]]

In [None]:
num_cols = ["mtb_MIC90", "pf_IC50", "hepg2_CC50"]

In [None]:
df[num_cols] = df[num_cols].replace({"< ":""}, regex=True)
df[num_cols] = df[num_cols].replace({"<":""}, regex=True)
df[num_cols] = df[num_cols].replace({"> ":""}, regex=True)
df[num_cols] = df[num_cols].replace({">":""}, regex=True)

In [None]:
vals = [x for x in df["pf_IC50"]]
for i, v in enumerate(vals):
    try:
        float(v)
    except:
        print(i, v)

In [None]:
import numpy as np
df.loc[240]["pf_IC50"]=np.nan
df.loc[221]["pf_IC50"]= 0.1
df.loc[222]["pf_IC50"]= 0.1
df.loc[223]["pf_IC50"]= 0.007


In [None]:
df = df.astype({"SMILES": "string",
        "CAN_SMILES": "string",
        "indication": "string",
        "mtb_MIC90": "float",
        "pf_IC50":"float",
        "hepg2_CC50":"float"
        })

In [None]:
df.dtypes

In [None]:
df.to_csv(os.path.join(DATAPATH, "mmv_pathogenbox.csv"), index=False)

In [None]:
with open(os.path.join(DATAPATH, "mmv_pathogenbox_smiles.txt"), "w") as f:
    for s in df["CAN_SMILES"].tolist():
        f.write(f"{s}\n")

# Malaria Box

In [None]:
xls = pd.ExcelFile(os.path.join(DATAPATH, "MalariaBox400compoundsDec2014.xls"))
xls.sheet_names

In [None]:
df = pd.read_excel(xls, sheet_name="vortex_sheet")

In [None]:
df.columns

In [None]:
df = df[["Smiles", "EC50_nM"]]
df.rename(columns={"Smiles": "SMILES", "EC50_nM": "pf_IC50"}, inplace=True)

In [None]:
mols = [Chem.MolFromSmiles(smi) for smi in df["SMILES"].tolist()]
can_smi = [Chem.MolToSmiles(mol) for mol in mols]
df["CAN_SMILES"] = can_smi
df = df[["SMILES", "CAN_SMILES", "pf_IC50"]]

In [None]:
vals = [x for x in df["pf_IC50"]]
nd = []
for i, v in enumerate(vals):
    try:
        float(v)
    except:
        print(i, v)
        nd += [i]

In [None]:
for i in nd:
    df.loc[i]["pf_IC50"]=np.nan

In [None]:
df = df.astype({"SMILES": "string",
        "CAN_SMILES": "string",
        "pf_IC50":"float",
        })
df.dtypes

In [None]:
df["pf_IC50"] = df["pf_IC50"].apply(lambda x: x*0.001)

In [None]:
df.to_csv(os.path.join(DATAPATH, "mmv_malariabox.csv"), index=False)

In [None]:
with open(os.path.join(DATAPATH, "mmv_malariabox_smiles.txt"), "w") as f:
    for s in df["CAN_SMILES"].tolist():
        f.write(f"{s}\n")

# Pandemic Response Box

In [36]:
xls = pd.ExcelFile(os.path.join(DATAPATH, "Pandemic_Response_Box_list_of compounds.xlsx"))
xls.sheet_names

['Pandemic Response Box_list of c', 'Cytotox-PAMPA-mMS']

In [38]:
df = pd.read_excel(xls, sheet_name="Pandemic Response Box_list of c")
df.columns

Index(['MMV ID', 'DISEASE AREA', 'SALT COEFF', 'SALT NAME', 'CHEM NAME',
       'TRIVIAL  NAME', 'STRUCTURE', 'SMILES', 'ChEMBL ID', 'LIT. REF',
       'COMMENTS', 'MW', 'FORMULA', 'PSA', 'ALOGP', 'RULEOF5'],
      dtype='object')

In [39]:
df.shape

(404, 16)

In [41]:
df = df[~df["SMILES"].isna()]
df.shape

(400, 16)

In [43]:
df = df[["SMILES"]] # we will only keep smiles at this moment
mols = [Chem.MolFromSmiles(smi) for smi in df["SMILES"].tolist()]
can_smi = [Chem.MolToSmiles(mol) for mol in mols]
df["CAN_SMILES"] = can_smi
df = df[["SMILES", "CAN_SMILES"]]
df.to_csv(os.path.join(DATAPATH, "mmv_pandemicresponsebox.csv"), index=False)

# Global Health Response Box

In [45]:
xls = pd.ExcelFile(os.path.join(DATAPATH, "GHPB_DETAILS.xlsx"))
xls.sheet_names

['ZZGHPB CONTENT_d4o',
 'GHPB CONTENT',
 'ZZGHPB DATA_d4o',
 'ZZMB2 PLATE_d4o',
 'VEC PLATE',
 'ZZZND PLATE_d4o',
 'ZND PLATE',
 'MB2 PLATE',
 'ZZRESITOME POOLS_d4o',
 'RESITOME POOLS']

In [56]:
df = pd.read_excel(xls, sheet_name="GHPB CONTENT")
print(df.shape)
df.columns

(240, 16)


Index(['STRUCTURE', 'ENTITY_ID', 'AREA OF RESEARCH', 'CHEMICAL NAME', 'SMILES',
       'FORMULA', 'MW', 'SALT_MW', 'SALT_NAME', 'SALT_COEFF', 'TRIVIAL_NAME',
       'ENTITY TAG', 'ENTITY COMMENT', 'PSA', 'ALOGP', 'RULEOF5'],
      dtype='object')

In [57]:
df = df[~df["SMILES"].isna()]
df.shape

(240, 16)

In [58]:
smi = df["SMILES"].tolist()

In [59]:
can_smi = []
mols = [Chem.MolFromSmiles(smi) for smi in df["SMILES"].tolist()]
for i,mol in enumerate(mols):
    if mol is not None:
        can_s = Chem.MolToSmiles(mol)
        can_smi += [can_s]
    else:
        print(i)

df = pd.DataFrame(columns=["CAN_SMILES"])
df["CAN_SMILES"] = can_smi
df.to_csv(os.path.join(DATAPATH, "mmv_globalhealthbox.csv"), index=False)

81
103


[13:16:58] SMILES Parse Error: syntax error while parsing: [R][C@@H](C)[C@@H]1O[C@@]2(C[C@H]3C[C@@H](C\C=C(/C)\[C@@H](O[C@H]4C[C@H](OC)[C@@H](O[C@@H]5C[C@@H](OC)[C@H](O)[C@@H](C)O5)[C@H](C)O4)[C@@H](C)\C=C\C=C\6/COC7[C@H](O)C(=C[C@@H](C(=O)O3)[C@@]67O)C)O2)C=C[C@H]1C
[13:16:58] SMILES Parse Error: Failed parsing SMILES '[R][C@@H](C)[C@@H]1O[C@@]2(C[C@H]3C[C@@H](C\C=C(/C)\[C@@H](O[C@H]4C[C@H](OC)[C@@H](O[C@@H]5C[C@@H](OC)[C@H](O)[C@@H](C)O5)[C@H](C)O4)[C@@H](C)\C=C\C=C\6/COC7[C@H](O)C(=C[C@@H](C(=O)O3)[C@@]67O)C)O2)C=C[C@H]1C' for input: '[R][C@@H](C)[C@@H]1O[C@@]2(C[C@H]3C[C@@H](C\C=C(/C)\[C@@H](O[C@H]4C[C@H](OC)[C@@H](O[C@@H]5C[C@@H](OC)[C@H](O)[C@@H](C)O5)[C@H](C)O4)[C@@H](C)\C=C\C=C\6/COC7[C@H](O)C(=C[C@@H](C(=O)O3)[C@@]67O)C)O2)C=C[C@H]1C'
[13:16:58] SMILES Parse Error: syntax error while parsing: [R][C@H]1O[C@]2(CC[C@@H]1C)C[C@@H]3C[C@@H](C\C=C(/C)\C[C@@H](C)\C=C\C=C\4/CO[C@@H]5[C@H](O)C(=C[C@@H](C(=O)O3)[C@]45O)C)O2
[13:16:58] SMILES Parse Error: Failed parsing SMILES '[R][C@H]1O

In [62]:
smi[103]

'[R][C@H]1O[C@]2(CC[C@@H]1C)C[C@@H]3C[C@@H](C\\C=C(/C)\\C[C@@H](C)\\C=C\\C=C\\4/CO[C@@H]5[C@H](O)C(=C[C@@H](C(=O)O3)[C@]45O)C)O2'

## OVERLAP

In [63]:
# is there overlap with malaria and pathogen boxes?

malaria = pd.read_csv(os.path.join(DATAPATH, "mmv_malariabox.csv"))
pathogen = pd.read_csv(os.path.join(DATAPATH, "mmv_pathogenbox.csv"))
pandemic = pd.read_csv(os.path.join(DATAPATH, "mmv_pandemicbox.csv"))
gh = pd.read_csv(os.path.join(DATAPATH, "mmv_ghbox.csv"))

print(len(list(set.intersection(set(malaria["CAN_SMILES"]), set(pandemic["CAN_SMILES"])))))
print(len(list(set.intersection(set(pathogen["CAN_SMILES"]), set(pandemic["CAN_SMILES"])))))
print(len(list(set.intersection(set(gh["CAN_SMILES"]), set(pandemic["CAN_SMILES"])))))
print(len(list(set.intersection(set(pathogen["CAN_SMILES"]), set(malaria["CAN_SMILES"])))))
print(len(list(set.intersection(set(pathogen["CAN_SMILES"]), set(gh["CAN_SMILES"])))))
print(len(list(set.intersection(set(malaria["CAN_SMILES"]), set(gh["CAN_SMILES"])))))

0
6
1
0
1
0
