# Pathogen Box

In [None]:
import os
import pandas as pd
import numpy as np
from rdkit import Chem

DATAPATH = "../data"

In [None]:
xls = pd.ExcelFile(os.path.join(DATAPATH, "original", "Pathogen_Box_Activity_Biological_Data_Smiles.xlsx"))
xls.sheet_names

In [None]:
df = pd.read_excel(xls, sheet_name="MASTER SHEET")

In [None]:
df.columns

In [None]:
df = df[['*Please refer to the individual tabs fo each disease set for more details',
        'Unnamed: 4',
        'Anti - Mycobacterium tuberculosis activity', 
        'Antimalarial activity: Asexual Blood Stage',
        'Cytotoxicity data*'
        ]]
df.rename(columns = {
    '*Please refer to the individual tabs fo each disease set for more details': "SMILES",
    'Unnamed: 4': "indication",
    'Anti - Mycobacterium tuberculosis activity': "mtb_MIC90",
    'Antimalarial activity: Asexual Blood Stage': "pf_IC50",
    'Cytotoxicity data*': "hepg2_CC50"
    }, inplace=True)

df.drop(labels=[0, 401, 402], axis = 0, inplace=True)
df.dropna(subset=["SMILES"], inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
mols = [Chem.MolFromSmiles(smi) for smi in df["SMILES"].tolist()]
can_smi = [Chem.MolToSmiles(mol) for mol in mols]
df["CAN_SMILES"] = can_smi
df = df[["SMILES", "CAN_SMILES", "indication","pf_IC50", "mtb_MIC90", "hepg2_CC50"]]

In [None]:
num_cols = ["mtb_MIC90", "pf_IC50", "hepg2_CC50"]

In [None]:
df[num_cols] = df[num_cols].replace({"< ":""}, regex=True)
df[num_cols] = df[num_cols].replace({"<":""}, regex=True)
df[num_cols] = df[num_cols].replace({"> ":""}, regex=True)
df[num_cols] = df[num_cols].replace({">":""}, regex=True)

In [None]:
vals = [x for x in df["pf_IC50"]]
for i, v in enumerate(vals):
    try:
        float(v)
    except:
        print(i, v)

In [None]:
import numpy as np
df.loc[240]["pf_IC50"]=np.nan
df.loc[221]["pf_IC50"]= 0.1
df.loc[222]["pf_IC50"]= 0.1
df.loc[223]["pf_IC50"]= 0.007


In [None]:
df = df.astype({"SMILES": "string",
        "CAN_SMILES": "string",
        "indication": "string",
        "mtb_MIC90": "float",
        "pf_IC50":"float",
        "hepg2_CC50":"float"
        })

In [None]:
df.dtypes

In [None]:
df.to_csv(os.path.join(DATAPATH, "processed", "mmv_pathogenbox.csv"), index=False)

In [None]:
with open(os.path.join(DATAPATH, "processed", "mmv_pathogenbox_smiles.txt"), "w") as f:
    for s in df["CAN_SMILES"].tolist():
        f.write(f"{s}\n")

# Malaria Box

In [None]:
xls = pd.ExcelFile(os.path.join(DATAPATH, "original", "MalariaBox400compoundsDec2014.xls"))
xls.sheet_names

In [None]:
df = pd.read_excel(xls, sheet_name="vortex_sheet")

In [None]:
df.columns

In [None]:
df = df[["Smiles", "EC50_nM"]]
df.rename(columns={"Smiles": "SMILES", "EC50_nM": "pf_IC50"}, inplace=True)

In [None]:
mols = [Chem.MolFromSmiles(smi) for smi in df["SMILES"].tolist()]
can_smi = [Chem.MolToSmiles(mol) for mol in mols]
df["CAN_SMILES"] = can_smi
df = df[["SMILES", "CAN_SMILES", "pf_IC50"]]

In [None]:
vals = [x for x in df["pf_IC50"]]
nd = []
for i, v in enumerate(vals):
    try:
        float(v)
    except:
        print(i, v)
        nd += [i]

In [None]:
for i in nd:
    df.loc[i]["pf_IC50"]=np.nan

In [None]:
df = df.astype({"SMILES": "string",
        "CAN_SMILES": "string",
        "pf_IC50":"float",
        })
df.dtypes

In [None]:
df["pf_IC50"] = df["pf_IC50"].apply(lambda x: x*0.001)

In [None]:
df.to_csv(os.path.join(DATAPATH, "processed", "mmv_malariabox.csv"), index=False)

In [None]:
with open(os.path.join(DATAPATH, "processed", "mmv_malariabox_smiles.txt"), "w") as f:
    for s in df["CAN_SMILES"].tolist():
        f.write(f"{s}\n")

# OSM Data

In [None]:
df = pd.read_csv(os.path.join(DATAPATH,"original", "osm_series4.csv"))

In [None]:
df.rename(columns={"osm":"ID", "smiles":"SMILES", "activity": "pf_IC50"}, inplace=True)

In [None]:
mols = [Chem.MolFromSmiles(smi) for smi in df["SMILES"].tolist()]
can_smi = [Chem.MolToSmiles(mol) for mol in mols]
df["CAN_SMILES"] = can_smi
df = df[["ID", "SMILES", "CAN_SMILES", "pf_IC50"]]

In [None]:
df.to_csv(os.path.join(DATAPATH, "processed", "osm_series4.csv"), index=False)

In [None]:
with open(os.path.join(DATAPATH, "processed", "osm_series4_smiles.txt"), "w") as f:
    for s in df["CAN_SMILES"].tolist():
        f.write(f"{s}\n")