In [None]:
import pandas as pd
import numpy as np
import os

DATAPATH = "../data"

In [None]:
pubchem_outcomes = {1: "inactive", 2: "active", 3: "inconclusive", 4: "unspecified", 5: "probe"}

In [None]:
# if outcome is same for duplicated smiles, keep first instance, otherwise delete all
def delete_duplicates(df):
    original_len = len(df)
    for smi in df["smiles"][df["smiles"].duplicated()].tolist():
        idx = df.index[df["smiles"]==smi].tolist()
        outcomes = []
        for i in idx:
            out = df.loc[i]["outcome"]
            outcomes += [out]
        if len(set(outcomes)) == 1:
            idx_to_remove = idx[1:]
            df.drop(index=idx_to_remove, inplace=True)
        else:
            df.drop(index=idx, inplace=True)  
    final_len = len(df)
    print("{} smiles removed".format(original_len-final_len))
    
def pubchem_outcomes(df):
    outcomes = df["outcome"].tolist()
    bin_act = []
    for out in outcomes:
        if out == 1:
            act = 0
        elif out == 2:
            act = 1
        elif out == 3:
            act = "inconclusive"
        elif out == 4:
            act = "unspecified"
        elif out == 5:
            act = "probe"
        else:
            print("unspecified")
            act = "unspecified"
        bin_act += [act]
    df["bin_activity"] = bin_act

def keep_actinact(df):
    df.reset_index(inplace=True, drop=True)
    outcomes = df["outcome"].tolist()
    idx = []
    for i,out in enumerate(outcomes):
        if out not in [1,2]:
            idx += [i]
    df.drop(index = idx, inplace=True)

### PUBCHEM AID1851

In [None]:
df = pd.read_csv(os.path.join(DATAPATH, "original","PUBCHEM1851.csv"), low_memory=False)

In [None]:
# remove substances without associated smiles
print(len(df))
df = df[~df["smiles"].isna()]
print(len(df))

In [None]:
#separate by cyp
cyps_1851 = {
            "cyp2c9" : df[df["Panel Name"]=="p450-cyp2c9"],
            "cyp2c19" : df[df["Panel Name"]=="p450-cyp2c19"],
            "cyp2d6" : df[df["Panel Name"]=="p450-cyp2d6"],
            "cyp3a4" : df[df["Panel Name"]=="p450-cyp3a4"],
    
}

In [None]:
for k,v in cyps_1851.items():
    delete_duplicates(v)
    keep_actinact(v)
    pubchem_outcomes(v)
    cyps_1851[k] = v

In [None]:
for k,v in cyps_1851.items():
    v['cid'] = 'CID' + v['cid'].astype("str")
    v["cid"] = v["cid"].apply(lambda x: x.split(".")[0])
    v[["cid", "smiles", "bin_activity"]].to_csv(os.path.join(DATAPATH, "processed", "aid1851_{}.csv".format(k)), index=False)

### PUBCHEM AID883, 884, 891, 899

In [None]:
aids ={"aid883_cyp2c9": pd.read_csv(os.path.join(DATAPATH, "original", "PUBCHEM883.csv")),
       "aid884_cyp3a4": pd.read_csv(os.path.join(DATAPATH, "original","PUBCHEM884.csv")),
       "aid891_cyp2d6": pd.read_csv(os.path.join(DATAPATH,"original", "PUBCHEM891.csv")),
       "aid899_cyp2c19": pd.read_csv(os.path.join(DATAPATH,"original", "PUBCHEM899.csv")),
}

In [None]:
for k,v in aids.items():
    v = v[~v["smiles"].isna()]
    delete_duplicates(v)
    keep_actinact(v)
    pubchem_outcomes(v)
    aids[k] = v

In [None]:
for k,v in aids.items():
    v['cid'] = 'CID' + v['cid'].astype("str")
    v["cid"] = v["cid"].apply(lambda x: x.split(".")[0])
    v[["cid", "smiles", "bin_activity"]].to_csv(os.path.join(DATAPATH, "processed", "{}.csv".format(k)), index=False)

### Merge data from PubChem

In [None]:
#merge data from different pubchem bioassays. If molecules are duplicated, keep the one from aid1851
aid1851 = {"cyp2c9": pd.read_csv(os.path.join(DATAPATH, "processed", "aid1851_cyp2c9.csv")),
           "cyp2c19": pd.read_csv(os.path.join(DATAPATH, "processed", "aid1851_cyp2c19.csv")),
           "cyp2d6": pd.read_csv(os.path.join(DATAPATH, "processed", "aid1851_cyp2d6.csv")),
           "cyp3a4": pd.read_csv(os.path.join(DATAPATH, "processed", "aid1851_cyp3a4.csv")),
          }
aids = {"cyp2c9": pd.read_csv(os.path.join(DATAPATH, "processed", "aid883_cyp2c9.csv")),
        "cyp2c19": pd.read_csv(os.path.join(DATAPATH, "processed", "aid899_cyp2c19.csv")),
        "cyp2d6": pd.read_csv(os.path.join(DATAPATH, "processed", "aid891_cyp2d6.csv")),
        "cyp3a4": pd.read_csv(os.path.join(DATAPATH, "processed", "aid884_cyp3a4.csv")),
}

In [None]:
for k,v in aid1851.items():
    for k2,v2 in aids.items():
        if k == k2:
            df = pd.concat([v,v2], ignore_index = True)
            df.drop_duplicates(subset=["smiles"], keep="first", inplace=True)     
            df.to_csv(os.path.join(DATAPATH, "processed", "pubchem_{}.csv".format(k)), index=False)

### CHEMBL DATA
ChEMBL database records include quantitative information on enzyme inhibition. Only compounds having assigned “Standard Type” “IC50″ or ”Ki“, and ”Standard Units“ ”nM“ were considered. Entries with ”Standard Value“ lower than 10,000 were defined as active if the ”Standard Relation“ was one of “=”, “<=” or “<”. Entries with ”Standard Value“ greater than 20,000 were defined as inactive if the ”Standard Relation“ was one of “=”, “>=” or “>”. Bioactivity data not matching any of these criteria were discarded

In [None]:
chembl_ic50 = {"cyp2c9":pd.read_csv(os.path.join(DATAPATH, "original", "chembl_cyp2c9_IC50.csv"), encoding='ISO-8859-1'),
               "cyp2c19":pd.read_csv(os.path.join(DATAPATH, "original", "chembl_cyp2c19_IC50.csv"),encoding='ISO-8859-1'),
               "cyp2d6":pd.read_csv(os.path.join(DATAPATH, "original", "chembl_cyp2d6_IC50.csv"), encoding='ISO-8859-1'),
               "cyp3a4":pd.read_csv(os.path.join(DATAPATH, "original", "chembl_cyp3a4_IC50.csv"), encoding='ISO-8859-1'),
              }
chembl_ki = {"cyp2c9":pd.read_csv(os.path.join(DATAPATH, "original", "chembl_cyp2c9_ki.csv")),
               "cyp2c19":pd.read_csv(os.path.join(DATAPATH, "original", "chembl_cyp2c19_ki.csv")),
               "cyp2d6":pd.read_csv(os.path.join(DATAPATH, "original", "chembl_cyp2d6_ki.csv")),
               "cyp3a4":pd.read_csv(os.path.join(DATAPATH, "original", "chembl_cyp3a4_ki.csv")),
              }

In [None]:
for k,v in chembl_ic50.items():    
    print(k)

    #delete rows with measurements not in nM
    idx = v.index[~v["Standard Units"].isin(["nM"])]
    v.drop(labels=idx, inplace=True)
    print(len(v))

    #select rows with matching bioactivity criteria for active / non-active
    idx=v.index[~(((v["Standard Value"]<=10000)&(v["Standard Relation"].isin(["'<'", "'='", "'<='"])))
             |((v["Standard Value"]>=20000)&(v["Standard Relation"].isin(["'>'", "'='", "'>='"]))))
            ]
    v.drop(labels=idx, inplace=True)
    
    #merge same smiles averging the results
    v=v[["Molecule ChEMBL ID",'Smiles', "Standard Value"]]
    v = v.groupby("Smiles", as_index=False).agg(
        chembl_id = pd.NamedAgg(column="Molecule ChEMBL ID", aggfunc = "first"),
        value = pd.NamedAgg(column="Standard Value", aggfunc = "mean")
    )
    print(len(v))    
    v.rename(columns={"Smiles":"smiles"}, inplace=True)
    chembl_ic50[k]=v #re-assing dataframe to dictionary

In [None]:
for k,v in chembl_ki.items():    
    print(k)

    #delete rows with measurements not in nM
    idx = v.index[~v["Standard Units"].isin(["nM"])]
    v.drop(labels=idx, inplace=True)
    print(len(v))

    #select rows with matching bioactivity criteria for active / non-active
    idx=v.index[~(((v["Standard Value"]<=10000)&(v["Standard Relation"].isin(["'<'", "'='", "'<='"])))
             |((v["Standard Value"]>=20000)&(v["Standard Relation"].isin(["'>'", "'='", "'>='"]))))
            ]
    v.drop(labels=idx, inplace=True)
    
    #merge same smiles averging the results
    v=v[["Molecule ChEMBL ID",'Smiles', "Standard Value"]]
    v = v.groupby("Smiles", as_index=False).agg(
        chembl_id = pd.NamedAgg(column="Molecule ChEMBL ID", aggfunc = "first"),
        value = pd.NamedAgg(column="Standard Value", aggfunc = "mean")
    )
    print(len(v))    
    v.rename(columns={"Smiles":"smiles"}, inplace=True)
    chembl_ki[k]=v #re-assing dataframe to dictionary

In [None]:
#merge IC50 and Ki data, if we have IC50 value for a given molecule do not use KI
for k,v in chembl_ic50.items():
    for k2, v2 in chembl_ki.items():
        if k == k2:
            df = pd.concat([v,v2], ignore_index=True)
            df.drop_duplicates(subset=["smiles"], keep="first", inplace=True)
            bin_act = []
            val = df["value"].tolist()
            for n in val:
                if n >= 20000:
                    ba = 0
                elif n <= 10000:
                    ba = 1
                bin_act += [ba]
            df["bin_activity"] = bin_act
            df.to_csv(os.path.join(DATAPATH, "processed", "chembl_{}.csv".format(k)), index=False)

### Join PubChem and ChEMBL datasets

In [None]:
pubchem = {"cyp2c9": pd.read_csv(os.path.join(DATAPATH, "processed", "pubchem_cyp2c9.csv")),
           "cyp2c19": pd.read_csv(os.path.join(DATAPATH, "processed", "pubchem_cyp2c19.csv")),
           "cyp2d6": pd.read_csv(os.path.join(DATAPATH, "processed", "pubchem_cyp2d6.csv")),
           "cyp3a4": pd.read_csv(os.path.join(DATAPATH, "processed", "pubchem_cyp3a4.csv")),
          }
chembl = {"cyp2c9": pd.read_csv(os.path.join(DATAPATH, "processed", "chembl_cyp2c9.csv")),
           "cyp2c19": pd.read_csv(os.path.join(DATAPATH, "processed", "chembl_cyp2c19.csv")),
           "cyp2d6": pd.read_csv(os.path.join(DATAPATH, "processed", "chembl_cyp2d6.csv")),
           "cyp3a4": pd.read_csv(os.path.join(DATAPATH, "processed", "chembl_cyp3a4.csv")),
          }

In [None]:
#keep only relevant columns and same names
for k,v in pubchem.items():
    v.rename(columns={"cid":"compound_id"}, inplace=True)
    pubchem[k]=v
for k,v in chembl.items():
    v.rename(columns={"chembl_id":"compound_id"}, inplace=True)
    v.drop(columns=["value"], inplace=True)
    chembl[k]=v

In [None]:
from rdkit import Chem

for k,v in pubchem.items():
    for k2,v2 in chembl.items():
        if k == k2:
            df = pd.concat([v,v2], ignore_index=True)
            print(len(df))
            df.drop_duplicates(subset=["smiles"], keep="first", inplace=True)
            print(len(df))
            smiles = df["smiles"].tolist()
            mols = [Chem.MolFromSmiles(smi) for smi in smiles]
            for i, mol in enumerate(mols):
                if mol is None:
                    df.drop(index = i, inplace=True)  
            can_smi = [Chem.MolToSmiles(mol) for mol in mols if mol != None]
            df["CAN_SMILES"] = can_smi
            print(len(df))

            df.to_csv(os.path.join(DATAPATH, "processed", "cyp_all_{}.csv".format(k)), index=False)
        