In [37]:
import pandas as pd
from chembl_webresource_client.new_client import new_client
from standardiser import standardise
from rdkit import Chem

with open("../raw/chembl_uniprot_mapping.txt", "r") as f:
    l = f.readlines()[1:]
    uniprot2chembl = {}
    for x in l:
        x = x.split("\t")
        uniprot2chembl[x[0]] = x[1]

df = pd.read_csv("../raw/MEP_Pathway_PfNF54_UniProt.csv")
pfname2uniprot = {}
for v in df.values:
    pfname2uniprot[v[0]] = v[1]


In [38]:
def get_blasted_target_chembl_ids(name):
    uniprot_ac = pfname2uniprot[name]
    df = pd.read_csv("../blast/blast_{0}.tsv".format(uniprot_ac), sep="\t")
    uniprot_acs = df["Accession"].tolist()
    chembl_ids = []
    for p in [uniprot_ac] + uniprot_acs:
        if p in uniprot2chembl:
            chembl_ids += [uniprot2chembl[p]]
    return chembl_ids
    

In [39]:
def get_activities_for_target(chembl_id):
    activity = new_client.activity
    res = activity.filter(target_chembl_id=chembl_id)
    R = []
    for r in res:
        if r["pchembl_value"]:
            mol_id = r["molecule_chembl_id"]
            pchembl = float(r["pchembl_value"])
            canonical_smiles = r["canonical_smiles"]
            try:
                std_smiles = Chem.MolToSmiles(standardise.run(Chem.MolFromSmiles(canonical_smiles)))
            except:
                continue
            R.append([chembl_id, mol_id, pchembl, std_smiles])
    df = pd.DataFrame(R, columns=["target_id", "molecule_id", "pchembl", "smiles"])
    return df


def dedupe(df):
    df = df.sort_values("pchembl", ascending=False).reset_index(drop=True)
    df = df.drop_duplicates(subset="molecule_id", keep="first")
    return df

def get_activities_for_mep_target(name):
    chembl_ids = get_blasted_target_chembl_ids(name)
    R = []
    for chembl_id in chembl_ids:
        df = get_activities_for_target(chembl_id)
        R.append(df)
    if len(R) == 0:
        return None
    return dedupe(pd.concat(R))


def get_all_activities_for_all_mep_targets():
    R = []
    for name in pfname2uniprot.keys():
        df = get_activities_for_mep_target(name)
        if df is not None:
            R.append(df)
    return dedupe(pd.concat(R))


get_all_activities_for_all_mep_targets()

Unnamed: 0,target_id,molecule_id,pchembl,smiles
0,CHEMBL4295619,CHEMBL4567904,7.96,O=C(CSC(c1cc(F)cc(F)c1)P(=O)(O)O)NO
1,CHEMBL4295619,CHEMBL4590343,7.89,CN(O)C(=O)CSC(c1cc(F)cc(F)c1)P(=O)(O)O
2,CHEMBL4295619,CHEMBL4592986,7.7,COc1cc(OC)cc(C(SCC(=O)N(C)O)P(=O)(O)O)c1
3,CHEMBL4094,CHEMBL203125,7.68,O=CN(O)CCCP(=O)(O)O
4,CHEMBL4295619,CHEMBL1922604,7.64,CC(=O)N(O)CCCP(=O)(O)O
5,CHEMBL4295619,CHEMBL4544750,7.62,COc1cc(OC)cc(C(SCC(=O)NO)P(=O)(O)O)c1
6,CHEMBL4295619,CHEMBL4474216,7.54,CSc1ccc(C(SCC(=O)NO)P(=O)(O)O)cc1
7,CHEMBL4295619,CHEMBL4447571,7.5,Cc1ccc(C(SCC(=O)NO)P(=O)(O)O)cc1
8,CHEMBL4295619,CHEMBL2164257,7.47,O=CN(O)CCCP(=O)(O)O
9,CHEMBL4295619,CHEMBL4207168,7.04,O=CN(O)C/C=C/P(=O)(O)O
