In [37]:
import pandas as pd
from chembl_webresource_client.new_client import new_client
from standardiser import standardise
from rdkit import Chem

with open("../raw/chembl_uniprot_mapping.txt", "r") as f:
    l = f.readlines()[1:]
    uniprot2chembl = {}
    for x in l:
        x = x.split("\t")
        uniprot2chembl[x[0]] = x[1]

df = pd.read_csv("../raw/MEP_Pathway_PfNF54_UniProt.csv")
pfname2uniprot = {}
for v in df.values:
    pfname2uniprot[v[0]] = v[1]


In [38]:
def get_blasted_target_chembl_ids(name):
    uniprot_ac = pfname2uniprot[name]
    df = pd.read_csv("../blast/blast_{0}.tsv".format(uniprot_ac), sep="\t")
    uniprot_acs = df["Accession"].tolist()
    chembl_ids = []
    for p in [uniprot_ac] + uniprot_acs:
        if p in uniprot2chembl:
            chembl_ids += [uniprot2chembl[p]]
    return chembl_ids
    

In [43]:
def get_activities_for_target(chembl_id):
    activity = new_client.activity
    res = activity.filter(target_chembl_id=chembl_id)
    R = []
    for r in res:
        if r["pchembl_value"]:
            mol_id = r["molecule_chembl_id"]
            pchembl = float(r["pchembl_value"])
            canonical_smiles = r["canonical_smiles"]
            try:
                std_smiles = Chem.MolToSmiles(standardise.run(Chem.MolFromSmiles(canonical_smiles)))
            except:
                continue
            R.append([chembl_id, mol_id, pchembl, std_smiles])
    df = pd.DataFrame(R, columns=["target_id", "molecule_id", "pchembl", "smiles"])
    return df


def dedupe(df):
    df = df.sort_values("pchembl", ascending=False).reset_index(drop=True)
    df = df.drop_duplicates(subset="molecule_id", keep="first")
    return df

def get_activities_for_mep_target(name):
    chembl_ids = get_blasted_target_chembl_ids(name)
    R = []
    for chembl_id in chembl_ids:
        df = get_activities_for_target(chembl_id)
        R.append(df)
    if len(R) == 0:
        return None
    return dedupe(pd.concat(R))


def get_all_activities_for_all_mep_targets():
    R = []
    for name in pfname2uniprot.keys():
        df = get_activities_for_mep_target(name)
        if df is not None:
            R.append(df)
    return dedupe(pd.concat(R))


df = get_all_activities_for_all_mep_targets()

## Test with LazyQSAR

In [None]:
pos_smiles = df[df["pchembl"] >= 5]["smiles"].tolist()
neg_smiles = pd.read_csv("../raw/drugbank_smiles.csv")["Smiles"].tolist()
neg_smiles = [x for x in neg_smiles if x not in pos_smiles]
smiles_list = pos_smiles + neg_smiles
y = [1] * len(pos_smiles) + [0] * len(neg_smiles)

In [51]:
import lazyqsar as lq

model = lq.MorganBinaryClassifier()

ImportError: dlopen(/Users/mduranfrigola/miniconda3/envs/aiworkshop/lib/python3.10/site-packages/rdkit/ML/InfoTheory/rdInfoTheory.so, 0x0002): Symbol not found: __ZN5boost10filesystem11path_traits7convertEPKwS3_RNSt3__112basic_stringIcNS4_11char_traitsIcEENS4_9allocatorIcEEEERKNS4_7codecvtIwc11__mbstate_tEE
  Referenced from: <4223EC42-E0FE-3E48-BEA3-6EBBBA894BBE> /Users/mduranfrigola/miniconda3/envs/aiworkshop/lib/python3.10/site-packages/rdkit/.dylibs/libboost_log_setup.dylib
  Expected in:     <632DB74F-44E3-3B0D-92F6-383B5A3A727A> /Users/mduranfrigola/miniconda3/envs/aiworkshop/lib/python3.10/site-packages/rdkit/.dylibs/libboost_filesystem.dylib