In [8]:
import pandas as pd
from chembl_webresource_client.new_client import new_client
from standardiser import standardise
from rdkit import Chem

with open("../raw/chembl_uniprot_mapping.txt", "r") as f:
    l = f.readlines()[1:]
    uniprot2chembl = {}
    for x in l:
        x = x.split("\t")
        uniprot2chembl[x[0]] = x[1]

df = pd.read_csv("../raw/MEP_Pathway_PfNF54_UniProt.csv")
pfname2uniprot = {}
for v in df.values:
    pfname2uniprot[v[0]] = v[1]


In [9]:
def get_blasted_target_chembl_ids(name):
    uniprot_ac = pfname2uniprot[name]
    df = pd.read_csv("../blast/blast_{0}.tsv".format(uniprot_ac), sep="\t")
    uniprot_acs = df["Accession"].tolist()
    chembl_ids = []
    for p in [uniprot_ac] + uniprot_acs:
        if p in uniprot2chembl:
            chembl_ids += [uniprot2chembl[p]]
    return chembl_ids
    

In [10]:
def get_activities_for_target(chembl_id):
    activity = new_client.activity
    res = activity.filter(target_chembl_id=chembl_id)
    R = []
    for r in res:
        if r["pchembl_value"]:
            mol_id = r["molecule_chembl_id"]
            pchembl = float(r["pchembl_value"])
            canonical_smiles = r["canonical_smiles"]
            try:
                std_smiles = Chem.MolToSmiles(standardise.run(Chem.MolFromSmiles(canonical_smiles)))
            except:
                continue
            R.append([chembl_id, mol_id, pchembl, std_smiles])
    df = pd.DataFrame(R, columns=["target_id", "molecule_id", "pchembl", "smiles"])
    return df


def dedupe(df):
    df = df.sort_values("pchembl", ascending=False).reset_index(drop=True)
    df = df.drop_duplicates(subset="molecule_id", keep="first")
    return df

def get_activities_for_mep_target(name):
    chembl_ids = get_blasted_target_chembl_ids(name)
    R = []
    for chembl_id in chembl_ids:
        df = get_activities_for_target(chembl_id)
        R.append(df)
    if len(R) == 0:
        return None
    return dedupe(pd.concat(R))


def get_all_activities_for_all_mep_targets():
    R = []
    for name in pfname2uniprot.keys():
        df = get_activities_for_mep_target(name)
        if df is not None:
            R.append(df)
    return dedupe(pd.concat(R))


df = get_all_activities_for_all_mep_targets()
df.to_csv("provisional.csv")

## Test with a very simple model

In [12]:
from rdkit import Chem
import pandas as pd
import random
from sklearn.ensemble import RandomForestClassifier
from rdkit.Chem import AllChem
import numpy as np

df = pd.read_csv("provisional.csv")

pos_smiles = df[df["pchembl"] >= 5]["smiles"].tolist()
pos_smiles = [Chem.MolToSmiles(Chem.MolFromSmiles(x)) for x in pos_smiles]

neg_smiles = pd.read_csv("../raw/drugbank_smiles.csv")["Smiles"].tolist()
neg_smiles = [x for x in neg_smiles if x not in pos_smiles]
neg_smiles_ = []
for smi in neg_smiles:
    try:
        x = Chem.MolToSmiles(Chem.MolFromSmiles(smi))
    except:
        continue
    neg_smiles_ += [x]
print(len(neg_smiles_))
neg_smiles = neg_smiles_

smiles_list = pos_smiles + neg_smiles
y = [1] * len(pos_smiles) + [0] * len(neg_smiles)
idxs = [i for i in range(len(y))]
random.shuffle(idxs)
smiles_list = [smiles_list[i] for i in idxs]
y = [y[i] for i in idxs]


def fingerprints(smiles_list):

    molecules = [Chem.MolFromSmiles(smiles) for smiles in smiles_list]

    # Define Morgan fingerprint parameters
    radius = 2
    nBits = 1024

    # Calculate fingerprints
    fingerprints = [AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=nBits) for mol in molecules]

    # Convert fingerprints to numpy array
    fingerprint_matrix = np.array([np.array(fingerprint) for fingerprint in fingerprints])

    return fingerprint_matrix

X = fingerprints(smiles_list)
model = RandomForestClassifier()
model.fit(X, y)

library_smiles = pd.read_csv("../processed/all_mols.csv")["smiles"].tolist()
X = fingerprints(library_smiles)
y_hat = model.predict_proba(X)[:,1]

print(y_hat)
print(np.max(y_hat))

[13:40:09] Explicit valence for atom # 0 N, 4, is greater than permitted
[13:40:09] Explicit valence for atom # 0 N, 4, is greater than permitted
[13:40:09] Explicit valence for atom # 0 N, 4, is greater than permitted
[13:40:09] Explicit valence for atom # 13 Cl, 5, is greater than permitted
[13:40:09] SMILES Parse Error: syntax error while parsing: OS(O)(O)C1=CC=C(C=C1)C-1=C2\C=CC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC=C(C=C1)S(O)(O)O)C1=CC=C(C=C1)S([O-])([O-])[O-])\C1=CC=C(C=C1)S(O)(O)[O-]
[13:40:09] SMILES Parse Error: Failed parsing SMILES 'OS(O)(O)C1=CC=C(C=C1)C-1=C2\C=CC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC=C(C=C1)S(O)(O)O)C1=CC=C(C=C1)S([O-])([O-])[O-])\C1=CC=C(C=C1)S(O)(O)[O-]' for input: 'OS(O)(O)C1=CC=C(C=C1)C-1=C2\C=CC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC=C(C=C1)S(O)(O)O)C1=CC=C(C=C1)S([O-])([O-])[O-])\C1=CC=C(C=C1)S(O)(O)[O-]'
[13:40:09] Explicit valence for atom # 19 O, 3, is greater than permitted
[13:

11906


[13:40:12] Unusual charge on atom 42 number of radical electrons set to zero
[13:40:19] Unusual charge on atom 42 number of radical electrons set to zero


[0. 0. 0. ... 0. 0. 0.]
0.96


In [23]:
from rdkit.Chem import AllChem
from rdkit.Chem import DataStructs
import collections

# Define Morgan fingerprint parameters
radius = 2
nBits = 1024

# Calculate fingerprints for input and output molecules
input_fingerprints = [AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(smi), radius, nBits=nBits) for smi in library_smiles]
output_fingerprints = [AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(smi), radius, nBits=nBits) for smi in pos_smiles]

# Initialize similarity matrix
similarity_matrix = np.zeros((len(input_fingerprints), len(output_fingerprints)))

# Calculate similarity between each pair of input and output compounds
similarity_hits = collections.defaultdict(list)
for i, input_fp in enumerate(input_fingerprints):
    for j, output_fp in enumerate(output_fingerprints):
        similarity = DataStructs.TanimotoSimilarity(input_fp, output_fp)
        similarity_hits[library_smiles[i]] += [(pos_smiles[j], similarity)]

In [24]:
similarity_hits = dict((k, sorted(v, key=lambda x: -x[1])) for k,v in similarity_hits.items())

In [26]:
for k,v in similarity_hits.items():
    for x in v:
        if x[1] > 0.3:
            print(k, x[0], x[1])

COc1ccc(-c2oc3cc(O)c(OC)c(O)c3c(=O)c2OC)cc1 CCOC(=O)C1=C(C)N=c2s/c(=C\c3cc(Br)c(O)c(Br)c3)c(=O)n2C1c1ccc(OC)cc1 0.2125
NC(Cc1ccccc1)C(=O)O CC(=O)N(C/C=C/P(=O)(O)O)OC(C)c1ccccc1 0.25
NC(Cc1ccccc1)C(=O)O O=C1C(=O)c2ccccc2-c2ccccc21 0.25
NC(Cc1ccccc1)C(=O)O CN(O)C(=O)CS(=O)(=O)C(c1ccccc1)P(=O)(O)O 0.2391304347826087
NC(Cc1ccccc1)C(=O)O CC(=O)N(C/C=C/P(=O)(O)O)OCc1ccc(-c2ccccc2)cc1 0.23529411764705882
CCOC(=O)N(Nc1ccc(N)cc1N)c1ccccc1 CC(=O)N(C/C=C/P(=O)(O)O)OC(C)c1ccccc1 0.2222222222222222
CCOC(=O)N(Nc1ccc(N)cc1N)c1ccccc1 CN(O)C(=O)CS(=O)(=O)C(c1ccccc1)P(=O)(O)O 0.21311475409836064
CCOC(=O)N(Nc1ccc(N)cc1N)c1ccccc1 CC(=O)N(C/C=C/P(=O)(O)O)OCc1ccc(-c2ccccc2)cc1 0.21212121212121213
COc1cc(/C=C/C(=O)Oc2cc3oc(-c4ccc(O)cc4)cc(=O)c3c(O)c2C2OC(CO)C(O)C(O)C2O)cc(OC)c1O CCOC(=O)C1=C(C)N=c2s/c(=C\c3cc(Br)c(O)c(Br)c3)c(=O)n2C1c1ccc(OC)cc1 0.21153846153846154
COc1ccc(-c2cc(=O)c3c(O)cc(OC4OC(COC5OC(O)C(O)C(O)C5O)C(O)C(O)C4O)cc3o2)cc1 CCOC(=O)C1=C(C)N=c2s/c(=C\c3cc(Br)c(O)c(Br)c3)c(=O)n2C1c1ccc(OC)cc1 0.