In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd 
from pathlib import Path
from bioblp.data import COL_EDGE, COL_SOURCE, COL_TARGET

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
DATA_DIR = Path("../data")
SHARED_DIR = Path("/home/jovyan/workbench-shared-folder/bioblp")

## Load benchmark

In [4]:
dpi_benchmark_path = SHARED_DIR.joinpath('data/benchmarks/transductive/dpi_fda.tsv') 

In [5]:
dpi_bm = pd.read_csv(dpi_benchmark_path, sep='\t', names=[COL_SOURCE, COL_EDGE, COL_TARGET])
dpi_bm.head(3)

Unnamed: 0,src,edg,tgt
0,DB01079,DPI,Q13639
1,DB00114,DPI,P20711
2,DB01158,DPI,P13637



* [DB01079; Tegaserod](https://go.drugbank.com/drugs/DB01079)
Tegaserod is a serotonin-4 (5-HT4) receptor agonist indicated for the treatment of constipation predominant irritable bowel syndrome (IBS-C) specifically in women under the age of 65. There is currently no safety or efficacy data for use of tegaserol in men.

* https://www.uniprot.org/uniprotkb/Q13639/entry

In [6]:
dpi_drugs_unique = list(dpi_bm.src.unique())
dpi_prots_unique = list(dpi_bm.tgt.unique())
len(dpi_bm), len(dpi_drugs_unique), len(dpi_prots_unique)

(18678, 2167, 2573)

### Other embedderst

In [7]:
dpi_bm.head()

Unnamed: 0,src,edg,tgt
0,DB01079,DPI,Q13639
1,DB00114,DPI,P20711
2,DB01158,DPI,P13637
3,DB01069,DPI,P18825
4,DB01186,DPI,P08684


In [8]:
dpi_drugs_unique = list(dpi_bm.src.unique())
dpi_prots_unique = list(dpi_bm.tgt.unique())

dpi_entities = dpi_drugs_unique + dpi_prots_unique

In [9]:
from bioblp.benchmarking.embeddings import LookupEmbedding

In [53]:
import torch
import json
import abc

from dataclasses import dataclass
from pathlib import Path
from typing import Union

from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem

import numpy as np
import pandas as pd


@dataclass
class LookupEmbedding():
    
    embeddings: torch.tensor
    entity_to_id: dict
    metadata: dict
    
    @classmethod 
    def from_pretrained(cls, model_path):
        _embedding_file = "embeddings.pt"
        _e2id_file = "entity_to_id.tsv"
        _metadata_file = "metadata.json"
        
        model_path = Path(model_path)
        
        required = [_embedding_file, _e2id_file]
        
        for file_i in required:
            assert model_path.joinpath(file_i).exists(), f"Missing required file {file_i} in dir {model_path}"
        
        embeddings = torch.load(model_path.joinpath(_embedding_file))
        entity_to_id = pd.read_csv(model_path.joinpath(_e2id_file), sep="\t", index_col = None, header=None)
        entity_to_id = {x[0]: x[1] for x in entity_to_id.values }
        
        metadata = None
        if model_path.joinpath(_metadata_file).exists():
             with open(model_path.joinpath(_metadata_file), "r") as f:
                metadata= json.load(f)
        
        return cls(embeddings=embeddings, entity_to_id=entity_to_id, metadata=metadata)
    
    
    def save(self, outdir: Union[str, Path], name: str):
        outdir = Path(outdir).joinpath(name)
        outdir.mkdir(parents=True, exist_ok=False)
        
        df = pd.DataFrame([[k, v] for k, v in self.entity_to_id.items()], columns=["e_", "id_"])
        df.to_csv(outdir.joinpath("entity_to_id.tsv"), sep="\t", index=False, header=False)
        
        torch.save(self.embeddings, outdir.joinpath("embeddings.pt"))
        
        if self.metadata is not None:
            with open(outdir.joinpath("metadata.json"), "w") as f:
                json.dump(self.metadata, f)
            
            
def create_noise_embeddings(entities, random_seed: int = 42, dim=128, emb_range=(-1, 1),
                            outdir = None, outname = None) -> LookupEmbedding:
    entity_set = set(entities)
    entity_to_id = {k: idx for idx, k in enumerate(sorted(list(entity_set)))}
    
    emb_shape = (len(entities), dim)

    r1 = emb_range[0]
    r2 = emb_range[1]

    g = torch.Generator()
    g.manual_seed(random_seed)

    embeddings = (r1 - r2) * torch.rand(emb_shape, generator=g) + r2
    
    metadata = {
        "random_seed": random_seed,
        "dim": dim,
        "emb_range": emb_range
    }
    
    lookup_embedding = LookupEmbedding(embeddings=embeddings,
                           entity_to_id=entity_to_id,
                           metadata=metadata)
    
    if outdir is not None:
        assert outname is not None, "need outout name"
        
        outdir = Path(outdir)
        
        lookup_embedding.save(outdir, outname)
    
    return lookup_embedding
            

def create_moltrans_lookup_embedding(data_dir, outdir = None, outname = None) -> LookupEmbedding:
    """Create lookup embedding files from raw Molecular Transformer files.
    """
    drug_id_file = "drug_ids.txt"
    drug_smiles_file = "drug_smiles.txt"
    smiles_emb_file = "drug_smiles_full.npz"
    
    data_dir = Path(data_dir)
    
    drug_df = pd.read_csv(data_dir.joinpath(drug_id_file), header=None)
    drug_df.columns = ["db_id"]
    drug_smiles_df = pd.read_csv(data_dir.joinpath(drug_smiles_file), header=None)

    drug_df["smiles"] = drug_smiles_df
    
    drug_ids_sorted = sorted(drug_df['db_id'].values.tolist())
    id2smiles = {x[0]: x[1] for x in drug_df.values}
    
    # e2id
    entity_to_id = {idx: k for idx, k in enumerate(drug_ids_sorted)}
    
    emb_records = []
    with np.load(data_dir.joinpath(smiles_emb_file)) as data:
        for drug_id in drug_ids_sorted:
            smile = id2smiles.get(drug_id)
            
            emb_tensor = torch.tensor(data[smile])
            emb_mean_pool = torch.mean(emb_tensor, dim=0) # is this best way?
            emb_records.append(emb_mean_pool)

    # embeddings
    embeddings = torch.stack(emb_records, dim=0)
    
    # meta
    metadata = {}
    
    lookup_embedding = LookupEmbedding(embeddings=embeddings,
                           entity_to_id=entity_to_id,
                           metadata=metadata)
    if outdir is not None:
        assert outname is not None, "need outout name"
        
        outdir = Path(outdir)
        
        lookup_embedding.save(outdir, outname)
    
    return lookup_embedding


def sanitize(rd_mol, raise_exception: bool = False):
    """
    Sanitizes the molecule if it has not been done before.
    :param
    raise_exception: if True will raise exception on failed sanitation
    :raises MoleculeException: if the molecule could not be sanitized
    """

    try:
        AllChem.SanitizeMol(rd_mol)
        # print("sanitized")

    except:  # noqa, there could be many reasons why the molecule cannot be sanitized
        # print("unsanitized")
        if raise_exception:
            raise ValueError(f"Unable to sanitize molecule")
        # rd_mol = Chem.MolFromSmiles(self.smiles, sanitize=False)
    return rd_mol


def compute_morgan_fingerprint(smiles, radius: int = 2, nbits: int or None = 1024) -> np.ndarray:
    """
    Returns the Morgan fingerprint of the molecule
`
    :param radius: the radius of the fingerprint
    :param nbits: the length of the fingerprint. If not given it will use RDKit default, defaults to None
    :return: the fingerprint
    """
    if nbits:
        key = (radius, nbits)
    else:
        key = (radius,)
    
    try:
        mol = Chem.MolFromSmiles(smiles)

        fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024)
        array = np.zeros((0, ), dtype=np.int8)
        DataStructs.ConvertToNumpyArray(fp, array)
        return array
    except:
        return None


def create_morgan_fingerprint_embeddings(data_dir, outdir = None, outname = None) -> LookupEmbedding:
    """Take drugs and their smiles and turn into morgan fingerprints.
    """
        
    drug_id_file = "drug_ids.txt"
    drug_smiles_file = "drug_smiles.txt"
    
    data_dir = Path(data_dir)
    
    drug_df = pd.read_csv(data_dir.joinpath(drug_id_file), header=None)
    drug_df.columns = ["db_id"]
    drug_smiles_df = pd.read_csv(data_dir.joinpath(drug_smiles_file), header=None)

    drug_df["smiles"] = drug_smiles_df
    
   
    
    # compute feats
    
    drug_df["fp"] = drug_df["smiles"].apply(lambda x: compute_morgan_fingerprint(x, radius=2, nbits=1024))
    
    drug_df = drug_df.dropna()
    
    fp_array = drug_df["fp"].values.tolist()
    print(fp_array[:10])
    
    fingerprints = torch.tensor(np.array(fp_array))
    
    drug_ids_sorted = sorted(drug_df['db_id'].values.tolist())
    id2smiles = {x[0]: x[1] for x in drug_df.values}
    
    # e2id
    entity_to_id = {idx: k for idx, k in enumerate(drug_ids_sorted)}
    
    metadata = {
        "radius": 2,
        "nbits": 1024,
        "sanitize": False
    }
    
    lookup_embedding = LookupEmbedding(embeddings=fingerprints,
                           entity_to_id=entity_to_id,
                           metadata=metadata)
    
    if outdir is not None:
        assert outname is not None, "need outout name"
        
        outdir = Path(outdir)
        
        lookup_embedding.save(outdir, outname)
    
    return lookup_embedding


In [11]:
import numpy as np

if False:
    X = np.array(pos_neg_df_feat.joint_encoding.values.tolist())    
    y = pos_neg_df_feat.label.values
    print(f"Shape X: {X.shape}, y: {y.shape}")
          
    np.save("../data/features/kge-1baon0eg/X.npy", X)
    np.save("../data/features/kge-1baon0eg/y.npy", y)

### BioKG Drugs

In [12]:
drug_properties_df = pd.read_csv("../data/biokg/biokg.properties.drug.tsv", sep="\t", header=None)
drug_meta_df = pd.read_csv("../data/biokg/biokg.metadata.drug.tsv", sep="\t", header=None)



In [13]:
drug_properties_df.head()

Unnamed: 0,0,1,2
0,DB00441,DRUG_INDICATION_ASSOCIATION,C0019829
1,DB01013,DRUG_INDICATION_ASSOCIATION,C0033860
2,DB00997,DRUG_INDICATION_ASSOCIATION,C0005695
3,DB00215,DRUG_INDICATION_ASSOCIATION,C0233477
4,DB00603,DRUG_INDICATION_ASSOCIATION,C0600142


In [14]:
drug_meta_df.head()

Unnamed: 0,0,1,2
0,DB00006,SUBSTITUENT,1_hydroxy_2_unsubstituted_benzenoid
1,DB00006,SUBSTITUENT,Alpha_peptide
2,DB00006,SUBSTITUENT,Alpha_amino_acid_amide
3,DB00006,SUBSTITUENT,Alpha_amino_acid_or_derivatives
4,DB00006,SUBSTITUENT,Amine


In [15]:
unique_drugs_prop = drug_properties_df[0].unique()
unique_drugs_m = drug_meta_df[0].unique()

biokg_drugs = list(set(unique_drugs_m).union(set(unique_drugs_prop)))

In [16]:
len(biokg_drugs)

8308

### BioKG Proteins

Note these are from the original biokg data, here we care about generating features for all entities.

In [17]:
protein_properties_df = pd.read_csv("../data/biokg/biokg.properties.protein.tsv", sep="\t", header=None)
protein_meta_df = pd.read_csv("../data/biokg/biokg.metadata.protein.tsv", sep="\t", header=None)



In [18]:
unique_prot_prop = protein_properties_df[0].unique()
unique_prot_m = protein_meta_df[0].unique()

biokg_prots = list(set(unique_prot_prop).union(set(unique_prot_m)))

In [19]:
len(biokg_prots)

122902

### Noise embeddings

In [20]:
biokg_drugs_and_proteins = biokg_drugs + biokg_prots

In [21]:
noise_embedder = create_noise_embeddings(biokg_drugs_and_proteins, random_seed=42, dim=128,
                                         # outdir="/home/jovyan/workbench-shared-folder/bioblp/data/benchmarks/features/",
                                         # outname="random-noise-lookup-embeddings"
                                        )

print(noise_embedder.embeddings[[10]])
print(noise_embedder.metadata)

tensor([[ 7.2355e-01,  2.4987e-01,  9.5468e-02,  5.5649e-01,  7.3862e-01,
         -6.7262e-01, -6.7865e-01,  9.0819e-01, -3.1821e-01, -4.0680e-01,
         -9.4997e-01, -5.7855e-01, -9.1934e-01,  3.2739e-01, -7.0039e-01,
         -8.1340e-01,  9.4433e-01,  8.0271e-01, -2.0241e-01, -5.4604e-01,
          4.9681e-01, -1.1013e-01,  1.4517e-03, -2.5311e-01,  5.3735e-01,
         -5.6404e-01, -6.6501e-01,  6.9386e-01, -9.6874e-03, -2.7746e-03,
         -3.2112e-01, -9.3158e-01, -2.9326e-01,  2.2722e-01, -8.9813e-01,
          3.8053e-01,  2.9037e-01, -6.8275e-02,  7.6151e-01, -1.0882e-01,
          6.7848e-01, -1.0282e-01, -9.5808e-02, -1.3842e-01,  8.4330e-01,
          9.4978e-01, -4.6012e-01, -8.5760e-01,  8.8737e-01, -3.7039e-01,
          7.3612e-01, -6.2691e-02, -9.3036e-01, -7.5862e-01,  7.3114e-01,
         -6.1857e-01, -5.2232e-01,  1.5339e-03, -9.6873e-01,  3.9718e-01,
          2.3272e-01,  5.0541e-01, -1.4390e-01, -2.6479e-01, -7.9680e-02,
          5.6264e-01,  4.5017e-01, -3.

### Mol transformer

In [22]:
moltrans_emb = create_moltrans_lookup_embedding("/home/jovyan/workbench-shared-folder/bioblp/data/molecular_data/",
                                                # outdir="/home/jovyan/workbench-shared-folder/bioblp/data/benchmarks/features/",
                                                # outname="moltrans-lookup-embeddings"
                                               )

In [23]:
moltrans_emb.embeddings.shape

torch.Size([10815, 512])

In [24]:
moltrans_drugs = pd.read_csv("/home/jovyan/workbench-shared-folder/bioblp/data/molecular_data/drug_ids.txt", header=None)[0].values.tolist()


In [25]:
biokg_drugs_s = set(biokg_drugs)
moltrans_drugs_s = set(moltrans_drugs)

print(f"Num drugs in biokg: {len(biokg_drugs_s)}")
print(f"Num drugs in moltrans: {len(moltrans_drugs_s)}")

biokg_drugs_diff = biokg_drugs_s - moltrans_drugs_s

print(f"biok - moltrans = {len(biokg_drugs_diff)}")

moltrans_drugs_diff = moltrans_drugs_s - biokg_drugs_s

print(f"moltrans - biok   = {len(moltrans_drugs_diff)}")

Num drugs in biokg: 8308
Num drugs in moltrans: 10815
biok - moltrans = 370
moltrans - biok   = 2877


In [26]:
sorted(biokg_drugs)[:10]

['DB00001',
 'DB00002',
 'DB00003',
 'DB00004',
 'DB00005',
 'DB00006',
 'DB00007',
 'DB00008',
 'DB00009',
 'DB00010']

In [27]:
sorted(moltrans_drugs)[:10]

['DB00006',
 'DB00007',
 'DB00014',
 'DB00027',
 'DB00035',
 'DB00050',
 'DB00067',
 'DB00080',
 'DB00091',
 'DB00093']

### Morgan fingerprints

In [55]:
morgan_fp_embedding = create_morgan_fingerprint_embeddings("/home/jovyan/workbench-shared-folder/bioblp/data/molecular_data/",
                                                outdir="/home/jovyan/workbench-shared-folder/bioblp/data/benchmarks/features/",
                                                outname="morgan-fingerprint-lookup-embeddings")

[22:07:09] Explicit valence for atom # 0 N, 4, is greater than permitted
[22:07:10] Explicit valence for atom # 3 N, 4, is greater than permitted
[22:07:10] Explicit valence for atom # 6 N, 4, is greater than permitted
[22:07:10] Explicit valence for atom # 0 N, 4, is greater than permitted
[22:07:10] Explicit valence for atom # 19 O, 3, is greater than permitted
[22:07:11] Explicit valence for atom # 13 Be, 3, is greater than permitted
[22:07:11] Explicit valence for atom # 4 F, 2, is greater than permitted
[22:07:11] Explicit valence for atom # 5 K, 2, is greater than permitted
[22:07:11] Unusual charge on atom 0 number of radical electrons set to zero
[22:07:11] Explicit valence for atom # 13 Cl, 5, is greater than permitted
[22:07:11] Explicit valence for atom # 0 O, 3, is greater than permitted
[22:07:11] Explicit valence for atom # 1 Cl, 4, is greater than permitted
[22:07:12] Explicit valence for atom # 0 N, 4, is greater than permitted
[22:07:12] Explicit valence for atom # 84 

[array([0, 0, 0, ..., 0, 0, 0], dtype=int8), array([1, 1, 0, ..., 0, 0, 0], dtype=int8), array([0, 0, 0, ..., 0, 0, 1], dtype=int8), array([0, 1, 0, ..., 0, 0, 0], dtype=int8), array([0, 1, 0, ..., 0, 0, 0], dtype=int8), array([0, 0, 0, ..., 0, 0, 0], dtype=int8), array([0, 0, 0, ..., 0, 0, 0], dtype=int8), array([0, 0, 0, ..., 0, 0, 0], dtype=int8), array([0, 1, 0, ..., 0, 0, 0], dtype=int8), array([0, 0, 0, ..., 0, 0, 0], dtype=int8)]


In [56]:
morgan_fp_embedding.embeddings[[0, 1, 2]]

tensor([[0, 0, 0,  ..., 0, 0, 0],
        [1, 1, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 1]], dtype=torch.int8)