In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd 
from pathlib import Path
from bioblp.data import COL_EDGE, COL_SOURCE, COL_TARGET

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
DATA_DIR = Path("../data")
SHARED_DIR = Path("/home/jovyan/workbench-shared-folder/bioblp")

## Load benchmark

In [4]:
dpi_benchmark_path = SHARED_DIR.joinpath('data/benchmarks/dpi_fda.tsv') 

In [5]:
dpi_bm = pd.read_csv(dpi_benchmark_path, sep='\t', names=[COL_SOURCE, COL_EDGE, COL_TARGET])
dpi_bm.head(3)

Unnamed: 0,src,edg,tgt
0,DB01079,DPI,Q13639
1,DB00114,DPI,P20711
2,DB01158,DPI,P13637



* [DB01079; Tegaserod](https://go.drugbank.com/drugs/DB01079)
Tegaserod is a serotonin-4 (5-HT4) receptor agonist indicated for the treatment of constipation predominant irritable bowel syndrome (IBS-C) specifically in women under the age of 65. There is currently no safety or efficacy data for use of tegaserol in men.

* https://www.uniprot.org/uniprotkb/Q13639/entry

In [6]:
dpi_drugs_unique = list(dpi_bm.src.unique())
dpi_prots_unique = list(dpi_bm.tgt.unique())
len(dpi_bm), len(dpi_drugs_unique), len(dpi_prots_unique)

(19161, 2286, 2705)

## Prepare Data for ML

In [7]:
from bioblp.benchmarking.featurize import load_model_and_entity_to_id_maps, ENTITY_TO_ID_MAP, RELATION_TO_ID_MAP

In [8]:
model_dir = Path('/home/jovyan/workbench-shared-folder/bioblp/models/1baon0eg')

kge_artifacts = load_model_and_entity_to_id_maps(model_dir)
entity_to_id_map = kge_artifacts.get(ENTITY_TO_ID_MAP)
relation_to_id_map = kge_artifacts.get(RELATION_TO_ID_MAP)

15:14:56 - INFO - bioblp.benchmarking.featurize - Loading trained model from /home/jovyan/workbench-shared-folder/bioblp/models/1baon0eg


### Generate Negative samples

In [9]:
from bioblp.benchmarking.preprocess import prepare_dpi_samples

In [10]:
pos_neg_df = prepare_dpi_samples(dpi_bm, entity_to_id_map, relation_to_id_map, num_negs_per_pos=10)

In [11]:
pos_neg_df.head(10)

Unnamed: 0,src,edg,tgt,label
0,8879,5,44406,1
1,7928,5,30725,1
2,8957,5,29310,1
3,8869,5,30330,1
4,8985,5,27350,1
5,14208,5,20956,1
6,8025,5,33235,1
7,10379,5,22335,1
8,7915,5,42516,1
9,14366,5,26237,1


### Featurize by obtaining joint entity pair encoding

In [12]:
from bioblp.benchmarking.featurize import generate_entity_pair_joint_encoding
from bioblp.benchmarking.featurize import concatenate, average


In [13]:
# pass on to KGEM encoder to featurize each entity pair
pos_neg_df_feat = generate_entity_pair_joint_encoding(pos_neg_df,
                                    model=kge_artifacts["model"],
                                    transform_fn=concatenate)
                            

15:14:58 - INFO - bioblp.benchmarking.featurize - Applying transformation function: f<function concatenate at 0x7fc4a23274c0>, to retrieve joint encoding for entity pair


In [14]:
del kge_artifacts

### Other embedderst

In [15]:
dpi_bm.head()

Unnamed: 0,src,edg,tgt,label
0,8879,5,44406,1
1,7928,5,30725,1
2,8957,5,29310,1
3,8869,5,30330,1
4,8985,5,27350,1


In [16]:
dpi_drugs_unique = list(dpi_bm.src.unique())
dpi_prots_unique = list(dpi_bm.tgt.unique())

dpi_entities = dpi_drugs_unique + dpi_prots_unique

In [17]:
from bioblp.benchmarking.embeddings import NoiseEmbedding

In [35]:
import torch
import json

from pathlib import Path
from typing import Union


class LookupEmbedding():
    def __init__(self, random_seed: int = 42, dim=128, emb_range=(-1, 1)):

        self.random_seed = random_seed

        self.embeddings = None
        self.dim = dim
        self.emb_range = emb_range
        self.entity_to_id = {}

    @classmethod
    def from_pretrained(cls, model_path):
        model_path = Path(model_path)
        cfg = {}
        with open(model_path.joinpath("config.json"), "r") as f:
            cfg = json.load(f)
            
        embeddings = torch.load(model_path.joinpath("embeddings.pt"))
        entity_to_id = pd.read_csv(model_path.joinpath("entity_to_id.tsv"), sep="\t", index_col = None, header=None)
        entity_to_id = {x[0]:x[1] for x in entity_to_id.values }
        
        obj = cls(random_seed = cfg["random_seed"],
                 dim=cfg["dim"],
                 emb_range=cfg["emb_range"])
        
        obj.embeddings = embeddings
        obj.entity_to_id = entity_to_id
        return obj

    def save(self, outdir: Union[str, Path], name="noise_embeddings"):
        outdir = Path(outdir).joinpath(name)
        outdir.mkdir(parents=True, exist_ok=False)
        
        config = {
            "random_seed": self.random_seed,
            "dim": self.dim,
            "emb_range": self.emb_range
        }
        
        with open(outdir.joinpath("config.json"), "w") as f:
            json.dump(config, f)
        
        df = pd.DataFrame([[k, v] for k, v in self.entity_to_id.items()], columns=["e_", "id_"])
        df.to_csv(outdir.joinpath("entity_to_id.tsv"), sep="\t", index=False, header=False)
        
        torch.save(self.embeddings, outdir.joinpath("embeddings.pt"))
        

    def _init_embeddings(self):
        raise NotImplementedError(
            "Child class needs implementation of _init_embeddings.")

    def _build_entity_to_id(self, entities):
        entities = sorted(dpi_entities)
        self.entity_to_id = {k: k_idx for k_idx, k in enumerate(entities)}

    def __call__(self, entities: list):
        if self.embeddings is None:
            self._build_entity_to_id(entities)
            self._init_embeddings()
        else:
            raise Error("Embedder already fit")


class NoiseEmbedding(LookupEmbedding):
    def __init__(self, random_seed: int = 42, dim=128, emb_range=(-1, 1)):
        super().__init__(random_seed=random_seed, dim=dim, emb_range=emb_range)

        self.dim = dim
        self.emb_range = emb_range
        self.entity_to_id = {}

    def _init_embeddings(self):
        emb_shape = (len(self.entity_to_id), self.dim)

        r1 = self.emb_range[0]
        r2 = self.emb_range[1]

        g = torch.Generator()
        g.manual_seed(self.random_seed)

        self.embeddings = (r1 - r2) * torch.rand(emb_shape, generator=g) + r2

    def _build_entity_to_id(self, entities):
        entities = sorted(entities)
        self.entity_to_id = {k: k_idx for k_idx, k in enumerate(entities)}

    def __call__(self, entities: list):
        self._build_entity_to_id(entities)
        self._init_embeddings()


In [26]:
noise_embedder = NoiseEmbedding(random_seed=42, dim=128)

In [27]:
noise_embedder(dpi_entities)

In [39]:
noise_embedder.embeddings[[0,12,3]]

tensor([[-0.7645, -0.8300,  0.2343, -0.9186,  0.2191, -0.2018,  0.4869, -0.5873,
         -0.8815,  0.7336, -0.8692, -0.1872, -0.7388, -0.1354, -0.4822,  0.1412,
         -0.7709, -0.1478,  0.4668, -0.2549,  0.4607,  0.1173,  0.4062, -0.6634,
          0.7894,  0.4610,  0.2824,  0.6013, -0.0944,  0.9877, -0.9031,  0.8495,
         -0.7720, -0.1664,  0.3247, -0.6179, -0.1559, -0.8080, -0.1093,  0.3154,
         -0.2687,  0.2712, -0.4209, -0.8928, -0.5781,  0.4372, -0.5773, -0.1789,
         -0.5078,  0.6095,  0.9899,  0.3864,  0.7670, -0.8205, -0.2880, -0.4142,
         -0.3163,  0.0174, -0.7826,  0.7105, -0.0630,  0.6825, -0.3084,  0.3444,
         -0.3064,  0.2083, -0.8294,  0.5927,  0.5964,  0.5964, -0.8994, -0.3333,
         -0.9623,  0.8253,  0.9919,  0.7824,  0.6727, -0.4050, -0.3581, -0.8309,
          0.5164,  0.6817, -0.5306,  0.4042, -0.6069,  0.2373, -0.5720,  0.7770,
          0.5046, -0.3049, -0.2114,  0.2550, -0.5961, -0.6798,  0.7252,  0.5339,
         -0.9157,  0.3374,  

In [30]:
noise_embedder.save("../data/features/", name="noisy-e1")

In [36]:
noise_embedder_2 = NoiseEmbedding.from_pretrained("../data/features/noisy-e/")

In [38]:
noise_embedder_2.embeddings[[0,12,3]]

tensor([[-0.7645, -0.8300,  0.2343, -0.9186,  0.2191, -0.2018,  0.4869, -0.5873,
         -0.8815,  0.7336, -0.8692, -0.1872, -0.7388, -0.1354, -0.4822,  0.1412,
         -0.7709, -0.1478,  0.4668, -0.2549,  0.4607,  0.1173,  0.4062, -0.6634,
          0.7894,  0.4610,  0.2824,  0.6013, -0.0944,  0.9877, -0.9031,  0.8495,
         -0.7720, -0.1664,  0.3247, -0.6179, -0.1559, -0.8080, -0.1093,  0.3154,
         -0.2687,  0.2712, -0.4209, -0.8928, -0.5781,  0.4372, -0.5773, -0.1789,
         -0.5078,  0.6095,  0.9899,  0.3864,  0.7670, -0.8205, -0.2880, -0.4142,
         -0.3163,  0.0174, -0.7826,  0.7105, -0.0630,  0.6825, -0.3084,  0.3444,
         -0.3064,  0.2083, -0.8294,  0.5927,  0.5964,  0.5964, -0.8994, -0.3333,
         -0.9623,  0.8253,  0.9919,  0.7824,  0.6727, -0.4050, -0.3581, -0.8309,
          0.5164,  0.6817, -0.5306,  0.4042, -0.6069,  0.2373, -0.5720,  0.7770,
          0.5046, -0.3049, -0.2114,  0.2550, -0.5961, -0.6798,  0.7252,  0.5339,
         -0.9157,  0.3374,  

In [None]:
import numpy as np

if False:
    X = np.array(pos_neg_df_feat.joint_encoding.values.tolist())    
    y = pos_neg_df_feat.label.values
    print(f"Shape X: {X.shape}, y: {y.shape}"
          
    np.save("../data/features/kge-1baon0eg/X.npy", X)
    np.save("../data/features/kge-1baon0eg/y.npy", y)