In [9]:
%load_ext autoreload
%autoreload 2

In [10]:
import pandas as pd 
from pathlib import Path
from bioblp.data import COL_EDGE, COL_SOURCE, COL_TARGET

In [11]:
DATA_DIR = Path("../data")
SHARED_DIR = Path("/home/jovyan/workbench-shared-folder/bioblp")

## Load benchmark

In [12]:
dpi_benchmark_path = SHARED_DIR.joinpath('data/benchmarks/dpi_fda.tsv') 

In [13]:
dpi_bm = pd.read_csv(dpi_benchmark_path, sep='\t', names=[COL_SOURCE, COL_EDGE, COL_TARGET])
dpi_bm.head(3)

Unnamed: 0,src,edg,tgt
0,DB01079,DPI,Q13639
1,DB00114,DPI,P20711
2,DB01158,DPI,P13637



* [DB01079; Tegaserod](https://go.drugbank.com/drugs/DB01079)
Tegaserod is a serotonin-4 (5-HT4) receptor agonist indicated for the treatment of constipation predominant irritable bowel syndrome (IBS-C) specifically in women under the age of 65. There is currently no safety or efficacy data for use of tegaserol in men.

* https://www.uniprot.org/uniprotkb/Q13639/entry

In [14]:
dpi_drugs_unique = list(dpi_bm.src.unique())
dpi_prots_unique = list(dpi_bm.tgt.unique())
len(dpi_bm), len(dpi_drugs_unique), len(dpi_prots_unique)

(19161, 2286, 2705)

## Prepare Data for ML

In [18]:
from bioblp.benchmarking.featurize import load_model_and_entity_to_id_maps, ENTITY_TO_ID_MAP, RELATION_TO_ID_MAP

In [8]:
model_dir = Path('/home/jovyan/workbench-shared-folder/bioblp/models/1baon0eg')

kge_artifacts = load_model_and_entity_to_id_maps(model_dir)
entity_to_id_map = kge_artifacts.get(ENTITY_TO_ID_MAP)
relation_to_id_map = kge_artifacts.get(RELATION_TO_ID_MAP)

15:14:56 - INFO - bioblp.benchmarking.featurize - Loading trained model from /home/jovyan/workbench-shared-folder/bioblp/models/1baon0eg


### Generate Negative samples

In [15]:
from bioblp.benchmarking.preprocess import prepare_dpi_samples

In [16]:
pos_neg_df = prepare_dpi_samples(dpi_bm, entity_to_id_map, relation_to_id_map, num_negs_per_pos=10)

NameError: name 'entity_to_id_map' is not defined

In [17]:
pos_neg_df.head(10)

NameError: name 'pos_neg_df' is not defined

### Featurize by obtaining joint entity pair encoding

In [12]:
from bioblp.benchmarking.featurize import generate_entity_pair_joint_encoding
from bioblp.benchmarking.featurize import concatenate, average


In [13]:
# pass on to KGEM encoder to featurize each entity pair
pos_neg_df_feat = generate_entity_pair_joint_encoding(pos_neg_df,
                                    model=kge_artifacts["model"],
                                    transform_fn=concatenate)
                            

15:14:58 - INFO - bioblp.benchmarking.featurize - Applying transformation function: f<function concatenate at 0x7fc4a23274c0>, to retrieve joint encoding for entity pair


In [14]:
del kge_artifacts

### Other embedderst

In [19]:
dpi_bm.head()

Unnamed: 0,src,edg,tgt
0,DB01079,DPI,Q13639
1,DB00114,DPI,P20711
2,DB01158,DPI,P13637
3,DB01069,DPI,P18825
4,DB01186,DPI,P08684


In [20]:
dpi_drugs_unique = list(dpi_bm.src.unique())
dpi_prots_unique = list(dpi_bm.tgt.unique())

dpi_entities = dpi_drugs_unique + dpi_prots_unique

In [21]:
from bioblp.benchmarking.embeddings import NoiseEmbedding

In [121]:
import torch
import json
import abc

from dataclasses import dataclass
from pathlib import Path
from typing import Union


@dataclass
class LookupEmbedding():
    
    embeddings: torch.tensor
    entity_to_id: dict
    metadata: dict
    
    # def __init__(self, embeddings, entity_to_id, metadata=None):
    #     self.embeddings = embeddings
    #     self.entity_to_id = entity_to_id
    #     self.metadata = metadata
        
    @classmethod 
    def from_pretrained(cls, model_path):
        _embedding_file = "embeddings.pt"
        _e2id_file = "entity_to_id.tsv"
        _metadata_file = "metadata.json"
        
        model_path = Path(model_path)
        
        required = [_embedding_file, _e2id_file]
        
        for file_i in required:
            assert model_path.joinpath(file_i).exists(), f"Missing required file {file_i} in dir {model_path}"
        
        embeddings = torch.load(model_path.joinpath(_embedding_file))
        entity_to_id = pd.read_csv(model_path.joinpath(_e2id_file), sep="\t", index_col = None, header=None)
        entity_to_id = {x[0]: x[1] for x in entity_to_id.values }
        
        metadata = None
        if model_path.joinpath(_metadata_file).exists():
             with open(model_path.joinpath(_metadata_file), "r") as f:
                metadata= json.load(f)
        
        return cls(embeddings=embeddings, entity_to_id=entity_to_id, metadata=metadata)
    
    
    def save(self, outdir: Union[str, Path], name: str):
        outdir = Path(outdir).joinpath(name)
        outdir.mkdir(parents=True, exist_ok=False)
        
        df = pd.DataFrame([[k, v] for k, v in self.entity_to_id.items()], columns=["e_", "id_"])
        df.to_csv(outdir.joinpath("entity_to_id.tsv"), sep="\t", index=False, header=False)
        
        torch.save(self.embeddings, outdir.joinpath("embeddings.pt"))
        
        if self.metadata is not None:
            with open(outdir.joinpath("metadata.json"), "w") as f:
                json.dump(self.metadata, f)



# class LookupEmbedding(abc.ABC):
#     def __init__(self, lookup_embedding: PretrainedLookupEmbedding = None, **kwargs):
        
#         self._lookup = lookup_embedding
#         self._embeddings = None
#         self._entity_to_id = None
        
#         self._config = kwargs

#     @classmethod
#     def from_pretrained(cls, model_path):
#         lookup = PretrainedLookupEmbedding.from_path(model_path)
        
#         return cls(lookup)
    
#     @property
#     def embeddings(self):
#         if self._lookup is not None:
#             return self._lookup.embeddings
#         else:
#             return None
    
#     @property
#     def entity_to_id(self):
#         if self._lookup is not None:
#             return self._lookup.entity_to_id
#         else:
#             return None
    
#     @property
#     def metadata(self):
#         if self._lookup is not None:
#             return self._lookup.metadata
#         else:
#             return None

#     def save(self, outdir: Union[str, Path], name="noise_embeddings"):
#         self._lookup.save(outdir, name)

#     def _build_embeddings(self) -> torch.tensor:
#         raise NotImplementedError(
#             "Child class needs implementation of _build_embeddings.")

#     def _build_entity_to_id(self, entities) -> dict:
#         entities = sorted(dpi_entities)
#         entity_to_id = {k: k_idx for k_idx, k in enumerate(entities)}
        
#         return entity_to_id

#     def __call__(self, entities: list):
#         if self.embeddings is None:
#             entity_to_id = self._build_entity_to_id(entities)
#             embeddings = self._build_embeddings(entities)
#             metadata = self._config
            
#             self._lookup = PretrainedLookupEmbedding(embeddings=embeddings,
#                                                      entity_to_id=entity_to_id,
#                                                      metadata=metadata)
#         else:
#             raise Error("Embedder already fit")
            

# def build_entity_to_id(entities) -> dict:
#     entities = sorted(dpi_entities)
#     entity_to_id = {k: k_idx for k_idx, k in enumerate(entities)}

#     return entity_to_id
            
            
def NoiseEmbedding(entities, random_seed: int = 42, dim=128, emb_range=(-1, 1)) -> LookupEmbedding:
    entity_to_id = build_entity_to_id(entities)
    
    emb_shape = (len(entities), dim)

    r1 = emb_range[0]
    r2 = emb_range[1]

    g = torch.Generator()
    g.manual_seed(random_seed)

    embeddings = (r1 - r2) * torch.rand(emb_shape, generator=g) + r2
    
    metadata = {
        "random_seed": random_seed,
        "dim": dim,
        "emb_range": emb_range
    }
    
    return LookupEmbedding(embeddings=embeddings,
                           entity_to_id=entity_to_id,
                           metadata=metadata)
            
# class NoiseEmbedding(LookupEmbedding):
#     def __init__(self, lookup_embedding: PretrainedLookupEmbedding = None, random_seed: int = 42, dim=128, emb_range=(-1, 1)):
#         super().__init__(lookup_embedding=lookup_embedding, random_seed=random_seed, dim=dim, emb_range=emb_range)
        
#         self.other_prop = 12

#     def _build_embeddings(self, entities):
        
#         dim = self._config["dim"]
#         seed = self._config["random_seed"]
#         emb_range = self._config["emb_range"]
        
#         emb_shape = (len(entities), dim)

#         r1 = emb_range[0]
#         r2 = emb_range[1]

#         g = torch.Generator()
#         g.manual_seed(seed)

#         embeddings = (r1 - r2) * torch.rand(emb_shape, generator=g) + r2
#         return embeddings


def MorganFingerpintEmbedding(PretrainedLookupEmbedding):
    def __init__(self, embeddings, entity_to_id):
        super().__init__(embeddings, entity_to_id)


In [122]:
# noise_embedder = NoiseEmbedding(random_seed=42, dim=128)

In [123]:
noise_embedder = NoiseEmbedding(dpi_entities, random_seed=42, dim=128)

In [124]:
noise_embedder.embeddings[[10,12,3]]

tensor([[ 7.2355e-01,  2.4987e-01,  9.5468e-02,  5.5649e-01,  7.3862e-01,
         -6.7262e-01, -6.7865e-01,  9.0819e-01, -3.1821e-01, -4.0680e-01,
         -9.4997e-01, -5.7855e-01, -9.1934e-01,  3.2739e-01, -7.0039e-01,
         -8.1340e-01,  9.4433e-01,  8.0271e-01, -2.0241e-01, -5.4604e-01,
          4.9681e-01, -1.1013e-01,  1.4517e-03, -2.5311e-01,  5.3735e-01,
         -5.6404e-01, -6.6501e-01,  6.9386e-01, -9.6874e-03, -2.7746e-03,
         -3.2112e-01, -9.3158e-01, -2.9326e-01,  2.2722e-01, -8.9813e-01,
          3.8053e-01,  2.9037e-01, -6.8275e-02,  7.6151e-01, -1.0882e-01,
          6.7848e-01, -1.0282e-01, -9.5808e-02, -1.3842e-01,  8.4330e-01,
          9.4978e-01, -4.6012e-01, -8.5760e-01,  8.8737e-01, -3.7039e-01,
          7.3612e-01, -6.2691e-02, -9.3036e-01, -7.5862e-01,  7.3114e-01,
         -6.1857e-01, -5.2232e-01,  1.5339e-03, -9.6873e-01,  3.9718e-01,
          2.3272e-01,  5.0541e-01, -1.4390e-01, -2.6479e-01, -7.9680e-02,
          5.6264e-01,  4.5017e-01, -3.

In [125]:
noise_embedder.metadata

{'random_seed': 42, 'dim': 128, 'emb_range': (-1, 1)}

In [126]:
noise_embedder.save("../data/features/", name="noisy-e5")

FileExistsError: [Errno 17] File exists: '../data/features/noisy-e5'

In [127]:
noise_embedder_2 = LookupEmbedding.from_pretrained("../data/features/noisy-e5/")

In [128]:
noise_embedder_2.embeddings[[0,12,3]]

tensor([[-0.7645, -0.8300,  0.2343, -0.9186,  0.2191, -0.2018,  0.4869, -0.5873,
         -0.8815,  0.7336, -0.8692, -0.1872, -0.7388, -0.1354, -0.4822,  0.1412,
         -0.7709, -0.1478,  0.4668, -0.2549,  0.4607,  0.1173,  0.4062, -0.6634,
          0.7894,  0.4610,  0.2824,  0.6013, -0.0944,  0.9877, -0.9031,  0.8495,
         -0.7720, -0.1664,  0.3247, -0.6179, -0.1559, -0.8080, -0.1093,  0.3154,
         -0.2687,  0.2712, -0.4209, -0.8928, -0.5781,  0.4372, -0.5773, -0.1789,
         -0.5078,  0.6095,  0.9899,  0.3864,  0.7670, -0.8205, -0.2880, -0.4142,
         -0.3163,  0.0174, -0.7826,  0.7105, -0.0630,  0.6825, -0.3084,  0.3444,
         -0.3064,  0.2083, -0.8294,  0.5927,  0.5964,  0.5964, -0.8994, -0.3333,
         -0.9623,  0.8253,  0.9919,  0.7824,  0.6727, -0.4050, -0.3581, -0.8309,
          0.5164,  0.6817, -0.5306,  0.4042, -0.6069,  0.2373, -0.5720,  0.7770,
          0.5046, -0.3049, -0.2114,  0.2550, -0.5961, -0.6798,  0.7252,  0.5339,
         -0.9157,  0.3374,  

In [84]:
import numpy as np

if False:
    X = np.array(pos_neg_df_feat.joint_encoding.values.tolist())    
    y = pos_neg_df_feat.label.values
    print(f"Shape X: {X.shape}, y: {y.shape}"
          
    np.save("../data/features/kge-1baon0eg/X.npy", X)
    np.save("../data/features/kge-1baon0eg/y.npy", y)

SyntaxError: invalid syntax (704593400.py, line 8)