In [9]:
%load_ext autoreload
%autoreload 2

In [10]:
import pandas as pd 
from pathlib import Path
from bioblp.data import COL_EDGE, COL_SOURCE, COL_TARGET

In [11]:
DATA_DIR = Path("../data")
SHARED_DIR = Path("/home/jovyan/workbench-shared-folder/bioblp")

## Load benchmark

In [12]:
dpi_benchmark_path = SHARED_DIR.joinpath('data/benchmarks/dpi_fda.tsv') 

In [13]:
dpi_bm = pd.read_csv(dpi_benchmark_path, sep='\t', names=[COL_SOURCE, COL_EDGE, COL_TARGET])
dpi_bm.head(3)

Unnamed: 0,src,edg,tgt
0,DB01079,DPI,Q13639
1,DB00114,DPI,P20711
2,DB01158,DPI,P13637



* [DB01079; Tegaserod](https://go.drugbank.com/drugs/DB01079)
Tegaserod is a serotonin-4 (5-HT4) receptor agonist indicated for the treatment of constipation predominant irritable bowel syndrome (IBS-C) specifically in women under the age of 65. There is currently no safety or efficacy data for use of tegaserol in men.

* https://www.uniprot.org/uniprotkb/Q13639/entry

In [14]:
dpi_drugs_unique = list(dpi_bm.src.unique())
dpi_prots_unique = list(dpi_bm.tgt.unique())
len(dpi_bm), len(dpi_drugs_unique), len(dpi_prots_unique)

(19161, 2286, 2705)

### Other embedderst

In [19]:
dpi_bm.head()

Unnamed: 0,src,edg,tgt
0,DB01079,DPI,Q13639
1,DB00114,DPI,P20711
2,DB01158,DPI,P13637
3,DB01069,DPI,P18825
4,DB01186,DPI,P08684


In [20]:
dpi_drugs_unique = list(dpi_bm.src.unique())
dpi_prots_unique = list(dpi_bm.tgt.unique())

dpi_entities = dpi_drugs_unique + dpi_prots_unique

In [21]:
from bioblp.benchmarking.embeddings import LookupEmbedding

In [129]:
import torch
import json
import abc

from dataclasses import dataclass
from pathlib import Path
from typing import Union


@dataclass
class LookupEmbedding():
    
    embeddings: torch.tensor
    entity_to_id: dict
    metadata: dict
    
    @classmethod 
    def from_pretrained(cls, model_path):
        _embedding_file = "embeddings.pt"
        _e2id_file = "entity_to_id.tsv"
        _metadata_file = "metadata.json"
        
        model_path = Path(model_path)
        
        required = [_embedding_file, _e2id_file]
        
        for file_i in required:
            assert model_path.joinpath(file_i).exists(), f"Missing required file {file_i} in dir {model_path}"
        
        embeddings = torch.load(model_path.joinpath(_embedding_file))
        entity_to_id = pd.read_csv(model_path.joinpath(_e2id_file), sep="\t", index_col = None, header=None)
        entity_to_id = {x[0]: x[1] for x in entity_to_id.values }
        
        metadata = None
        if model_path.joinpath(_metadata_file).exists():
             with open(model_path.joinpath(_metadata_file), "r") as f:
                metadata= json.load(f)
        
        return cls(embeddings=embeddings, entity_to_id=entity_to_id, metadata=metadata)
    
    
    def save(self, outdir: Union[str, Path], name: str):
        outdir = Path(outdir).joinpath(name)
        outdir.mkdir(parents=True, exist_ok=False)
        
        df = pd.DataFrame([[k, v] for k, v in self.entity_to_id.items()], columns=["e_", "id_"])
        df.to_csv(outdir.joinpath("entity_to_id.tsv"), sep="\t", index=False, header=False)
        
        torch.save(self.embeddings, outdir.joinpath("embeddings.pt"))
        
        if self.metadata is not None:
            with open(outdir.joinpath("metadata.json"), "w") as f:
                json.dump(self.metadata, f)
            
            
def create_noise_embeddings(entities, random_seed: int = 42, dim=128, emb_range=(-1, 1),
                            outdir = None, outname = None) -> LookupEmbedding:
    entity_set = set(entities)
    entity_to_id = {k: idx for idx, k in enumerate(sorted(list(entity_set)))}
    
    emb_shape = (len(entities), dim)

    r1 = emb_range[0]
    r2 = emb_range[1]

    g = torch.Generator()
    g.manual_seed(random_seed)

    embeddings = (r1 - r2) * torch.rand(emb_shape, generator=g) + r2
    
    metadata = {
        "random_seed": random_seed,
        "dim": dim,
        "emb_range": emb_range
    }
    
    lookup = LookupEmbedding(embeddings=embeddings,
                           entity_to_id=entity_to_id,
                           metadata=metadata)
    
    if outdir is not None:
        assert outname is not None, "need outout name"
        
        outdir = Path(outdir)
        
        lookup_embedding.save(outdir, outname)
    
    return lookup
            

def create_moltrans_lookup_embedding(data_dir, outdir = None, outname = None) -> LookupEmbedding:
    """Create lookup embedding files from raw Molecular Transformer files.
    """
    drug_id_file = "drug_ids.txt"
    drug_smiles_file = "drug_smiles.txt"
    smiles_emb_file = "drug_smiles_full.npz"
    
    data_dir = Path(data_dir)
    
    drug_df = pd.read_csv(data_dir.joinpath(drug_id_file), header=None)
    drug_df.columns = ["db_id"]
    drug_smiles_df = pd.read_csv(data_dir.joinpath(drug_smiles_file), header=None)

    drug_df["smiles"] = drug_smiles_df
    
    drug_ids_sorted = sorted(drug_df['db_id'].values.tolist())
    id2smiles = {x[0]: x[1] for x in drug_df.values}
    
    # e2id
    entity_to_id = {idx: k for idx, k in enumerate(drug_ids_sorted)}
    
    emb_records = []
    with np.load(data_dir.joinpath(smiles_emb_file)) as data:
        for drug_id in drug_ids_sorted:
            smile = id2smiles.get(drug_id)
            
            emb_tensor = torch.tensor(data[smile])
            emb_mean_pool = torch.mean(emb_tensor, dim=0) # is this best way?
            emb_records.append(emb_mean_pool)

    # embeddings
    embeddings = torch.stack(emb_records, dim=0)
    
    # meta
    metadata = {}
    
    lookup_embedding = LookupEmbedding(embeddings=embeddings,
                           entity_to_id=entity_to_id,
                           metadata=metadata)
    if outdir is not None:
        assert outname is not None, "need outout name"
        
        outdir = Path(outdir)
        
        lookup_embedding.save(outdir, outname)
    
    return lookup_embedding


In [122]:
# noise_embedder = NoiseEmbedding(random_seed=42, dim=128)

In [123]:
noise_embedder = create_noise_embeddings(dpi_entities, random_seed=42, dim=128)

In [131]:
noise_embedder.embeddings[[10,12,3]]

tensor([[ 7.2355e-01,  2.4987e-01,  9.5468e-02,  5.5649e-01,  7.3862e-01,
         -6.7262e-01, -6.7865e-01,  9.0819e-01, -3.1821e-01, -4.0680e-01,
         -9.4997e-01, -5.7855e-01, -9.1934e-01,  3.2739e-01, -7.0039e-01,
         -8.1340e-01,  9.4433e-01,  8.0271e-01, -2.0241e-01, -5.4604e-01,
          4.9681e-01, -1.1013e-01,  1.4517e-03, -2.5311e-01,  5.3735e-01,
         -5.6404e-01, -6.6501e-01,  6.9386e-01, -9.6874e-03, -2.7746e-03,
         -3.2112e-01, -9.3158e-01, -2.9326e-01,  2.2722e-01, -8.9813e-01,
          3.8053e-01,  2.9037e-01, -6.8275e-02,  7.6151e-01, -1.0882e-01,
          6.7848e-01, -1.0282e-01, -9.5808e-02, -1.3842e-01,  8.4330e-01,
          9.4978e-01, -4.6012e-01, -8.5760e-01,  8.8737e-01, -3.7039e-01,
          7.3612e-01, -6.2691e-02, -9.3036e-01, -7.5862e-01,  7.3114e-01,
         -6.1857e-01, -5.2232e-01,  1.5339e-03, -9.6873e-01,  3.9718e-01,
          2.3272e-01,  5.0541e-01, -1.4390e-01, -2.6479e-01, -7.9680e-02,
          5.6264e-01,  4.5017e-01, -3.

In [132]:
noise_embedder.metadata

{'random_seed': 42, 'dim': 128, 'emb_range': (-1, 1)}

In [133]:
noise_embedder.save("../data/features/", name="noisy-emb")

In [134]:
noise_embedder_2 = LookupEmbedding.from_pretrained("../data/features/noisy-emb/")

In [136]:
import numpy as np

if False:
    X = np.array(pos_neg_df_feat.joint_encoding.values.tolist())    
    y = pos_neg_df_feat.label.values
    print(f"Shape X: {X.shape}, y: {y.shape}")
          
    np.save("../data/features/kge-1baon0eg/X.npy", X)
    np.save("../data/features/kge-1baon0eg/y.npy", y)

### Drug properties

In [141]:
drug_properties_df = pd.read_csv("../data/biokg/biokg.properties.drug.tsv", sep="\t", header=None)

In [146]:
drug_properties_df.head()

Unnamed: 0,0,1,2
0,DB00441,DRUG_INDICATION_ASSOCIATION,C0019829
1,DB01013,DRUG_INDICATION_ASSOCIATION,C0033860
2,DB00997,DRUG_INDICATION_ASSOCIATION,C0005695
3,DB00215,DRUG_INDICATION_ASSOCIATION,C0233477
4,DB00603,DRUG_INDICATION_ASSOCIATION,C0600142


In [153]:
unique_drugs_prop = drug_properties_df[0].unique()

In [150]:
len(all_drugs)

3247

In [144]:
drug_properties_df[1].unique()

array(['DRUG_INDICATION_ASSOCIATION', 'DRUG_ATC_CODE',
       'DRUG_SIDEEFFECT_ASSOCIATION'], dtype=object)

In [145]:
drug_meta_df = pd.read_csv("../data/biokg/biokg.metadata.drug.tsv", sep="\t", header=None)

In [147]:
drug_meta_df.head()

Unnamed: 0,0,1,2
0,DB00006,SUBSTITUENT,1_hydroxy_2_unsubstituted_benzenoid
1,DB00006,SUBSTITUENT,Alpha_peptide
2,DB00006,SUBSTITUENT,Alpha_amino_acid_amide
3,DB00006,SUBSTITUENT,Alpha_amino_acid_or_derivatives
4,DB00006,SUBSTITUENT,Amine


In [155]:
unique_drugs_m = drug_meta_df[0].unique()

In [151]:
len(drug_meta_df[0].unique())

6627

In [272]:
biokg_drugs = list(set(unique_drugs_m).union(set(unique_drugs_prop)))

In [273]:
len(biokg_drugs)

8308

In [165]:
# all_drugs

Load data from mol transformer



In [167]:
mol_data_id_df = pd.read_csv(SHARED_DIR.joinpath("data/molecular_data/drug_ids_full.txt"), header=None)



In [185]:
mol_data_id_df

Unnamed: 0,0
0,DB07460
1,DB08869
2,DB11154
3,DB03231
4,DB08873
...,...
10810,DB09295
10811,DB11880
10812,DB03505
10813,DB14736


In [184]:
mol_data_id_df + mol_data_id_df

Unnamed: 0,0
0,DB07460DB07460
1,DB08869DB08869
2,DB11154DB11154
3,DB03231DB03231
4,DB08873DB08873
...,...
10810,DB09295DB09295
10811,DB11880DB11880
10812,DB03505DB03505
10813,DB14736DB14736


In [181]:
struct_embedding_array['CNC(=O)C1=C(NC2=NC(NC3=CC=C(C=C3OC)N3CCOCC3)=NC=C2Cl)C=CC=C1']

AttributeError: 'NoneType' object has no attribute 'open'

In [243]:
moltrans_emb = create_moltrans_lookup_embedding("/home/jovyan/workbench-shared-folder/bioblp/data/molecular_data/",
                                                outdir="/home/jovyan/workbench-shared-folder/bioblp/data/processed/",
                                                outname="moltrans-lookup-embeddings")

In [238]:
moltrans_emb.embeddings.shape

torch.Size([10815, 512])

In [260]:
moltrans_drugs = pd.read_csv("/home/jovyan/workbench-shared-folder/bioblp/data/molecular_data/drug_ids.txt", header=None)[0].values.tolist()


In [274]:
biokg_drugs_s = set(biokg_drugs)
moltrans_drugs_s = set(moltrans_drugs)

print(f"Num drugs in biokg: {len(biokg_drugs_s)}")
print(f"Num drugs in moltrans: {len(moltrans_drugs_s)}")

biokg_drugs_diff = biokg_drugs_s - moltrans_drugs_s

print(f"biok - moltrans = {len(biokg_drugs_diff)}")

moltrans_drugs_diff = moltrans_drugs_s - biokg_drugs_s

print(f"moltrans - biok   = {len(moltrans_drugs_diff)}")

Num drugs in biokg: 8308
Num drugs in moltrans: 10815
biok - moltrans = 370
moltrans - biok   = 2877


In [278]:
sorted(biokg_drugs)[:10]

['DB00001',
 'DB00002',
 'DB00003',
 'DB00004',
 'DB00005',
 'DB00006',
 'DB00007',
 'DB00008',
 'DB00009',
 'DB00010']

In [279]:
sorted(moltrans_drugs)[:10]

['DB00006',
 'DB00007',
 'DB00014',
 'DB00027',
 'DB00035',
 'DB00050',
 'DB00067',
 'DB00080',
 'DB00091',
 'DB00093']