In [9]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
#import pandas as pd 
from pathlib import Path
from bioblp.data import COL_EDGE, COL_SOURCE, COL_TARGET

In [4]:
DATA_DIR = Path("../data")

## Load benchmark

In [11]:
dpi_benchmark_path = DATA_DIR.joinpath('benchmarks/dpi_fda.tsv')

In [12]:
dpi_bm = pd.read_csv(dpi_benchmark_path, sep='\t', names=[COL_SOURCE, COL_EDGE, COL_TARGET])
dpi_bm.head(3)

Unnamed: 0,src,edg,tgt
0,DB01079,DPI,Q13639
1,DB00114,DPI,P20711
2,DB01158,DPI,P13637



* [DB01079; Tegaserod](https://go.drugbank.com/drugs/DB01079)
Tegaserod is a serotonin-4 (5-HT4) receptor agonist indicated for the treatment of constipation predominant irritable bowel syndrome (IBS-C) specifically in women under the age of 65. There is currently no safety or efficacy data for use of tegaserol in men.

* https://www.uniprot.org/uniprotkb/Q13639/entry

In [13]:
dpi_drugs_unique = list(dpi_bm.src.unique())
dpi_prots_unique = list(dpi_bm.tgt.unique())
len(dpi_bm), len(dpi_drugs_unique), len(dpi_prots_unique)

(19161, 2286, 2705)

## Prepare Data for ML

In [14]:
from bioblp.benchmarking.featurize import load_model_and_entity_to_id_maps, ENTITY_TO_ID_MAP, RELATION_TO_ID_MAP

In [15]:
model_dir = Path('/home/jovyan/BioBLP/models/1baon0eg')

kge_artifacts = load_model_and_entity_to_id_maps(model_dir)
entity_to_id_map = kge_artifacts.get(ENTITY_TO_ID_MAP)
relation_to_id_map = kge_artifacts.get(RELATION_TO_ID_MAP)

18:19:07 - INFO - bioblp.benchmarking.featurize - Loading trained model from /home/jovyan/BioBLP/models/1baon0eg


### Generate Negative samples

In [24]:
from bioblp.benchmarking.preprocess import prepare_dpi_samples

In [25]:
pos_neg_df = prepare_dpi_samples(dpi_bm[:4], entity_to_id_map, relation_to_id_map)

In [26]:
pos_neg_df.head(10)

Unnamed: 0,src,edg,tgt,label
0,8879,5,44406,1
1,7928,5,30725,1
2,8957,5,29310,1
3,8869,5,30330,1
4,22444,5,30725,0
5,12889,5,30330,0
6,8879,5,5562,0
7,8957,5,18343,0


### Featurize by obtaining joint entity pair encoding

In [29]:
from bioblp.benchmarking.featurize import generate_dp_pair_joint_encoding
from bioblp.benchmarking.featurize import concatenate, average


In [30]:
# pass on to KGEM encoder to featurize each entity pair
generate_dp_pair_joint_encoding(true_pos_df,
                                model=kge_artifacts["model"],
                                transform_fn=concatenate)
                            

18:22:21 - INFO - bioblp.benchmarking.featurize - Applying transformation function: f<function concatenate at 0x7f51d7aecf70>, to retrieve joint encoding for entity pair


Unnamed: 0,index,src,edg,tgt,label,joint_encoding
0,0,8879,5,44406,1,"[0.24256302416324615, 0.4320533871650696, -0.4..."
1,1,7928,5,30725,1,"[0.34729743003845215, -0.003592705586925149, -..."
2,2,8957,5,29310,1,"[-1.564800500869751, 0.018311385065317154, -0...."
3,3,8869,5,30330,1,"[0.03526313602924347, -2.60638427734375, -1.02..."
4,0,8095,5,30725,0,"[-0.19735385477542877, 0.06663068383932114, -0..."
5,1,40463,5,30330,0,"[-0.022010549902915955, 1.4183855056762695, 1...."
6,2,8879,5,15044,0,"[0.24256302416324615, 0.4320533871650696, -0.4..."
7,3,8957,5,33535,0,"[-1.564800500869751, 0.018311385065317154, -0...."
