In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import json
import numpy as np
import pandas as pd 
from pathlib import Path
from bioblp.data import COL_EDGE, COL_SOURCE, COL_TARGET

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
DATA_DIR = Path("../data")
SHARED_DIR = Path("/home/jovyan/workbench-shared-folder/bioblp")

In [4]:
!ls {SHARED_DIR}/models

1baon0eg  iwy9z7m9


## Load benchmark

#### Set paths

In [59]:
DDR = 'bm_ddr'
BIOKG = 'bm_biokg'
BIOKG_TRANSD = 'bm_biokg_transductive'
ALL = 'all'
TRAIN  = 'train'
TEST = 'test'
VALID = 'valid'
data_splits = [ALL, TRAIN, TEST, VALID]
BM_TYPES = [BIOKG, DDR, BIOKG_TRANSD]

In [60]:
dpi_biokg_bm_path = SHARED_DIR.joinpath('data/benchmarks/dpi_fda.tsv') 
dpi_biokg_transductive_bm_path = SHARED_DIR.joinpath('data/benchmarks/transductive/dpi_fda.tsv') 
dpi_ddr_bm_path = SHARED_DIR.joinpath('data/benchmarks/ddr_dpi_fda.txt') 
BIOKG_GRAPH_PATHS = {ALL: DATA_DIR.joinpath('raw/biokg.links.tsv'),
                     TRAIN: DATA_DIR.joinpath('raw/biokg_bm_splits/biokg.links-train.csv'),
                     TEST: DATA_DIR.joinpath('raw/biokg_bm_splits/biokg.links-test.csv'),
                     VALID: DATA_DIR.joinpath('raw/biokg_bm_splits/biokg.links-valid.csv'),
}
FDA_DPI_PATHS = {BIOKG: dpi_biokg_bm_path,
                 DDR: dpi_ddr_bm_path,
                 BIOKG_TRANSD: dpi_biokg_transductive_bm_path }

In [61]:
def describe_dpi_dataset(df):
    dpi_drugs_unique = list(df[COL_SOURCE].unique())
    dpi_prots_unique = list(df[COL_TARGET].unique())
    print(f'# Unique dpi instances (TP): {len(df)}')
    print(f'# Unique Drugs: {len(dpi_drugs_unique)}')
    print(f'# Unique Proteins: {len(dpi_prots_unique)}\n')

#### Load benchmarks and graphs (as DFs)

Let's load them graphs

In [62]:
col_headers = [COL_SOURCE, COL_EDGE, COL_TARGET]
biokg_graphs = {}

for split in data_splits:
    biokg_graphs[split] = pd.read_csv(BIOKG_GRAPH_PATHS[split], sep="\t", names=col_headers)

In [63]:
biokg_graphs_dpi = {}
for split in data_splits:
    biokg_graphs_dpi[split] = biokg_graphs[split][biokg_graphs[split][COL_EDGE]=='DPI']
    biokg_graphs_dpi[split]['key'] = list(zip(biokg_graphs_dpi[split].src, biokg_graphs_dpi[split].tgt))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  biokg_graphs_dpi[split]['key'] = list(zip(biokg_graphs_dpi[split].src, biokg_graphs_dpi[split].tgt))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  biokg_graphs_dpi[split]['key'] = list(zip(biokg_graphs_dpi[split].src, biokg_graphs_dpi[split].tgt))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bio

In [64]:
for split in data_splits:
    print(f'DPIs in split `{split}` of biokg graph')
    describe_dpi_dataset(biokg_graphs_dpi[split])


DPIs in split `all` of biokg graph
# Unique dpi instances (TP): 28061
# Unique Drugs: 6842
# Unique Proteins: 3672

DPIs in split `train` of biokg graph
# Unique dpi instances (TP): 7860
# Unique Drugs: 4483
# Unique Proteins: 2085

DPIs in split `test` of biokg graph
# Unique dpi instances (TP): 527
# Unique Drugs: 381
# Unique Proteins: 378

DPIs in split `valid` of biokg graph
# Unique dpi instances (TP): 513
# Unique Drugs: 370
# Unique Proteins: 366



Now, there are 2 FDA DPI benchmark datasets:
1. Biokg's FDA_DPI benchmark dataset
2. DDR's version of FDA_DPI: https://academic.oup.com/bioinformatics/article/34/7/1164/4657065

In [65]:
# biokg FDA_DPI benchmark dataset
def load_dpi_benchmark(bm_types):
    DPI_BENCHMARKS = {}
    for bm_type in bm_types:
        if bm_type == DDR:
            df = pd.read_csv(FDA_DPI_PATHS[bm_type], sep='\t', names=[COL_SOURCE, COL_TARGET]).drop_duplicates()
            df[COL_EDGE] = 'DPI'
        else:
            df = pd.read_csv(FDA_DPI_PATHS[bm_type], sep='\t', names=[COL_SOURCE, COL_EDGE, COL_TARGET]).drop_duplicates()
        df['key'] = list(zip(df.src, df.tgt))
        DPI_BENCHMARKS[bm_type] = df
    return DPI_BENCHMARKS

dpi_benchmarks = load_dpi_benchmark(BM_TYPES)
dpi_benchmarks[BIOKG].head(3)

Unnamed: 0,src,edg,tgt,key
0,DB01079,DPI,Q13639,"(DB01079, Q13639)"
1,DB00114,DPI,P20711,"(DB00114, P20711)"
2,DB01158,DPI,P13637,"(DB01158, P13637)"


In [66]:
for bm_type in BM_TYPES:
    describe_dpi_dataset(dpi_benchmarks[bm_type])

# Unique dpi instances (TP): 19161
# Unique Drugs: 2286
# Unique Proteins: 2705

# Unique dpi instances (TP): 9881
# Unique Drugs: 1482
# Unique Proteins: 1408

# Unique dpi instances (TP): 18678
# Unique Drugs: 2167
# Unique Proteins: 2573



sanity check our biokg dpi_fda set

#### What's missing and what's not
Check coverage and distribution of benchmark DTI triples, and entities, across the biokg graphs, both pre and post benchmark separation

In [67]:
dpi_drugs = {BIOKG: set(dpi_benchmarks[BIOKG].src.values),
             BIOKG_TRANSD: set(dpi_benchmarks[BIOKG_TRANSD].src.values),
            DDR: set(dpi_benchmarks[DDR].src.values),
            f'{TRAIN}_dpi': set(biokg_graphs_dpi[TRAIN].src.values),
            f'{ALL}_dpi': set(biokg_graphs_dpi[ALL].src.values),
}

dpi_prots = {BIOKG: set(dpi_benchmarks[BIOKG].tgt.values),
             BIOKG_TRANSD: set(dpi_benchmarks[BIOKG_TRANSD].tgt.values),
            DDR: set(dpi_benchmarks[DDR].tgt.values),
            f'{TRAIN}_dpi': set(biokg_graphs_dpi[TRAIN].tgt.values),
            f'{ALL}_dpi': set(biokg_graphs_dpi[ALL].tgt.values),
}

all_ents = {TRAIN: set(biokg_graphs[TRAIN].src.unique()).union(set(biokg_graphs[TRAIN].tgt.unique())),
            ALL: set(biokg_graphs[ALL].src.unique()).union(set(biokg_graphs[ALL].tgt.unique())),
}

In [68]:
[(k,len(v)) for k,v in dpi_drugs.items()]

[('bm_biokg', 2286),
 ('bm_biokg_transductive', 2167),
 ('bm_ddr', 1482),
 ('train_dpi', 4483),
 ('all_dpi', 6842)]

In [70]:
[(k,len(v)) for k,v in all_ents.items()]

[('train', 106047), ('all', 106337)]

1. Assert that biokg_full_graph drug/prot entities form a superset of entities in train dpis, and bm dpis

In [73]:
def assert_tallies(given_slice = BIOKG):
    print(f'From Amongst entities from {given_slice} benchmark:')
    for data_slice in dpi_prots.keys():
        if data_slice == given_slice:
            next
        else:
            print(f'# Unseen drugs in {data_slice}: {len(dpi_drugs[given_slice].difference(dpi_drugs[data_slice]))}')
            print(f'# Unseen proteins in {data_slice}: {len(dpi_prots[given_slice].difference(dpi_prots[data_slice]))}')
    for graph_type, unique_ent_set in all_ents.items():
        print(f'# Unseen drugs in {graph_type}: {len(dpi_drugs[given_slice].difference(unique_ent_set))}')
        print(f'# Unseen proteins in {graph_type}: {len(dpi_prots[given_slice].difference(unique_ent_set))}')



In [74]:
assert_tallies(given_slice=DDR)

From Amongst entities from bm_ddr benchmark:
# Unseen drugs in bm_biokg: 100
# Unseen proteins in bm_biokg: 67
# Unseen drugs in bm_biokg_transductive: 113
# Unseen proteins in bm_biokg_transductive: 81
# Unseen drugs in train_dpi: 1394
# Unseen proteins in train_dpi: 565
# Unseen drugs in all_dpi: 9
# Unseen proteins in all_dpi: 2
# Unseen drugs in train: 16
# Unseen proteins in train: 13
# Unseen drugs in all: 3
# Unseen proteins in all: 1


In [75]:
assert_tallies(given_slice=BIOKG)

From Amongst entities from bm_biokg benchmark:
# Unseen drugs in bm_biokg_transductive: 119
# Unseen proteins in bm_biokg_transductive: 132
# Unseen drugs in bm_ddr: 904
# Unseen proteins in bm_ddr: 1364
# Unseen drugs in train_dpi: 2286
# Unseen proteins in train_dpi: 1520
# Unseen drugs in all_dpi: 0
# Unseen proteins in all_dpi: 0
# Unseen drugs in train: 110
# Unseen proteins in train: 96
# Unseen drugs in all: 0
# Unseen proteins in all: 0


In [76]:
assert_tallies(given_slice=BIOKG_TRANSD)

From Amongst entities from bm_biokg_transductive benchmark:
# Unseen drugs in bm_biokg: 0
# Unseen proteins in bm_biokg: 0
# Unseen drugs in bm_ddr: 798
# Unseen proteins in bm_ddr: 1246
# Unseen drugs in train_dpi: 2167
# Unseen proteins in train_dpi: 1401
# Unseen drugs in all_dpi: 0
# Unseen proteins in all_dpi: 0
# Unseen drugs in train: 0
# Unseen proteins in train: 0
# Unseen drugs in all: 0
# Unseen proteins in all: 0


In [78]:
inductive_biokg_dpi_drugs = list(dpi_drugs[BIOKG].difference(all_ents[TRAIN]))
inductive_biokg_dpi_prots = list(dpi_prots[BIOKG].difference(all_ents[TRAIN]))
len(inductive_biokg_dpi_drugs), len(inductive_biokg_dpi_prots)

(110, 96)

In [79]:
inductive_dpi_drugs_path = DATA_DIR.joinpath('benchmarks/inductive_fda_dpi_drugs.json')
inductive_dpi_prots_path = DATA_DIR.joinpath('benchmarks/inductive_fda_dpi_prots.json')

with open(inductive_dpi_drugs_path, 'w+') as f:
    json.dump(inductive_biokg_dpi_drugs, f)
    
with open(inductive_dpi_prots_path, 'w+') as f:
    json.dump(inductive_biokg_dpi_prots, f)

Let's check the overlap/spread of the benchmark triples across the graphs

In [80]:
biokg_graphs[ALL]['key'] = list(zip(biokg_graphs[ALL].src.values, biokg_graphs[ALL].tgt.values))
bm_biokg_dpi_pairs = set(dpi_benchmarks[BIOKG]['key'].values)
bm_biokg_transd_dpi_pairs = set(dpi_benchmarks[BIOKG_TRANSD]['key'].values)
bm_ddr_dpi_pairs = set(dpi_benchmarks[DDR]['key'].values)
biokg_all_dpi_pairs = set(biokg_graphs_dpi[ALL]['key'].values)
biokg_train_dpi_pairs = set(biokg_graphs_dpi[TRAIN]['key'].values)
len(bm_biokg_dpi_pairs), len(bm_ddr_dpi_pairs), len(biokg_all_dpi_pairs)

(19161, 9881, 28061)

In [143]:
num_ddr_bm_pairs = len(bm_ddr_dpi_pairs)
unseen_ddr_in_biokg_bm = len(bm_ddr_dpi_pairs.difference(bm_biokg_dpi_pairs))
unseen_ddr_in_train_set = len(bm_ddr_dpi_pairs.difference(biokg_train_dpi_pairs))
unseen_ddr_in_all_biokg = len(bm_ddr_dpi_pairs.difference(biokg_all_dpi_pairs))

In [185]:
print(f'# DPI triples in DDR benchmark: {len(bm_ddr_dpi_pairs)}')
print('Proportion of DDR DPI triples unseen in biokg bm DPI set: {} Triples OR {:.2f}%'.format(unseen_ddr_in_biokg_bm, unseen_ddr_in_biokg_bm/num_ddr_bm_pairs*100))
print('Proportion of DDR DPI triples unseen in train biokg: {} Triples OR {:.2f}%'.format(unseen_ddr_in_train_set, unseen_ddr_in_train_set/num_ddr_bm_pairs*100))
print('Proportion of DDR DPI pairs unseen in superset biokg: {} Triples OR {:.2f}%'.format(unseen_ddr_in_all_biokg, unseen_ddr_in_all_biokg/num_ddr_bm_pairs*100))

# DPI triples in DDR benchmark: 9881
Proportion of DDR DPI triples unseen in biokg bm DPI set: 1199 Triples OR 12.13%
Proportion of DDR DPI triples unseen in train biokg: 9523 Triples OR 96.38%
Proportion of DDR DPI pairs unseen in superset biokg: 751 Triples OR 7.60%


**From the above we see that DDR DPI benchmark is NOT a complete subset of BIOKG DPI benchmark**

In [81]:
num_biokg_bm_pairs = len(bm_biokg_dpi_pairs)
unseen_biokg_bm_in_ddr_bm = len(bm_biokg_dpi_pairs.difference(bm_ddr_dpi_pairs))
unseen_biokg_bm_in_train_set = len(bm_biokg_dpi_pairs.difference(biokg_train_dpi_pairs))
unseen_biokg_bm_in_all_biokg = len(bm_biokg_dpi_pairs.difference(biokg_all_dpi_pairs))

In [87]:
print(f'# DPI triples in Biokg DPI benchmark: {num_biokg_bm_pairs}')
print('Proportion of Biokg Benchmark DPI pairs unseen in DDR bm DPI set: {} Triples OR {:.2f}%'.format(unseen_biokg_bm_in_ddr_bm, unseen_biokg_bm_in_ddr_bm/num_biokg_bm_pairs*100))
print('Proportion of Biokg Benchmark DPI pairs unseen in train biokg: {} Triples OR {:.2f}%'.format(unseen_biokg_bm_in_train_set, unseen_biokg_bm_in_train_set/num_biokg_bm_pairs*100))
print('Proportion of Biokg Benchmark DPI pairs unseen in superset biokg: {} Triples OR {:.2f}%'.format(unseen_biokg_bm_in_all_biokg, unseen_biokg_bm_in_all_biokg/num_biokg_bm_pairs*100))


# DPI triples in Biokg DPI benchmark: 19161
Proportion of Biokg Benchmark DPI pairs unseen in DDR bm DPI set: 10479 Triples OR 54.69%
Proportion of Biokg Benchmark DPI pairs unseen in train biokg: 19161 Triples OR 100.00%
Proportion of Biokg Benchmark DPI pairs unseen in superset biokg: 0 Triples OR 0.00%


`Note`: We want 100% of benchmark test triples to be unseen by train set (i.e., null intersection). However, we commonly want all the test entities to be seen at the time if training. This is so that trained models such as Complex, RotatE, etc can learn embeddings for these entties without which they would not be able to predict links concering these entities at test time. This is `transductive` setting. 


However, it would be cool to be able to test all trained Link prediction models on their ability to predict links on new 'unseeen-during-training' entities. This is the indictive setting. From an above analysis we see that not all drugs/proteins from the test set (dpi benchmarks) are present in the training sets. These can form an inductive test set.

### Tranductive DPI benchmark 
(`Transductive to BioKG-sans-benchmarks`)

In [None]:
num_biokg_transd_bm_pairs = len(bm_biokg_transd_dpi_pairs)
unseen_biokg_transd_bm_in_ddr_bm = len(bm_biokg_transd_dpi_pairs.difference(bm_ddr_dpi_pairs))
unseen_biokg_transd_bm_in_train_set = len(bm_biokg_transd_dpi_pairs.difference(biokg_train_dpi_pairs))
unseen_biokg_transd_bm_in_all_biokg = len(bm_biokg_transd_dpi_pairs.difference(biokg_all_dpi_pairs))

In [86]:
print(f'# DPI triples in biokg transductive DPI benchmark: {num_biokg_transd_bm_pairs}')
print('Proportion of Biokg-Transductive Benchmark DPI pairs unseen in DDR bm DPI set: {} Triples OR {:.2f}%'.format(unseen_biokg_transd_bm_in_ddr_bm, unseen_biokg_transd_bm_in_ddr_bm/num_biokg_transd_bm_pairs*100))
print('Proportion of Biokg-Transductive Benchmark DPI pairs unseen in train biokg: {} Triples OR {:.2f}%'.format(unseen_biokg_transd_bm_in_train_set, unseen_biokg_transd_bm_in_train_set/num_biokg_transd_bm_pairs*100))
print('Proportion of Biokg-Transductive Benchmark DPI pairs unseen in superset biokg: {} Triples OR {:.2f}%'.format(unseen_biokg_transd_bm_in_all_biokg, unseen_biokg_transd_bm_in_all_biokg/num_biokg_transd_bm_pairs*100))


# DPI triples in biokg transductive DPI benchmark: 18678
Proportion of Biokg-Transductive Benchmark DPI pairs unseen in DDR bm DPI set: 10031 Triples OR 53.70%
Proportion of Biokg-Transductive Benchmark DPI pairs unseen in train biokg: 18678 Triples OR 100.00%
Proportion of Biokg-Transductive Benchmark DPI pairs unseen in superset biokg: 0 Triples OR 0.00%


### What about within the other benchmark datasets?


In [187]:
benchmark_path =SHARED_DIR.joinpath('data/benchmarks/')
benchmark_files = [#'dpi_fda.tsv',
                   'dep_fda_exp.tsv',
                   'ddi_efficacy.tsv',
                   'ddi_minerals.tsv',
                   'phosphorylation.tsv']

def load_benchmark_file(path):
    return pd.read_csv(path, sep='\t', header=None, usecols=[0, 1, 2], names=['src', 'edg', 'tgt'])

other_benchmarks = pd.concat([load_benchmark_file(benchmark_path.joinpath(file)) for file in benchmark_files])
print(f'There are {len(other_benchmarks):,} pairs in the merged benchmarks.')
other_benchmarks.head()

There are 1,129,439 pairs in the merged benchmarks.


Unnamed: 0,src,edg,tgt
0,DB00977,inc_expr,Q8R2H1
1,DB00996,inc_expr,Q5R4C1
2,DB01109,inc_expr,P25106
3,DB00499,inc_expr,Q7JZM8
4,DB13323,inc_expr,Q9R1R0


In [189]:
other_benchmarks.edg.unique()

array(['inc_expr', 'dec_expr', 'decrease_therapeutic_efficacy',
       'increase_therapeutic_efficacy', 'increase_hyperkalemia',
       'increase_hypoglycemia', 'increase_hyperglycemia',
       'increase_hypokalemia', 'decrease_hypoglycemia',
       'increase_hyponatremia', 'increase_hypercalcemia',
       'increase_hypocalcemia', 'phosphorylates'], dtype=object)

In [190]:
def combine_and_sort_entities(df):
    """Given a dataset containing pairs in the columns `left` and `right`,
    creates a new column called `combined` containing the pairs as a
    list (in string format) with the pairs sorted in lexicographical order."""
    df['combined'] = df.apply(lambda x: str(sorted([x['src'], x['tgt']])), axis=1)
    return df

combine_and_sort_entities(other_benchmarks)
other_benchmarks.head()

Unnamed: 0,src,edg,tgt,combined
0,DB00977,inc_expr,Q8R2H1,"['DB00977', 'Q8R2H1']"
1,DB00996,inc_expr,Q5R4C1,"['DB00996', 'Q5R4C1']"
2,DB01109,inc_expr,P25106,"['DB01109', 'P25106']"
3,DB00499,inc_expr,Q7JZM8,"['DB00499', 'Q7JZM8']"
4,DB13323,inc_expr,Q9R1R0,"['DB13323', 'Q9R1R0']"


In [180]:
combine_and_sort_entities(dpi_benchmarks[DDR])
dpi_benchmarks[DDR].head()

Unnamed: 0,src,tgt,edg,key,combined
0,DB00672,Q16348,DPI,"(DB00672, Q16348)","['DB00672', 'Q16348']"
1,DB00116,P13995,DPI,"(DB00116, P13995)","['DB00116', 'P13995']"
2,DB06663,P30872,DPI,"(DB06663, P30872)","['DB06663', 'P30872']"
3,DB06663,P30874,DPI,"(DB06663, P30874)","['DB06663', 'P30874']"
4,DB00586,O43526,DPI,"(DB00586, O43526)","['DB00586', 'O43526']"


can we find some of these DPI Pairs in the other 4 benchmarks?

In [191]:
merged = pd.merge(other_benchmarks, dpi_benchmarks[DDR], how='right', on=["combined"], indicator=True)

In [182]:
merged.head()

Unnamed: 0,src_x,edg_x,tgt_x,combined,src_y,tgt_y,edg_y,key,_merge
0,,,,"['DB00672', 'Q16348']",DB00672,Q16348,DPI,"(DB00672, Q16348)",right_only
1,,,,"['DB00116', 'P13995']",DB00116,P13995,DPI,"(DB00116, P13995)",right_only
2,,,,"['DB06663', 'P30872']",DB06663,P30872,DPI,"(DB06663, P30872)",right_only
3,,,,"['DB06663', 'P30874']",DB06663,P30874,DPI,"(DB06663, P30874)",right_only
4,,,,"['DB00586', 'O43526']",DB00586,O43526,DPI,"(DB00586, O43526)",right_only
