In [None]:
# 1. know whether the treats and drug-disease relationship were removed in preparation of topological embeddings
# 2. know if the node-type subgraphs created disconnected components (non-connected subgraphs) in the graph
# 3. Know the ground truth not retained by the ROBOKOP KG and find out the reason for most of them



In [2]:
import pandas as pd
import gcsfs

In [3]:
df = pd.read_parquet("gs://mtrx-us-central1-hub-dev-storage/kedro/data/tests/base_robo_num_walks_50_walk_length_30/runs/main-0b4f8ab2/datasets/modelling/ground_truth/translator/v2.10.0_validated/ground_truth_pairs")
df.shape

(46065, 13)

In [4]:
df.head()

Unnamed: 0,original_object,original_subject,indication,contraindication,y,drug|disease,disease,drug,predicate,source,subject_normalization_success,target,object_normalization_success
0,MONDO:0002406,CHEBI:31199,True,False,1,CHEBI:31199|MONDO:0002406,MONDO:0002406,CHEBI:31199,clinical_trails,CHEBI:31199,True,MONDO:0002406,True
1,MONDO:0002406,CHEBI:5108,True,False,1,CHEBI:5108|MONDO:0002406,MONDO:0002406,CHEBI:5108,clinical_trails,CHEBI:5108,True,MONDO:0002406,True
2,MONDO:0002406,CHEBI:2948,True,False,1,CHEBI:2948|MONDO:0002406,MONDO:0002406,CHEBI:2948,clinical_trails,CHEBI:2948,True,MONDO:0002406,True
3,MONDO:0002406,CHEBI:422,True,False,1,CHEBI:422|MONDO:0002406,MONDO:0002406,CHEBI:422,clinical_trails,CHEBI:422,True,MONDO:0002406,True
4,MONDO:0002406,CHEBI:31414,True,False,1,CHEBI:31414|MONDO:0002406,MONDO:0002406,CHEBI:31414,clinical_trails,CHEBI:31414,True,MONDO:0002406,True


In [5]:
df.to_csv("/Users/jchung/Documents/RENCI/everycure/matrix/data/base_robo_50_30/gt_retained.tsv", sep='\t')

In [6]:
df["subject_normalization_success"].value_counts()

subject_normalization_success
True    46065
Name: count, dtype: int64

In [None]:
fs = gcsfs.GCSFileSystem()
files = fs.ls('mtrx-us-central1-hub-dev-storage/kedro/data/tests/base_robo_num_walks_50_walk_length_30/datasets/ingestion/ground_truth/translator/v2.10.0_validated/gt_pairs.tsv')

# Filter to only .tsv files
tsv_files = ['gs://' + f for f in files if f.endswith('.csv')]

# Read and concatenate all TSVs
gt_ori = pd.concat([pd.read_csv(f, sep='\t') for f in tsv_files], ignore_index=True)

In [8]:
gt_ori.head()

Unnamed: 0,source,target,indication,contraindication,y,drug|disease
0,UMLS:C0450442,MONDO:0005429,True,False,1,UMLS:C0450442|MONDO:0005429
1,CHEBI:8711,MONDO:0005429,True,False,1,CHEBI:8711|MONDO:0005429
2,CHEBI:59826,HP:0100608,True,False,1,CHEBI:59826|HP:0100608
3,UMLS:C0243192,MONDO:0000088,True,False,1,UMLS:C0243192|MONDO:0000088
4,PUBCHEM.COMPOUND:36523,MONDO:0000088,True,False,1,PUBCHEM.COMPOUND:36523|MONDO:0000088


In [9]:
gt_ori["in_ROBOKOP"] = gt_ori["drug|disease"].isin(df["drug|disease"])

In [10]:
gt_ori[~gt_ori["in_ROBOKOP"]]

Unnamed: 0,source,target,indication,contraindication,y,drug|disease,in_ROBOKOP
0,UMLS:C0450442,MONDO:0005429,True,False,1,UMLS:C0450442|MONDO:0005429,False
2,CHEBI:59826,HP:0100608,True,False,1,CHEBI:59826|HP:0100608,False
3,UMLS:C0243192,MONDO:0000088,True,False,1,UMLS:C0243192|MONDO:0000088,False
5,MESH:D006062,MONDO:0000088,True,False,1,MESH:D006062|MONDO:0000088,False
9,CHEBI:46345,UMLS:C1271404,True,False,1,CHEBI:46345|UMLS:C1271404,False
...,...,...,...,...,...,...,...
68964,CHEBI:4027,MONDO:0008383,False,True,0,CHEBI:4027|MONDO:0008383,False
68975,CHEBI:47381,MONDO:0004795,False,True,0,CHEBI:47381|MONDO:0004795,False
68979,PUBCHEM.COMPOUND:60852,HP:0012531,False,True,0,PUBCHEM.COMPOUND:60852|HP:0012531,False
68991,PUBCHEM.COMPOUND:5460033,HP:0012531,False,True,0,PUBCHEM.COMPOUND:5460033|HP:0012531,False


In [11]:
table = gt_ori.pivot_table(index='y', columns='in_ROBOKOP', aggfunc='size', fill_value=0)
print(table)

percentage_table = table.divide(table.sum(axis=1), axis=0) * 100
print(percentage_table)

in_ROBOKOP  False  True 
y                       
0           11462  28977
1           11474  17088
in_ROBOKOP      False      True 
y                               
0           28.343925  71.656075
1           40.172257  59.827743


In [25]:
gt_ori.to_csv("/Users/jchung/Documents/RENCI/everycure/matrix/data/base_robo_50_30/ground_truth_rtxv2.10.0_validated_inROBOKOP.tsv", sep='\t')

In [7]:
df.to_csv("/Users/jchung/Documents/RENCI/everycure/matrix/data/kedro_data_tests_base_robo_num_walks_50_walk_length_30_datasets_ingestion_ground_truth_translator_v2.10.0_validated_gt_pairs.tsv", sep='\t')

In [14]:
tp = pd.read_csv("/Users/jchung/Documents/RENCI/everycure/matrix/data/data_01_RAW_ground_truth_kg2_v2.10.0_validated_tp_pairs.txt", sep='\t')
tp["drug_disease"] = tp["source"].astype(str) + '|' + tp["target"].astype(str)
tp["in_ROBOKOP"] = tp["drug_disease"].isin(df["drug|disease"])
tp[tp["in_ROBOKOP"]][["source", "target"]].to_csv("/Users/jchung/Documents/RENCI/everycure/matrix/data/data_01_RAW_ground_truth_kg2_v2.10.0_validated_tp_pairs_retained.txt", sep='\t', index=False, header=True)

In [None]:
tp

In [15]:
tn = pd.read_csv("/Users/jchung/Documents/RENCI/everycure/matrix/data/data_01_RAW_ground_truth_kg2_v2.10.0_validated_tn_pairs.txt", sep='\t')
tn["drug_disease"] = tn["source"].astype(str) + '|' + tn["target"].astype(str)
tn["in_ROBOKOP"] = tn["drug_disease"].isin(df["drug|disease"])
tn[tn["in_ROBOKOP"]][["source", "target"]].to_csv("/Users/jchung/Documents/RENCI/everycure/matrix/data/data_01_RAW_ground_truth_kg2_v2.10.0_validated_tn_pairs_retained.txt", sep='\t', index=False, header=True)

In [16]:
tn["in_ROBOKOP"].value_counts()

in_ROBOKOP
True     28977
False    11462
Name: count, dtype: int64

In [22]:
gt_pairs = pd.read_csv("/Users/jchung/Documents/RENCI/everycure/matrix/data/kedro_data_tests_base_robo_num_walks_50_walk_length_30_datasets_ingestion_ground_truth_translator_v2.10.0_validated_gt_pairs.tsv", sep='\t', index_col=0)
gt_pairs = gt_pairs[gt_pairs["drug|disease"].isin(df["drug|disease"])].reset_index(drop=True)
gt_pairs.to_csv("/Users/jchung/Documents/RENCI/everycure/matrix/data/retained_gt_set/v2.10.0_validated_gt_pairs.tsv", sep='\t', index=True, header=True)

In [23]:
gt_pairs_raw = pd.read_csv("/Users/jchung/Documents/RENCI/everycure/matrix/data/kedro_data_tests_base_robo_num_walks_50_walk_length_30_datasets_ingestion_ground_truth_translator_v2.10.0_validated_gt_pairs_raw.tsv", sep='\t')
gt_pairs_raw = gt_pairs_raw[gt_pairs_raw["drug|disease"].isin(df["drug|disease"])].reset_index(drop=True)
gt_pairs_raw.to_csv("/Users/jchung/Documents/RENCI/everycure/matrix/data/retained_gt_set/v2.10.0_validated_gt_pairs_raw.tsv", sep='\t', index=False, header=True)