This notebook has 3 purposes:
* Understanding the structure of the data
* Constructing the knowledge graph (KG)
* Producing the positive and negative samples for training

# Data Structure

In [111]:
import pandas as pd
import os

In [112]:
path = os.path.dirname(os.getcwd())

Nodes

In [113]:
nodes = pd.read_csv(path + r'\data\nodes.csv')
print(nodes.shape)
nodes.head()

(129375, 5)


Unnamed: 0,node_index,node_id,node_type,node_name,node_source
0,0,9796,gene/protein,PHYHIP,NCBI
1,1,7918,gene/protein,GPANK1,NCBI
2,2,8233,gene/protein,ZRSR2,NCBI
3,3,4899,gene/protein,NRF1,NCBI
4,4,5297,gene/protein,PI4KA,NCBI


Edges

In [114]:
edges = pd.read_csv(path + r'\data\edges.csv')
print(edges.shape)
edges.head()

(8100498, 4)


Unnamed: 0,relation,display_relation,x_index,y_index
0,protein_protein,ppi,0,8889
1,protein_protein,ppi,1,2798
2,protein_protein,ppi,2,5646
3,protein_protein,ppi,3,11592
4,protein_protein,ppi,4,2122


Knowledge Graph

In [115]:
kg = pd.read_csv(path + r'\data\kg.csv')
print(kg.shape)
kg.head()

  kg = pd.read_csv(path + r'\data\kg.csv')


(8100498, 12)


Unnamed: 0,relation,display_relation,x_index,x_id,x_type,x_name,x_source,y_index,y_id,y_type,y_name,y_source
0,protein_protein,ppi,0,9796,gene/protein,PHYHIP,NCBI,8889,56992,gene/protein,KIF15,NCBI
1,protein_protein,ppi,1,7918,gene/protein,GPANK1,NCBI,2798,9240,gene/protein,PNMA1,NCBI
2,protein_protein,ppi,2,8233,gene/protein,ZRSR2,NCBI,5646,23548,gene/protein,TTC33,NCBI
3,protein_protein,ppi,3,4899,gene/protein,NRF1,NCBI,11592,11253,gene/protein,MAN1B1,NCBI
4,protein_protein,ppi,4,5297,gene/protein,PI4KA,NCBI,2122,8601,gene/protein,RGS20,NCBI


# KG Construction

In [139]:
from torch_geometric.data import HeteroData
from torch_geometric.loader import DataLoader
import torch_geometric.transforms as T
import torch

Create HeteroData object

In [130]:
data = HeteroData()

display(data)

HeteroData()

Get the amount of each type of node

In [131]:
node_dict = {}

for node in nodes['node_type']:
    node_dict[node] = node_dict.get(node, 0) + 1

display(node_dict)

{'gene/protein': 27671,
 'drug': 7957,
 'effect/phenotype': 15311,
 'disease': 17080,
 'biological_process': 28642,
 'molecular_function': 11169,
 'cellular_component': 4176,
 'exposure': 818,
 'pathway': 2516,
 'anatomy': 14035}

Randomize a feature tensor for each type of node and add to data object

In [133]:
for node_type in node_dict.keys():
    data[node_type] = torch.randn(node_dict[node_type], 16)
data['num_nodes'] = nodes.shape[0]

display(data)

HeteroData(
  gene/protein=[27671, 16],
  drug=[7957, 16],
  effect/phenotype=[15311, 16],
  disease=[17080, 16],
  biological_process=[28642, 16],
  molecular_function=[11169, 16],
  cellular_component=[4176, 16],
  exposure=[818, 16],
  pathway=[2516, 16],
  anatomy=[14035, 16],
  num_nodes=129375
)

Get the edge connections and add to data object

In [134]:
for edge_type in kg['relation'].unique():
    relation_df = kg.loc[kg['relation'] == edge_type]
    t1 = relation_df['x_type'].unique()[0]
    t2 = relation_df['y_type'].unique()[0]
    edge_list = [[],[]]
    for edge_subtype in relation_df['display_relation'].unique():
        sub_relation_df = relation_df.loc[relation_df['display_relation'] == edge_subtype]
        if sub_relation_df['x_type'].unique()[0] == t1:
            edge_list[0].extend(sub_relation_df['x_index'].to_list())
            edge_list[1].extend(sub_relation_df['y_index'].to_list())
        else:
            edge_list[0].extend(sub_relation_df['y_index'].to_list())
            edge_list[1].extend(sub_relation_df['x_index'].to_list())
    edge_list = torch.Tensor(edge_list)
    data[t1, edge_type, t2] = edge_list
data['num_edges'] = edges.shape[0]

display(data)

HeteroData(
  gene/protein=[27671, 16],
  drug=[7957, 16],
  effect/phenotype=[15311, 16],
  disease=[17080, 16],
  biological_process=[28642, 16],
  molecular_function=[11169, 16],
  cellular_component=[4176, 16],
  exposure=[818, 16],
  pathway=[2516, 16],
  anatomy=[14035, 16],
  num_nodes=129375,
  (gene/protein, protein_protein, gene/protein)=[2, 642150],
  (drug, drug_protein, gene/protein)=[2, 51306],
  (drug, contraindication, disease)=[2, 61350],
  (drug, indication, disease)=[2, 18776],
  (drug, off-label use, disease)=[2, 5136],
  (drug, drug_drug, drug)=[2, 2672628],
  (gene/protein, phenotype_protein, effect/phenotype)=[2, 6660],
  (effect/phenotype, phenotype_phenotype, effect/phenotype)=[2, 37472],
  (disease, disease_phenotype_negative, effect/phenotype)=[2, 2386],
  (disease, disease_phenotype_positive, effect/phenotype)=[2, 300634],
  (gene/protein, disease_protein, disease)=[2, 160822],
  (disease, disease_disease, disease)=[2, 64388],
  (drug, drug_effect, effect/ph

Set graph to be undirected (Not required here)

In [136]:
data = T.ToUndirected()(data)
display(data)

HeteroData(
  gene/protein=[27671, 16],
  drug=[7957, 16],
  effect/phenotype=[15311, 16],
  disease=[17080, 16],
  biological_process=[28642, 16],
  molecular_function=[11169, 16],
  cellular_component=[4176, 16],
  exposure=[818, 16],
  pathway=[2516, 16],
  anatomy=[14035, 16],
  num_nodes=129375,
  (gene/protein, protein_protein, gene/protein)=[2, 642150],
  (drug, drug_protein, gene/protein)=[2, 51306],
  (drug, contraindication, disease)=[2, 61350],
  (drug, indication, disease)=[2, 18776],
  (drug, off-label use, disease)=[2, 5136],
  (drug, drug_drug, drug)=[2, 2672628],
  (gene/protein, phenotype_protein, effect/phenotype)=[2, 6660],
  (effect/phenotype, phenotype_phenotype, effect/phenotype)=[2, 37472],
  (disease, disease_phenotype_negative, effect/phenotype)=[2, 2386],
  (disease, disease_phenotype_positive, effect/phenotype)=[2, 300634],
  (gene/protein, disease_protein, disease)=[2, 160822],
  (disease, disease_disease, disease)=[2, 64388],
  (drug, drug_effect, effect/ph

# Training Data

Will be done after setting up and testing the model