This notebook has 3 purposes:
* Understanding the structure of the data
* Constructing the knowledge graph (KG)
* Producing the positive and negative samples for training

# Data Structure

In [40]:
import pandas as pd
import os

In [41]:
path = os.path.dirname(os.getcwd())

Nodes

In [42]:
nodes = pd.read_csv(path + r'\data\nodes.csv')
print(nodes.shape)
nodes.head()

(129375, 5)


Unnamed: 0,node_index,node_id,node_type,node_name,node_source
0,0,9796,gene/protein,PHYHIP,NCBI
1,1,7918,gene/protein,GPANK1,NCBI
2,2,8233,gene/protein,ZRSR2,NCBI
3,3,4899,gene/protein,NRF1,NCBI
4,4,5297,gene/protein,PI4KA,NCBI


Edges

In [43]:
edges = pd.read_csv(path + r'\data\edges.csv')
print(edges.shape)
edges.head()

(8100498, 4)


Unnamed: 0,relation,display_relation,x_index,y_index
0,protein_protein,ppi,0,8889
1,protein_protein,ppi,1,2798
2,protein_protein,ppi,2,5646
3,protein_protein,ppi,3,11592
4,protein_protein,ppi,4,2122


Knowledge Graph

In [44]:
kg = pd.read_csv(path + r'\data\kg.csv')
print(kg.shape)
kg.head()

  kg = pd.read_csv(path + r'\data\kg.csv')


(8100498, 12)


Unnamed: 0,relation,display_relation,x_index,x_id,x_type,x_name,x_source,y_index,y_id,y_type,y_name,y_source
0,protein_protein,ppi,0,9796,gene/protein,PHYHIP,NCBI,8889,56992,gene/protein,KIF15,NCBI
1,protein_protein,ppi,1,7918,gene/protein,GPANK1,NCBI,2798,9240,gene/protein,PNMA1,NCBI
2,protein_protein,ppi,2,8233,gene/protein,ZRSR2,NCBI,5646,23548,gene/protein,TTC33,NCBI
3,protein_protein,ppi,3,4899,gene/protein,NRF1,NCBI,11592,11253,gene/protein,MAN1B1,NCBI
4,protein_protein,ppi,4,5297,gene/protein,PI4KA,NCBI,2122,8601,gene/protein,RGS20,NCBI


# KG Construction

In [45]:
from torch_geometric.data import HeteroData
from torch_geometric.loader import DataLoader
import torch_geometric.transforms as T
import torch

Create HeteroData object

In [46]:
data = HeteroData()

display(data)

HeteroData()

Get the amount of each type of node

In [47]:
node_dict = {}

for node in nodes['node_type']:
    node_dict[node] = node_dict.get(node, 0) + 1

display(node_dict)

{'gene/protein': 27671,
 'drug': 7957,
 'effect/phenotype': 15311,
 'disease': 17080,
 'biological_process': 28642,
 'molecular_function': 11169,
 'cellular_component': 4176,
 'exposure': 818,
 'pathway': 2516,
 'anatomy': 14035}

Randomize a feature tensor for each type of node and add to data object

In [48]:
for node_type in node_dict.keys():
    data[node_type].x = torch.randn(node_dict[node_type], 16)
data['num_nodes'] = nodes.shape[0]

display(data)

HeteroData(
  num_nodes=129375,
  gene/protein={ x=[27671, 16] },
  drug={ x=[7957, 16] },
  effect/phenotype={ x=[15311, 16] },
  disease={ x=[17080, 16] },
  biological_process={ x=[28642, 16] },
  molecular_function={ x=[11169, 16] },
  cellular_component={ x=[4176, 16] },
  exposure={ x=[818, 16] },
  pathway={ x=[2516, 16] },
  anatomy={ x=[14035, 16] }
)

Get the edge connections and add to data object

In [49]:
edges.shape
kg.shape

(8100498, 12)

In [50]:
for edge_type in kg['relation'].unique():
    relation_df = kg.loc[kg['relation'] == edge_type]
    t1 = relation_df['x_type'].unique()[0]
    t2 = relation_df['y_type'].unique()[0]
    edge_list = [[],[]]
    for edge_subtype in relation_df['display_relation'].unique():
        sub_relation_df = relation_df.loc[relation_df['display_relation'] == edge_subtype]
        if sub_relation_df['x_type'].unique()[0] == t1:
            edge_list[0].extend(sub_relation_df['x_index'].to_list())
            edge_list[1].extend(sub_relation_df['y_index'].to_list())
        else:
            edge_list[0].extend(sub_relation_df['y_index'].to_list())
            edge_list[1].extend(sub_relation_df['x_index'].to_list())
    edge_list = torch.Tensor(edge_list)
    data[t1, edge_type, t2].edge_index = edge_list
data['num_edges'] = edges.shape[0]

# data = T.ToUndirected()(data)
display(data)

HeteroData(
  num_nodes=129375,
  num_edges=8100498,
  gene/protein={ x=[27671, 16] },
  drug={ x=[7957, 16] },
  effect/phenotype={ x=[15311, 16] },
  disease={ x=[17080, 16] },
  biological_process={ x=[28642, 16] },
  molecular_function={ x=[11169, 16] },
  cellular_component={ x=[4176, 16] },
  exposure={ x=[818, 16] },
  pathway={ x=[2516, 16] },
  anatomy={ x=[14035, 16] },
  (gene/protein, protein_protein, gene/protein)={ edge_index=[2, 642150] },
  (drug, drug_protein, gene/protein)={ edge_index=[2, 51306] },
  (drug, contraindication, disease)={ edge_index=[2, 61350] },
  (drug, indication, disease)={ edge_index=[2, 18776] },
  (drug, off-label use, disease)={ edge_index=[2, 5136] },
  (drug, drug_drug, drug)={ edge_index=[2, 2672628] },
  (gene/protein, phenotype_protein, effect/phenotype)={ edge_index=[2, 6660] },
  (effect/phenotype, phenotype_phenotype, effect/phenotype)={ edge_index=[2, 37472] },
  (disease, disease_phenotype_negative, effect/phenotype)={ edge_index=[2, 2

# Training Data

We organize the data into two main datasets, one with all edges and their relationships and the other with just drugs and diseases

In [51]:
pretrain_head_idx = []
pretrain_relation = []
pretrain_tail_idx = []

finetine_head_idx = []
finetine_relation = []
finetine_tail_idx = []

for i,edge_type in enumerate(data.edge_types):
    pretrain_head_idx.extend(data[edge_type].edge_index[0].tolist())
    pretrain_relation.extend([i]*data[edge_type].edge_index.shape[1])
    pretrain_tail_idx.extend(data[edge_type].edge_index[1].tolist())
    
    if edge_type[1] == 'contraindication':
        finetine_head_idx.extend(data[edge_type].edge_index[0].tolist())
        finetine_relation.extend([0]*data[edge_type].edge_index.shape[1])
        finetine_tail_idx.extend(data[edge_type].edge_index[1].tolist())
    elif edge_type[1] == 'indication':
        finetine_head_idx.extend(data[edge_type].edge_index[0].tolist())
        finetine_relation.extend([1]*data[edge_type].edge_index.shape[1])
        finetine_tail_idx.extend(data[edge_type].edge_index[1].tolist())

pretrain_head_idx = torch.tensor(pretrain_head_idx)
pretrain_relation = torch.tensor(pretrain_relation)
pretrain_tail_idx = torch.tensor(pretrain_tail_idx)

finetine_head_idx = torch.tensor(finetine_head_idx)
finetine_relation = torch.tensor(finetine_relation)
finetine_tail_idx = torch.tensor(finetine_tail_idx)

display(pretrain_head_idx.shape, pretrain_relation.shape, pretrain_tail_idx.shape)
display(finetine_head_idx.shape, finetine_relation.shape, finetine_tail_idx.shape)

torch.Size([8100498])

torch.Size([8100498])

torch.Size([8100498])

torch.Size([80126])

torch.Size([80126])

torch.Size([80126])

Next we divide these edges into dataloaders

In [52]:
pdata = torch.stack([pretrain_head_idx, pretrain_relation, pretrain_tail_idx], dim=1)
fdata = torch.stack([finetine_head_idx, finetine_relation, finetine_tail_idx], dim=1)

psplit = torch.utils.data.random_split(pdata, [0.8,0.1,0.1])
fsplit = torch.utils.data.random_split(fdata, [0.8,0.1,0.1])

ptrain_loader = DataLoader(psplit[0], batch_size=128, shuffle=True)
pval_loader = DataLoader(psplit[1], batch_size=128, shuffle=True)
ptest_loader = DataLoader(psplit[2], batch_size=128, shuffle=True)

ftrain_loader = DataLoader(fsplit[0], batch_size=128, shuffle=True)
fval_loader = DataLoader(fsplit[1], batch_size=128, shuffle=True)
ftest_loader = DataLoader(fsplit[2], batch_size=128, shuffle=True)

This should be all the data preparation that is necessary for simple pretraining and fine tuning (I hope), a .py file will be created for the simple creation of these dataloaders