This notebook has 3 purposes:
* Understanding the structure of the data
* Constructing the knowledge graph (KG)
* Producing the positive and negative samples for training

# Data Structure

In [2]:
import pandas as pd
import os

In [3]:
path = os.path.dirname(os.getcwd())

Nodes

In [3]:
nodes = pd.read_csv(path + '/data/nodes.csv')
nodes['node_type']= nodes['node_type'].apply(lambda x: x.replace("/","_"))
print(nodes.shape)
nodes.head()

(129375, 5)


Unnamed: 0,node_index,node_id,node_type,node_name,node_source
0,0,9796,gene_protein,PHYHIP,NCBI
1,1,7918,gene_protein,GPANK1,NCBI
2,2,8233,gene_protein,ZRSR2,NCBI
3,3,4899,gene_protein,NRF1,NCBI
4,4,5297,gene_protein,PI4KA,NCBI


Edges

In [4]:
edges = pd.read_csv(path + '/data/edges.csv')
print(edges.shape)
edges.head()

(8100498, 4)


Unnamed: 0,relation,display_relation,x_index,y_index
0,protein_protein,ppi,0,8889
1,protein_protein,ppi,1,2798
2,protein_protein,ppi,2,5646
3,protein_protein,ppi,3,11592
4,protein_protein,ppi,4,2122


Knowledge Graph

In [5]:
kg = pd.read_csv(r'/opt/scratch/labs/wuc/Drug-Repurposing/data/totalkg.csv')
kg['x_type']= kg['x_type'].apply(lambda x: x.replace("/","_"))
kg['y_type']= kg['y_type'].apply(lambda x: x.replace("/","_"))
kg['relation']= kg['relation'].apply(lambda x: x.replace("-","_"))
kg['relation']= kg['relation'].apply(lambda x: x.replace(" ","_"))
print(kg.shape)
kg.head()

(4296359, 7)


Unnamed: 0,relation,x_index,x_type,x_name,y_index,y_type,y_name
0,protein_protein,0,gene_protein,PHYHIP,8889.0,gene_protein,KIF15
1,protein_protein,1,gene_protein,GPANK1,2798.0,gene_protein,PNMA1
2,protein_protein,2,gene_protein,ZRSR2,5646.0,gene_protein,TTC33
3,protein_protein,3,gene_protein,NRF1,11592.0,gene_protein,MAN1B1
4,protein_protein,4,gene_protein,PI4KA,2122.0,gene_protein,RGS20


In [20]:
len(kg[kg['x_type'] == 'gene_protein']['x_name'].unique())

19137

# KG Construction

In [6]:
from torch_geometric.data import HeteroData
from torch_geometric.loader import DataLoader
import torch_geometric.transforms as T
import torch
from copy import deepcopy

Create HeteroData object

In [7]:
data = HeteroData()

display(data)

HeteroData()

Proving that both x and y indices span the entire range

In [8]:
for i in range(max(kg['x_index'])+1):
    if i not in kg['x_index'] or i not in kg['y_index']:
        display(i)

print(max(kg['x_index']) == max(kg['y_index']))

True


Get the amount of each type of node

In [9]:
nodes = deepcopy(kg)
nodes.drop_duplicates(subset=['x_index'], keep='first', inplace=True)
node_dict = {}

for node_type in nodes['x_type'].unique():
    split = nodes.loc[kg['x_type'] == node_type]
    node_dict[node_type] = len(split['x_index'].unique())

display(node_dict)

{'gene_protein': 27671,
 'drug': 7957,
 'effect_phenotype': 15311,
 'disease': 17080,
 'biological_process': 28642,
 'molecular_function': 11169,
 'cellular_component': 4176,
 'exposure': 818,
 'pathway': 2516,
 'anatomy': 14035}

Randomize a feature tensor for each type of node and add to data object

In [10]:
for node_type in node_dict.keys():
    data[node_type].x = torch.empty(node_dict[node_type], 16, requires_grad=True)
    torch.nn.init.xavier_uniform_(data[node_type].x)

display(data)

HeteroData(
  gene_protein={ x=[27671, 16] },
  drug={ x=[7957, 16] },
  effect_phenotype={ x=[15311, 16] },
  disease={ x=[17080, 16] },
  biological_process={ x=[28642, 16] },
  molecular_function={ x=[11169, 16] },
  cellular_component={ x=[4176, 16] },
  exposure={ x=[818, 16] },
  pathway={ x=[2516, 16] },
  anatomy={ x=[14035, 16] }
)

Create edge dict to keep track of within-group indices

In [11]:
temp = deepcopy(kg)
temp.drop_duplicates(subset=['x_index'], keep='first', inplace=True)
temp['group_idx'] = temp.groupby('x_type').cumcount()
idx_to_group = dict(zip(temp['x_index'], temp['group_idx']))

Get the edge connections and add to data object

In [None]:
edges = deepcopy(kg)

# Apply edge dictionary
edges['group_x'] = edges['x_index'].map(idx_to_group)
edges['group_y'] = edges['y_index'].map(idx_to_group)

# Group by relation
groups = edges.groupby('relation')

for relation, group in groups:
    
    # Get the main group
    subgroups = group.groupby('x_type')
    group_name = list(subgroups.groups)[0]
    group = subgroups.get_group(group_name)
    
    # Get edge indices and create edge list
    x_indices = torch.tensor(group['group_x'].values, dtype=torch.long)
    y_indices = torch.tensor(group['group_y'].values, dtype=torch.long)
    edge_list = torch.stack([x_indices, y_indices], dim=0)
        
    # Store in data
    data[group['x_type'].values[0], relation, group['y_type'].values[0]].edge_index = edge_list

display(data)

pretrain_indices = kg[['x_index','relation','y_index']]
name_to_num = dict(zip(pretrain_indices['relation'], pd.factorize(pretrain_indices['relation'])[0]))
pretrain_indices['relation'] = pretrain_indices['relation'].map(name_to_num)

finetune_indices = kg[['x_index','relation','y_index']]
finetune_indices = finetune_indices.loc[(finetune_indices['relation'] == 'contraindication') | 
                                        (finetune_indices['relation'] == 'indication') | 
                                        (finetune_indices['relation'] == 'off_label_use')]
finetune_indices['relation'] = finetune_indices['relation'].map(name_to_num)

display(pretrain_indices.shape)
display(finetune_indices.shape)

Finally, we use PyG's validation function to see if our dataset is valid

In [13]:
data.validate()

True

# Training Data

We organize the data into two main datasets, one with all edges and their relationships and the other with just drugs and diseases. In our model, we will be using global reference indices rather than local reference

In [65]:
# Get indices and process relation
raw_pretrain_indices = kg[['x_index','x_type','relation','y_index']]
name_to_num = dict(zip(raw_pretrain_indices['relation'], pd.factorize(raw_pretrain_indices['relation'])[0]))
raw_pretrain_indices['relation'] = raw_pretrain_indices['relation'].map(name_to_num)

# Group indices by relation
groups = raw_pretrain_indices.groupby('relation')
pretrain_indices = pd.DataFrame(columns=['x_index','relation','y_index'])

for relation, group in groups:
    
    # Get the main group
    subgroups = group.groupby('x_type')
    group_name = list(subgroups.groups)[0]
    group = subgroups.get_group(group_name)
    
    # Add only the main group
    pretrain_indices = pd.concat([pretrain_indices,group[['x_index','relation','y_index']]])

# Get indices and process relation
raw_finetune_indices = kg[['x_index','x_type','relation','y_index']]
raw_finetune_indices = raw_finetune_indices.loc[(raw_finetune_indices['relation'] == 'contraindication') | 
                                                (raw_finetune_indices['relation'] == 'indication') | 
                                                (raw_finetune_indices['relation'] == 'off_label_use')]
raw_finetune_indices['relation'] = raw_finetune_indices['relation'].map(name_to_num)

# Group indices by relation
groups = raw_finetune_indices.groupby('relation')
finetune_indices = pd.DataFrame(columns=['x_index','relation','y_index'])

for relation, group in groups:
    
    # Get the main group
    subgroups = group.groupby('x_type')
    group_name = list(subgroups.groups)[0]
    group = subgroups.get_group(group_name)
    
    # Add only the main group
    finetune_indices = pd.concat([finetune_indices,group[['x_index','relation','y_index']]])

# Make sure indices are a compatible datatype
pretrain_indices = pretrain_indices.astype({'x_index': 'int32', 'relation': 'int32', 'y_index': 'int32', 'local_x': 'int32'})
finetune_indices = finetune_indices.astype({'x_index': 'int32', 'relation': 'int32', 'y_index': 'int32', 'local_x': 'int32'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  raw_pretrain_indices['relation'] = raw_pretrain_indices['relation'].map(name_to_num)


Next we divide these edges into dataloaders

In [67]:
pdata = torch.tensor(pretrain_indices.values,dtype=torch.long)
fdata = torch.tensor(finetune_indices.values,dtype=torch.long)

psplit = torch.utils.data.random_split(pdata, [0.8,0.1,0.1])
fsplit = torch.utils.data.random_split(fdata, [0.8,0.1,0.1])

ptrain_loader = DataLoader(psplit[0], batch_size=128, shuffle=True)
pval_loader = DataLoader(psplit[1], batch_size=128, shuffle=True)
ptest_loader = DataLoader(psplit[2], batch_size=128, shuffle=True)

ftrain_loader = DataLoader(fsplit[0], batch_size=128, shuffle=True)
fval_loader = DataLoader(fsplit[1], batch_size=128, shuffle=True)
ftest_loader = DataLoader(fsplit[2], batch_size=128, shuffle=True)

We must also make sure that the data object is undirected for proper message passing

In [68]:
data = T.ToUndirected()(data)

This should be all the data preparation that is necessary for simple pretraining and fine tuning (I hope), a .py file will be created for the simple creation of these dataloaders