In [None]:
print("hola")

In [None]:
import csv
import random
from pathlib import Path

import pandas as pd
import numpy as np
import dgl
import torch as th
from sklearn.model_selection import train_test_split

In [None]:
load_path = Path('data/pubmed/raw')
df_node = pd.read_csv(load_path / 'node.dat', sep='\t', names=['node_id', 'node_name', 'node_type', 'node_attributes'], quoting=csv.QUOTE_NONE)  # must add this quoting argument
df_link = pd.read_csv(load_path / 'link.dat', sep='\t', names=['src_id', 'dst_id', 'link_type', 'link_weight'])
df_link_test = pd.read_csv(load_path / 'link.dat.test', sep='\t', names=['src_id', 'dst_id', 'link_status'])

ntypes = ['GENE', 'DISEASE', 'CHEMICAL', 'SPECIES']
ntype_ids = {'GENE': 0, 'DISEASE': 1, 'CHEMICAL': 2, 'SPECIES': 3}
etypes = ['GENE-and-GENE',
          'GENE-causing-DISEASE',
          'DISEASE-and-DISEASE',
          'CHEMICAL-in-GENE',
          'CHEMICAL-in-DISEASE',
          'CHEMICAL-and-CHEMICAL',
          'CHEMICAL-in-SPECIES',
          'SPECIES-with-GENE',
          'SPECIES-with-DISEASE',
          'SPECIES-and-SPECIES']

In [None]:
type_mask = np.zeros(63109, dtype=int)
type_mask[:] = -1
type_mask[df_node['node_id']] = df_node['node_type']
num_nodes_dict = {ntype: (type_mask == ntype_ids[ntype]).sum() for ntype in ntypes}

In [None]:
adjM = np.zeros((63109, 63109), dtype=int)

# edges from link.dat
links = df_link[['src_id', 'dst_id']].to_numpy()
adjM[links[:, 0], links[:, 1]] = 1

# positive edges from link.dat.test
links_test = df_link_test[df_link_test['link_status'] == 1].to_numpy()
adjM[links_test[:, 0], links_test[:, 1]] = 2

# DISEASE-DISEASE matrix
DD_adjM = adjM[type_mask == ntype_ids['DISEASE']][:, type_mask == ntype_ids['DISEASE']]
DD_edges = DD_adjM.nonzero()
train_val_idx = (DD_adjM[DD_edges] == 1).nonzero()[0]
test_idx = (DD_adjM[DD_edges] == 2).nonzero()[0]

In [None]:
# sample edges for validation
# validation set size being 12.5% of the total edges.
train_idx, val_idx = train_test_split(train_val_idx, test_size=0.125, random_state=1024)
train_idx.sort()
val_idx.sort()

In [None]:
# g_train
data_dict_train = {}
for etype in etypes:
    srctype, _, dsttype = etype.split('-')
    if srctype == dsttype:
        if etype == 'DISEASE-and-DISEASE':
            data_dict_train[('DISEASE', 'DISEASE-and-DISEASE', 'DISEASE')] = (DD_edges[0][train_idx], DD_edges[1][train_idx])
        else:
            data_dict_train[(srctype, etype, srctype)] = adjM[type_mask == ntype_ids[srctype]][:, type_mask == ntype_ids[srctype]].nonzero()
    else:
        data_dict_train[(srctype, etype, dsttype)] = adjM[type_mask == ntype_ids[srctype]][:, type_mask == ntype_ids[dsttype]].nonzero()
        data_dict_train[(dsttype, '^' + etype, srctype)] = adjM[type_mask == ntype_ids[srctype]][:, type_mask == ntype_ids[dsttype]].transpose().nonzero()

# g_val
data_dict_val = {}
for etype in etypes:
    srctype, _, dsttype = etype.split('-')
    if srctype == dsttype:
        if etype == 'DISEASE-and-DISEASE':
            data_dict_val[('DISEASE', 'DISEASE-and-DISEASE', 'DISEASE')] = (DD_edges[0][train_val_idx], DD_edges[1][train_val_idx])
        else:
            data_dict_val[(srctype, etype, srctype)] = adjM[type_mask == ntype_ids[srctype]][:, type_mask == ntype_ids[srctype]].nonzero()
    else:
        data_dict_val[(srctype, etype, dsttype)] = adjM[type_mask == ntype_ids[srctype]][:, type_mask == ntype_ids[dsttype]].nonzero()
        data_dict_val[(dsttype, '^' + etype, srctype)] = adjM[type_mask == ntype_ids[srctype]][:, type_mask == ntype_ids[dsttype]].transpose().nonzero()

# g_test
data_dict_test = {}
for etype in etypes:
    srctype, _, dsttype = etype.split('-')
    if srctype == dsttype:
        if etype == 'DISEASE-and-DISEASE':
            data_dict_test[('DISEASE', 'DISEASE-and-DISEASE', 'DISEASE')] = DD_edges
        else:
            data_dict_test[(srctype, etype, srctype)] = adjM[type_mask == ntype_ids[srctype]][:, type_mask == ntype_ids[srctype]].nonzero()
    else:
        data_dict_test[(srctype, etype, dsttype)] = adjM[type_mask == ntype_ids[srctype]][:, type_mask == ntype_ids[dsttype]].nonzero()
        data_dict_test[(dsttype, '^' + etype, srctype)] = adjM[type_mask == ntype_ids[srctype]][:, type_mask == ntype_ids[dsttype]].transpose().nonzero()

g_train = dgl.heterograph(data_dict_train, num_nodes_dict, idtype=th.int64)
g_val = dgl.heterograph(data_dict_val, num_nodes_dict, idtype=th.int64)
g_test = dgl.heterograph(data_dict_test, num_nodes_dict, idtype=th.int64)

x_dict = {ntype: np.genfromtxt(df_node[df_node['node_type'] == ntype_ids[ntype]]['node_attributes'].tolist(), delimiter=',') for ntype in ntypes}
for ntype in ntypes:
    temp_tensor = th.from_numpy(x_dict[ntype]).float()
    g_train.nodes[ntype].data['x'] = temp_tensor
    g_val.nodes[ntype].data['x'] = temp_tensor
    g_test.nodes[ntype].data['x'] = temp_tensor

in_dim_dict = {ntype: x_dict[ntype].shape[1] for ntype in ntypes}

In [None]:
# sample hard negatives for validation
# for a positive pair (d1, d2), sample (d1, d') as a negative pair
# where d' associates with d1 via D-D-D metapath (i.e., a 2-hop neighbor) and (d1, d') does not exist in the original graph
# if such an edge not exist, opt to random edges not in the original graph
unique, counts = np.unique(DD_edges[0][val_idx], return_counts=True)
g_val_DDD = dgl.metapath_reachable_graph(g_val, ['DISEASE-and-DISEASE', 'DISEASE-and-DISEASE'])
val_neg_edges = []
for d, count in zip(unique, counts):
    neg_diseases = list(set(g_val_DDD.out_edges(d)[1].tolist()) - set(g_val.out_edges(d, etype='DISEASE-and-DISEASE')[1].tolist()))
    if count <= len(neg_diseases):
        neg_ds = random.sample(neg_diseases, k=count)
    else:
        neg_ds = neg_diseases
        to_sample = np.ones(num_nodes_dict['DISEASE'], dtype=int)
        to_sample[g_val.out_edges(d, etype='DISEASE-and-DISEASE')[1].tolist()] = 0
        neg_ds.extend(random.sample(to_sample.nonzero()[0].tolist(), k=count - len(neg_diseases)))
    val_neg_edges.extend([[d, neg_d] for neg_d in neg_ds])

val_neg_edges.sort()
val_neg_edges = np.array(val_neg_edges)

In [None]:
# fix testing negative pairs
# randomly sample edges not in the original graph
DISEASE_id_map = {global_id: local_id for local_id, global_id in enumerate(df_node[df_node['node_type'] == ntype_ids['DISEASE']]['node_id'])}
df_link_test_mapped = df_link_test[['src_id', 'dst_id']].replace(DISEASE_id_map)
df_link_test_mapped_pos = df_link_test_mapped[df_link_test['link_status'] == 1]
df_link_test_mapped_neg = df_link_test_mapped[df_link_test['link_status'] == 0]

test_pos_nums = df_link_test_mapped_pos.groupby('src_id').size().reindex(list(range(num_nodes_dict['DISEASE'])), fill_value=0).to_numpy()
test_neg_nums = df_link_test_mapped_neg.groupby('src_id').size().reindex(list(range(num_nodes_dict['DISEASE'])), fill_value=0).to_numpy()
neg_to_add = test_pos_nums - test_neg_nums

test_neg_edges = df_link_test_mapped_neg.values.tolist()
for d in neg_to_add.nonzero()[0]:
    to_sample = np.ones(num_nodes_dict['DISEASE'], dtype=int)
    to_sample[g_test.out_edges(d, etype='DISEASE-and-DISEASE')[1].tolist()] = 0
    neg_ds = random.sample(to_sample.nonzero()[0].tolist(), k=neg_to_add[d])
    test_neg_edges.extend([[d, neg_d] for neg_d in neg_ds])

test_neg_edges.sort()
test_neg_edges = np.array(test_neg_edges)

In [None]:
# save everything needed
save_path = load_path.parent
dgl.save_graphs(str(save_path / 'graph.bin'), [g_train, g_val, g_test])
np.savez(save_path / 'train_val_test_idx.npz',
         train_idx=train_idx,
         val_idx=val_idx,
         test_idx=test_idx)
np.save(save_path / 'val_neg_edges.npy', val_neg_edges)
np.save(save_path / 'test_neg_edges.npy', test_neg_edges)