# Build heterogeneous dataset

## Setting up environment

---



### Loading libraries

In [1]:
import sys
import os

sys.path.insert(0, os.path.abspath(".."))

import pickle

from torch_geometric.data import Data
from box import Box

from util.postgres import create_sqlalchemy_engine
from util.heterogeneous.dataset import DatasetEuCoHT

### Global variables

In [2]:
# -------------------- GLOBAL VARIABLES --------------------
PATH_TO_CONFIG_FILE = '../config.yaml'

# -------------------- LOAD CONFIGURATION --------------------
# Load the configuration file
config = Box.from_yaml(filename=PATH_TO_CONFIG_FILE)

num_train = 0.8             # Percentage of data used for training
dataset_save_filepath = '../data/dataset_heterogeneous.pkl'
target_edge_type = ('author', 'co_authors', 'author')
target_node_type = 'author'

pg_engine = create_sqlalchemy_engine(
    username=config.POSTGRES.USERNAME,
    password=config.POSTGRES.PASSWORD,
    host=config.POSTGRES.HOST,
    port=config.POSTGRES.PORT,
    database=config.POSTGRES.DATABASE,
    schema=config.POSTGRES.SCHEMA
)

## Data preparation

---



In [3]:
# Build the homogeneous graph
data: Data
author_node_id_map: dict
author_id_map: dict
dataset: DatasetEuCoHT = DatasetEuCoHT(
    pg_engine=pg_engine, 
    num_train=num_train,
    target_edge_type=target_edge_type,
    target_node_type=target_node_type
)
data, author_node_id_map, author_id_map = dataset.build_homogeneous_graph()

Querying co-authorship edge data...
Querying author nodes...
Querying publishing edge data...
Querying article nodes...
Rows fetched 10000 for batch 0
Rows fetched 10000 for batch 1
Rows fetched 10000 for batch 2
Rows fetched 10000 for batch 3
Rows fetched 10000 for batch 4
Rows fetched 10000 for batch 5
Rows fetched 10000 for batch 6
Rows fetched 10000 for batch 7
Rows fetched 10000 for batch 8
Rows fetched 10000 for batch 9
Rows fetched 10000 for batch 10
Rows fetched 10000 for batch 11
Rows fetched 10000 for batch 12
Rows fetched 10000 for batch 13
Rows fetched 10000 for batch 14
Rows fetched 10000 for batch 15
Rows fetched 10000 for batch 16
Rows fetched 10000 for batch 17
Rows fetched 10000 for batch 18
Rows fetched 10000 for batch 19
Rows fetched 10000 for batch 20
Rows fetched 10000 for batch 21
Rows fetched 10000 for batch 22
Rows fetched 10000 for batch 23
Rows fetched 10000 for batch 24
Rows fetched 10000 for batch 25
Rows fetched 10000 for batch 26
Rows fetched 10000 for bat

### Unit tests

In [4]:
# # Test: check that the number of elements in the positive edge index equals to the number of elements in the negative edge index
# assert data.test_pos_edge_index.numel() == data.test_neg_edge_index.numel()
# # Test: check that the number of elements in the training, positive edge index equals to <num_train> times all nodes
# assert data.train_pos_edge_index.shape[1] == int(num_train * data.edge_index.shape[1])
# # Test: check that the number of elements in the test, positive edge index equals to <1 - num_train> times all nodes
# assert data.test_pos_edge_index.shape[1] == data.edge_index.shape[1] - int(num_train * data.edge_index.shape[1])

# # Test: check that all edges are bidirectional
# assert_bidirectional_edges(data)

### Save dataset to local file

In [7]:
# Before saving the dataset, we need to close the engine to connect to Postgres DB.
dataset.close_engine()
# Save the dataset
with open(dataset_save_filepath, 'wb') as output:
    pickle.dump(dataset, output, pickle.HIGHEST_PROTOCOL)