# Build homogeneous dataset

## Setting up environment

---



### Loading libraries

In [1]:
import sys
import os

sys.path.insert(0, os.path.abspath(".."))

import pickle

from torch_geometric.data import Data
from box import Box

from util.postgres import create_sqlalchemy_engine
from util.homogeneous.dataset import DatasetEuCoHM, assert_bidirectional_edges

### Global variables

In [2]:
# -------------------- GLOBAL VARIABLES --------------------
PATH_TO_CONFIG_FILE = '../config.yaml'

# -------------------- LOAD CONFIGURATION --------------------
# Load the configuration file
config = Box.from_yaml(filename=PATH_TO_CONFIG_FILE)

num_train = 0.8             # Percentage of data used for training
dataset_save_filepath = '../data/dataset_homogeneous.pkl'

pg_engine = create_sqlalchemy_engine(
    username=config.POSTGRES.USERNAME,
    password=config.POSTGRES.PASSWORD,
    host=config.POSTGRES.HOST,
    port=config.POSTGRES.PORT,
    database=config.POSTGRES.DATABASE,
    schema=config.POSTGRES.SCHEMA
)

## Data preparation

---



In [4]:
# Build the homogeneous graph
data: Data
author_node_id_map: dict
author_id_map: dict
dataset: DatasetEuCoHM = DatasetEuCoHM(pg_engine=pg_engine)
data, author_node_id_map, author_id_map = dataset.build_homogeneous_graph()

Querying co-authorship edge data...
Querying author nodes...


### Unit tests

In [5]:
# Test: check that the number of elements in the positive edge index equals to the number of elements in the negative edge index
assert data.test_pos_edge_index.numel() == data.test_neg_edge_index.numel()
# Test: check that the number of elements in the training, positive edge index equals to <num_train> times all nodes
assert data.train_pos_edge_index.shape[1] == int(num_train * data.edge_index.shape[1])
# Test: check that the number of elements in the test, positive edge index equals to <1 - num_train> times all nodes
assert data.test_pos_edge_index.shape[1] == data.edge_index.shape[1] - int(num_train * data.edge_index.shape[1])

# Test: check that all edges are bidirectional
assert_bidirectional_edges(data)

### Save dataset to local file

In [6]:
# Before saving the dataset, we need to close the engine to connect to Postgres DB.
dataset.close_engine()
# Save the dataset
with open(dataset_save_filepath, 'wb') as output:
    pickle.dump(dataset, output, pickle.HIGHEST_PROTOCOL)