# Build homogeneous dataset

## Setting up environment

---



### Loading libraries

In [1]:
import sys
import os

sys.path.insert(0, os.path.abspath(".."))

import pickle

from torch_geometric.data import Data
from box import Box

from util.postgres import create_sqlalchemy_engine
from util.homogeneous.dataset import DatasetEuCoHM, assert_bidirectional_edges

### Global variables

In [2]:
# -------------------- GLOBAL VARIABLES --------------------
PATH_TO_CONFIG_FILE = '../config.yaml'

# -------------------- LOAD CONFIGURATION --------------------
# Load the configuration file
config = Box.from_yaml(filename=PATH_TO_CONFIG_FILE)

num_train = 0.7             # Percentage of data used for training
num_bootstraping = 10
pg_engine = create_sqlalchemy_engine(
    username=config.POSTGRES.USERNAME,
    password=config.POSTGRES.PASSWORD,
    host=config.POSTGRES.HOST,
    port=config.POSTGRES.PORT,
    database=config.POSTGRES.DATABASE,
    schema=config.POSTGRES.SCHEMA
)

## Data preparation

---



In [3]:
def unit_testing(data):
    # Test: check that the number of elements in the positive edge index equals to the number of elements in the negative edge index
    assert data.test_pos_edge_index.numel() == data.test_neg_edge_index.numel()
    
    # Test: check that all positive edges are bidirectional
    assert_bidirectional_edges(edges=data.train_pos_edge_index)
    assert_bidirectional_edges(edges=data.test_pos_edge_index)
    print('All tests passed')

def save_dataset(dataset):
    dataset_save_filepath = f'../data/{dataset.get_dataset_name()}.pkl'
    # Before saving the dataset, we need to close the engine to connect to Postgres DB.
    dataset.close_engine()
    # Save the dataset
    with open(dataset_save_filepath, 'wb') as output:
        pickle.dump(dataset, output, pickle.HIGHEST_PROTOCOL)
        print(f'Dataset saved to {dataset_save_filepath}')

def build_dataset(use_periodical_embedding_decay: bool,
                  use_top_keywords: bool,
                  num_train: float, 
                  bootstrap_id: int):
    # Build the homogeneous graph
    data: Data
    author_node_id_map: dict
    author_id_map: dict
    dataset: DatasetEuCoHM = DatasetEuCoHM(
        pg_engine=pg_engine,
        num_train=num_train,
        use_periodical_embedding_decay=use_periodical_embedding_decay,
        use_top_keywords=use_top_keywords,
        bootstrap_id=bootstrap_id
    )
    data, author_node_id_map, author_id_map = dataset.build_homogeneous_graph()
    return dataset, data, author_node_id_map, author_id_map

In [4]:
# Uncomment the ones you need to rebuild
dataset_configurations = [
    {'use_periodical_embedding_decay': False, 'use_top_keywords': False, 'num_train': num_train}
]

for conf in dataset_configurations:
    print(f'Processing dataset configuration {conf}...')
    # Building dataset
    dataset, data, author_node_id_map, author_id_map = build_dataset(
        use_periodical_embedding_decay=conf['use_periodical_embedding_decay'],
        use_top_keywords=conf['use_top_keywords'],
        num_train=conf['num_train']
    )
    # Unit testing
    unit_testing(data)

    # Save dataset
    save_dataset(dataset)

Processing dataset configuration {'use_periodical_embedding_decay': False, 'use_top_keywords': False, 'num_train': 0.7}...


TypeError: build_dataset() missing 1 required positional argument: 'bootstrap_id'

In [None]:
# Uncomment the ones you need to rebuild
dataset_configurations = [
    {'use_periodical_embedding_decay': False, 'use_top_keywords': False, 'num_train': num_train},
    # {'use_periodical_embedding_decay': True, 'use_top_keywords': False, 'num_train': num_train},
    # {'use_periodical_embedding_decay': False, 'use_top_keywords': True, 'num_train': num_train},
    # {'use_periodical_embedding_decay': True, 'use_top_keywords': True, 'num_train': num_train},
    # {'use_periodical_embedding_decay': True, 'use_top_keywords': False, 'num_train': 1.0},
]

for conf in dataset_configurations:
    print(f'Processing dataset configuration {conf}...')
    # Building dataset
    for bootstrap_id in range(num_bootstraping):
        dataset, data, author_node_id_map, author_id_map = build_dataset(
            use_periodical_embedding_decay=conf['use_periodical_embedding_decay'],
            use_top_keywords=conf['use_top_keywords'],
            num_train=conf['num_train'],
            bootstrap_id=bootstrap_id
        )
        # Unit testing
        unit_testing(data)
    
        # Save dataset
        save_dataset(dataset)