# Some utilities

These functions are here because they are supposed to be used in the main code and/or several graph-specific modules

In [1]:
import torch
import random

from torch_geometric.data import Data
from torch_geometric.utils import subgraph
import torch_geometric.utils as utils

import networkx as nx
from networkx.generators import random_graphs

import time
import calendar

import os

import pandas as pd
from openpyxl import load_workbook

import sys

import import_ipynb

import ogbl_citation2

importing Jupyter notebook from ogbl_citation2.ipynb




importing Jupyter notebook from my_utils.ipynb


In [2]:
def get_path_of_reduced_graph(dir,module_name,n_nodes):
    #return os.path.join(dir,f"{sys.modules[module_name].get_graph_name()}_reduced_{n_nodes}")
    return os.path.join(dir,f"{sys.modules[module_name].get_graph_name()}_reduced_{n_nodes}")

# Returns the n_max most frequent items from a tensor
def get_most_frequent(my_tensor,n_max):
    bincount = torch.bincount(my_tensor)
    bincount_s,indices = torch.sort(bincount,descending=True)
    return indices[:n_max]

# Returns a reduced version of the graph with only n_nodes nodes
# - if dense: nodes with highest degree are taken (otherwise, randomly chosen)
# - if reverse: edges are reversed
# - if clip: the rest of the nodes are NOT kept in the reduced graph
# - if relabel_nodes: node ids are relabeled (from 0 to n_nodes-1)
def reduce_graph(graph,n_nodes,dense=True,reverse=False,clip=False,relabel_nodes=False):
    if dense:
        # selects the nodes with a higher in-degree
        nodes = get_most_frequent(graph.edge_index[1],n_nodes)
    else:
        # selects nodes randomly
        nodes = random.sample(range(graph.num_nodes), n_nodes)

    edges = subgraph(subset=nodes,edge_index=graph.edge_index,relabel_nodes=relabel_nodes)
    if reverse:
        r_edge_index = torch.stack([edges[0][1],edges[0][0]])
    else:
        r_edge_index = edges[0]

    mask_n = torch.zeros(graph.num_nodes).byte()
    for n in nodes:
        mask_n[n] = True
    if clip:
        x = graph.x[mask_n]
    else:
        x = graph.x

    return mask_n, Data(x=x,edge_index=r_edge_index) #,y=graph.y[mask_n])

# For each experiment, a line is written to a local Excel file that contains the main figures:
# Type of architecture, number of nodes, hyper-parameters, training time, accuracy, hits@k, MRR...
def write_line_to_xlsx(dir,output):
    current_GMT = time.gmtime()
    time_stamp = calendar.timegm(current_GMT)
    time_stamp = datetime.utcfromtimestamp(time_stamp).strftime('%Y-%m-%d %H:%M:%S')
    records = [tuple([time_stamp] + list(output.values()))]

    wb = load_workbook(dir + "tmp_results.xlsx")
    # Select First Worksheet
    ws = wb.worksheets[0]

    for record in records:
        # Append Row Values
        ws.append(record)

    wb.save(dir + "tmp_results.xlsx")

def print_time(topic,get_times,start_time):
    if get_times:
        print(f"    *** {topic} time: {time.time()-start_time}s")

### Other types of graphs

Instead of the graph loaded from data, we can also load
- A Barabási-Albert graph
- An Erdös-Renyi random-generated graph
- An egocentric network (based on the loaded data)
- etc.

The idea is that these graphs have (if possible) the same number of nodes, and a similar number of edges.

In [3]:
# TO-DO: this is repeated in the ogbl-citation2 module, since it is also needed there.
# We should move it to a utils module if possible
def get_random_features(n_nodes,num_node_features):
    # random values in [0,1]
    x = torch.rand(n_nodes,num_node_features).to(device)
    # map values to [-1,1]
    x = (x-0.5)*2
    return x

def get_dummy_features(n_nodes,num_node_features=3):
    x = torch.ones(n_nodes,num_node_features).to(device)
    return x

# We could use directly the PyG function instead of NetworkX
def get_barabasi_albert_graph(n_nodes,n_edges,num_node_features):
    x = get_random_features(n_nodes,num_node_features)

    G = random_graphs.barabasi_albert_graph(n_nodes, round(n_edges/n_nodes/2))
    graph = utils.convert.from_networkx(G).to(device)
    graph.x = x
    return graph

def get_erdos_renyi_graph(n_nodes,n_edges,num_node_features):
    x = get_random_features(n_nodes,num_node_features)

    p = n_edges/(n_nodes*n_nodes)
    # According to the documentation, this implementation is faster for sparse graphs
    G = random_graphs.fast_gnp_random_graph(n_nodes,p)
    #G = random_graphs.erdos_renyi_graph(n_nodes,p)
    graph = utils.convert.from_networkx(G).to(device)
    graph.x = x

    return graph

### Building data for training / test

We are splitting nodes into **training** and **test**.

Due to the nature of the problem, the training graph has a subset of nodes and the corresponding edges. Moreover, edges are **reversed** in order to be able to build proper **computations graphs**.

In [4]:
def get_training_graph(graph,train_ratio,device):
    # Computes the graph with only a percentage of nodes for training
    train_mask, train_graph = reduce_graph(graph=graph,
                                           n_nodes=round(graph.num_nodes*train_ratio),
                                           dense=False,
                                           reverse=True,
                                           clip=False,
                                           relabel_nodes=False)

    nodes = torch.tensor(range(0,graph.num_nodes)).to(device)

    train_nodes = nodes[train_mask]
    test_mask = torch.logical_not(train_mask)
    test_nodes = nodes[test_mask]

    return train_graph, train_nodes, test_nodes