# The ogbl-citation2 graph

## Dataset

This is the original description of the dataset. Clearly, **our goal is to set up a completely different prediction task**.

**Graph**: The ogbl-citation2 dataset is a directed graph, representing the citation network between a subset of papers extracted from MAG. Dach node is a paper with **128-dimensional word2vec features** that summarizes its title and abstract, and each directed edge indicates that **one paper cites another**. All nodes also come with meta-information indicating the year the corresponding paper was published.

**Prediction task**: The task is to predict missing citations given existing citations. Specifically, for each source paper, two of its references are randomly dropped, and we would like the model to rank the missing two references higher than 1,000 negative reference candidates. The negative references are randomly-sampled from all the previous papers that are not referenced by the source paper. The evaluation metric is Mean Reciprocal Rank (MRR), where the reciprocal rank of the true reference among the negative candidates is calculated for each source paper, and then the average is taken over all source papers.

**Dataset splitting**: We split the edges according to time, in order to simulate a realistic application in citation recommendation (e.g., a user is writing a new paper and has already cited several existing papers, but wants to be recommended additional references). To this end, we use the most recent papers (those published in 2019) as the source papers for which we want to recommend the references. For each source paper, we drop two papers from its references—the resulting two dropped edges (pointing from the source paper to the dropped papers) are used respectively for validation and testing. All the rest of the edges are used for training.

In [1]:
import torch
import ogb
from ogb.linkproppred import PygLinkPropPredDataset, Evaluator

import random

from torch_geometric.data import Data
from torch_geometric.utils import subgraph
import torch_geometric.utils as utils

import networkx as nx
from networkx.generators import random_graphs

import pandas as pd

import import_ipynb

import os



In [2]:
import my_utils

importing Jupyter notebook from my_utils.ipynb
importing Jupyter notebook from ogbl_citation2.ipynb


In [9]:
def get_graph_name():
    return graph_name

# TO-DO: compute it from graph_name by replacing '-' with '_'
def get_module_name():
    return module_name

def get_n_nodes():
    return n_nodes

### Installation and Exploration

In [4]:
################################################################################
################################################################################
##### EXECUTE ONLY IF THE ORIGINAL GRAPH HAS TO BE DOWNLOADED (it takes time) ##
def download_graph(graph_name):
    dataset = PygLinkPropPredDataset(name=graph_name)
    graph_0 = dataset[0]

    print(f'{graph_name} has {graph_0.num_nodes} nodes and {graph_0.num_edges} edges, with an average (incoming) node degree of {graph_0.num_edges / graph_0.num_nodes}')
    print(f'Each node has {len(graph_0.x[0])} features')

    # G = convert.to_networkx(graph_0, to_undirected=False)
    # Too big to be drawed
    # nx.draw(G)

    return (graph_0)


################################################################################
################################################################################
##### EXECUTE ONLY IF THE ORIGINAL GRAPH HAS TO BE REDUCED #####################
def compute_reduced_graph(dir,num_nodes_to_sample):
    graph0 = download_graph(get_graph_name())
    mask, graph = my_utils.reduce_graph(
                            graph=graph0,
                            n_nodes=num_nodes_to_sample,
                            dense=True,
                            reverse=False,
                            clip=True,
                            relabel_nodes=True)

    print(f'The reduced version of {get_graph_name()} has {graph.num_nodes} nodes and {graph.num_edges} edges, with an average (incoming) node degree of {graph.num_edges / graph.num_nodes}')
    print(f'Each node has {len(graph.x[0])} features')

    file = my_utils.get_path_of_reduced_graph(dir,get_module_name(),num_nodes_to_sample)
    print(f"loading from {file}")
    torch.save(graph, file)

def get_reduced_graph(dir, sampled_nodes, node_features='original'):
    file = my_utils.get_path_of_reduced_graph(dir,get_module_name(),sampled_nodes)
    print(f"loading from {file}")
    graph = torch.load(file) #, map_location=torch.device(device)).to(device)

    # Optionally, we can use newly-computed node features instead of the original 128-dimensional vector
    # This choice leads to a clear information LEAKAGE: in some sense, features for test nodes would
    # inherit information about the graph connectivity (that is, we are giving information that we are supposed to predict)
    if node_features == 'stats':
        print("Taking newly-generated node features instead of the original ones")
        G = utils.convert.to_networkx(graph, to_undirected=False)
        # compute node stats using existing algorithms
        pagerank = nx.algorithms.link_analysis.pagerank_alg.pagerank(G)
        clustering_coef = nx.algorithms.cluster.clustering(G)
        betweenness_centrality = nx.betweenness_centrality(G, k=50)
        degree = G.degree()
        # create initial node features from that
        aug_emb = torch.ones(graph.num_nodes, 5, dtype=torch.float64) # .to(device)
        for i in range(graph.num_nodes):
            aug_emb[i][0] = degree[i]
            aug_emb[i][1] = pagerank[i]
            aug_emb[i][2] = betweenness_centrality[i]
            aug_emb[i][3] = pagerank[i]
            aug_emb[i][4] = 1.0
            aug_emb = aug_emb.float()
        graph = Data(x=aug_emb, edge_index=graph.edge_index)

    if node_features == 'dummy': # TO-DO: to run tests with this choice gives some error; double-check
        aug_emb = torch.ones(graph.num_nodes, 3, dtype=torch.float64) # .to(device)
        aug_emb = aug_emb.float()
        graph = Data(x=aug_emb, edge_index=graph.edge_index) # .to(device)

    return graph

In [5]:
# Generating other kinds of graphs starting from the citation graph

# Starting from the undirected version of the (big) citation graph, and one
# randomly-chosen node, builds a connected graph with ~n_nodes
# nodes, together with the original connections between them
#
# Needs the original graph 'graph_0' to have been previously loaded
def get_ego_network_un(graph_0, n_nodes):
    # First, the undirected version of the original graph is computed
    edge_index_un = utils.to_undirected(graph_0.edge_index)
    # This indicates the minimum and maximum number of nodes allowed
    min_ratio, max_ratio = [2/3,3/2]
    final_nodes = sys.maxsize
    # we keep generating ego networks randomly until a reasonable size is obtained (less than double the expected nodes)
    while (final_nodes>n_nodes*max_ratio):
        # First gets a randomly-chosen node
        initial_node = random.randint(0,graph_0.num_nodes-1)
        print(f"Starting from node {initial_node}...")
        nodes = [initial_node]
        k = 1
        # Generates k_hop subgraph until the node number is enough
        while len(nodes)<(n_nodes*min_ratio):
            nodes, edge_index, mapping, edge_mask = utils.k_hop_subgraph([initial_node],
                                                             k,
                                                             edge_index_un,
                                                             relabel_nodes=True)
            print(f"   k: {k} -> {len(nodes)} nodes")
            k = k+1
        final_nodes = len(nodes)
    # Filters node features
    mask_n = torch.zeros(graph_0.num_nodes).byte()
    for n in nodes:
        mask_n[n] = True
    x = graph_0.x[mask_n]
    g = Data(x=x,edge_index=edge_index)
    print(f"Ego network generated with {g.num_nodes} nodes and {g.num_edges} edges")
    return g

def get_random_features(n_nodes,num_node_features,device):
    # random values in [0,1]
    x = torch.rand(n_nodes,num_node_features).to(device)
    # map values to [-1,1]
    x = (x-0.5)*2
    return x

# This takes a reduced graph with N*train_ratio nodes, and adds the remaining
# N*(1-train_ratio) nodes by using Barabási-Albert. The idea is that new nodes
# are the test nodes
def get_sampled_plus_barabasi_albert_graph(n_nodes,train_ratio,num_node_features,device):
    n_train_nodes = round(n_nodes*train_ratio)
    n_test_nodes = n_nodes-n_train_nodes

    graph0 = get_reduced_graph(sampled_nodes=n_train_nodes, node_features='original')
    G0 = utils.convert.to_networkx(graph0)
    G = random_graphs.barabasi_albert_graph(n_nodes,round(1.5*graph0.num_edges/graph0.num_nodes),initial_graph=G0) # the second argument is meant to produce a similar number of edges as the dense subgraph

    x = get_random_features(n_nodes,num_node_features,device)

    graph = utils.convert.from_networkx(G).to(device)
    graph.x = x

    return graph

In [6]:
def get_graph(dir,n_nodes,node_features_type):
    if n_nodes >= get_n_nodes():
        # No need to reduce the graph
        graph0 = download_graph(get_graph_name())
    else:
        # Reduction of the graph is needed
        path = my_utils.get_path_of_reduced_graph(dir,get_module_name(),n_nodes)
        if not os.path.exists(path):
            compute_reduced_graph(dir,n_nodes)
        graph0 = get_reduced_graph(dir,n_nodes,node_features_type)
    n_edges = graph0.num_edges
    num_features = len(graph0.x[0])

    return graph0

In [7]:
graph_name = 'ogbl-citation2'
module_name = 'ogbl_citation2'

n_nodes = 2927963