#### Copyright (c) 2025 Graphcore Ltd. All rights reserved.

In [29]:
import pandas as pd
import numpy as np
import os.path as osp

from ogb.linkproppred import LinkPropPredDataset

## Load ogbl-wikikg2

In [30]:
root = r"../data/"
dataset = LinkPropPredDataset(name="ogbl-wikikg2", root=root)

  self.graph = torch.load(pre_processed_file_path, 'rb')


In [31]:
split_edge = dataset.get_edge_split()

  train = torch.load(osp.join(path, 'train.pt'))
  valid = torch.load(osp.join(path, 'valid.pt'))
  test = torch.load(osp.join(path, 'test.pt'))


In [32]:
# fuse all splits

heads = np.concatenate([val["head"] for val in split_edge.values()])
tails = np.concatenate([val["tail"] for val in split_edge.values()])
relations = np.concatenate([val["relation"] for val in split_edge.values()])

edge_ids = np.stack([heads, tails], axis=1).T

## Get node and relation labels / descriptions from wikiKG90M

In [28]:
# download wikiKG90M text data

!wget http://snap.stanford.edu/ogb/data/lsc/mapping/wikikg90mv2_mapping.zip -P {root}
!unzip ../data/wikikg90mv2_mapping.zip -d {root}

--2025-10-07 10:25:31--  http://snap.stanford.edu/ogb/data/lsc/mapping/wikikg90mv2_mapping.zip
Resolving snap.stanford.edu (snap.stanford.edu)... 171.64.75.80
Connecting to snap.stanford.edu (snap.stanford.edu)|171.64.75.80|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2536451815 (2.4G) [application/zip]
Saving to: ‘../data_bis/wikikg90mv2_mapping.zip’

wikikg90mv2_mapping   6%[>                   ] 148.83M  15.8MB/s    eta 2m 49s ^C
unzip:  cannot find or open ../data/wikikg90mv2_mapping.zip, ../data/wikikg90mv2_mapping.zip.zip or ../data/wikikg90mv2_mapping.zip.ZIP.


In [33]:
# determine mapping for ogbl-wikikg2

df_nodes2 = pd.read_csv(osp.join(root, "ogbl_wikikg2", "mapping", "nodeidx2entityid.csv.gz"))
df_relations2 = pd.read_csv(osp.join(root, "ogbl_wikikg2", "mapping", "reltype2relid.csv.gz"))
df_nodes90 = pd.read_csv(osp.join(root, "wikikg90mv2_mapping", "entity.csv"))
df_relations90 = pd.read_csv(osp.join(root, "wikikg90mv2_mapping", "relation.csv"))
df_node2_label = pd.merge(df_nodes2, df_nodes90, left_on='entity id', right_on='entity', how='left')
df_relations2_label = pd.merge(df_relations2, df_relations90, left_on='rel id', right_on='relation', how='left')

### Filter out nodes without label or description

In [34]:
rm_label = np.array([0 if isinstance(l, str) else 1 for l in df_node2_label["title"].values])
rm_desc = np.array([0 if isinstance(l, str) else 1 for l in df_node2_label["desc"].values])

no_label_or_desription = np.nonzero(rm_label | rm_desc)[0]

In [35]:
subgraph_nodes = np.arange(len(df_node2_label))
filtered_subgraph_nodes = np.delete(subgraph_nodes, no_label_or_desription)

### Filter out relations without label

In [36]:
rm_rel = np.array([0 if isinstance(l, str) else 1 for l in df_relations2_label["title"].values])

no_rel_label = np.nonzero(rm_rel)[0]

In [37]:
subgraph_relations = np.sort(np.unique(relations))
filtered_subgraph_relations = np.delete(subgraph_relations, no_rel_label)

## Build subgraph

In [43]:
# final filtered edges, labels, descriptions

edge_idx = np.nonzero(np.in1d(edge_ids[0, :], filtered_subgraph_nodes) & 
                      np.in1d(edge_ids[1, :], filtered_subgraph_nodes) &
                      np.in1d(relations, filtered_subgraph_relations))[0]

node_labels = df_node2_label["title"][filtered_subgraph_nodes].values
node_descriptions = df_node2_label["desc"][filtered_subgraph_nodes].values
node_ids = df_node2_label["entity"][filtered_subgraph_nodes].values
relation_labels = df_relations2_label["title"][filtered_subgraph_relations].values
relation_ids = df_relations2_label["relation"][filtered_subgraph_relations].values

In [39]:
filtered_edge_ids = edge_ids[:, edge_idx]
filtered_relation_types = relations[edge_idx]

global_id_to_subgraph_id = np.zeros(len(subgraph_nodes), dtype=np.int64)
global_id_to_subgraph_id[filtered_subgraph_nodes] = np.arange(len(filtered_subgraph_nodes))

filtered_edge_ids = global_id_to_subgraph_id[filtered_edge_ids]

In [40]:
rel_global_id_to_subgraph_id = np.zeros(len(subgraph_relations), dtype=np.int64)
rel_global_id_to_subgraph_id[filtered_subgraph_relations] = np.arange(len(filtered_subgraph_relations))

filtered_relation_types = rel_global_id_to_subgraph_id[filtered_relation_types]

### Sanity check

In [46]:
for idx in np.random.choice(np.arange(filtered_edge_ids.shape[1]), 10):
    print(f"{node_labels[filtered_edge_ids[0, idx]]} ({node_descriptions[filtered_edge_ids[0, idx]]}; {node_ids[filtered_edge_ids[0, idx]]}) - "
          f"{relation_labels[filtered_relation_types[idx]]} ({relation_ids[filtered_relation_types[idx]]}) - "
          f"{node_labels[filtered_edge_ids[1, idx]]} ({node_descriptions[filtered_edge_ids[1, idx]]}; {node_ids[filtered_edge_ids[1, idx]]})")


Svetlana Kharitonova (Soviet actress; Q4495926) - occupation (P106) - actor (person who acts in a dramatic or comic production and works in film, television, theatre, or radio; Q33999)
Konstantina Mpornivelli (Greek sportsperson; Q12879615) - ethnic group (P172) - Greeks (ethnic group native to Greece, Cyprus, Albania, Italy, Turkey, Egypt and, to a lesser extent, other countries surrounding the Mediterranean Sea; Q539051)
Bassiano (Italian comune; Q128057) - located in time zone (P421) - UTC+01:00 (identifier for a time offset from UTC of +1; Q6655)
Gor Malakyan (Armenian association football player; Q4276527) - place of birth (P19) - Yerevan (capital of Armenia; Q1953)
Giuseppe Buzzanca (Italian politician; Q3770186) - sex or gender (P21) - male (to be used in "sex or gender" (P21) to indicate that the human subject is a male; Q6581097)
Richard Rosenfeld (American sociologist; Q15436212) - given name (P735) - Richard (male given name; Q1249148)
Sir Andrew Clark, 1st Baronet (British 

## Save final data

In [47]:
np.save(osp.join(root, "ogbl_wikikg2", "edge_ids.npy"), filtered_edge_ids)
np.save(osp.join(root, "ogbl_wikikg2", "relation_types.npy"), filtered_relation_types)
np.save(osp.join(root, "ogbl_wikikg2", "node_labels.npy"), node_labels)
np.save(osp.join(root, "ogbl_wikikg2", "node_ids.npy"), node_ids)
np.save(osp.join(root, "ogbl_wikikg2", "node_descriptions.npy"), node_descriptions)
np.save(osp.join(root, "ogbl_wikikg2", "relation_labels.npy"), relation_labels)
np.save(osp.join(root, "ogbl_wikikg2", "relation_ids.npy"), relation_ids)