In [7]:
from pathlib import Path
import random

import numpy as np
import scipy.sparse
import dgl
import torch as th

In [8]:
load_path = Path('data/lastfm/magnn')
adjM = scipy.sparse.load_npz(load_path / 'adjM.npz').toarray()
type_mask = np.load(load_path / 'node_types.npy')
user_artist = np.load(load_path / 'user_artist.npy')
train_val_test_idx = np.load(load_path / 'train_val_test_idx.npz')

train_idx = train_val_test_idx['train_idx']
val_idx = train_val_test_idx['val_idx']
test_idx = train_val_test_idx['test_idx']
train_val_idx = np.sort(np.concatenate((train_idx, val_idx)))

ntypes = ['user', 'artist', 'tag']
ntype_ids = {'user': 0, 'artist': 1, 'tag': 2}
num_nodes_dict = {ntype: (type_mask == ntype_ids[ntype]).sum() for ntype in ntypes}

In [9]:
# g_train, g_val, g_test
data_dict_train = {
    ('user', 'user-user', 'user'): adjM[type_mask == ntype_ids['user']][:,type_mask == ntype_ids['user']].nonzero(),
    ('user', 'user-artist', 'artist'): (user_artist[train_idx][:, 0], user_artist[train_idx][:, 1]),
    ('artist', 'artist-user', 'user'): (user_artist[train_idx][:, 1], user_artist[train_idx][:, 0]),
    ('artist', 'artist-tag', 'tag'): adjM[type_mask == ntype_ids['artist']][:,type_mask == ntype_ids['tag']].nonzero(),
    ('tag', 'tag-artist', 'artist'): adjM[type_mask == ntype_ids['tag']][:,type_mask == ntype_ids['artist']].nonzero(),
}
data_dict_val = {
    ('user', 'user-user', 'user'): adjM[type_mask == ntype_ids['user']][:,type_mask == ntype_ids['user']].nonzero(),
    ('user', 'user-artist', 'artist'): (user_artist[train_val_idx][:, 0], user_artist[train_val_idx][:, 1]),
    ('artist', 'artist-user', 'user'): (user_artist[train_val_idx][:, 1], user_artist[train_val_idx][:, 0]),
    ('artist', 'artist-tag', 'tag'): adjM[type_mask == ntype_ids['artist']][:,type_mask == ntype_ids['tag']].nonzero(),
    ('tag', 'tag-artist', 'artist'): adjM[type_mask == ntype_ids['tag']][:,type_mask == ntype_ids['artist']].nonzero(),
}
data_dict_test = {
    ('user', 'user-user', 'user'): adjM[type_mask == ntype_ids['user']][:,type_mask == ntype_ids['user']].nonzero(),
    ('user', 'user-artist', 'artist'): (user_artist[:, 0], user_artist[:, 1]),
    ('artist', 'artist-user', 'user'): (user_artist[:, 1], user_artist[:, 0]),
    ('artist', 'artist-tag', 'tag'): adjM[type_mask == ntype_ids['artist']][:,type_mask == ntype_ids['tag']].nonzero(),
    ('tag', 'tag-artist', 'artist'): adjM[type_mask == ntype_ids['tag']][:,type_mask == ntype_ids['artist']].nonzero(),
}

g_train = dgl.heterograph(data_dict_train, num_nodes_dict, idtype=th.int64)
g_val = dgl.heterograph(data_dict_val, num_nodes_dict, idtype=th.int64)
g_test = dgl.heterograph(data_dict_test, num_nodes_dict, idtype=th.int64)

In [10]:
# sample hard negatives
# for a positive pair (u, a), sample (u, a') as a negative pair
# where a' associates with a via A-U-A metapath (i.e., a 2-hop neighbor) and (u, a') does not exist in the original graph

# validation
g_val_AUA = dgl.metapath_reachable_graph(g_val, ['artist-user', 'user-artist'])
val_neg_user_artist = []
for u, a in user_artist[val_idx]:
    neg_artists = list(set(g_val_AUA.out_edges(a)[1].tolist()) - set(g_val.out_edges(u, etype='user-artist')[1].tolist()))
    neg_a = random.choice(neg_artists)
    val_neg_user_artist.append([u, neg_a])
val_neg_user_artist = np.array(val_neg_user_artist)

# testing
g_test_AUA = dgl.metapath_reachable_graph(g_test, ['artist-user', 'user-artist'])
test_neg_user_artist = []
for u, a in user_artist[test_idx]:
    neg_artists = list(set(g_test_AUA.out_edges(a)[1].tolist()) - set(g_test.out_edges(u, etype='user-artist')[1].tolist()))
    neg_a = random.choice(neg_artists)
    test_neg_user_artist.append([u, neg_a])
test_neg_user_artist = np.array(test_neg_user_artist)

In [11]:
# save graphs and hard negative pairs
save_path = load_path.parent
dgl.save_graphs(str(save_path / 'graph.bin'), [g_train, g_val, g_test])
np.save(save_path / 'val_neg_user_artist.npy', val_neg_user_artist)
np.save(save_path / 'test_neg_user_artist.npy', test_neg_user_artist)