In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [None]:
import dgl
from dgl import DGLGraph

# Load Pytorch as backend
dgl.load_backend('pytorch')

In [None]:
import numpy as np
from scipy import sparse as spsp

# GNN models

In [None]:
from dgl.nn.pytorch import conv as dgl_conv

class GraphSAGEModel(nn.Module):
    def __init__(self,
                 in_feats,
                 n_hidden,
                 out_dim,
                 n_layers,
                 activation,
                 dropout,
                 aggregator_type):
        super(GraphSAGEModel, self).__init__()
        self.layers = nn.ModuleList()

        # input layer
        self.layers.append(dgl_conv.SAGEConv(in_feats, n_hidden, aggregator_type,
                                         feat_drop=dropout, activation=activation))
        # hidden layers
        for i in range(n_layers - 1):
            self.layers.append(dgl_conv.SAGEConv(n_hidden, n_hidden, aggregator_type,
                                             feat_drop=dropout, activation=activation))
        # output layer
        self.layers.append(dgl_conv.SAGEConv(n_hidden, out_dim, aggregator_type,
                                         feat_drop=dropout, activation=None))

    def forward(self, g, features):
        h = features
        for layer in self.layers:
            h = layer(g, h)
        return h

# Dataset

In [None]:
import stanfordnlp

stanfordnlp.download('en')

In [None]:
from movielens import MovieLens
data = MovieLens('.')

In [None]:
user_id = np.array(data.ratings['user_idx'])
movie_id = np.array(data.ratings['movie_idx'])
print('#user-movie:', len(movie_id))
spm = spsp.coo_matrix((np.ones((len(user_id),)), (user_id, movie_id)))
print(spm.shape)

In [None]:
num_users = len(np.unique(user_id))
spm_t = spm.transpose()
movie_deg = spm_t.dot(np.ones((num_users,)))
movie_ratio = movie_deg / np.sum(movie_deg)
# 1e-6 is a hyperparameter for this dataset.
movie_sample_prob = 1 - np.maximum(1 - np.sqrt(1e-6 / movie_ratio), 0)
sample_prob = movie_sample_prob[movie_id]
sample = np.random.uniform(size=(len(movie_id),))
user_id = user_id[sample_prob > sample]
movie_id = movie_id[sample_prob > sample]
print('#samples:', len(user_id))
spm = spsp.coo_matrix((np.ones((len(user_id),)), (user_id, movie_id)))
print(spm.shape)
movie_deg = spm_t.dot(np.ones((num_users,)))
print(np.sum(movie_deg == 0))

In [None]:
movie_spm = np.dot(spm.transpose(), spm)
dense_movie = np.sort(movie_spm.todense())
topk_movie = dense_movie[:,-50]
topk_movie_spm = movie_spm >= topk_movie

In [None]:
g = dgl.DGLGraph(topk_movie_spm, readonly=True)
year = np.expand_dims(data.movie_data['year'], axis=1)
genre = data.movie_data['genre']
print('#genre:', genre.shape[1])
title = data.movie_data['title']
print('title vocabulary:', title.shape[1])
features = torch.tensor(np.concatenate((genre, title), axis=1), dtype=torch.float32)
#features = genre
print('#movies:', g.number_of_nodes())
print('#edges:', g.number_of_edges())
print('#features:', features.shape[1])
in_feats = features.shape[1]

# Link prediction model

In [None]:
class EncodeLayer(nn.Module):
    def __init__(self, in_feats, num_hidden):
        super(EncodeLayer, self).__init__()
        self.proj = nn.Linear(in_feats, num_hidden)
        
    def forward(self, feats):
        return self.proj(feats)

In [None]:
#Model hyperparameters
n_hidden = 64
n_layers = 1
dropout = 0.3
aggregator_type = 'gcn'

# create GraphSAGE model
gconv_model = GraphSAGEModel(n_hidden,
                             n_hidden,
                             n_hidden,
                             n_layers,
                             F.relu,
                             dropout,
                             aggregator_type)

In [None]:
# NCE loss
def NCE_loss(pos_score, neg_score, neg_sample_size):
    pos_score = F.logsigmoid(pos_score)
    neg_score = F.logsigmoid(-neg_score).reshape(-1, neg_sample_size)
    return -pos_score - torch.sum(neg_score, dim=1)

class LinkPrediction(nn.Module):
    def __init__(self, gconv_model):
        super(LinkPrediction, self).__init__()
        self.encode = EncodeLayer(in_feats, n_hidden)
        self.gconv_model = gconv_model

    def forward(self, g, features, neg_sample_size):
        emb = self.encode(features)
        emb = self.gconv_model(g, emb)
        #emb = self.gconv_model(g, features)
        pos_g, neg_g = edge_sampler(g, neg_sample_size, return_false_neg=False)
        pos_score = score_func(pos_g, emb)
        neg_score = score_func(neg_g, emb)
        return torch.mean(NCE_loss(pos_score, neg_score, neg_sample_size))

In [None]:
def edge_sampler(g, neg_sample_size, edges=None, return_false_neg=True):
    sampler = dgl.contrib.sampling.EdgeSampler(g, batch_size=int(g.number_of_edges()/10),
                                               seed_edges=edges,
                                               neg_sample_size=neg_sample_size,
                                               negative_mode='tail',
                                               shuffle=True,
                                               return_false_neg=return_false_neg)
    sampler = iter(sampler)
    return next(sampler)

In [None]:
def score_func(g, emb):
    src_nid, dst_nid = g.all_edges(order='eid')
    # Get the node Ids in the parent graph.
    src_nid = g.parent_nid[src_nid]
    dst_nid = g.parent_nid[dst_nid]
    # Read the node embeddings of the source nodes and destination nodes.
    pos_heads = emb[src_nid]
    pos_tails = emb[dst_nid]
    # cosine similarity
    return torch.sum(pos_heads * pos_tails, dim=1)

In [None]:
def LPEvaluate(model, g, features, eval_eids, neg_sample_size):
    gconv_model.eval()
    with torch.no_grad():
        emb = model.encode(features)
        emb = model.gconv_model(g, emb)
        #emb = model.gconv_model(g, features)
        
        pos_g, neg_g = edge_sampler(g, neg_sample_size, eval_eids, return_false_neg=True)
        pos_score = score_func(pos_g, emb)
        neg_score = score_func(neg_g, emb).reshape(-1, neg_sample_size)
        filter_bias = neg_g.edata['false_neg'].reshape(-1, neg_sample_size)

        pos_score = F.logsigmoid(pos_score)
        neg_score = F.logsigmoid(neg_score)
        neg_score -= filter_bias.float()
        pos_score = pos_score.unsqueeze(1)
        rankings = torch.sum(neg_score >= pos_score, dim=1) + 1
        return np.mean(1.0/rankings.cpu().numpy())

In [None]:
eids = np.random.permutation(g.number_of_edges())
train_eids = eids[:int(len(eids) * 0.8)]
valid_eids = eids[int(len(eids) * 0.8):int(len(eids) * 0.9)]
test_eids = eids[int(len(eids) * 0.9):]
train_g = g.edge_subgraph(train_eids, preserve_nodes=True)

In [None]:
# Model for link prediction
model = LinkPrediction(gconv_model)

# Training hyperparameters
weight_decay = 5e-4
n_epochs = 200
lr = 1e-3
neg_sample_size = 10

# use optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

# initialize graph
dur = []
for epoch in range(n_epochs):
    model.train()
    loss = model(train_g, features, neg_sample_size)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    acc = LPEvaluate(model, g, features, valid_eids, neg_sample_size)
    print("Epoch {:05d} | Loss {:.4f} | MRR {:.4f}".format(epoch, loss.item(), acc))

print()
# Let's save the trained node embeddings.
acc = LPEvaluate(model, g, features, test_eids, neg_sample_size)
print("Test MRR {:.4f}".format(acc))