In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [None]:
import dgl
from dgl import DGLGraph

# Load Pytorch as backend
dgl.load_backend('pytorch')

In [None]:
import numpy as np
import pandas as pd
from scipy import stats
from scipy import sparse as spsp

# GNN models from scratch

In [None]:
from sageconv import SAGEConv

class GraphSAGEModel(nn.Module):
    def __init__(self,
                 in_feats,
                 n_hidden,
                 out_dim,
                 n_layers,
                 activation,
                 dropout,
                 aggregator_type):
        super(GraphSAGEModel, self).__init__()
        self.layers = nn.ModuleList()

        # input layer
        self.layers.append(SAGEConv(in_feats, n_hidden, aggregator_type,
                                    feat_drop=dropout, activation=activation))
        # hidden layers
        for i in range(n_layers - 1):
            self.layers.append(SAGEConv(n_hidden, n_hidden, aggregator_type,
                                        feat_drop=dropout, activation=activation))
        # output layer
        self.layers.append(SAGEConv(n_hidden, out_dim, aggregator_type,
                                    feat_drop=dropout, activation=None))

    def forward(self, g, features):
        h = features
        for layer in self.layers:
            h = layer(g, h, g.edata['similarity'])
        return h

# Dataset

In [None]:
import stanfordnlp

#stanfordnlp.download('en')

In [None]:
from movielens import MovieLens
data = MovieLens('.')

Calculate some statistics of the dataset.

In [None]:
ratings = data.ratings

user_id = np.array(ratings['user_idx'])
movie_id = np.array(ratings['movie_idx'])
print('#user-movie:', len(movie_id))
user_movie_spm = spsp.coo_matrix((np.ones((len(user_id),)), (user_id, movie_id)))
num_users, num_movies = user_movie_spm.shape
print('#users:', num_users)
print('#movies:', num_movies)

We split the dataset into a training set and a testing set

In [None]:
ratings_train = ratings[~(ratings['valid_mask'] | ratings['test_mask'])]
user_latest_item_indices = (
        ratings_train.groupby('user_id')['timestamp'].transform(pd.Series.max) ==
        ratings_train['timestamp'])
user_latest_item = ratings_train[user_latest_item_indices]
user_latest_item = dict(
        zip(user_latest_item['user_idx'].values, user_latest_item['movie_idx'].values))

Construct the training dataset

In [None]:
user_id = np.array(ratings_train['user_idx'])
movie_id = np.array(ratings_train['movie_idx'])
user_movie_spm = spsp.coo_matrix((np.ones((len(user_id),)), (user_id, movie_id)))
assert num_users == user_movie_spm.shape[0]
assert num_movies == user_movie_spm.shape[1]
train_size = len(user_id)
print('#training size:', train_size)

Construct the validation and testing dataset

In [None]:
users_valid = ratings[ratings['valid_mask']]['user_idx'].values
movies_valid = ratings[ratings['valid_mask']]['movie_idx'].values
users_test = ratings[ratings['test_mask']]['user_idx'].values
movies_test = ratings[ratings['test_mask']]['movie_idx'].values
valid_size = len(users_valid)
test_size = len(users_test)

## Construct the movie graph with SLIM
Here is another way of constructing a moive matrix by taking advantage of SLIM. SLIM is an item-based recommendation model. When training SLIM on a user-movie dataset, it learns a movie similarity matrix. This similarity matrix is the movie matrix we like to construct.

In [None]:
#from SLIM import SLIM, SLIMatrix
#model = SLIM()
#params = {'algo': 'cd', 'nthreads': 2, 'l1r': 1.0, 'l2r': 1.0}
#trainmat = SLIMatrix(user_movie_spm.tocsr())
#model.train(params, trainmat)

In [None]:
#model.save_model(modelfname='slim_model.csr', mapfname='slim_map.csr')

In [None]:
def read_csr(filename):
    f = open(filename, 'r')
    all_rows = []
    all_cols = []
    all_vals = []
    for i, line in enumerate(f.readlines()):
        strs = line.split(' ')
        cols = [int(s) for s in strs[1::2]]
        vals = [float(s) for s in strs[2::2]]
        all_cols.extend(cols)
        all_vals.extend(vals)
        all_rows.extend([i for _ in cols])
    all_rows = np.array(all_rows, dtype=np.int64)
    all_cols = np.array(all_cols, dtype=np.int64)
    all_vals = np.array(all_vals, dtype=np.float32)
    mat = spsp.coo_matrix((all_vals, (all_rows, all_cols)))
    return mat

movie_spm = read_csr('slim_model.csr')
assert movie_spm.shape[0] == user_movie_spm.shape[1]
assert movie_spm.shape[1] == user_movie_spm.shape[1]
print('#edges:', movie_spm.nnz)
print('most similar:', np.max(movie_spm.data))
print('most unsimilar:', np.min(movie_spm.data))
print('#similar:', np.sum(movie_spm > 1e-2))

In [None]:
g = dgl.DGLGraph(movie_spm, readonly=True)
g.edata['similarity'] = torch.tensor(movie_spm.data, dtype=torch.float32)
year = np.expand_dims(data.movie_data['year'], axis=1)
genre = data.movie_data['genre']
print('#genre:', genre.shape[1])
title = data.movie_data['title']
print('title vocabulary:', title.shape[1])
features = torch.tensor(np.concatenate((genre, title), axis=1), dtype=torch.float32)
#features = genre
print('#movies:', g.number_of_nodes())
print('#edges:', g.number_of_edges())
print('#features:', features.shape[1])
in_feats = features.shape[1]

# Item-based recommendation model

In [None]:
class EncodeLayer(nn.Module):
    def __init__(self, in_feats, num_hidden):
        super(EncodeLayer, self).__init__()
        self.proj = nn.Linear(in_feats, num_hidden)
        
    def forward(self, feats):
        return self.proj(feats)

In [None]:
#Model hyperparameters
n_hidden = 64
n_layers = 1
dropout = 0.5
aggregator_type = 'sum'

# create GraphSAGE model
gconv_model = GraphSAGEModel(n_hidden,
                             n_hidden,
                             n_hidden,
                             n_layers,
                             F.relu,
                             dropout,
                             aggregator_type)

In [None]:
# NCE loss
def NCE_loss(pos_score, neg_score, neg_sample_size):
    pos_score = F.logsigmoid(pos_score)
    neg_score = F.logsigmoid(-neg_score).reshape(-1, neg_sample_size)
    return -pos_score - torch.sum(neg_score, dim=1)

class LinkPrediction(nn.Module):
    def __init__(self, gconv_model):
        super(LinkPrediction, self).__init__()
        self.encode = EncodeLayer(in_feats, n_hidden)
        self.gconv_model = gconv_model

    def forward(self, g, features, neg_sample_size):
        emb = self.encode(features)
        emb = self.gconv_model(g, emb)
        #emb = self.gconv_model(g, features)
        pos_g, neg_g = edge_sampler(g, neg_sample_size, return_false_neg=False)
        pos_score = score_func(pos_g, emb)
        neg_score = score_func(neg_g, emb)
        return torch.mean(NCE_loss(pos_score, neg_score, neg_sample_size))

In [None]:
def edge_sampler(g, neg_sample_size, edges=None, return_false_neg=True):
    sampler = dgl.contrib.sampling.EdgeSampler(g, batch_size=int(g.number_of_edges()/10),
                                               seed_edges=edges,
                                               neg_sample_size=neg_sample_size,
                                               negative_mode='tail',
                                               shuffle=True,
                                               return_false_neg=return_false_neg)
    sampler = iter(sampler)
    return next(sampler)

In [None]:
def score_func(g, emb):
    src_nid, dst_nid = g.all_edges(order='eid')
    # Get the node Ids in the parent graph.
    src_nid = g.parent_nid[src_nid]
    dst_nid = g.parent_nid[dst_nid]
    # Read the node embeddings of the source nodes and destination nodes.
    pos_heads = emb[src_nid]
    pos_tails = emb[dst_nid]
    # cosine similarity
    return torch.sum(pos_heads * pos_tails, dim=1)

In [None]:
def LPEvaluate(model, g, features, users_eval, movies_eval, neg_sample_size):
    gconv_model.eval()
    with torch.no_grad():
        emb = model.encode(features)
        emb = model.gconv_model(g, emb)
        loss = model(g, features, neg_sample_size)
        #emb = model.gconv_model(g, features)
        hits_10s = []
        # evaluate one user-item interaction at a time
        for u, i in zip(users_eval, movies_eval):
            I_q = user_latest_item[u]
            I = torch.cat([torch.LongTensor([i]), torch.LongTensor(data.neg_valid[u])])
            Z_q = emb[I_q]
            Z = emb[I]
            score = (Z_q[None, :] * Z).sum(1).cpu().numpy()
            rank = stats.rankdata(-score, 'min')
            hits_10s.append(rank[0] <= 10)
        print('HITS@10:{:.4f}, loss:{:.4f}'.format(np.mean(hits_10s), loss.item()))
        return np.mean(hits_10s)

In [None]:
# Model for link prediction
model = LinkPrediction(gconv_model)

# Training hyperparameters
weight_decay = 5e-4
n_epochs = 20
lr = 1e-3
neg_sample_size = 40

# use optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

# initialize graph
dur = []
prev_acc = 0
for epoch in range(n_epochs):
    model.train()
    loss = model(g, features, neg_sample_size)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    print("Epoch {:05d} | Loss {:.4f}".format(epoch, loss.item()))
    
    acc = LPEvaluate(model, g, features, users_valid, movies_valid, neg_sample_size)
    if epoch > 5 and acc <= prev_acc:
        break
    prev_acc = acc

print()
# Let's save the trained node embeddings.
LPEvaluate(model, g, features, users_test, movies_test, neg_sample_size)