# Dataset

We use the MoiveLens dataset for demonstration because it is commonly used for recommendation models. In this dataset, there are two types of nodes: users and movies. The movie nodes have three attributes: year, title and genre. There are ratings between user nodes and movie nodes. Each rating has a timestamp. In our recommendation model, we don't consider ratings and timestamps.

**Note**: It is not necessarily the best dataset to demonstrate the power of GNN for recommendation. We have prepared the dataset to simplify the demonstration.

To run the data preprocessing script, a user needs to download the English dictionary of the stanfordnlp package first. However, the following command only needs to run once.

In [None]:
# Please uncomment the two commands when the tutorial is run for the first time.
#import stanfordnlp
#stanfordnlp.download('en')

Load the MovieLens dataset.

In [None]:
from movielens import MovieLens
data = MovieLens('.')

Calculate some statistics of the dataset.

In [None]:
ratings = data.ratings
user_id = np.array(ratings['user_idx'])
movie_id = np.array(ratings['movie_idx'])
user_movie_spm = spsp.coo_matrix((np.ones((len(user_id),)), (user_id, movie_id)))
num_users, num_movies = user_movie_spm.shape
print('#user-movie iterations:', len(movie_id))
print('#users:', num_users)
print('#movies:', num_movies)

Split the dataset into training, validation and testing sets. In the validation and testing dataset, each user has an item to predict.

In [None]:
def pick_test(user_movie_spm):
    users = user_movie_spm.row
    movies = user_movie_spm.col
    picks = np.zeros(shape=(len(users)))
    user_movie_spm = user_movie_spm.tocsr()
    indptr = user_movie_spm.indptr
    valid_set = np.zeros(shape=(num_users))
    test_set = np.zeros(shape=(num_users))
    for i in range(user_movie_spm.shape[0]):
        start_idx = indptr[i]
        end_idx = indptr[i+1]
        idx = np.random.choice(np.arange(start_idx, end_idx), 2, replace=False)
        valid_set[i] = movies[idx[0]]
        picks[idx[0]] = 1
        test_set[i] = movies[idx[1]]
        picks[idx[1]] = 1
    users = users[picks == 0]
    movies = movies[picks == 0]
    return spsp.coo_matrix((np.ones((len(users),)), (users, movies))), valid_set, test_set

orig_user_movie_spm = user_movie_spm.tocsr()
user_movie_spm, valid_set, test_set = pick_test(user_movie_spm)
print('#training size:', user_movie_spm.nnz)
users_valid = np.arange(num_users)
movies_valid = valid_set
users_test = np.arange(num_users)
movies_test = test_set
valid_size = len(users_valid)
test_size = len(users_test)
print('valid set:', valid_size)
print('test set:', test_size)

Save the data split in the file.

In [None]:
coo = user_movie_spm.tocoo()
train_map = {}
valid_map = {}
test_map = {}
#print the training set.
with open("train.txt","w") as file:
    for row, col in zip(coo.row, coo.col):
        train_map[(row, col)] = 1
        file.write(str(row) + ', ' + str(col) + '\n')
    file.close()
with open('valid.txt', 'w') as file:
    for row, col in enumerate(valid_set):
        valid_map[(row, col)] = 1
        file.write(str(row) + ', ' + str(int(col)) + '\n')
    file.close()
with open('test.txt', 'w') as file:
    for row, col in enumerate(test_set):
        test_map[(row, col)] = 1
        file.write(str(row) + ', ' + str(int(col)) + '\n')
    file.close()

Some of the negative samples are actually positive. Here we try to remove all of the postive ones from the negative set.

In [None]:
def gen_neg_set(user_movie_spm, neg_sample_size):
    num_users = user_movie_spm.shape[0]
    num_movies = user_movie_spm.shape[1]
    neg_mat = np.zeros(shape=(num_users, neg_sample_size))
    for user in range(num_users):
        movie_set = set()
        while len(movie_set) < neg_sample_size:
            movies = np.random.choice(num_movies, neg_sample_size, replace=False)
            for movie in movies:
                if user_movie_spm[user, movie] == 0:
                    movie_set.add(movie)
                if len(movie_set) == neg_sample_size:
                    break
        neg_mat[user] = np.array(list(movie_set))

    for user, movies in enumerate(neg_mat):
        for idx, movie in enumerate(movies):
            assert user_movie_spm[user, movie] == 0
                
    return neg_mat

neg_valid = gen_neg_set(orig_user_movie_spm.tocsr(), 99)
neg_test = gen_neg_set(orig_user_movie_spm.tocsr(), 99)

Save the negative sets.

In [None]:
with open('neg_valid.txt', 'w') as file:
    for row, cols in enumerate(neg_valid):
        for col in cols:
            assert (row, col) not in train_map
            assert (row, col) not in valid_map
            assert (row, col) not in test_map
            file.write(str(row) + ', ' + str(int(col)) + '\n')
    file.close()

with open('neg_test.txt', 'w') as file:
    for row, cols in enumerate(neg_test):
        for col in cols:
            assert (row, col) not in train_map
            assert (row, col) not in valid_map
            assert (row, col) not in test_map
            file.write(str(row) + ', ' + str(int(col)) + '\n')
    file.close()

Construct the item features.

In [None]:
year = np.expand_dims(data.movie_data['year'], axis=1)
genre = data.movie_data['genre']
title = data.movie_data['title']
features = torch.tensor(np.concatenate((genre, title), axis=1), dtype=torch.float32)
print('#features:', features.shape[1])
in_feats = features.shape[1]

Save everything in pickle

In [None]:
import pickle
pickle.dump(user_movie_spm, open('movielens_orig_train.pkl', 'wb'))
pickle.dump(g, open('movielens_graph.pkl', 'wb'))
pickle.dump(features, open('movielens_features.pkl', 'wb'))
pickle.dump((valid_set, test_set), open('movielens_eval.pkl', 'wb'))
pickle.dump((neg_valid, neg_test), open('movielens_neg.pkl', 'wb'))