From 9737b2700f82937834823111043a73e83e9dc2e2 Mon Sep 17 00:00:00 2001 From: Minjie Wang Date: Thu, 26 Sep 2019 17:50:38 -0400 Subject: [PATCH] [Model][Hetero] GCMC using new hetero APIs (#860) * init * rm data * README * fix rating values that are not decimal * rm stale codes * small fix * upd * rewrite decoder * fix many * many fix; performance matched * upd; handle sparse input * upd * address comments * more docstring; download data automatically * shared param mode --- examples/mxnet/gcmc/.gitignore | 1 + examples/mxnet/gcmc/README.md | 51 ++++ examples/mxnet/gcmc/data.py | 529 +++++++++++++++++++++++++++++++++ examples/mxnet/gcmc/model.py | 233 +++++++++++++++ examples/mxnet/gcmc/train.py | 242 +++++++++++++++ examples/mxnet/gcmc/utils.py | 79 +++++ 6 files changed, 1135 insertions(+) create mode 100644 examples/mxnet/gcmc/.gitignore create mode 100644 examples/mxnet/gcmc/README.md create mode 100644 examples/mxnet/gcmc/data.py create mode 100644 examples/mxnet/gcmc/model.py create mode 100644 examples/mxnet/gcmc/train.py create mode 100644 examples/mxnet/gcmc/utils.py diff --git a/examples/mxnet/gcmc/.gitignore b/examples/mxnet/gcmc/.gitignore new file mode 100644 index 000000000000..6bfe6b19e375 --- /dev/null +++ b/examples/mxnet/gcmc/.gitignore @@ -0,0 +1 @@ +log diff --git a/examples/mxnet/gcmc/README.md b/examples/mxnet/gcmc/README.md new file mode 100644 index 000000000000..aea0eb08e73a --- /dev/null +++ b/examples/mxnet/gcmc/README.md @@ -0,0 +1,51 @@ +# Graph Convolutional Matrix Completion + +Paper link: [https://arxiv.org/abs/1706.02263](https://arxiv.org/abs/1706.02263) +Author's code: [https://github.com/riannevdberg/gc-mc](https://github.com/riannevdberg/gc-mc) + +The implementation does not handle side-channel features and mini-epoching and thus achieves +slightly worse performance when using node features. + +Credit: Jiani Zhang ([@jennyzhang0215](https://github.com/jennyzhang0215)) + +## Dependencies +* MXNet 1.5.0+ +* pandas +* gluonnlp + +## Data + +Supported datasets: ml-100k, ml-1m, ml-10m + +## How to run + +ml-100k, no feature +```bash +DGLBACKEND=mxnet python train.py --data_name=ml-100k --use_one_hot_fea --gcn_agg_accum=stack +``` +Results: RMSE=0.9077 (0.910 reported) +Speed: 0.0246s/epoch (vanilla implementation: 0.1008s/epoch) + +ml-100k, with feature +```bash +DGLBACKEND=mxnet python train.py --data_name=ml-100k --gcn_agg_accum=stack +``` +Results: RMSE=0.9495 (0.905 reported) + +ml-1m, no feature +```bash +DGLBACKEND=mxnet python train.py --data_name=ml-1m --gcn_agg_accum=sum --use_one_hot_fea +``` +Results: RMSE=0.8377 (0.832 reported) +Speed: 0.0695s/epoch (vanilla implementation: 1.538s/epoch) + +ml-10m, no feature +```bash +DGLBACKEND=mxnet python train.py --data_name=ml-10m --gcn_agg_accum=stack --gcn_dropout=0.3 \ + --train_lr=0.001 --train_min_lr=0.0001 --train_max_iter=15000 \ + --use_one_hot_fea --gen_r_num_basis_func=4 +``` +Results: RMSE=0.7875 (0.777 reported) +Speed: 0.6480s/epoch (vanilla implementation: OOM) + +Testbed: EC2 p3.2xlarge diff --git a/examples/mxnet/gcmc/data.py b/examples/mxnet/gcmc/data.py new file mode 100644 index 000000000000..dc32c74727af --- /dev/null +++ b/examples/mxnet/gcmc/data.py @@ -0,0 +1,529 @@ +"""MovieLens dataset""" +import numpy as np +import os +import re +import pandas as pd +import scipy.sparse as sp +import gluonnlp as nlp +import mxnet as mx + +import dgl +from dgl.data.utils import download, extract_archive, get_download_dir + +_urls = { + 'ml-100k' : 'http://files.grouplens.org/datasets/movielens/ml-100k.zip', + 'ml-1m' : 'http://files.grouplens.org/datasets/movielens/ml-1m.zip', + 'ml-10m' : 'http://files.grouplens.org/datasets/movielens/ml-10m.zip', +} + +READ_DATASET_PATH = get_download_dir() +GENRES_ML_100K =\ + ['unknown', 'Action', 'Adventure', 'Animation', + 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', + 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', + 'Thriller', 'War', 'Western'] +GENRES_ML_1M = GENRES_ML_100K[1:] +GENRES_ML_10M = GENRES_ML_100K + ['IMAX'] + +class MovieLens(object): + """MovieLens dataset used by GCMC model + + TODO(minjie): make this dataset more general + + The dataset stores MovieLens ratings in two types of graphs. The encoder graph + contains rating value information in the form of edge types. The decoder graph + stores plain user-movie pairs in the form of a bipartite graph with no rating + information. All graphs have two types of nodes: "user" and "movie". + + The training, validation and test set can be summarized as follows: + + training_enc_graph : training user-movie pairs + rating info + training_dec_graph : training user-movie pairs + valid_enc_graph : training user-movie pairs + rating info + valid_dec_graph : validation user-movie pairs + test_enc_graph : training user-movie pairs + validation user-movie pairs + rating info + test_dec_graph : test user-movie pairs + + Attributes + ---------- + train_enc_graph : dgl.DGLHeteroGraph + Encoder graph for training. + train_dec_graph : dgl.DGLHeteroGraph + Decoder graph for training. + train_labels : mx.nd.NDArray + The categorical label of each user-movie pair + train_truths : mx.nd.NDArray + The actual rating values of each user-movie pair + valid_enc_graph : dgl.DGLHeteroGraph + Encoder graph for validation. + valid_dec_graph : dgl.DGLHeteroGraph + Decoder graph for validation. + valid_labels : mx.nd.NDArray + The categorical label of each user-movie pair + valid_truths : mx.nd.NDArray + The actual rating values of each user-movie pair + test_enc_graph : dgl.DGLHeteroGraph + Encoder graph for test. + test_dec_graph : dgl.DGLHeteroGraph + Decoder graph for test. + test_labels : mx.nd.NDArray + The categorical label of each user-movie pair + test_truths : mx.nd.NDArray + The actual rating values of each user-movie pair + user_feature : mx.nd.NDArray + User feature tensor. If None, representing an identity matrix. + movie_feature : mx.nd.NDArray + Movie feature tensor. If None, representing an identity matrix. + possible_rating_values : np.ndarray + Available rating values in the dataset + + Parameters + ---------- + name : str + Dataset name. Could be "ml-100k", "ml-1m", "ml-10m" + ctx : mx.context.Context + Device context + use_one_hot_fea : bool, optional + If true, the ``user_feature`` attribute is None, representing an one-hot identity + matrix. (Default: False) + symm : bool, optional + If true, the use symmetric normalize constant. Otherwise, use left normalize + constant. (Default: True) + test_ratio : float, optional + Ratio of test data + valid_ratio : float, optional + Ratio of validation data + + """ + def __init__(self, name, ctx, use_one_hot_fea=False, symm=True, + test_ratio=0.1, valid_ratio=0.1): + self._name = name + self._ctx = ctx + self._symm = symm + self._test_ratio = test_ratio + self._valid_ratio = valid_ratio + # download and extract + download_dir = get_download_dir() + zip_file_path = '{}/{}.zip'.format(download_dir, name) + download(_urls[name], path=zip_file_path) + extract_archive(zip_file_path, '{}/{}'.format(download_dir, name)) + if name == 'ml-10m': + root_folder = 'ml-10M100K' + else: + root_folder = name + self._dir = os.path.join(download_dir, name, root_folder) + print("Starting processing {} ...".format(self._name)) + self._load_raw_user_info() + self._load_raw_movie_info() + print('......') + if self._name == 'ml-100k': + self.all_train_rating_info = self._load_raw_rates(os.path.join(self._dir, 'u1.base'), '\t') + self.test_rating_info = self._load_raw_rates(os.path.join(self._dir, 'u1.test'), '\t') + self.all_rating_info = pd.concat([self.all_train_rating_info, self.test_rating_info]) + elif self._name == 'ml-1m' or self._name == 'ml-10m': + self.all_rating_info = self._load_raw_rates(os.path.join(self._dir, 'ratings.dat'), '::') + num_test = int(np.ceil(self.all_rating_info.shape[0] * self._test_ratio)) + shuffled_idx = np.random.permutation(self.all_rating_info.shape[0]) + self.test_rating_info = self.all_rating_info.iloc[shuffled_idx[: num_test]] + self.all_train_rating_info = self.all_rating_info.iloc[shuffled_idx[num_test: ]] + else: + raise NotImplementedError + print('......') + num_valid = int(np.ceil(self.all_train_rating_info.shape[0] * self._valid_ratio)) + shuffled_idx = np.random.permutation(self.all_train_rating_info.shape[0]) + self.valid_rating_info = self.all_train_rating_info.iloc[shuffled_idx[: num_valid]] + self.train_rating_info = self.all_train_rating_info.iloc[shuffled_idx[num_valid: ]] + self.possible_rating_values = np.unique(self.train_rating_info["rating"].values) + + print("All rating pairs : {}".format(self.all_rating_info.shape[0])) + print("\tAll train rating pairs : {}".format(self.all_train_rating_info.shape[0])) + print("\t\tTrain rating pairs : {}".format(self.train_rating_info.shape[0])) + print("\t\tValid rating pairs : {}".format(self.valid_rating_info.shape[0])) + print("\tTest rating pairs : {}".format(self.test_rating_info.shape[0])) + + self.user_info = self._drop_unseen_nodes(orign_info=self.user_info, + cmp_col_name="id", + reserved_ids_set=set(self.all_rating_info["user_id"].values), + label="user") + self.movie_info = self._drop_unseen_nodes(orign_info=self.movie_info, + cmp_col_name="id", + reserved_ids_set=set(self.all_rating_info["movie_id"].values), + label="movie") + + # Map user/movie to the global id + self.global_user_id_map = {ele: i for i, ele in enumerate(self.user_info['id'])} + self.global_movie_id_map = {ele: i for i, ele in enumerate(self.movie_info['id'])} + print('Total user number = {}, movie number = {}'.format(len(self.global_user_id_map), + len(self.global_movie_id_map))) + self._num_user = len(self.global_user_id_map) + self._num_movie = len(self.global_movie_id_map) + + ### Generate features + if use_one_hot_fea: + self.user_feature = None + self.movie_feature = None + else: + self.user_feature = mx.nd.array(self._process_user_fea(), ctx=ctx, dtype=np.float32) + self.movie_feature = mx.nd.array(self._process_movie_fea(), ctx=ctx, dtype=np.float32) + if self.user_feature is None: + self.user_feature_shape = (self.num_user, self.num_user) + self.movie_feature_shape = (self.num_movie, self.num_movie) + else: + self.user_feature_shape = self.user_feature.shape + self.movie_feature_shape = self.movie_feature.shape + info_line = "Feature dim: " + info_line += "\nuser: {}".format(self.user_feature_shape) + info_line += "\nmovie: {}".format(self.movie_feature_shape) + print(info_line) + + all_train_rating_pairs, all_train_rating_values = self._generate_pair_value(self.all_train_rating_info) + train_rating_pairs, train_rating_values = self._generate_pair_value(self.train_rating_info) + valid_rating_pairs, valid_rating_values = self._generate_pair_value(self.valid_rating_info) + test_rating_pairs, test_rating_values = self._generate_pair_value(self.test_rating_info) + + def _make_labels(ratings): + labels = mx.nd.array(np.searchsorted(self.possible_rating_values, ratings), + ctx=ctx, dtype=np.int32) + return labels + + self.train_enc_graph = self._generate_enc_graph(train_rating_pairs, train_rating_values, add_support=True) + self.train_dec_graph = self._generate_dec_graph(train_rating_pairs) + self.train_labels = _make_labels(train_rating_values) + self.train_truths = mx.nd.array(train_rating_values, ctx=ctx, dtype=np.float32) + + self.valid_enc_graph = self.train_enc_graph + self.valid_dec_graph = self._generate_dec_graph(valid_rating_pairs) + self.valid_labels = _make_labels(valid_rating_values) + self.valid_truths = mx.nd.array(valid_rating_values, ctx=ctx, dtype=np.float32) + + self.test_enc_graph = self._generate_enc_graph(all_train_rating_pairs, all_train_rating_values, add_support=True) + self.test_dec_graph = self._generate_dec_graph(test_rating_pairs) + self.test_labels = _make_labels(test_rating_values) + self.test_truths = mx.nd.array(test_rating_values, ctx=ctx, dtype=np.float32) + + def _npairs(graph): + rst = 0 + for r in self.possible_rating_values: + rst += graph.number_of_edges(str(r)) + return rst + + print("Train enc graph: \t#user:{}\t#movie:{}\t#pairs:{}".format( + self.train_enc_graph.number_of_nodes('user'), self.train_enc_graph.number_of_nodes('movie'), + _npairs(self.train_enc_graph))) + print("Train dec graph: \t#user:{}\t#movie:{}\t#pairs:{}".format( + self.train_dec_graph.number_of_nodes('user'), self.train_dec_graph.number_of_nodes('movie'), + self.train_dec_graph.number_of_edges())) + print("Valid enc graph: \t#user:{}\t#movie:{}\t#pairs:{}".format( + self.valid_enc_graph.number_of_nodes('user'), self.valid_enc_graph.number_of_nodes('movie'), + _npairs(self.valid_enc_graph))) + print("Valid dec graph: \t#user:{}\t#movie:{}\t#pairs:{}".format( + self.valid_dec_graph.number_of_nodes('user'), self.valid_dec_graph.number_of_nodes('movie'), + self.valid_dec_graph.number_of_edges())) + print("Test enc graph: \t#user:{}\t#movie:{}\t#pairs:{}".format( + self.test_enc_graph.number_of_nodes('user'), self.test_enc_graph.number_of_nodes('movie'), + _npairs(self.test_enc_graph))) + print("Test dec graph: \t#user:{}\t#movie:{}\t#pairs:{}".format( + self.test_dec_graph.number_of_nodes('user'), self.test_dec_graph.number_of_nodes('movie'), + self.test_dec_graph.number_of_edges())) + + def _generate_pair_value(self, rating_info): + rating_pairs = (np.array([self.global_user_id_map[ele] for ele in rating_info["user_id"]], + dtype=np.int64), + np.array([self.global_movie_id_map[ele] for ele in rating_info["movie_id"]], + dtype=np.int64)) + rating_values = rating_info["rating"].values.astype(np.float32) + return rating_pairs, rating_values + + def _generate_enc_graph(self, rating_pairs, rating_values, add_support=False): + user_movie_R = np.zeros((self._num_user, self._num_movie), dtype=np.float32) + user_movie_R[rating_pairs] = rating_values + movie_user_R = user_movie_R.transpose() + + rating_graphs = [] + rating_row, rating_col = rating_pairs + for rating in self.possible_rating_values: + ridx = np.where(rating_values == rating) + rrow = rating_row[ridx] + rcol = rating_col[ridx] + bg = dgl.bipartite((rrow, rcol), 'user', str(rating), 'movie', + card=(self._num_user, self._num_movie)) + rev_bg = dgl.bipartite((rcol, rrow), 'movie', 'rev-%s' % str(rating), 'user', + card=(self._num_movie, self._num_user)) + rating_graphs.append(bg) + rating_graphs.append(rev_bg) + graph = dgl.hetero_from_relations(rating_graphs) + + # sanity check + assert len(rating_pairs[0]) == sum([graph.number_of_edges(et) for et in graph.etypes]) // 2 + + if add_support: + def _calc_norm(x): + x = x.asnumpy().astype('float32') + x[x == 0.] = np.inf + x = mx.nd.array(1. / np.sqrt(x)) + return x.as_in_context(self._ctx).expand_dims(1) + user_ci = [] + user_cj = [] + movie_ci = [] + movie_cj = [] + for r in self.possible_rating_values: + r = str(r) + user_ci.append(graph['rev-%s' % r].in_degrees()) + movie_ci.append(graph[r].in_degrees()) + if self._symm: + user_cj.append(graph[r].out_degrees()) + movie_cj.append(graph['rev-%s' % r].out_degrees()) + else: + user_cj.append(mx.nd.zeros((self.num_user,))) + movie_cj.append(mx.nd.zeros((self.num_movie,))) + user_ci = _calc_norm(mx.nd.add_n(*user_ci)) + movie_ci = _calc_norm(mx.nd.add_n(*movie_ci)) + if self._symm: + user_cj = _calc_norm(mx.nd.add_n(*user_cj)) + movie_cj = _calc_norm(mx.nd.add_n(*movie_cj)) + else: + user_cj = mx.nd.ones((self.num_user,), ctx=self._ctx) + movie_cj = mx.nd.ones((self.num_movie,), ctx=self._ctx) + graph.nodes['user'].data.update({'ci' : user_ci, 'cj' : user_cj}) + graph.nodes['movie'].data.update({'ci' : movie_ci, 'cj' : movie_cj}) + + return graph + + def _generate_dec_graph(self, rating_pairs): + ones = np.ones_like(rating_pairs[0]) + user_movie_ratings_coo = sp.coo_matrix( + (ones, rating_pairs), + shape=(self.num_user, self.num_movie), dtype=np.float32) + return dgl.bipartite(user_movie_ratings_coo, 'user', 'rate', 'movie') + + @property + def num_links(self): + return self.possible_rating_values.size + + @property + def num_user(self): + return self._num_user + + @property + def num_movie(self): + return self._num_movie + + def _drop_unseen_nodes(self, orign_info, cmp_col_name, reserved_ids_set, label): + # print(" -----------------") + # print("{}: {}(reserved) v.s. {}(from info)".format(label, len(reserved_ids_set), + # len(set(orign_info[cmp_col_name].values)))) + if reserved_ids_set != set(orign_info[cmp_col_name].values): + pd_rating_ids = pd.DataFrame(list(reserved_ids_set), columns=["id_graph"]) + # print("\torign_info: ({}, {})".format(orign_info.shape[0], orign_info.shape[1])) + data_info = orign_info.merge(pd_rating_ids, left_on=cmp_col_name, right_on='id_graph', how='outer') + data_info = data_info.dropna(subset=[cmp_col_name, 'id_graph']) + data_info = data_info.drop(columns=["id_graph"]) + data_info = data_info.reset_index(drop=True) + # print("\tAfter dropping, data shape: ({}, {})".format(data_info.shape[0], data_info.shape[1])) + return data_info + else: + orign_info = orign_info.reset_index(drop=True) + return orign_info + + def _load_raw_rates(self, file_path, sep): + """In MovieLens, the rates have the following format + + ml-100k + user id \t movie id \t rating \t timestamp + + ml-1m/10m + UserID::MovieID::Rating::Timestamp + + timestamp is unix timestamp and can be converted by pd.to_datetime(X, unit='s') + + Parameters + ---------- + file_path : str + + Returns + ------- + rating_info : pd.DataFrame + """ + rating_info = pd.read_csv( + file_path, sep=sep, header=None, + names=['user_id', 'movie_id', 'rating', 'timestamp'], + dtype={'user_id': np.int32, 'movie_id' : np.int32, + 'ratings': np.float32, 'timestamp': np.int64}, engine='python') + return rating_info + + def _load_raw_user_info(self): + """In MovieLens, the user attributes file have the following formats: + + ml-100k: + user id | age | gender | occupation | zip code + + ml-1m: + UserID::Gender::Age::Occupation::Zip-code + + For ml-10m, there is no user information. We read the user id from the rating file. + + Parameters + ---------- + name : str + + Returns + ------- + user_info : pd.DataFrame + """ + if self._name == 'ml-100k': + self.user_info = pd.read_csv(os.path.join(self._dir, 'u.user'), sep='|', header=None, + names=['id', 'age', 'gender', 'occupation', 'zip_code'], engine='python') + elif self._name == 'ml-1m': + self.user_info = pd.read_csv(os.path.join(self._dir, 'users.dat'), sep='::', header=None, + names=['id', 'gender', 'age', 'occupation', 'zip_code'], engine='python') + elif self._name == 'ml-10m': + rating_info = pd.read_csv( + os.path.join(self._dir, 'ratings.dat'), sep='::', header=None, + names=['user_id', 'movie_id', 'rating', 'timestamp'], + dtype={'user_id': np.int32, 'movie_id': np.int32, 'ratings': np.float32, + 'timestamp': np.int64}, engine='python') + self.user_info = pd.DataFrame(np.unique(rating_info['user_id'].values.astype(np.int32)), + columns=['id']) + else: + raise NotImplementedError + + def _process_user_fea(self): + """ + + Parameters + ---------- + user_info : pd.DataFrame + name : str + For ml-100k and ml-1m, the column name is ['id', 'gender', 'age', 'occupation', 'zip_code']. + We take the age, gender, and the one-hot encoding of the occupation as the user features. + For ml-10m, there is no user feature and we set the feature to be a single zero. + + Returns + ------- + user_features : np.ndarray + + """ + if self._name == 'ml-100k' or self._name == 'ml-1m': + ages = self.user_info['age'].values.astype(np.float32) + gender = (self.user_info['gender'] == 'F').values.astype(np.float32) + all_occupations = set(self.user_info['occupation']) + occupation_map = {ele: i for i, ele in enumerate(all_occupations)} + occupation_one_hot = np.zeros(shape=(self.user_info.shape[0], len(all_occupations)), + dtype=np.float32) + occupation_one_hot[np.arange(self.user_info.shape[0]), + np.array([occupation_map[ele] for ele in self.user_info['occupation']])] = 1 + user_features = np.concatenate([ages.reshape((self.user_info.shape[0], 1)) / 50.0, + gender.reshape((self.user_info.shape[0], 1)), + occupation_one_hot], axis=1) + elif self._name == 'ml-10m': + user_features = np.zeros(shape=(self.user_info.shape[0], 1), dtype=np.float32) + else: + raise NotImplementedError + return user_features + + def _load_raw_movie_info(self): + """In MovieLens, the movie attributes may have the following formats: + + In ml_100k: + + movie id | movie title | release date | video release date | IMDb URL | [genres] + + In ml_1m, ml_10m: + + MovieID::Title (Release Year)::Genres + + Also, Genres are separated by |, e.g., Adventure|Animation|Children|Comedy|Fantasy + + Parameters + ---------- + name : str + + Returns + ------- + movie_info : pd.DataFrame + For ml-100k, the column name is ['id', 'title', 'release_date', 'video_release_date', 'url'] + [GENRES (19)]] + For ml-1m and ml-10m, the column name is ['id', 'title'] + [GENRES (18/20)]] + """ + if self._name == 'ml-100k': + GENRES = GENRES_ML_100K + elif self._name == 'ml-1m': + GENRES = GENRES_ML_1M + elif self._name == 'ml-10m': + GENRES = GENRES_ML_10M + else: + raise NotImplementedError + + if self._name == 'ml-100k': + file_path = os.path.join(self._dir, 'u.item') + self.movie_info = pd.read_csv(file_path, sep='|', header=None, + names=['id', 'title', 'release_date', 'video_release_date', 'url'] + GENRES, + engine='python') + elif self._name == 'ml-1m' or self._name == 'ml-10m': + file_path = os.path.join(self._dir, 'movies.dat') + movie_info = pd.read_csv(file_path, sep='::', header=None, + names=['id', 'title', 'genres'], engine='python') + genre_map = {ele: i for i, ele in enumerate(GENRES)} + genre_map['Children\'s'] = genre_map['Children'] + genre_map['Childrens'] = genre_map['Children'] + movie_genres = np.zeros(shape=(movie_info.shape[0], len(GENRES)), dtype=np.float32) + for i, genres in enumerate(movie_info['genres']): + for ele in genres.split('|'): + if ele in genre_map: + movie_genres[i, genre_map[ele]] = 1.0 + else: + print('genres not found, filled with unknown: {}'.format(genres)) + movie_genres[i, genre_map['unknown']] = 1.0 + for idx, genre_name in enumerate(GENRES): + assert idx == genre_map[genre_name] + movie_info[genre_name] = movie_genres[:, idx] + self.movie_info = movie_info.drop(columns=["genres"]) + else: + raise NotImplementedError + + def _process_movie_fea(self): + """ + + Parameters + ---------- + movie_info : pd.DataFrame + name : str + + Returns + ------- + movie_features : np.ndarray + Generate movie features by concatenating embedding and the year + + """ + if self._name == 'ml-100k': + GENRES = GENRES_ML_100K + elif self._name == 'ml-1m': + GENRES = GENRES_ML_1M + elif self._name == 'ml-10m': + GENRES = GENRES_ML_10M + else: + raise NotImplementedError + + word_embedding = nlp.embedding.GloVe('glove.840B.300d') + tokenizer = nlp.data.transforms.SpacyTokenizer() + + title_embedding = np.zeros(shape=(self.movie_info.shape[0], 300), dtype=np.float32) + release_years = np.zeros(shape=(self.movie_info.shape[0], 1), dtype=np.float32) + p = re.compile(r'(.+)\s*\((\d+)\)') + for i, title in enumerate(self.movie_info['title']): + match_res = p.match(title) + if match_res is None: + print('{} cannot be matched, index={}, name={}'.format(title, i, self._name)) + title_context, year = title, 1950 + else: + title_context, year = match_res.groups() + # We use average of glove + title_embedding[i, :] = word_embedding[tokenizer(title_context)].asnumpy().mean(axis=0) + release_years[i] = float(year) + movie_features = np.concatenate((title_embedding, + (release_years - 1950.0) / 100.0, + self.movie_info[GENRES]), + axis=1) + return movie_features + +if __name__ == '__main__': + MovieLens("ml-100k", ctx=mx.cpu(), symm=True) diff --git a/examples/mxnet/gcmc/model.py b/examples/mxnet/gcmc/model.py new file mode 100644 index 000000000000..75171b36ee75 --- /dev/null +++ b/examples/mxnet/gcmc/model.py @@ -0,0 +1,233 @@ +"""NN modules""" +import math + +import numpy as np +import mxnet as mx +import mxnet.ndarray as F +from mxnet.gluon import nn, Block +import dgl.function as fn + +from utils import get_activation + +class GCMCLayer(Block): + r"""GCMC layer + + .. math:: + z_j^{(l+1)} = \sigma_{agg}\left[\mathrm{agg}\left( + \sum_{j\in\mathcal{N}_1}\frac{1}{c_{ij}}W_1h_j, \ldots, + \sum_{j\in\mathcal{N}_R}\frac{1}{c_{ij}}W_Rh_j + \right)\right] + + After that, apply an extra output projection: + + .. math:: + h_j^{(l+1)} = \sigma_{out}W_oz_j^{(l+1)} + + The equation is applied to both user nodes and movie nodes and the parameters + are not shared unless ``share_user_item_param`` is true. + + Parameters + ---------- + rating_vals : list of int or float + Possible rating values. + user_in_units : int + Size of user input feature + movie_in_units : int + Size of movie input feature + msg_units : int + Size of message :math:`W_rh_j` + out_units : int + Size of of final output user and movie features + dropout_rate : float, optional + Dropout rate (Default: 0.0) + agg : str, optional + Function to aggregate messages of different ratings. + Could be any of the supported cross type reducers: + "sum", "max", "min", "mean", "stack". + (Default: "stack") + agg_act : callable, str, optional + Activation function :math:`sigma_{agg}`. (Default: None) + out_act : callable, str, optional + Activation function :math:`sigma_{agg}`. (Default: None) + share_user_item_param : bool, optional + If true, user node and movie node share the same set of parameters. + Require ``user_in_units`` and ``move_in_units`` to be the same. + (Default: False) + """ + def __init__(self, + rating_vals, + user_in_units, + movie_in_units, + msg_units, + out_units, + dropout_rate=0.0, + agg='stack', # or 'sum' + agg_act=None, + out_act=None, + share_user_item_param=False): + super(GCMCLayer, self).__init__() + self.rating_vals = rating_vals + self.agg = agg + self.share_user_item_param = share_user_item_param + if agg == 'stack': + # divide the original msg unit size by number of ratings to keep + # the dimensionality + assert msg_units % len(rating_vals) == 0 + msg_units = msg_units // len(rating_vals) + with self.name_scope(): + self.dropout = nn.Dropout(dropout_rate) + self.W_r = {} + for rating in rating_vals: + rating = str(rating) + if share_user_item_param and user_in_units == movie_in_units: + self.W_r[rating] = self.params.get( + 'W_r_%s' % rating, shape=(user_in_units, msg_units), + dtype=np.float32, allow_deferred_init=True) + self.W_r['rev-%s' % rating] = self.W_r[rating] + else: + self.W_r[rating] = self.params.get( + 'W_r_%s' % rating, shape=(user_in_units, msg_units), + dtype=np.float32, allow_deferred_init=True) + self.W_r['rev-%s' % rating] = self.params.get( + 'revW_r_%s' % rating, shape=(movie_in_units, msg_units), + dtype=np.float32, allow_deferred_init=True) + self.ufc = nn.Dense(out_units) + if share_user_item_param: + self.ifc = self.ufc + else: + self.ifc = nn.Dense(out_units) + self.agg_act = get_activation(agg_act) + self.out_act = get_activation(out_act) + + def forward(self, graph, ufeat=None, ifeat=None): + """Forward function + + Normalizer constant :math:`c_{ij}` is stored as two node data "ci" + and "cj". + + Parameters + ---------- + graph : DGLHeteroGraph + User-movie rating graph. It should contain two node types: "user" + and "movie" and many edge types each for one rating value. + ufeat : mx.nd.NDArray, optional + User features. If None, using an identity matrix. + ifeat : mx.nd.NDArray, optional + Movie features. If None, using an identity matrix. + + Returns + ------- + new_ufeat : mx.nd.NDArray + New user features + new_ifeat : mx.nd.NDArray + New movie features + """ + num_u = graph.number_of_nodes('user') + num_i = graph.number_of_nodes('movie') + funcs = {} + for i, rating in enumerate(self.rating_vals): + rating = str(rating) + # W_r * x + x_u = dot_or_identity(ufeat, self.W_r[rating].data()) + x_i = dot_or_identity(ifeat, self.W_r['rev-%s' % rating].data()) + # left norm and dropout + x_u = x_u * self.dropout(graph.nodes['user'].data['cj']) + x_i = x_i * self.dropout(graph.nodes['movie'].data['cj']) + graph.nodes['user'].data['h%d' % i] = x_u + graph.nodes['movie'].data['h%d' % i] = x_i + funcs[rating] = (fn.copy_u('h%d' % i, 'm'), fn.sum('m', 'h')) + funcs['rev-%s' % rating] = (fn.copy_u('h%d' % i, 'm'), fn.sum('m', 'h')) + # message passing + graph.multi_update_all(funcs, self.agg) + ufeat = graph.nodes['user'].data.pop('h').reshape((num_u, -1)) + ifeat = graph.nodes['movie'].data.pop('h').reshape((num_i, -1)) + # right norm + ufeat = ufeat * graph.nodes['user'].data['ci'] + ifeat = ifeat * graph.nodes['movie'].data['ci'] + # fc and non-linear + ufeat = self.agg_act(ufeat) + ifeat = self.agg_act(ifeat) + ufeat = self.dropout(ufeat) + ifeat = self.dropout(ifeat) + ufeat = self.ufc(ufeat) + ifeat = self.ifc(ifeat) + return self.out_act(ufeat), self.out_act(ifeat) + +class BiDecoder(Block): + r"""Bilinear decoder. + + .. math:: + p(M_{ij}=r) = \text{softmax}(u_i^TQ_rv_j) + + The trainable parameter :math:`Q_r` is further decomposed to a linear + combination of basis weight matrices :math:`P_s`: + + .. math:: + Q_r = \sum_{s=1}^{b} a_{rs}P_s + + Parameters + ---------- + rating_vals : list of int or float + Possible rating values. + in_units : int + Size of input user and movie features + num_basis_functions : int, optional + Number of basis. (Default: 2) + dropout_rate : float, optional + Dropout raite (Default: 0.0) + """ + def __init__(self, + rating_vals, + in_units, + num_basis_functions=2, + dropout_rate=0.0): + super(BiDecoder, self).__init__() + self.rating_vals = rating_vals + self._num_basis_functions = num_basis_functions + self.dropout = nn.Dropout(dropout_rate) + self.Ps = [] + with self.name_scope(): + for i in range(num_basis_functions): + self.Ps.append(self.params.get( + 'Ps_%d' % i, shape=(in_units, in_units), + #init=mx.initializer.Orthogonal(scale=1.1, rand_type='normal'), + init=mx.initializer.Xavier(magnitude=math.sqrt(2.0)), + allow_deferred_init=True)) + self.rate_out = nn.Dense(units=len(rating_vals), flatten=False, use_bias=False) + + def forward(self, graph, ufeat, ifeat): + """Forward function. + + Parameters + ---------- + graph : DGLHeteroGraph + "Flattened" user-movie graph with only one edge type. + ufeat : mx.nd.NDArray + User embeddings. Shape: (|V_u|, D) + ifeat : mx.nd.NDArray + Movie embeddings. Shape: (|V_m|, D) + + Returns + ------- + mx.nd.NDArray + Predicting scores for each user-movie edge. + """ + graph = graph.local_var() + ufeat = self.dropout(ufeat) + ifeat = self.dropout(ifeat) + graph.nodes['movie'].data['h'] = ifeat + basis_out = [] + for i in range(self._num_basis_functions): + graph.nodes['user'].data['h'] = F.dot(ufeat, self.Ps[i].data()) + graph.apply_edges(fn.u_dot_v('h', 'h', 'sr')) + basis_out.append(graph.edata['sr'].expand_dims(1)) + out = F.concat(*basis_out, dim=1) + out = self.rate_out(out) + return out + +def dot_or_identity(A, B): + # if A is None, treat as identity matrix + if A is None: + return B + else: + return mx.nd.dot(A, B) diff --git a/examples/mxnet/gcmc/train.py b/examples/mxnet/gcmc/train.py new file mode 100644 index 000000000000..a43a9bdc4da9 --- /dev/null +++ b/examples/mxnet/gcmc/train.py @@ -0,0 +1,242 @@ +"""Training script""" +import os, time +import argparse +import logging +import random +import string +import numpy as np +import mxnet as mx +from mxnet import gluon +from data import MovieLens +from model import GCMCLayer, BiDecoder +from utils import get_activation, parse_ctx, gluon_net_info, gluon_total_param_num, \ + params_clip_global_norm, MetricLogger +from mxnet.gluon import Block + +class Net(Block): + def __init__(self, args, **kwargs): + super(Net, self).__init__(**kwargs) + self._act = get_activation(args.model_activation) + with self.name_scope(): + self.encoder = GCMCLayer(args.rating_vals, + args.src_in_units, + args.dst_in_units, + args.gcn_agg_units, + args.gcn_out_units, + args.gcn_dropout, + args.gcn_agg_accum, + agg_act=self._act, + share_user_item_param=args.share_param) + self.decoder = BiDecoder(args.rating_vals, + in_units=args.gcn_out_units, + num_basis_functions=args.gen_r_num_basis_func) + + def forward(self, enc_graph, dec_graph, ufeat, ifeat): + user_out, movie_out = self.encoder( + enc_graph, + ufeat, + ifeat) + pred_ratings = self.decoder(dec_graph, user_out, movie_out) + return pred_ratings + +def evaluate(args, net, dataset, segment='valid'): + possible_rating_values = dataset.possible_rating_values + nd_possible_rating_values = mx.nd.array(possible_rating_values, ctx=args.ctx, dtype=np.float32) + + if segment == "valid": + rating_values = dataset.valid_truths + enc_graph = dataset.valid_enc_graph + dec_graph = dataset.valid_dec_graph + elif segment == "test": + rating_values = dataset.test_truths + enc_graph = dataset.test_enc_graph + dec_graph = dataset.test_dec_graph + else: + raise NotImplementedError + + # Evaluate RMSE + with mx.autograd.predict_mode(): + pred_ratings = net(enc_graph, dec_graph, + dataset.user_feature, dataset.movie_feature) + real_pred_ratings = (mx.nd.softmax(pred_ratings, axis=1) * + nd_possible_rating_values.reshape((1, -1))).sum(axis=1) + rmse = mx.nd.square(real_pred_ratings - rating_values).mean().asscalar() + rmse = np.sqrt(rmse) + return rmse + +def train(args): + print(args) + dataset = MovieLens(args.data_name, args.ctx, use_one_hot_fea=args.use_one_hot_fea, symm=args.gcn_agg_norm_symm, + test_ratio=args.data_test_ratio, valid_ratio=args.data_valid_ratio) + print("Loading data finished ...\n") + + args.src_in_units = dataset.user_feature_shape[1] + args.dst_in_units = dataset.movie_feature_shape[1] + args.rating_vals = dataset.possible_rating_values + + ### build the net + net = Net(args=args) + net.initialize(init=mx.init.Xavier(factor_type='in'), ctx=args.ctx) + net.hybridize() + nd_possible_rating_values = mx.nd.array(dataset.possible_rating_values, ctx=args.ctx, dtype=np.float32) + rating_loss_net = gluon.loss.SoftmaxCELoss() + rating_loss_net.hybridize() + trainer = gluon.Trainer(net.collect_params(), args.train_optimizer, {'learning_rate': args.train_lr}) + print("Loading network finished ...\n") + + ### perpare training data + train_gt_labels = dataset.train_labels + train_gt_ratings = dataset.train_truths + + ### prepare the logger + train_loss_logger = MetricLogger(['iter', 'loss', 'rmse'], ['%d', '%.4f', '%.4f'], + os.path.join(args.save_dir, 'train_loss%d.csv' % args.save_id)) + valid_loss_logger = MetricLogger(['iter', 'rmse'], ['%d', '%.4f'], + os.path.join(args.save_dir, 'valid_loss%d.csv' % args.save_id)) + test_loss_logger = MetricLogger(['iter', 'rmse'], ['%d', '%.4f'], + os.path.join(args.save_dir, 'test_loss%d.csv' % args.save_id)) + + ### declare the loss information + best_valid_rmse = np.inf + no_better_valid = 0 + best_iter = -1 + avg_gnorm = 0 + count_rmse = 0 + count_num = 0 + count_loss = 0 + + print("Start training ...") + dur = [] + for iter_idx in range(1, args.train_max_iter): + if iter_idx > 3: + t0 = time.time() + with mx.autograd.record(): + pred_ratings = net(dataset.train_enc_graph, dataset.train_dec_graph, + dataset.user_feature, dataset.movie_feature) + loss = rating_loss_net(pred_ratings, train_gt_labels).mean() + loss.backward() + + count_loss += loss.asscalar() + gnorm = params_clip_global_norm(net.collect_params(), args.train_grad_clip, args.ctx) + avg_gnorm += gnorm + trainer.step(1.0) + if iter_idx > 3: + dur.append(time.time() - t0) + + if iter_idx == 1: + print("Total #Param of net: %d" % (gluon_total_param_num(net))) + print(gluon_net_info(net, save_path=os.path.join(args.save_dir, 'net%d.txt' % args.save_id))) + + real_pred_ratings = (mx.nd.softmax(pred_ratings, axis=1) * + nd_possible_rating_values.reshape((1, -1))).sum(axis=1) + rmse = mx.nd.square(real_pred_ratings - train_gt_ratings).sum() + count_rmse += rmse.asscalar() + count_num += pred_ratings.shape[0] + + if iter_idx % args.train_log_interval == 0: + train_loss_logger.log(iter=iter_idx, + loss=count_loss/(iter_idx+1), rmse=count_rmse/count_num) + logging_str = "Iter={}, gnorm={:.3f}, loss={:.4f}, rmse={:.4f}, time={:.4f}".format( + iter_idx, avg_gnorm/args.train_log_interval, + count_loss/iter_idx, count_rmse/count_num, + np.average(dur)) + avg_gnorm = 0 + count_rmse = 0 + count_num = 0 + + if iter_idx % args.train_valid_interval == 0: + valid_rmse = evaluate(args=args, net=net, dataset=dataset, segment='valid') + valid_loss_logger.log(iter = iter_idx, rmse = valid_rmse) + logging_str += ',\tVal RMSE={:.4f}'.format(valid_rmse) + + if valid_rmse < best_valid_rmse: + best_valid_rmse = valid_rmse + no_better_valid = 0 + best_iter = iter_idx + #net.save_parameters(filename=os.path.join(args.save_dir, 'best_valid_net{}.params'.format(args.save_id))) + test_rmse = evaluate(args=args, net=net, dataset=dataset, segment='test') + best_test_rmse = test_rmse + test_loss_logger.log(iter=iter_idx, rmse=test_rmse) + logging_str += ', Test RMSE={:.4f}'.format(test_rmse) + else: + no_better_valid += 1 + if no_better_valid > args.train_early_stopping_patience\ + and trainer.learning_rate <= args.train_min_lr: + logging.info("Early stopping threshold reached. Stop training.") + break + if no_better_valid > args.train_decay_patience: + new_lr = max(trainer.learning_rate * args.train_lr_decay_factor, args.train_min_lr) + if new_lr < trainer.learning_rate: + logging.info("\tChange the LR to %g" % new_lr) + trainer.set_learning_rate(new_lr) + no_better_valid = 0 + if iter_idx % args.train_log_interval == 0: + print(logging_str) + print('Best Iter Idx={}, Best Valid RMSE={:.4f}, Best Test RMSE={:.4f}'.format( + best_iter, best_valid_rmse, best_test_rmse)) + train_loss_logger.close() + valid_loss_logger.close() + test_loss_logger.close() + + +def config(): + parser = argparse.ArgumentParser(description='Run the baseline method.') + + parser.add_argument('--seed', default=123, type=int) + parser.add_argument('--ctx', dest='ctx', default='gpu0', type=str, + help='Running Context. E.g `--ctx gpu` or `--ctx gpu0,gpu1` or `--ctx cpu`') + parser.add_argument('--save_dir', type=str, help='The saving directory') + parser.add_argument('--save_id', type=int, help='The saving log id') + parser.add_argument('--silent', action='store_true') + + parser.add_argument('--data_name', default='ml-1m', type=str, + help='The dataset name: ml-100k, ml-1m, ml-10m') + parser.add_argument('--data_test_ratio', type=float, default=0.1) ## for ml-100k the test ration is 0.2 + parser.add_argument('--data_valid_ratio', type=float, default=0.1) + parser.add_argument('--use_one_hot_fea', action='store_true', default=False) + + #parser.add_argument('--model_remove_rating', type=bool, default=False) + parser.add_argument('--model_activation', type=str, default="leaky") + + parser.add_argument('--gcn_dropout', type=float, default=0.7) + parser.add_argument('--gcn_agg_norm_symm', type=bool, default=True) + parser.add_argument('--gcn_agg_units', type=int, default=500) + parser.add_argument('--gcn_agg_accum', type=str, default="sum") + parser.add_argument('--gcn_out_units', type=int, default=75) + + parser.add_argument('--gen_r_num_basis_func', type=int, default=2) + + # parser.add_argument('--train_rating_batch_size', type=int, default=10000) + parser.add_argument('--train_max_iter', type=int, default=2000) + parser.add_argument('--train_log_interval', type=int, default=1) + parser.add_argument('--train_valid_interval', type=int, default=1) + parser.add_argument('--train_optimizer', type=str, default="adam") + parser.add_argument('--train_grad_clip', type=float, default=1.0) + parser.add_argument('--train_lr', type=float, default=0.01) + parser.add_argument('--train_min_lr', type=float, default=0.001) + parser.add_argument('--train_lr_decay_factor', type=float, default=0.5) + parser.add_argument('--train_decay_patience', type=int, default=50) + parser.add_argument('--train_early_stopping_patience', type=int, default=100) + parser.add_argument('--share_param', default=False, action='store_true') + + args = parser.parse_args() + args.ctx = parse_ctx(args.ctx)[0] + + + ### configure save_fir to save all the info + if args.save_dir is None: + args.save_dir = args.data_name+"_" + ''.join(random.choices(string.ascii_uppercase + string.digits, k=2)) + if args.save_id is None: + args.save_id = np.random.randint(20) + args.save_dir = os.path.join("log", args.save_dir) + if not os.path.isdir(args.save_dir): + os.makedirs(args.save_dir) + + return args + + +if __name__ == '__main__': + args = config() + np.random.seed(args.seed) + mx.random.seed(args.seed, args.ctx) + train(args) diff --git a/examples/mxnet/gcmc/utils.py b/examples/mxnet/gcmc/utils.py new file mode 100644 index 000000000000..b56e37656d66 --- /dev/null +++ b/examples/mxnet/gcmc/utils.py @@ -0,0 +1,79 @@ +import ast +import os +import csv +import inspect +import logging +import re +import mxnet.ndarray as nd +from mxnet import gluon +from mxnet.gluon import nn +import mxnet as mx +import numpy as np +from collections import OrderedDict + +class MetricLogger(object): + def __init__(self, attr_names, parse_formats, save_path): + self._attr_format_dict = OrderedDict(zip(attr_names, parse_formats)) + self._file = open(save_path, 'w') + self._csv = csv.writer(self._file) + self._csv.writerow(attr_names) + self._file.flush() + + def log(self, **kwargs): + self._csv.writerow([parse_format % kwargs[attr_name] + for attr_name, parse_format in self._attr_format_dict.items()]) + self._file.flush() + + def close(self): + self._file.close() + +def parse_ctx(ctx_args): + ctx = re.findall('([a-z]+)(\d*)', ctx_args) + ctx = [(device, int(num)) if len(num) > 0 else (device, 0) for device, num in ctx] + ctx = [mx.Context(*ele) for ele in ctx] + return ctx + + +def gluon_total_param_num(net): + return sum([np.prod(v.shape) for v in net.collect_params().values()]) + + +def gluon_net_info(net, save_path=None): + info_str = 'Total Param Number: {}\n'.format(gluon_total_param_num(net)) +\ + 'Params:\n' + for k, v in net.collect_params().items(): + info_str += '\t{}: {}, {}\n'.format(k, v.shape, np.prod(v.shape)) + info_str += str(net) + if save_path is not None: + with open(save_path, 'w') as f: + f.write(info_str) + return info_str + + +def params_clip_global_norm(param_dict, clip, ctx): + grads = [p.grad(ctx) for p in param_dict.values()] + gnorm = gluon.utils.clip_global_norm(grads, clip) + return gnorm + +def get_activation(act): + """Get the activation based on the act string + + Parameters + ---------- + act: str or HybridBlock + + Returns + ------- + ret: HybridBlock + """ + if act is None: + return lambda x: x + if isinstance(act, str): + if act == 'leaky': + return nn.LeakyReLU(0.1) + elif act in ['relu', 'sigmoid', 'tanh', 'softrelu', 'softsign']: + return nn.Activation(act) + else: + raise NotImplementedError + else: + return act