In [1]:
import time
import datetime
# 시간 표시 함수
def format_time(elapsed):
    # 반올림
    elapsed_rounded = int(round((elapsed)))
    # hh:mm:ss으로 형태 변경
    return str(datetime.timedelta(seconds=elapsed_rounded))

start_time = time.time()
print("  Training epoch took: {:}".format(format_time(time.time() - start_time)))

  Training epoch took: 0:00:00


In [2]:
"""MovieLens dataset"""

import os
import scipy.sparse as sp

import numpy as np
import pandas as pd
import torch as th

import dgl 
from dgl.data.utils import download, extract_archive, get_download_dir
from refex import extract_refex_feature
import utils

_urls = {
    'ml-100k' : 'http://files.grouplens.org/datasets/movielens/ml-100k.zip',
    'ml-1m' : 'http://files.grouplens.org/datasets/movielens/ml-1m.zip',
}

GENRES_ML_100K =\
    ['unknown', 'Action', 'Adventure', 'Animation',
     'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
     'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
     'Thriller', 'War', 'Western']
GENRES_ML_1M = GENRES_ML_100K[1:]

Using backend: pytorch


In [3]:
class MovieLens(object):
    """MovieLens dataset used by GCMC model
    """
    def __init__(self, data_name, testing=False, 
                 test_ratio=0.1, valid_ratio=0.2):
        # self._data_name = data_name

        # # 1. download and extract
        # download_dir = get_download_dir()
        # self._dir = os.path.join(download_dir, data_name, data_name)
        # if not os.path.exists(self._dir):
        #     zip_file_path = '{}/{}.zip'.format(download_dir, data_name)
        #     download(_urls[data_name], path=zip_file_path)
        #     extract_archive(zip_file_path, '{}/{}'.format(download_dir, data_name))
        
        # print("Starting processing {} ...".format(self._data_name))

        # # 2. load rating data
        # if self._data_name == 'ml-100k':
        #     train_rating_data = self._load_raw_rates(os.path.join(self._dir, 'u1.base'), '\t')
        #     test_rating_data = self._load_raw_rates(os.path.join(self._dir, 'u1.test'), '\t')
        #     all_rating_data = pd.concat([train_rating_data, test_rating_data])
        # elif self._data_name == 'ml-1m':
        #     all_rating_data = self._load_raw_rates(os.path.join(self._dir, 'ratings.dat'), '::')
        #     num_test = int(np.ceil(all_rating_data.shape[0] * test_ratio))
        #     shuffled_idx = np.random.permutation(all_rating_data.shape[0])
        #     test_rating_data = all_rating_data.iloc[shuffled_idx[: num_test]]
        #     train_rating_data = all_rating_data.iloc[shuffled_idx[num_test: ]]
        # else:
        #     raise NotImplementedError
        # num_valid = int(np.ceil(train_rating_data.shape[0] * valid_ratio))
        # shuffled_idx = np.random.permutation(train_rating_data.shape[0])
        # valid_rating_data = train_rating_data.iloc[shuffled_idx[: num_valid]]
        # if not testing:
        #     train_rating_data = train_rating_data.iloc[shuffled_idx[num_valid:]]

        # self._rating = np.sort(np.unique(all_rating_data["rating"].values))
        
        # print("All rating pairs : {}".format(all_rating_data.shape[0]))
        # # print("\tAll train rating pairs : {}".format(self.all_train_rating_data.shape[0]))
        # print("\tTrain rating pairs : {}".format(train_rating_data.shape[0]))
        # print("\tValid rating pairs : {}".format(valid_rating_data.shape[0]))
        # print("\tTest rating pairs  : {}".format(test_rating_data.shape[0]))

        # # 2. load user and movie data, and drop those unseen in rating_data
        # user_data = self._load_raw_user_data()
        # movie_data = self._load_raw_movie_data()
        # user_data = self._drop_unseen_nodes(data_df=user_data,
        #                                     col_name="id",
        #                                     reserved_ids_set=set(all_rating_data["user_id"].values))
        # movie_data = self._drop_unseen_nodes(data_df=movie_data,
        #                                     col_name="id",
        #                                     reserved_ids_set=set(all_rating_data["movie_id"].values))

        # # 3. set user and movie feature to None
        # user_feature = None
        # movie_feature = None

        # # 4. generate rating pairs
        # # Map user/movie to the global id
        # self._global_user_id_map = {ele: i for i, ele in enumerate(user_data['id'])}
        # self._global_movie_id_map = {ele: i for i, ele in enumerate(movie_data['id'])}
        # print('Total user number = {}, movie number = {}'.format(len(self._global_user_id_map),
        #                                                          len(self._global_movie_id_map)))
        # self._num_user = len(self._global_user_id_map)
        # self._num_movie = len(self._global_movie_id_map)

        # # pair value is idx rather than id, and rating value starts from 1.0
        # # self.all_train_rating_pairs, self.all_train_rating_values = self._generate_pair_value(self.all_train_rating_data)
        # train_u_indices, train_v_indices, train_labels = self._generate_pair_value(train_rating_data)
        # val_u_indices, val_v_indices, val_labels = self._generate_pair_value(valid_rating_data)
        # test_u_indices, test_v_indices, test_labels = self._generate_pair_value(test_rating_data)

        if data_name == 'ml-100k':
            print("Using official MovieLens dataset split u1.base/u1.test with 20% validation set size...")
            (
                u_features, v_features, adj_train, train_labels, train_u_indices, train_v_indices,
                val_labels, val_u_indices, val_v_indices, test_labels, test_u_indices, 
                test_v_indices, class_values
            ) = load_official_trainvaltest_split(
                'ml-100k', testing, None, None, 1.0
            )
        elif data_name == 'ml-1m':
            data_seed = 1234
            datasplit_path = (
                'raw_data/' + data_name + '/split_seed' + str(data_seed) + 
                '.pickle'
            )
            print("Using random dataset split ...")
            (
                u_features, v_features, adj_train, train_labels, train_u_indices, train_v_indices,
                val_labels, val_u_indices, val_v_indices, test_labels, test_u_indices, 
                test_v_indices, class_values
            ) = create_trainvaltest_split(
                'ml-1m', 1234, testing, datasplit_path, True, True, None, 
                None, 1.0
            )
            
        self._num_user = u_features.shape[0]
        self._num_movie = v_features.shape[0]

        # reindex u and v, v nodes start after u
        train_v_indices += self.num_user
        val_v_indices += self.num_user
        test_v_indices += self.num_user

        self.train_rating_pairs = (th.LongTensor(train_u_indices), th.LongTensor(train_v_indices))
        self.valid_rating_pairs = (th.LongTensor(val_u_indices), th.LongTensor(val_v_indices))
        self.test_rating_pairs = (th.LongTensor(test_u_indices), th.LongTensor(test_v_indices))
        self.train_rating_values = th.FloatTensor(train_labels)
        self.valid_rating_values = th.FloatTensor(val_labels)
        self.test_rating_values = th.FloatTensor(test_labels)

        print("\tTrain rating pairs : {}".format(len(train_labels)))
        print("\tValid rating pairs : {}".format(len(val_labels)))
        print("\tTest rating pairs  : {}".format(len(test_labels)))

        # build dgl graph object, which is homogeneous and bidirectional and contains only training edges
        self.train_graph = dgl.graph((th.cat([self.train_rating_pairs[0], self.train_rating_pairs[1]]), 
                                      th.cat([self.train_rating_pairs[1], self.train_rating_pairs[0]])))
        self.train_graph.edata['etype'] = th.cat([self.train_rating_values, self.train_rating_values]).to(th.long)
                    
        # # add refex feature
        # refex_feature = extract_refex_feature(self.train_graph)
        # print("refex feature shape: {}".format(refex_feature.numpy().shape))
        # self.train_graph.ndata['refex'] = refex_feature

        # # add gdv feature
        # gdv_feature = np.loadtxt('./{}.gdv'.format(data_name), dtype=np.float32)
        # print("gdv feature shape: {}".format(gdv_feature.shape))
        # gdv_feature = utils.MinMaxScaling(gdv_feature, axis=0)
        # self.train_graph.ndata['gdv'] = th.from_numpy(gdv_feature)

    @property
    def num_rating(self):
        return self._rating.size

    @property
    def num_user(self):
        return self._num_user

    @property
    def num_movie(self):
        return self._num_movie

    def _load_raw_user_data(self):
        """In MovieLens, the user attributes file have the following formats:

        ml-100k:
        user id | age | gender | occupation | zip code

        ml-1m:
        UserID::Gender::Age::Occupation::Zip-code

        Parameters
        ----------
        name : str

        Returns
        -------
        user_data : pd.DataFrame
        """
        if self._data_name == 'ml-100k':
            user_data = pd.read_csv(os.path.join(self._dir, 'u.user'), sep='|', header=None,
                                    names=['id', 'age', 'gender', 'occupation', 'zip_code'], engine='python')
        elif self._data_name == 'ml-1m':
            user_data = pd.read_csv(os.path.join(self._dir, 'users.dat'), sep='::', header=None,
                                    names=['id', 'gender', 'age', 'occupation', 'zip_code'], engine='python')
        else:
            raise NotImplementedError
        return user_data

    def _load_raw_movie_data(self):
        """In MovieLens, the movie attributes may have the following formats:

        In ml-100k:

        movie id | movie title | release date | video release date | IMDb URL | [genres]

        In ml-1m, ml-10m:

        MovieID::Title (Release Year)::Genres

        Also, Genres are separated by |, e.g., Adventure|Animation|Children|Comedy|Fantasy

        Parameters
        ----------
        name : str

        Returns
        -------
        movie_data : pd.DataFrame
            For ml-100k, the column name is ['id', 'title', 'release_date', 'video_release_date', 'url'] + [GENRES (19)]]
            For ml-1m, the column name is ['id', 'title'] + [GENRES (18/20)]]
        """
        if self._data_name == 'ml-100k':
            GENRES = GENRES_ML_100K
        elif self._data_name == 'ml-1m':
            GENRES = GENRES_ML_1M
        else:
            raise NotImplementedError

        if self._data_name == 'ml-100k':
            file_path = os.path.join(self._dir, 'u.item')
            movie_data = pd.read_csv(file_path, sep='|', header=None,
                                          names=['id', 'title', 'release_date', 'video_release_date', 'url'] + GENRES,
                                          engine='python')
        elif self._data_name == 'ml-1m':
            file_path = os.path.join(self._dir, 'movies.dat')
            movie_data = pd.read_csv(file_path, sep='::', header=None,
                                     names=['id', 'title', 'genres'], engine='python')
            genre_map = {ele: i for i, ele in enumerate(GENRES)}
            genre_map['Children\'s'] = genre_map['Children']
            genre_map['Childrens'] = genre_map['Children']
            movie_genres = np.zeros(shape=(movie_data.shape[0], len(GENRES)), dtype=np.float32)
            for i, genres in enumerate(movie_data['genres']):
                for ele in genres.split('|'):
                    if ele in genre_map:
                        movie_genres[i, genre_map[ele]] = 1.0
                    else:
                        print('genres not found, filled with unknown: {}'.format(genres))
                        movie_genres[i, genre_map['unknown']] = 1.0
            for idx, genre_name in enumerate(GENRES):
                assert idx == genre_map[genre_name]
                movie_data[genre_name] = movie_genres[:, idx]
            movie_data = movie_data.drop(columns=["genres"])
        else:
            raise NotImplementedError
        return movie_data

    def _load_raw_rates(self, file_path, sep):
        """In MovieLens, the rates have the following format

        ml-100k
        user id \t movie id \t rating \t timestamp

        ml-1m/10m
        UserID::MovieID::Rating::Timestamp

        timestamp is unix timestamp and can be converted by pd.to_datetime(X, unit='s')

        Parameters
        ----------
        file_path : str

        Returns
        -------
        rating_data : pd.DataFrame
        """
        rating_data = pd.read_csv(
            file_path, sep=sep, header=None,
            names=['user_id', 'movie_id', 'rating', 'timestamp'],
            dtype={'user_id': np.int32, 'movie_id' : np.int32,
                   'ratings': np.float32, 'timestamp': np.int64}, engine='python')
        return rating_data

    def _drop_unseen_nodes(self, data_df, col_name, reserved_ids_set):
        data_df = data_df[data_df[col_name].isin(reserved_ids_set)]
        data_df.reset_index(drop=True, inplace=True)
        return data_df

    def _generate_pair_value(self, rating_data):
        rating_pairs = (np.array([self._global_user_id_map[ele] for ele in rating_data["user_id"]],
                                 dtype=np.int32),
                        np.array([self._global_movie_id_map[ele] for ele in rating_data["movie_id"]],
                                 dtype=np.int32))
        # label ranges from 0. to 4.
        rating_values = rating_data["rating"].values.astype(np.float32) - 1.
        return rating_pairs[0], rating_pairs[1], rating_values

In [4]:
import os
import random
import pickle as pkl
import pandas as pd
import numpy as np
import scipy.sparse as sp

# For automatic dataset downloading
from urllib.request import urlopen
from zipfile import ZipFile
from io import BytesIO

In [65]:
def map_data(data):
    """
    Map data to proper indices in case they are not in a continues [0, N) range
    Parameters
    ----------
    data : np.int32 arrays
    Returns
    -------
    mapped_data : np.int32 arrays
    n : length of mapped_data
    """
    uniq = list(set(data))

    id_dict = {old: new for new, old in enumerate(sorted(uniq))}
    data = np.array([id_dict[x] for x in data])
    n = len(uniq)

    return data, id_dict, n

def download_dataset(dataset, files, data_dir):
    """ Downloads dataset if files are not present. """

    if not np.all([os.path.isfile(data_dir + f) for f in files]):
        url = "http://files.grouplens.org/datasets/movielens/" + dataset.replace('_', '-') + '.zip'
        request = urlopen(url)

        print('Downloading %s dataset' % dataset)

        if dataset in ['ml-100k', 'ml-1m']:
            target_dir = 'raw_data/' + dataset.replace('_', '-')
        elif dataset == 'ml-10m':
            target_dir = 'raw_data/' + 'ml-10M100K'
        else:
            raise ValueError('Invalid dataset option %s' % dataset)

        with ZipFile(BytesIO(request.read())) as zip_ref:
            zip_ref.extractall('raw_data/')

        os.rename(target_dir, data_dir)
        #shutil.rmtree(target_dir)

def load_data(fname, seed=1234, verbose=True):
    """ Loads dataset and creates adjacency matrix
    and feature matrix
    Parameters
    ----------
    fname : str, dataset
    seed: int, dataset shuffling seed
    verbose: to print out statements or not
    Returns
    -------
    num_users : int
        Number of users and items respectively
    num_items : int
    u_nodes : np.int32 arrays
        User indices
    v_nodes : np.int32 array
        item (movie) indices
    ratings : np.float32 array
        User/item ratings s.t. ratings[k] is the rating given by user u_nodes[k] to
        item v_nodes[k]. Note that that the all pairs u_nodes[k]/v_nodes[k] are unique, but
        not necessarily all u_nodes[k] or all v_nodes[k] separately.
    u_features: np.float32 array, or None
        If present in dataset, contains the features of the users.
    v_features: np.float32 array, or None
        If present in dataset, contains the features of the users.
    seed: int,
        For datashuffling seed with pythons own random.shuffle, as in CF-NADE.
    """

    u_features = None
    v_features = None

    print('Loading dataset', fname)

    data_dir = 'raw_data/' + fname

    if fname == 'ml-100k':

        # Check if files exist and download otherwise
        files = ['/u.data', '/u.item', '/u.user']

        download_dataset(fname, files, data_dir)

        sep = '\t'
        filename = data_dir + files[0]

        dtypes = {
            'u_nodes': np.int32, 'v_nodes': np.int32,
            'ratings': np.float32, 'timestamp': np.float64}

        data = pd.read_csv(
            filename, sep=sep, header=None,
            names=['u_nodes', 'v_nodes', 'ratings', 'timestamp'], dtype=dtypes)

        # shuffle here like cf-nade paper with python's own random class
        # make sure to convert to list, otherwise random.shuffle acts weird on it without a warning
        data_array = data.values.tolist()
        random.seed(seed)
        random.shuffle(data_array)
        data_array = np.array(data_array)

        u_nodes_ratings = data_array[:, 0].astype(dtypes['u_nodes'])
        v_nodes_ratings = data_array[:, 1].astype(dtypes['v_nodes'])
        ratings = data_array[:, 2].astype(dtypes['ratings'])

        u_nodes_ratings, u_dict, num_users = map_data(u_nodes_ratings)
        v_nodes_ratings, v_dict, num_items = map_data(v_nodes_ratings)

        u_nodes_ratings, v_nodes_ratings = u_nodes_ratings.astype(np.int64), v_nodes_ratings.astype(np.int32)
        ratings = ratings.astype(np.float64)

        # Movie features (genres)
        sep = r'|'
        movie_file = data_dir + files[1]
        movie_headers = ['movie id', 'movie title', 'release date', 'video release date',
                         'IMDb URL', 'unknown', 'Action', 'Adventure', 'Animation',
                         'Childrens', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
                         'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
                         'Thriller', 'War', 'Western']
        movie_df = pd.read_csv(movie_file, sep=sep, header=None, 
                               names=movie_headers, encoding='ISO-8859-1')

        genre_headers = movie_df.columns.values[6:]
        num_genres = genre_headers.shape[0]

        v_features = np.zeros((num_items, num_genres), dtype=np.float32)
        for movie_id, g_vec in zip(movie_df['movie id'].values.tolist(), movie_df[genre_headers].values.tolist()):
            # Check if movie_id was listed in ratings file and therefore in mapping dictionary
            if movie_id in v_dict.keys():
                v_features[v_dict[movie_id], :] = g_vec

        # User features

        sep = r'|'
        users_file = data_dir + files[2]
        users_headers = ['user id', 'age', 'gender', 'occupation', 'zip code']
        users_df = pd.read_csv(users_file, sep=sep, header=None,
                               names=users_headers, engine='python')

        occupation = set(users_df['occupation'].values.tolist())

        gender_dict = {'M': 0., 'F': 1.}
        occupation_dict = {f: i for i, f in enumerate(occupation, start=2)}

        num_feats = 2 + len(occupation_dict)

        u_features = np.zeros((num_users, num_feats), dtype=np.float32)
        for _, row in users_df.iterrows():
            u_id = row['user id']
            if u_id in u_dict.keys():
                # age
                u_features[u_dict[u_id], 0] = row['age']
                # gender
                u_features[u_dict[u_id], 1] = gender_dict[row['gender']]
                # occupation
                u_features[u_dict[u_id], occupation_dict[row['occupation']]] = 1.

        u_features = sp.csr_matrix(u_features)
        v_features = sp.csr_matrix(v_features)

    elif fname == 'ml-1m':

        # Check if files exist and download otherwise
        files = ['/ratings.dat', '/movies.dat', '/users.dat']
        download_dataset(fname, files, data_dir)

        sep = r'\:\:'
        filename = data_dir + files[0]

        dtypes = {
            'u_nodes': np.int64, 'v_nodes': np.int64,
            'ratings': np.float32, 'timestamp': np.float64}

        # use engine='python' to ignore warning about switching to python backend when using regexp for sep
        data = pd.read_csv(filename, sep=sep, header=None,
                           names=['u_nodes', 'v_nodes', 'ratings', 'timestamp'], converters=dtypes, engine='python')

        # shuffle here like cf-nade paper with python's own random class
        # make sure to convert to list, otherwise random.shuffle acts weird on it without a warning
        data_array = data.values.tolist()
        random.seed(seed)
        random.shuffle(data_array)
        data_array = np.array(data_array)

        u_nodes_ratings = data_array[:, 0].astype(dtypes['u_nodes'])
        v_nodes_ratings = data_array[:, 1].astype(dtypes['v_nodes'])
        ratings = data_array[:, 2].astype(dtypes['ratings'])

        u_nodes_ratings, u_dict, num_users = map_data(u_nodes_ratings)
        v_nodes_ratings, v_dict, num_items = map_data(v_nodes_ratings)

        u_nodes_ratings, v_nodes_ratings = u_nodes_ratings.astype(np.int64), v_nodes_ratings.astype(np.int64)
        ratings = ratings.astype(np.float32)

        # Load movie features
        movies_file = data_dir + files[1]
        movies_headers = ['movie_id', 'title', 'genre']
        movies_df = pd.read_csv(movies_file, sep=sep, header=None,
                                names=movies_headers, engine='python', encoding='ISO-8859-1')

        # Extracting all genres
        genres = []
        for s in movies_df['genre'].values:
            genres.extend(s.split('|'))

        genres = list(set(genres))
        num_genres = len(genres)

        genres_dict = {g: idx for idx, g in enumerate(genres)}

        # Creating 0 or 1 valued features for all genres
        v_features = np.zeros((num_items, num_genres), dtype=np.float32)
        for movie_id, s in zip(movies_df['movie_id'].values.tolist(), movies_df['genre'].values.tolist()):
            # Check if movie_id was listed in ratings file and therefore in mapping dictionary
            if movie_id in v_dict.keys():
                gen = s.split('|')
                for g in gen:
                    v_features[v_dict[movie_id], genres_dict[g]] = 1.

        # Load user features
        users_file = data_dir + files[2]
        users_headers = ['user_id', 'gender', 'age', 'occupation', 'zip-code']
        users_df = pd.read_csv(users_file, sep=sep, header=None,
                               names=users_headers, engine='python')

        # Extracting all features
        cols = users_df.columns.values[1:]

        cntr = 0
        feat_dicts = []
        for header in cols:
            d = dict()
            feats = np.unique(users_df[header].values).tolist()
            d.update({f: i for i, f in enumerate(feats, start=cntr)})
            feat_dicts.append(d)
            cntr += len(d)

        num_feats = sum(len(d) for d in feat_dicts)

        u_features = np.zeros((num_users, num_feats), dtype=np.float32)
        for _, row in users_df.iterrows():
            u_id = row['user_id']
            if u_id in u_dict.keys():
                for k, header in enumerate(cols):
                    u_features[u_dict[u_id], feat_dicts[k][row[header]]] = 1.

        u_features = sp.csr_matrix(u_features)
        v_features = sp.csr_matrix(v_features)

    elif fname == 'ml-10m':

        # Check if files exist and download otherwise
        files = ['/ratings.dat']
        download_dataset(fname, files, data_dir)

        sep = r'\:\:'

        filename = data_dir + files[0]

        dtypes = {
            'u_nodes': np.int64, 'v_nodes': np.int64,
            'ratings': np.float32, 'timestamp': np.float64}

        # use engine='python' to ignore warning about switching to python backend when using regexp for sep
        data = pd.read_csv(filename, sep=sep, header=None,
                           names=['u_nodes', 'v_nodes', 'ratings', 'timestamp'], converters=dtypes, engine='python')

        # shuffle here like cf-nade paper with python's own random class
        # make sure to convert to list, otherwise random.shuffle acts weird on it without a warning
        data_array = data.values.tolist()
        random.seed(seed)
        random.shuffle(data_array)
        data_array = np.array(data_array)

        u_nodes_ratings = data_array[:, 0].astype(dtypes['u_nodes'])
        v_nodes_ratings = data_array[:, 1].astype(dtypes['v_nodes'])
        ratings = data_array[:, 2].astype(dtypes['ratings'])

        u_nodes_ratings, u_dict, num_users = map_data(u_nodes_ratings)
        v_nodes_ratings, v_dict, num_items = map_data(v_nodes_ratings)

        u_nodes_ratings, v_nodes_ratings = u_nodes_ratings.astype(np.int64), v_nodes_ratings.astype(np.int64)
        ratings = ratings.astype(np.float32)

    else:
        raise ValueError('Dataset name not recognized: ' + fname)

    if verbose:
        print('Number of users = %d' % num_users)
        print('Number of items = %d' % num_items)
        print('Number of links = %d' % ratings.shape[0])
        print('Fraction of positive links = %.4f' % (float(ratings.shape[0]) / (num_users * num_items),))

    return num_users, num_items, u_nodes_ratings, v_nodes_ratings, ratings, u_features, v_features

# 'ml-1m'에서 사용
def create_trainvaltest_split(seed=1234, testing=False, datasplit_path=None, 
                              datasplit_from_file=False, verbose=True, rating_map=None, 
                              post_rating_map=None, ratio=1.0):
    """
    Splits data set into train/val/test sets from full bipartite adjacency matrix. Shuffling of dataset is done in
    load_data function.
    For each split computes 1-of-num_classes labels. Also computes training
    adjacency matrix.
    """

    if datasplit_from_file and os.path.isfile(datasplit_path):
        print('Reading dataset splits from file...')
        with open(datasplit_path, 'rb') as f:
            num_users, num_items, u_nodes, v_nodes, ratings, u_features, v_features = pkl.load(f)

        if verbose:
            print('Number of users = %d' % num_users)
            print('Number of items = %d' % num_items)
            print('Number of links = %d' % ratings.shape[0])
            print('Fraction of positive links = %.4f' % (float(ratings.shape[0]) / (num_users * num_items),))

    else:
        num_users, num_items, u_nodes, v_nodes, ratings, u_features, v_features = load_data(dataset, seed=seed,
                                                                                            verbose=verbose)

        with open(datasplit_path, 'wb') as f:
            pkl.dump([num_users, num_items, u_nodes, v_nodes, ratings, u_features, v_features], f)

    if rating_map is not None:
        for i, x in enumerate(ratings):
            ratings[i] = rating_map[x]

    neutral_rating = -1

    rating_dict = {r: i for i, r in enumerate(np.sort(np.unique(ratings)).tolist())}

    labels = np.full((num_users, num_items), neutral_rating, dtype=np.int32)
    labels[u_nodes, v_nodes] = np.array([rating_dict[r] for r in ratings])
    labels = labels.reshape([-1])

    # number of test and validation edges
    num_test = int(np.ceil(ratings.shape[0] * 0.1))
    if dataset == 'ml-100k':
        num_val = int(np.ceil(ratings.shape[0] * 0.9 * 0.05))
    else:
        num_val = int(np.ceil(ratings.shape[0] * 0.9 * 0.05))

    num_train = ratings.shape[0] - num_val - num_test

    pairs_nonzero = np.array([[u, v] for u, v in zip(u_nodes, v_nodes)])

    idx_nonzero = np.array([u * num_items + v for u, v in pairs_nonzero])

    train_idx = idx_nonzero[0:int(num_train*ratio)]
    val_idx = idx_nonzero[num_train:num_train + num_val]
    test_idx = idx_nonzero[num_train + num_val:]

    train_pairs_idx = pairs_nonzero[0:int(num_train*ratio)]
    val_pairs_idx = pairs_nonzero[num_train:num_train + num_val]
    test_pairs_idx = pairs_nonzero[num_train + num_val:]

    u_test_idx, v_test_idx = test_pairs_idx.transpose()
    u_val_idx, v_val_idx = val_pairs_idx.transpose()
    u_train_idx, v_train_idx = train_pairs_idx.transpose()

    # create labels
    train_labels = labels[train_idx]
    val_labels = labels[val_idx]
    test_labels = labels[test_idx]

    if testing:
        u_train_idx = np.hstack([u_train_idx, u_val_idx])
        v_train_idx = np.hstack([v_train_idx, v_val_idx])
        train_labels = np.hstack([train_labels, val_labels])
        # for adjacency matrix construction
        train_idx = np.hstack([train_idx, val_idx])

    class_values = np.sort(np.unique(ratings))

    # make training adjacency matrix
    rating_mx_train = np.zeros(num_users * num_items, dtype=np.float32)
    if post_rating_map is None:
        rating_mx_train[train_idx] = labels[train_idx].astype(np.float32) + 1.
    else:
        rating_mx_train[train_idx] = np.array([post_rating_map[r] for r in class_values[labels[train_idx]]]) + 1.
    rating_mx_train = sp.csr_matrix(rating_mx_train.reshape(num_users, num_items))

    return u_features, v_features, rating_mx_train, train_labels, u_train_idx, v_train_idx, \
        val_labels, u_val_idx, v_val_idx, test_labels, u_test_idx, v_test_idx, class_values

# 'ml-100k'에서 사용
def load_official_trainvaltest_split(dataset, testing=False, rating_map=None, post_rating_map=None, ratio=1.0):
    """
    Loads official train/test split and uses 10% of training samples for validaiton
    For each split computes 1-of-num_classes labels. Also computes training
    adjacency matrix. Assumes flattening happens everywhere in row-major fashion.
    """

    sep = '\t'

    # Check if files exist and download otherwise
    files = ['/u1.base', '/u1.test', '/u.item', '/u.user']
    fname = dataset
    data_dir = 'raw_data/' + fname

    download_dataset(fname, files, data_dir)

    dtypes = {
        'u_nodes': np.int32, 'v_nodes': np.int32,
        'ratings': np.float32, 'timestamp': np.float64}

    filename_train = 'raw_data/' + dataset + '/u1.base'
    filename_test = 'raw_data/' + dataset + '/u1.test'

    data_train = pd.read_csv(
        filename_train, sep=sep, header=None,
        names=['u_nodes', 'v_nodes', 'ratings', 'timestamp'], dtype=dtypes)

    data_test = pd.read_csv(
        filename_test, sep=sep, header=None,
        names=['u_nodes', 'v_nodes', 'ratings', 'timestamp'], dtype=dtypes)

    data_array_train = data_train.values.tolist()
    data_array_train = np.array(data_array_train)
    data_array_test = data_test.values.tolist()
    data_array_test = np.array(data_array_test)

    if ratio < 1.0:
        data_array_train = data_array_train[data_array_train[:, -1].argsort()[:int(ratio*len(data_array_train))]]

    data_array = np.concatenate([data_array_train, data_array_test], axis=0)

    u_nodes_ratings = data_array[:, 0].astype(dtypes['u_nodes'])
    v_nodes_ratings = data_array[:, 1].astype(dtypes['v_nodes'])
    ratings = data_array[:, 2].astype(dtypes['ratings'])
    if rating_map is not None:
        for i, x in enumerate(ratings):
            ratings[i] = rating_map[x]

    u_nodes_ratings, u_dict, num_users = map_data(u_nodes_ratings)
    v_nodes_ratings, v_dict, num_items = map_data(v_nodes_ratings)

    u_nodes_ratings, v_nodes_ratings = u_nodes_ratings.astype(np.int64), v_nodes_ratings.astype(np.int32)
    ratings = ratings.astype(np.float64)

    u_nodes = u_nodes_ratings
    v_nodes = v_nodes_ratings

    neutral_rating = -1  # int(np.ceil(np.float(num_classes)/2.)) - 1

    # assumes that ratings_train contains at least one example of every rating type
    rating_dict = {r: i for i, r in enumerate(np.sort(np.unique(ratings)).tolist())}

    labels = np.full((num_users, num_items), neutral_rating, dtype=np.int32)
    labels[u_nodes, v_nodes] = np.array([rating_dict[r] for r in ratings])

    for i in range(len(u_nodes)):
        assert(labels[u_nodes[i], v_nodes[i]] == rating_dict[ratings[i]])

    labels = labels.reshape([-1])

    # number of test and validation edges, see cf-nade code

    num_train = data_array_train.shape[0]
    num_test = data_array_test.shape[0]
    num_val = int(np.ceil(num_train * 0.2))
    num_train = num_train - num_val

    pairs_nonzero = np.array([[u, v] for u, v in zip(u_nodes, v_nodes)])
    idx_nonzero = np.array([u * num_items + v for u, v in pairs_nonzero])

    for i in range(len(ratings)):
        assert(labels[idx_nonzero[i]] == rating_dict[ratings[i]])

    idx_nonzero_train = idx_nonzero[0:num_train+num_val]
    idx_nonzero_test = idx_nonzero[num_train+num_val:]

    pairs_nonzero_train = pairs_nonzero[0:num_train+num_val]
    pairs_nonzero_test = pairs_nonzero[num_train+num_val:]

    # Internally shuffle training set (before splitting off validation set)
    rand_idx = list(range(len(idx_nonzero_train)))
    np.random.seed(1234)
    np.random.shuffle(rand_idx)
    idx_nonzero_train = idx_nonzero_train[rand_idx]
    pairs_nonzero_train = pairs_nonzero_train[rand_idx]

    idx_nonzero = np.concatenate([idx_nonzero_train, idx_nonzero_test], axis=0)
    pairs_nonzero = np.concatenate([pairs_nonzero_train, pairs_nonzero_test], axis=0)

    val_idx = idx_nonzero[0:num_val]
    train_idx = idx_nonzero[num_val:num_train + num_val]
    test_idx = idx_nonzero[num_train + num_val:]

    assert(len(test_idx) == num_test)

    val_pairs_idx = pairs_nonzero[0:num_val]
    train_pairs_idx = pairs_nonzero[num_val:num_train + num_val]
    test_pairs_idx = pairs_nonzero[num_train + num_val:]

    u_test_idx, v_test_idx = test_pairs_idx.transpose()
    u_val_idx, v_val_idx = val_pairs_idx.transpose()
    u_train_idx, v_train_idx = train_pairs_idx.transpose()

    # create labels
    train_labels = labels[train_idx]
    val_labels = labels[val_idx]
    test_labels = labels[test_idx]

    if testing:
        u_train_idx = np.hstack([u_train_idx, u_val_idx])
        v_train_idx = np.hstack([v_train_idx, v_val_idx])
        train_labels = np.hstack([train_labels, val_labels])
        # for adjacency matrix construction
        train_idx = np.hstack([train_idx, val_idx])
    
    class_values = np.sort(np.unique(ratings))

    # make training adjacency matrix
    rating_mx_train = np.zeros(num_users * num_items, dtype=np.float32)
    if post_rating_map is None:
        rating_mx_train[train_idx] = labels[train_idx].astype(np.float32) + 1.
    else:
        rating_mx_train[train_idx] = np.array([post_rating_map[r] for r in class_values[labels[train_idx]]]) + 1.
    rating_mx_train = sp.csr_matrix(rating_mx_train.reshape(num_users, num_items))

    if dataset =='ml-100k':

        # movie features (genres)
        sep = r'|'
        movie_file = 'raw_data/' + dataset + '/u.item'
        movie_headers = ['movie id', 'movie title', 'release date', 'video release date',
                         'IMDb URL', 'unknown', 'Action', 'Adventure', 'Animation',
                         'Childrens', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
                         'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
                         'Thriller', 'War', 'Western']
        movie_df = pd.read_csv(movie_file, sep=sep, header=None, 
                               names=movie_headers, encoding='ISO-8859-1')

        genre_headers = movie_df.columns.values[6:]
        num_genres = genre_headers.shape[0]

        v_features = np.zeros((num_items, num_genres), dtype=np.float32)
        for movie_id, g_vec in zip(movie_df['movie id'].values.tolist(), movie_df[genre_headers].values.tolist()):
            # check if movie_id was listed in ratings file and therefore in mapping dictionary
            if movie_id in v_dict.keys():
                v_features[v_dict[movie_id], :] = g_vec

        # user features

        sep = r'|'
        users_file = 'raw_data/' + dataset + '/u.user'
        users_headers = ['user id', 'age', 'gender', 'occupation', 'zip code']
        users_df = pd.read_csv(users_file, sep=sep, header=None,
                               names=users_headers, engine='python')

        occupation = set(users_df['occupation'].values.tolist())

        age = users_df['age'].values
        age_max = age.max()

        gender_dict = {'M': 0., 'F': 1.}
        occupation_dict = {f: i for i, f in enumerate(occupation, start=2)}

        num_feats = 2 + len(occupation_dict)

        u_features = np.zeros((num_users, num_feats), dtype=np.float32)
        for _, row in users_df.iterrows():
            u_id = row['user id']
            if u_id in u_dict.keys():
                # age
                u_features[u_dict[u_id], 0] = row['age'] / np.float(age_max)
                # gender
                u_features[u_dict[u_id], 1] = gender_dict[row['gender']]
                # occupation
                u_features[u_dict[u_id], occupation_dict[row['occupation']]] = 1.

    elif dataset == 'ml-1m':

        # load movie features
        movies_file = 'raw_data/' + dataset + '/movies.dat'

        movies_headers = ['movie_id', 'title', 'genre']
        movies_df = pd.read_csv(movies_file, sep=sep, header=None,
                                names=movies_headers, engine='python')

        # extracting all genres
        genres = []
        for s in movies_df['genre'].values:
            genres.extend(s.split('|'))

        genres = list(set(genres))
        num_genres = len(genres)

        genres_dict = {g: idx for idx, g in enumerate(genres)}

        # creating 0 or 1 valued features for all genres
        v_features = np.zeros((num_items, num_genres), dtype=np.float32)
        for movie_id, s in zip(movies_df['movie_id'].values.tolist(), movies_df['genre'].values.tolist()):
            # check if movie_id was listed in ratings file and therefore in mapping dictionary
            if movie_id in v_dict.keys():
                gen = s.split('|')
                for g in gen:
                    v_features[v_dict[movie_id], genres_dict[g]] = 1.

        # load user features
        users_file = 'raw_data/' + dataset + '/users.dat'
        users_headers = ['user_id', 'gender', 'age', 'occupation', 'zip-code']
        users_df = pd.read_csv(users_file, sep=sep, header=None,
                               names=users_headers, engine='python')

        # extracting all features
        cols = users_df.columns.values[1:]

        cntr = 0
        feat_dicts = []
        for header in cols:
            d = dict()
            feats = np.unique(users_df[header].values).tolist()
            d.update({f: i for i, f in enumerate(feats, start=cntr)})
            feat_dicts.append(d)
            cntr += len(d)

        num_feats = sum(len(d) for d in feat_dicts)

        u_features = np.zeros((num_users, num_feats), dtype=np.float32)
        for _, row in users_df.iterrows():
            u_id = row['user_id']
            if u_id in u_dict.keys():
                for k, header in enumerate(cols):
                    u_features[u_dict[u_id], feat_dicts[k][row[header]]] = 1.
    else:
        raise ValueError('Invalid dataset option %s' % dataset)

    u_features = sp.csr_matrix(u_features)
    v_features = sp.csr_matrix(v_features)

    print("User features shape: "+str(u_features.shape))
    print("Item features shape: "+str(v_features.shape))

    return u_features, v_features, rating_mx_train, train_labels, u_train_idx, v_train_idx, \
        val_labels, u_val_idx, v_val_idx, test_labels, u_test_idx, v_test_idx, class_values

# 1. Main()

In [66]:
if __name__ == '__main__':
    dataset = MovieLens("ml-100k", testing=True)

Using official MovieLens dataset split u1.base/u1.test with 20% validation set size...
User features shape: (943, 23)
Item features shape: (1682, 18)
	Train rating pairs : 80000
	Valid rating pairs : 16000
	Test rating pairs  : 20000


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  u_features[u_dict[u_id], 0] = row['age'] / np.float(age_max)


In [67]:
dataset

<__main__.MovieLens at 0x14d0f7bb280>

# 2. load_official_trainvaltest_split 함수(ml-100k 파트만 남김)

- 매개변수

In [68]:
dataset = 'ml-100k'
testing = False
rating_map = None
post_rating_map = None
ratio = 1.0

- Download the data 

In [10]:
sep = '\t'

# Check if files exist and download otherwise
files = ['/u1.base', '/u1.test', '/u.item', '/u.user']
fname = dataset
data_dir = 'raw_data/' + fname

download_dataset(fname, files, data_dir)

In [11]:
dtypes = {
    'u_nodes': np.int32, 'v_nodes': np.int32,
    'ratings': np.float32, 'timestamp': np.float64}

filename_train = 'raw_data/' + dataset + '/u1.base'
filename_test = 'raw_data/' + dataset + '/u1.test'

data_train = pd.read_csv(
    filename_train, sep=sep, header=None,
    names=['u_nodes', 'v_nodes', 'ratings', 'timestamp'], dtype=dtypes)

data_test = pd.read_csv(
    filename_test, sep=sep, header=None,
    names=['u_nodes', 'v_nodes', 'ratings', 'timestamp'], dtype=dtypes)

In [12]:
data_train.head()

Unnamed: 0,u_nodes,v_nodes,ratings,timestamp
0,1,1,5.0,874965758.0
1,1,2,3.0,876893171.0
2,1,3,4.0,878542960.0
3,1,4,3.0,876893119.0
4,1,5,3.0,889751712.0


In [13]:
data_array_train = data_train.values.tolist()
data_array_train = np.array(data_array_train)
data_array_test = data_test.values.tolist()
data_array_test = np.array(data_array_test)

print(data_array_train.shape)
print(data_array_test.shape)

(80000, 4)
(20000, 4)


In [14]:
# 시간에 따라 오름차순 정렬 후 비율에 따라 trainset을 나눔
if ratio < 1.0:
    data_array_train = data_array_train[data_array_train[:, -1].argsort()[:int(ratio*len(data_array_train))]]

data_array = np.concatenate([data_array_train, data_array_test], axis=0)

In [15]:
data_array.shape

(100000, 4)

In [16]:
u_nodes_ratings = data_array[:, 0].astype(dtypes['u_nodes'])
v_nodes_ratings = data_array[:, 1].astype(dtypes['v_nodes'])
ratings = data_array[:, 2].astype(dtypes['ratings'])
if rating_map is not None:
    for i, x in enumerate(ratings):
        ratings[i] = rating_map[x]

In [17]:
u_nodes_ratings

array([  1,   1,   1, ..., 459, 460, 462])

In [18]:
# 인덱스를 0번부터 시작하게끔 모든 인덱스를 당기기  ex) 1번 -> 0번 / 456번 -> 455번
u_nodes_ratings, u_dict, num_users = map_data(u_nodes_ratings)
v_nodes_ratings, v_dict, num_items = map_data(v_nodes_ratings)

In [19]:
print(num_users)
print(num_items)

943
1682


In [20]:
u_nodes_ratings, v_nodes_ratings = u_nodes_ratings.astype(np.int64), v_nodes_ratings.astype(np.int32)
ratings = ratings.astype(np.float64)

u_nodes = u_nodes_ratings
v_nodes = v_nodes_ratings

In [21]:
np.sort(np.unique(ratings)).tolist()

[1.0, 2.0, 3.0, 4.0, 5.0]

In [22]:
neutral_rating = -1  # int(np.ceil(np.float(num_classes)/2.)) - 1

# assumes that ratings_train contains at least one example of every rating type
rating_dict = {r: i for i, r in enumerate(np.sort(np.unique(ratings)).tolist())}

labels = np.full((num_users, num_items), neutral_rating, dtype=np.int32)
print(labels.shape)

(943, 1682)


In [23]:
labels # user,item의 rating matrix를 -1로 초기화

array([[-1, -1, -1, ..., -1, -1, -1],
       [-1, -1, -1, ..., -1, -1, -1],
       [-1, -1, -1, ..., -1, -1, -1],
       ...,
       [-1, -1, -1, ..., -1, -1, -1],
       [-1, -1, -1, ..., -1, -1, -1],
       [-1, -1, -1, ..., -1, -1, -1]])

In [24]:
labels[u_nodes, v_nodes] = np.array([rating_dict[r] for r in ratings])

In [25]:
labels

array([[ 4,  2,  3, ..., -1, -1, -1],
       [ 3, -1, -1, ..., -1, -1, -1],
       [-1, -1, -1, ..., -1, -1, -1],
       ...,
       [ 4, -1, -1, ..., -1, -1, -1],
       [-1, -1, -1, ..., -1, -1, -1],
       [-1,  4, -1, ..., -1, -1, -1]])

In [26]:
len(u_nodes)

100000

In [27]:
ratings

array([5., 3., 4., ..., 3., 3., 5.])

In [28]:
rating_dict

{1.0: 0, 2.0: 1, 3.0: 2, 4.0: 3, 5.0: 4}

In [29]:
# 평점이 제대로 들어갔는지 점검
for i in range(len(u_nodes)):
    assert(labels[u_nodes[i], v_nodes[i]] == rating_dict[ratings[i]])

labels = labels.reshape([-1])

# number of test and validation edges, see cf-nade code

num_train = data_array_train.shape[0]
num_test = data_array_test.shape[0]
num_val = int(np.ceil(num_train * 0.2))
num_train = num_train - num_val

In [30]:
len(ratings)

100000

In [31]:
len(labels)

1586126

In [32]:
print(num_train)
print(num_val)
print(num_test)

64000
16000
20000


In [33]:
len(v_nodes)

100000

In [34]:
pairs_nonzero = np.array([[u, v] for u, v in zip(u_nodes, v_nodes)])
idx_nonzero = np.array([u * num_items + v for u, v in pairs_nonzero]) # rating index 번호

- trainset/testset 분리

In [35]:
for i in range(len(ratings)):
    assert(labels[idx_nonzero[i]] == rating_dict[ratings[i]])

idx_nonzero_train = idx_nonzero[0:num_train+num_val]
idx_nonzero_test = idx_nonzero[num_train+num_val:]

pairs_nonzero_train = pairs_nonzero[0:num_train+num_val]
pairs_nonzero_test = pairs_nonzero[num_train+num_val:]

In [36]:
# validset 생성 전에 trainset을 섞기
rand_idx = list(range(len(idx_nonzero_train)))
np.random.seed(1234)
np.random.shuffle(rand_idx)
idx_nonzero_train = idx_nonzero_train[rand_idx]
pairs_nonzero_train = pairs_nonzero_train[rand_idx]

idx_nonzero = np.concatenate([idx_nonzero_train, idx_nonzero_test], axis=0)
pairs_nonzero = np.concatenate([pairs_nonzero_train, pairs_nonzero_test], axis=0)

val_idx = idx_nonzero[0:num_val]
train_idx = idx_nonzero[num_val:num_train + num_val]
test_idx = idx_nonzero[num_train + num_val:]

assert(len(test_idx) == num_test)

val_pairs_idx = pairs_nonzero[0:num_val]
train_pairs_idx = pairs_nonzero[num_val:num_train + num_val]
test_pairs_idx = pairs_nonzero[num_train + num_val:]

In [37]:
u_test_idx, v_test_idx = test_pairs_idx.transpose() # 2 x 20000으로 변경
u_val_idx, v_val_idx = val_pairs_idx.transpose()
u_train_idx, v_train_idx = train_pairs_idx.transpose()

# create labels
train_labels = labels[train_idx]
val_labels = labels[val_idx]
test_labels = labels[test_idx]

In [38]:
testing

False

In [39]:
if testing:
    u_train_idx = np.hstack([u_train_idx, u_val_idx])
    v_train_idx = np.hstack([v_train_idx, v_val_idx])
    train_labels = np.hstack([train_labels, val_labels])
    # for adjacency matrix construction
    train_idx = np.hstack([train_idx, val_idx])

class_values = np.sort(np.unique(ratings))

In [40]:
# trainset의 인접행렬 생성
rating_mx_train = np.zeros(num_users * num_items, dtype=np.float32)
if post_rating_map is None:
    rating_mx_train[train_idx] = labels[train_idx].astype(np.float32) + 1.
else:
    rating_mx_train[train_idx] = np.array([post_rating_map[r] for r in class_values[labels[train_idx]]]) + 1.
    
rating_mx_train = sp.csr_matrix(rating_mx_train.reshape(num_users, num_items))

- Movie Features

In [41]:
# movie features (genres)
sep = r'|'
movie_file = 'raw_data/' + dataset + '/u.item'
movie_headers = ['movie id', 'movie title', 'release date', 'video release date',
                 'IMDb URL', 'unknown', 'Action', 'Adventure', 'Animation',
                 'Childrens', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
                 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
                 'Thriller', 'War', 'Western']
movie_df = pd.read_csv(movie_file, sep=sep, header=None, 
                       names=movie_headers, encoding='ISO-8859-1')

In [42]:
movie_df.head(1)

Unnamed: 0,movie id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Childrens,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0


In [43]:
genre_headers = movie_df.columns.values[6:]
num_genres = genre_headers.shape[0]

v_features = np.zeros((num_items, num_genres), dtype=np.float32)

In [44]:
v_features.shape

(1682, 18)

In [45]:
# g_vec : movie_id별 장르 벡터
for movie_id, g_vec in zip(movie_df['movie id'].values.tolist(), movie_df[genre_headers].values.tolist()):
    # check if movie_id was listed in ratings file and therefore in mapping dictionary
    if movie_id in v_dict.keys():
        v_features[v_dict[movie_id], :] = g_vec

- User features

In [46]:
# user features
sep = r'|'
users_file = 'raw_data/' + dataset + '/u.user'
users_headers = ['user id', 'age', 'gender', 'occupation', 'zip code']
users_df = pd.read_csv(users_file, sep=sep, header=None,
                       names=users_headers, engine='python')

occupation = set(users_df['occupation'].values.tolist())

In [47]:
age = users_df['age'].values
age_max = age.max()

gender_dict = {'M': 0., 'F': 1.}
occupation_dict = {f: i for i, f in enumerate(occupation, start=2)}

num_feats = 2 + len(occupation_dict)

In [48]:
# user feature 유형: age(1), gender(1), occupation(21)   총 23개
num_feats

23

In [49]:
users_df.head()

Unnamed: 0,user id,age,gender,occupation,zip code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [50]:
u_features = np.zeros((num_users, num_feats), dtype=np.float32)
for _, row in users_df.iterrows():
    u_id = row['user id']
    if u_id in u_dict.keys():
        # age
        u_features[u_dict[u_id], 0] = row['age'] / np.float(age_max)
        # gender
        u_features[u_dict[u_id], 1] = gender_dict[row['gender']]
        # occupation
        u_features[u_dict[u_id], occupation_dict[row['occupation']]] = 1.

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  u_features[u_dict[u_id], 0] = row['age'] / np.float(age_max)


In [51]:
u_features = sp.csr_matrix(u_features)
v_features = sp.csr_matrix(v_features)

print("User features shape: "+str(u_features.shape))
print("Item features shape: "+str(v_features.shape))

User features shape: (943, 23)
Item features shape: (1682, 18)


# 3. create_trainvaltest_split (ml-1m)

In [None]:
python3 train_multi_gpu.py --data_name ml-1m --testing \
                --batch_size 32 --edge_dropout 0. --max_nodes_per_hop 100 --train_epochs 40 \
                --train_log_interval 1000 --valid_log_interval 5 --train_lr_decay_step 20 \
                --gpu 0,1,2,3

In [73]:
dataset = 'ml-1m'
seed = 1234
testing = False
datasplit_path = './data.pickle'
verbose = True
rating_map = None
post_rating_map = None
ratio = 1.0

In [103]:
"""
Splits data set into train/val/test sets from full bipartite adjacency matrix. 
Shuffling of dataset is done inload_data function.
For each split computes 1-of-num_classes labels. 
Also computes training adjacency matrix.
"""

if os.path.isfile(datasplit_path):
    print('Reading dataset splits from file...')
    with open(datasplit_path, 'rb') as f:
        num_users, num_items, u_nodes, v_nodes, ratings, u_features, v_features = pkl.load(f)

    if verbose:
        print('Number of users = %d' % num_users)
        print('Number of items = %d' % num_items)
        print('Number of links = %d' % ratings.shape[0])
        print('Fraction of positive links = %.4f' % (float(ratings.shape[0]) / (num_users * num_items),))

else:
    num_users, num_items, u_nodes, v_nodes, ratings, u_features, v_features = load_data(dataset, seed=seed, verbose=verbose)

#     with open(datasplit_path, 'wb') as f:
#         pkl.dump([num_users, num_items, u_nodes, v_nodes, ratings, u_features, v_features], f)

Reading dataset splits from file...
Number of users = 6040
Number of items = 3706
Number of links = 1000209
Fraction of positive links = 0.0447


In [104]:
with open(datasplit_path, 'wb') as f:
    pkl.dump([num_users, num_items, u_nodes, v_nodes, ratings, u_features, v_features], f)

In [105]:
with open(datasplit_path, 'rb') as f:
    num_users, num_items, u_nodes, v_nodes, ratings, u_features, v_features = pkl.load(f)

In [107]:
if rating_map is not None:
    for i, x in enumerate(ratings):
        ratings[i] = rating_map[x]

neutral_rating = -1

rating_dict = {r: i for i, r in enumerate(np.sort(np.unique(ratings)).tolist())}

labels = np.full((num_users, num_items), neutral_rating, dtype=np.int32)
labels[u_nodes, v_nodes] = np.array([rating_dict[r] for r in ratings])
labels = labels.reshape([-1])

In [97]:
# number of test and validation edges
num_test = int(np.ceil(ratings.shape[0] * 0.1))
num_val = int(np.ceil(ratings.shape[0] * 0.9 * 0.05))
num_train = ratings.shape[0] - num_val - num_test

pairs_nonzero = np.array([[u, v] for u, v in zip(u_nodes, v_nodes)])

idx_nonzero = np.array([u * num_items + v for u, v in pairs_nonzero])

train_idx = idx_nonzero[0:int(num_train*ratio)]
val_idx = idx_nonzero[num_train:num_train + num_val]
test_idx = idx_nonzero[num_train + num_val:]

train_pairs_idx = pairs_nonzero[0:int(num_train*ratio)]
val_pairs_idx = pairs_nonzero[num_train:num_train + num_val]
test_pairs_idx = pairs_nonzero[num_train + num_val:]

u_test_idx, v_test_idx = test_pairs_idx.transpose()
u_val_idx, v_val_idx = val_pairs_idx.transpose()
u_train_idx, v_train_idx = train_pairs_idx.transpose()

In [98]:
# create labels
train_labels = labels[train_idx]
val_labels = labels[val_idx]
test_labels = labels[test_idx]

if testing:
    u_train_idx = np.hstack([u_train_idx, u_val_idx])
    v_train_idx = np.hstack([v_train_idx, v_val_idx])
    train_labels = np.hstack([train_labels, val_labels])
    # for adjacency matrix construction
    train_idx = np.hstack([train_idx, val_idx])

class_values = np.sort(np.unique(ratings))

In [99]:
# make training adjacency matrix
rating_mx_train = np.zeros(num_users * num_items, dtype=np.float32)
if post_rating_map is None:
    rating_mx_train[train_idx] = labels[train_idx].astype(np.float32) + 1.
else:
    rating_mx_train[train_idx] = np.array([post_rating_map[r] for r in class_values[labels[train_idx]]]) + 1.
rating_mx_train = sp.csr_matrix(rating_mx_train.reshape(num_users, num_items))

In [100]:
rating_mx_train

<6040x3706 sparse matrix of type '<class 'numpy.float32'>'
	with 855178 stored elements in Compressed Sparse Row format>

In [101]:
u_features

<6040x3469 sparse matrix of type '<class 'numpy.float32'>'
	with 24160 stored elements in Compressed Sparse Row format>

In [None]:
return u_features, v_features, rating_mx_train, train_labels, u_train_idx, v_train_idx, \
    val_labels, u_val_idx, v_val_idx, test_labels, u_test_idx, v_test_idx, class_values

# 4. MovieLens 클래스 분석(ml-100k)

In [52]:
# MovieLens("ml-100k", testing=True)

data_name = 'ml-100k'
testing = False
test_ratio = 0.1
valid_ratio = 0.2
    
if data_name == 'ml-100k':
    print("Using official MovieLens dataset split u1.base/u1.test with 20% validation set size...")
    (
        u_features, v_features, adj_train, 
        train_labels, train_u_indices, train_v_indices,
        val_labels, val_u_indices, val_v_indices, 
        test_labels, test_u_indices, test_v_indices, 
        class_values
    ) = load_official_trainvaltest_split(
        'ml-100k', testing, None, None, 1.0
    )

_num_user = u_features.shape[0]
_num_movie = v_features.shape[0]

Using official MovieLens dataset split u1.base/u1.test with 20% validation set size...
User features shape: (943, 23)
Item features shape: (1682, 18)


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  u_features[u_dict[u_id], 0] = row['age'] / np.float(age_max)


In [53]:
print(_num_user)
print(_num_movie)

943
1682


In [54]:
print(u_features.shape)
print(v_features.shape)

(943, 23)
(1682, 18)


In [55]:
# property 할당(동일한 이름으로 사용)
# num_rating = _rating.size  #GCMC에서 사용
num_user = _num_user
num_movie = _num_movie

In [56]:
print(train_v_indices)
print(val_v_indices)
print(test_v_indices)
print(num_user)

[166 250 204 ... 183 945 281]
[ 429  596  187 ...  209 1024  287]
[  5   9  11 ... 933   9 681]
943


In [57]:
train_v_indices + num_user

array([1109, 1193, 1147, ..., 1126, 1888, 1224], dtype=int64)

In [58]:
# reindex u and v, v nodes start after u (v노드는 u노드 다음으로 인덱스를 부여함)
train_v_indices += num_user
val_v_indices += num_user
test_v_indices += num_user

train_rating_pairs  = (th.LongTensor(train_u_indices), th.LongTensor(train_v_indices))
valid_rating_pairs  = (th.LongTensor(val_u_indices), th.LongTensor(val_v_indices))
test_rating_pairs   = (th.LongTensor(test_u_indices), th.LongTensor(test_v_indices))
train_rating_values = th.FloatTensor(train_labels)
valid_rating_values = th.FloatTensor(val_labels)
test_rating_values  = th.FloatTensor(test_labels)

print("\tTrain rating pairs : {}".format(len(train_labels)))
print("\tValid rating pairs : {}".format(len(val_labels)))
print("\tTest rating pairs  : {}".format(len(test_labels)))

	Train rating pairs : 64000
	Valid rating pairs : 16000
	Test rating pairs  : 20000


In [59]:
train_rating_pairs

(tensor([562, 269, 478,  ..., 748, 477, 715]),
 tensor([1109, 1193, 1147,  ..., 1126, 1888, 1224]))

In [60]:
# build dgl graph object, which is homogeneous and bidirectional and contains only training edges
train_graph = dgl.graph((th.cat([train_rating_pairs[0], train_rating_pairs[1]]), 
                         th.cat([train_rating_pairs[1], train_rating_pairs[0]])))
train_graph.edata['etype'] = th.cat([train_rating_values, train_rating_values]).to(th.long)

In [61]:
train_graph

Graph(num_nodes=2625, num_edges=128000,
      ndata_schemes={}
      edata_schemes={'etype': Scheme(shape=(), dtype=torch.int64)})

- class 함수

In [62]:
def _load_raw_user_data():
    """In MovieLens, the user attributes file have the following formats:

    ml-100k:
    user id | age | gender | occupation | zip code

    ml-1m:
    UserID::Gender::Age::Occupation::Zip-code

    Parameters
    ----------
    name : str

    Returns
    -------
    user_data : pd.DataFrame
    """
    if _data_name == 'ml-100k':
        user_data = pd.read_csv(os.path.join(_dir, 'u.user'), sep='|', header=None,
                                names=['id', 'age', 'gender', 'occupation', 'zip_code'], engine='python')
    elif _data_name == 'ml-1m':
        user_data = pd.read_csv(os.path.join(_dir, 'users.dat'), sep='::', header=None,
                                names=['id', 'gender', 'age', 'occupation', 'zip_code'], engine='python')
    else:
        raise NotImplementedError
    return user_data

def _load_raw_movie_data():
    """In MovieLens, the movie attributes may have the following formats:

    In ml-100k:

    movie id | movie title | release date | video release date | IMDb URL | [genres]

    In ml-1m, ml-10m:

    MovieID::Title (Release Year)::Genres

    Also, Genres are separated by |, e.g., Adventure|Animation|Children|Comedy|Fantasy

    Parameters
    ----------
    name : str

    Returns
    -------
    movie_data : pd.DataFrame
        For ml-100k, the column name is ['id', 'title', 'release_date', 'video_release_date', 'url'] + [GENRES (19)]]
        For ml-1m, the column name is ['id', 'title'] + [GENRES (18/20)]]
    """
    if _data_name == 'ml-100k':
        GENRES = GENRES_ML_100K
    elif _data_name == 'ml-1m':
        GENRES = GENRES_ML_1M
    else:
        raise NotImplementedError

    if _data_name == 'ml-100k':
        file_path = os.path.join(_dir, 'u.item')
        movie_data = pd.read_csv(file_path, sep='|', header=None,
                                      names=['id', 'title', 'release_date', 'video_release_date', 'url'] + GENRES,
                                      engine='python')
    elif _data_name == 'ml-1m':
        file_path = os.path.join(_dir, 'movies.dat')
        movie_data = pd.read_csv(file_path, sep='::', header=None,
                                 names=['id', 'title', 'genres'], engine='python')
        genre_map = {ele: i for i, ele in enumerate(GENRES)}
        genre_map['Children\'s'] = genre_map['Children']
        genre_map['Childrens'] = genre_map['Children']
        movie_genres = np.zeros(shape=(movie_data.shape[0], len(GENRES)), dtype=np.float32)
        for i, genres in enumerate(movie_data['genres']):
            for ele in genres.split('|'):
                if ele in genre_map:
                    movie_genres[i, genre_map[ele]] = 1.0
                else:
                    print('genres not found, filled with unknown: {}'.format(genres))
                    movie_genres[i, genre_map['unknown']] = 1.0
        for idx, genre_name in enumerate(GENRES):
            assert idx == genre_map[genre_name]
            movie_data[genre_name] = movie_genres[:, idx]
        movie_data = movie_data.drop(columns=["genres"])
    else:
        raise NotImplementedError
    return movie_data

def _load_raw_rates(self, file_path, sep):
    """In MovieLens, the rates have the following format

    ml-100k
    user id \t movie id \t rating \t timestamp

    ml-1m/10m
    UserID::MovieID::Rating::Timestamp

    timestamp is unix timestamp and can be converted by pd.to_datetime(X, unit='s')

    Parameters
    ----------
    file_path : str

    Returns
    -------
    rating_data : pd.DataFrame
    """
    rating_data = pd.read_csv(
        file_path, sep=sep, header=None,
        names=['user_id', 'movie_id', 'rating', 'timestamp'],
        dtype={'user_id': np.int32, 'movie_id' : np.int32,
               'ratings': np.float32, 'timestamp': np.int64}, engine='python')
    return rating_data

def _drop_unseen_nodes(self, data_df, col_name, reserved_ids_set):
    data_df = data_df[data_df[col_name].isin(reserved_ids_set)]
    data_df.reset_index(drop=True, inplace=True)
    return data_df

def _generate_pair_value(self, rating_data):
    rating_pairs = (np.array([_global_user_id_map[ele] for ele in rating_data["user_id"]],
                             dtype=np.int32),
                    np.array([_global_movie_id_map[ele] for ele in rating_data["movie_id"]],
                             dtype=np.int32))
    # label ranges from 0. to 4.
    rating_values = rating_data["rating"].values.astype(np.float32) - 1.
    return rating_pairs[0], rating_pairs[1], rating_values