This code is referenced from https://github.com/dawenl/vae_cf (Variational autoencoders for collaborative filtering)

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import os
import sys
from scipy import sparse

import tensorflow as tf
from tensorflow.contrib.layers import apply_regularization, l2_regularizer

# Read in Dataset

In [2]:
mainPath = 'C:\\Users\\nizhe\\Desktop\\python code\\ml-20m'

In [3]:
raw_data_original = pd.read_csv(os.path.join(mainPath, 'ratings.csv'), header = 0)

# Select users and items

In [4]:
def preprocessing(raw_data_original):
    '''
    Drop user count less than 5, and movie count less than 0
    '''
    raw_data = raw_data_original[raw_data_original['rating'] > 3.5].drop('timestamp', axis = 1)
    user_count_df = raw_data.groupby('userId').count().reset_index()[['userId', 'rating']].rename(columns = {'rating' : 'count'})
    user_count_df = user_count_df[user_count_df['count'] >= 5]
    movie_count_df = raw_data.groupby('movieId').count().reset_index()[['movieId', 'rating']].rename(columns = {'rating' : 'count'})
    movie_count_df = movie_count_df[movie_count_df['count'] >= 0]
    
    result_df = raw_data.merge(user_count_df, on = 'userId').drop('count', axis = 1).merge(movie_count_df, on = 'movieId').drop('count', axis = 1)
    
    return result_df, user_count_df['count'], movie_count_df['count']

In [5]:
raw_data, user_activity, item_popularity = preprocessing(raw_data_original)

In [6]:
sparsity = 1. * raw_data.shape[0] / (user_activity.shape[0] * item_popularity.shape[0])

print("After filtering, there are %d watching events from %d users and %d movies (sparsity: %.3f%%)" % 
      (raw_data.shape[0], user_activity.shape[0], item_popularity.shape[0], sparsity * 100))

After filtering, there are 9990682 watching events from 136677 users and 20720 movies (sparsity: 0.353%)


In [7]:
n_users = len(user_activity)
n_heldout_users = 10000

In [8]:
unique_uid = user_activity.index # the entire users without duplicates
    
np.random.seed(98765)
unique_uid = unique_uid[np.random.permutation(unique_uid.size)] # shuffle

# split all the users in 3 parts
tr_users = unique_uid[ : (n_users - n_heldout_users * 2)]
vd_users = unique_uid[(n_users - n_heldout_users * 2) : (n_users - n_heldout_users)]
te_users = unique_uid[(n_users - n_heldout_users) : ]

train_df = raw_data.loc[raw_data['userId'].isin(tr_users)]

unique_sid = pd.unique(train_df['movieId']) # the entire items from tr without duplicates

In [9]:
show2id = dict((sid, i) for (i, sid) in enumerate(unique_sid))
profile2id = dict((pid, i) for (i, pid) in enumerate(unique_uid))

In [10]:
def train_test_split(data, test_prop = 0.2):
    
    data_grouped_by_user = data.groupby('userId')
    tr_list, te_list = list(), list()

    np.random.seed(98765)

    for i, (_, group) in enumerate(data_grouped_by_user):
        n_items_u = len(group)

        if n_items_u >= 5:
            idx = np.zeros(n_items_u, dtype = 'bool')
            idx[np.random.choice(n_items_u, size = int(test_prop * n_items_u), replace = False).astype('int64')] = True

            tr_list.append(group[np.logical_not(idx)])
            te_list.append(group[idx])
        else:
            tr_list.append(group)

    data_tr = pd.concat(tr_list)    
    data_te = pd.concat(te_list)
    
    return data_tr, data_te

In [11]:
def numerize(tp):
    uid = list(map(lambda x: profile2id[x], tp['userId']))
    sid = list(map(lambda x: show2id[x], tp['movieId']))
    return pd.DataFrame(data={'uid': uid, 'sid': sid}, columns=['uid', 'sid'])

In [12]:
vad_plays = raw_data[raw_data['userId'].isin(vd_users) & raw_data['movieId'].isin(unique_sid)]
test_plays = raw_data.loc[raw_data['userId'].isin(te_users) & raw_data['movieId'].isin(unique_sid)]

vad_plays_tr, vad_plays_te = train_test_split(vad_plays)
test_plays_tr, test_plays_te = train_test_split(test_plays)

In [13]:
train_data = numerize(train_df)
vad_tr = numerize(vad_plays_tr)
vad_te = numerize(vad_plays_te)
test_tr = numerize(test_plays_tr)
test_te = numerize(test_plays_te)