In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import os
import sys
from scipy import sparse

import tensorflow as tf
from tensorflow.contrib.layers import apply_regularization, l2_regularizer

# Read in Dataset

In [2]:
mainPath = 'C:\\Users\\nizhe\\Desktop\\python code\\ml-20m'

In [3]:
raw_data_original = pd.read_csv(os.path.join(mainPath, 'ratings.csv'), header = 0)

# Select users and items

In [4]:
def preprocessing(raw_data_original):
    '''
    Drop user count less than 5, and movie count less than 0
    '''
    raw_data = raw_data_original[raw_data_original['rating'] > 3.5].drop('timestamp', axis = 1)
    user_count_df = raw_data.groupby('userId').count().reset_index()[['userId', 'rating']].rename(columns = {'rating' : 'count'})
    user_count_df = user_count_df[user_count_df['count'] >= 5]
    movie_count_df = raw_data.groupby('movieId').count().reset_index()[['movieId', 'rating']].rename(columns = {'rating' : 'count'})
    movie_count_df = movie_count_df[movie_count_df['count'] >= 0]
    
    result_df = raw_data.merge(user_count_df, on = 'userId').drop('count', axis = 1).merge(movie_count_df, on = 'movieId').drop('count', axis = 1)
    
    return result_df, user_count_df['count'], movie_count_df['count']

In [5]:
raw_data, user_activity, item_popularity = preprocessing(raw_data_original)

In [6]:
sparsity = 1. * raw_data.shape[0] / (user_activity.shape[0] * item_popularity.shape[0])

print("After filtering, there are %d watching events from %d users and %d movies (sparsity: %.3f%%)" % 
      (raw_data.shape[0], user_activity.shape[0], item_popularity.shape[0], sparsity * 100))

After filtering, there are 9990682 watching events from 136677 users and 20720 movies (sparsity: 0.353%)


In [7]:
n_users = len(user_activity)
n_heldout_users = 10000

In [10]:
unique_uid = user_activity.index
    
np.random.seed(98765)
unique_uid = unique_uid[np.random.permutation(unique_uid.size)]

tr_users = unique_uid[ : (n_users - n_heldout_users * 2)]
vd_users = unique_uid[(n_users - n_heldout_users * 2) : (n_users - n_heldout_users)]
te_users = unique_uid[(n_users - n_heldout_users) : ]

train_df = raw_data.loc[raw_data['userId'].isin(tr_users)].drop_duplicates(subset = {'movieId'})

unique_sid = pd.unique(train_df['movieId'])