# Movielens SVD collaborative filtering example
This example will use the 1M Movielens dataset (http://grouplens.org/datasets/movielens/1m/).

This dataset contains ~1,000,000 ratings from ~6000 users on ~3900 movies.

## Download and extract dataset
We will use the raw text dataset. We fill first download the ZIP if not done already and then extract it

In [1]:
from urllib.request import urlretrieve
from zipfile import ZipFile
import os

srcUrl = 'http://files.grouplens.org/datasets/movielens/ml-1m.zip'

# Create folder if it doesn't exists
if not os.path.exists('./movielens-1m'):
    print("Create movielens-1m folder")
    os.makedirs('./movielens-1m')

# Check for existance of ZIP file
if not os.path.exists('./movielens-1m/ml-1m.zip'):
    print("Download %s" % srcUrl)
    urlretrieve(srcUrl, './movielens-1m/ml-1m.zip')
    
# Extract zipFile
with ZipFile('./movielens-1m/ml-1m.zip', 'r') as zipFile:
    print("Extract %d files from ml-1m.zip" % len(zipFile.namelist()))
    zipFile.extractall('./movielens-1m')

Extract 5 files from ml-1m.zip


## Create dataframes
Extract the data from the individual files and created pandas DataFrame's from them

In [2]:
import pandas as pd
import numpy as np

ratings_list = [i.strip().split("::") for i in open('./movielens-1m/ml-1m/ratings.dat', 'r', encoding='iso-8859-1').readlines()]
users_list = [i.strip().split("::") for i in open('./movielens-1m/ml-1m/users.dat', 'r', encoding='iso-8859-1').readlines()]
movies_list = [i.strip().split("::") for i in open('./movielens-1m/ml-1m/movies.dat', 'r', encoding='iso-8859-1').readlines()]

ratings_df = pd.DataFrame(ratings_list, columns = ['UserID', 'MovieID', 'Rating', 'Timestamp'], dtype = int)
movies_df = pd.DataFrame(movies_list, columns = ['MovieID', 'Title', 'Genres'])
movies_df['MovieID'] = movies_df['MovieID'].apply(pd.to_numeric)

In [3]:
movies_df.head()

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
ratings_df.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


## Split into a training and test set
Split the data into a training and a test set. We use a split of 25% in this case

In [5]:
# Initialze a RandomState with a constant seed to make the split consistent
from numpy.random import RandomState
prng = RandomState(1)

# Split the dataset into a training and test set
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(ratings_df, test_size = .25, random_state=prng)

## Create pivot tables
Create pivot tables for training and testset. On one axis are the UserID's on the other axis are the MovieIds. Fill in the empty values with zero's

In [6]:
R_train_df = train_data.pivot(index = 'UserID', columns ='MovieID', values = 'Rating').fillna(0)
R_test_df = test_data.pivot(index = 'UserID', columns ='MovieID', values = 'Rating').fillna(0)

R_train_df.head()

MovieID,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## De-mean the data
Every user rates things differently, where one gives a 5 star rating if he just liked the movie some other users may never give a 4 or higher so average out the data.

In [7]:
R_train_df.as_matrix().shape
R = R_df.as_matrix()
user_ratings_mean = np.mean(R, axis = 1)
R_demeaned = R - user_ratings_mean.reshape(-1, 1)

NameError: name 'R_df' is not defined

## Perform the Singular Value Decomposition
Use scipy to do the Singular Value Decomposition. Limit the number of latent factors to 50.

In [None]:
from scipy.sparse.linalg import svds
U, sigma, Vt = svds(R_demeaned, k = 100)

sigma = np.diag(sigma)

## Use the Decomposed Matrices to make the predicted ratings
With U, sigma and Vt we can recreate the matrix using k (50) latent factors. Then re-add the user mean

In [None]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)
preds_df = pd.DataFrame(all_user_predicted_ratings, columns = R_df.columns)

preds_df.head()

## Compare results against our test set

In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt

def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

print('Predictions RMSE: %0.2f' % rmse(all_user_predicted_ratings, R_test_df.as_matrix()))

## Get recommended movie list

In [None]:
def recommend_movies(predictions_df, userID, movies_df, original_ratings_df, num_recommendations=5):
    
    # Get and sort the user's predictions
    user_row_number = userID - 1 # UserID starts at 1, not 0
    sorted_user_predictions = predictions_df.iloc[user_row_number].sort_values(ascending=False)
    
    # Get the user's data and merge in the movie information.
    user_data = original_ratings_df[original_ratings_df.UserID == (userID)]
    user_full = (user_data.merge(movies_df, how = 'left', left_on = 'MovieID', right_on = 'MovieID').
                     sort_values(['Rating'], ascending=False)
                 )

    print('User %d has already rated %d movies.' % (userID, user_full.shape[0]))
    print('Recommending the highest %d predicted ratings movies not already rated.' % num_recommendations)
    
    # Recommend the highest predicted rating movies that the user hasn't seen yet.
    recommendations = (movies_df[~movies_df['MovieID'].isin(user_full['MovieID'])].
         merge(pd.DataFrame(sorted_user_predictions).reset_index(), how = 'left',
               left_on = 'MovieID',
               right_on = 'MovieID').
         rename(columns = {user_row_number: 'Predictions'}).
         sort_values('Predictions', ascending = False).
                       iloc[:num_recommendations, :-1]
                      )

    return user_full, recommendations, sorted_user_predictions

# Get recommendations
already_rated, recommendations, sorted_user_predictions = recommend_movies(preds_df, 1, movies_df, ratings_df, 10)

In [None]:
already_rated.head(10)

In [None]:
recommendations.head(10)