# Movielens Collaborative filtering example
This example will use the 100K Movielens dataset https://grouplens.org/datasets/movielens/100k/.

This dataset contains 100,000 ratings from ~1000 users on ~1700 movies.

## Download and extract dataset
We will use the raw text dataset. We fill first download the ZIP if not done already and then extract it

In [1]:
from urllib.request import urlretrieve
from zipfile import ZipFile
import os

srcUrl = 'http://files.grouplens.org/datasets/movielens/ml-100k.zip'

# Create folder if it doesn't exists
if not os.path.exists('./movielens-100k'):
    print("Create movielens-100k folder")
    os.makedirs('./movielens-100k')

# Check for existance of ZIP file
if not os.path.exists('./movielens-100k/ml-100k.zip'):
    print("Download %s" % srcUrl)
    urlretrieve(srcUrl, './movielens-100k/ml-100k.zip')
    
# Extract zipFile
with ZipFile('./movielens-100k/ml-100k.zip', 'r') as zipFile:
    print("Extract %d files from ml-100k.zip" % len(zipFile.namelist()))
    zipFile.extractall('./movielens-100k')

Extract 24 files from ml-100k.zip


In [2]:
import numpy as np
import pandas as pd

header = ['UserID', 'MovieID', 'Rating', 'Timestamp']
df = pd.read_csv('./movielens-100k/ml-100k/u.data', sep='\t', names=header)

n_users = df.UserID.unique().shape[0]
n_items = df.MovieID.unique().shape[0]
print("Number of users = %d" % n_users)
print("Number of items = %d" % n_items)

Number of users = 943
Number of items = 1682


In [3]:
df.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [4]:
# Initialze a RandomState with a constant seed to make the split consistent
from numpy.random import RandomState
prng = RandomState(1)

# Split the dataset into a training and test set
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(df, test_size = .25, random_state=prng)

In [5]:
# Create two matrices, one for training and one for testing
train_df = train_data.pivot(index = 'UserID', columns ='MovieID', values = 'Rating').fillna(0)
train_data_matrix = train_df.as_matrix()

test_df = test_data.pivot(index = 'UserID', columns ='MovieID', values = 'Rating').fillna(0)
test_data_matrix = test_df.as_matrix()

train_df.head()

MovieID,1,2,3,4,5,6,7,8,9,10,...,1671,1673,1674,1675,1677,1678,1679,1680,1681,1682
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,0.0,3.0,5.0,4.0,1.0,5.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
from sklearn.metrics.pairwise import pairwise_distances
user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')

In [7]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        return mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    else:
        return ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])

user_prediction = predict(train_data_matrix, user_similarity, type='user')
item_prediction = predict(train_data_matrix, item_similarity, type='item')

In [8]:
from sklearn.metrics import mean_squared_error
from math import sqrt

def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

print('User-based CF RMSE: %0.2f' % rmse(user_prediction, test_data_matrix))
print('Item-based CF RMSE: %0.2f' % rmse(item_prediction, test_data_matrix))

User-based CF RMSE: 3.27
Item-based CF RMSE: 3.44


In [9]:
# Create mapping between item_id and title
item_id_to_title = {}
with open('./movielens-100k/ml-100k/u.item', 'r', encoding='iso-8859-1') as f:
    for line in f.readlines():
        info = line.split('|')
        item_id_to_title[int(info[0])-1] = info[1]           

In [10]:
user_idx = 0
for item_idx in user_prediction[user_idx].argsort()[-25:][::-1]:
    print("[ %0.2f ] %s" % (user_prediction[user_idx][item_idx], item_id_to_title[item_idx]))    

[ 2.10 ] Star Wars (1977)
[ 1.88 ] Fargo (1996)
[ 1.86 ] Contact (1997)
[ 1.72 ] Return of the Jedi (1983)
[ 1.64 ] English Patient, The (1996)
[ 1.64 ] Godfather, The (1972)
[ 1.61 ] Scream (1996)
[ 1.60 ] Air Force One (1997)
[ 1.57 ] Raiders of the Lost Ark (1981)
[ 1.56 ] Silence of the Lambs, The (1991)
[ 1.55 ] Toy Story (1995)
[ 1.51 ] Liar Liar (1997)
[ 1.42 ] Titanic (1997)
[ 1.42 ] Pulp Fiction (1994)
[ 1.39 ] Jerry Maguire (1996)
[ 1.38 ] Twelve Monkeys (1995)
[ 1.37 ] Independence Day (ID4) (1996)
[ 1.34 ] Empire Strikes Back, The (1980)
[ 1.31 ] Rock, The (1996)
[ 1.29 ] Schindler's List (1993)
[ 1.29 ] Full Monty, The (1997)
[ 1.28 ] Back to the Future (1985)
[ 1.26 ] Star Trek: First Contact (1996)
[ 1.26 ] L.A. Confidential (1997)
[ 1.25 ] Fugitive, The (1993)
