##### Collaborative Filtering application on Movie lens dataset from https://www.ethanrosenthal.com/2015/11/02/intro-to-collaborative-filtering/

In [52]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [10]:
!curl -O http://files.grouplens.org/datasets/movielens/ml-100k.zip
!unzip ml-100k.zip
!rm ml-100k.zip


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 4808k  100 4808k    0     0  2760k      0  0:00:01  0:00:01 --:--:-- 2758k
Archive:  ml-100k.zip
   creating: ml-100k/
  inflating: ml-100k/allbut.pl       
  inflating: ml-100k/mku.sh          
  inflating: ml-100k/README          
  inflating: ml-100k/u.data          
  inflating: ml-100k/u.genre         
  inflating: ml-100k/u.info          
  inflating: ml-100k/u.item          
  inflating: ml-100k/u.occupation    
  inflating: ml-100k/u.user          
  inflating: ml-100k/u1.base         
  inflating: ml-100k/u1.test         
  inflating: ml-100k/u2.base         
  inflating: ml-100k/u2.test         
  inflating: ml-100k/u3.base         
  inflating: ml-100k/u3.test         
  inflating: ml-100k/u4.base         
  inflating: ml-100k/u4.test         
  inflating: ml-100k/u5.base         
  inflating: ml-100k/u5.test       

In [22]:
!cd ml-100k/
!ls ml-100k/

allbut.pl  u1.base  u2.test  u4.base  u5.test  ub.base	u.genre  u.occupation
mku.sh	   u1.test  u3.base  u4.test  ua.base  ub.test	u.info	 u.user
README	   u2.base  u3.test  u5.base  ua.test  u.data	u.item


In [29]:
!head ml-100k/u.data
!echo
!wc -l ml-100k/u.data 

196	242	3	881250949
186	302	3	891717742
22	377	1	878887116
244	51	2	880606923
166	346	1	886397596
298	474	4	884182806
115	265	2	881171488
253	465	5	891628467
305	451	3	886324817
6	86	3	883603013

100000 ml-100k/u.data


In [30]:
names = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv('ml-100k/u.data', sep='\t', names=names)
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [32]:
n_users , n_items = df.user_id.nunique(), df.item_id.nunique()
print('Number of users: {}'.format(n_users))
print('Number of items: {}'.format(n_items))


Number of users: 943
Number of items: 1682


In [40]:
ratings = np.zeros((n_users, n_items))
for row in df.itertuples():
    ratings[row[1]-1, row[2]-1] = row[3]
ratings

array([[5., 3., 4., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [5., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 5., 0., ..., 0., 0., 0.]])

In [41]:
sparcity = len(ratings.nonzero()[0]) / (ratings.shape[0] * ratings.shape[1])
print('Sparcity: {:4.2f}%'.format(sparcity*100))

Sparcity: 6.30%


In [53]:
# split train and test by 10 ratings to 0 test the model
def train_test_split(ratings):
    test = np.zeros(ratings.shape)
    train = ratings.copy()
    for user in range(ratings.shape[0]):
        test_ratings = np.random.choice(ratings[user, :].nonzero()[0], size=10, replace=False)
        train[user, test_ratings] = 0.
        test[user, test_ratings] = ratings[user, test_ratings]
    # Test and training are truly disjoint
    assert(np.all((train * test) == 0))
    return train, test

In [56]:
train, test = train_test_split(ratings)


In [62]:
def fast_similarity(ratings, kind='user',eps=1e-9):
    if kind == 'user':
        sim = ratings.dot(ratings.T) + eps
    elif kind == 'item':
        sim = ratings.T.dot(ratings) + eps
    norms = np.array([np.sqrt(np.diagonal(sim))])
    return (sim / norms / norms.T)

In [63]:
%%timeit 
fast_similarity(ratings, kind='user')

42.8 ms ± 4.43 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [89]:
user_similarity = fast_similarity(train, kind='user')
item_similarity = fast_similarity(train, kind='item')
print(item_similarity[:4, :4])

[[1.         0.40501493 0.31529634 0.44683971]
 [0.40501493 1.         0.28116995 0.47440782]
 [0.31529634 0.28116995 1.         0.3218065 ]
 [0.44683971 0.47440782 0.3218065  1.        ]]


In [90]:
def predict_fast_simple(ratings, similarity, kind='user'):
    if kind == 'user':
        return similarity.dot(ratings) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif kind == 'item':
        return ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])

In [79]:
%timeit predict_fast_simple(ratings, user_similarity, kind='user')

59.9 ms ± 8.23 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [82]:
from sklearn.metrics import mean_squared_error
def get_mse(pred,actual):
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return mean_squared_error(pred, actual)


In [92]:
item_prediction = predict_fast_simple(ratings, item_similarity, kind='item')
user_prediction = predict_fast_simple(ratings, user_similarity, kind='user')
print('user_based_prediction_mse: {}'.format(get_mse(user_prediction, test)))
print('item_based_prediction_mse: {}'.format(get_mse(item_prediction, test)))


user_based_prediction_mse: 7.636607497169519
item_based_prediction_mse: 11.16182022068182
