## Memory-based Collaborative Filtering
https://surprise.readthedocs.io/en/stable/getting_started.html

In [1]:
# From Surprise docs getting started

from surprise import Dataset
from surprise.model_selection import train_test_split

data = Dataset.load_builtin('ml-100k')
# sample random trainset and testset
# test set is made of 25% of the ratings.
trainset, testset = train_test_split(data, test_size=.25)

uid = str(196)  # raw user id (as in the ratings file). They are **strings**!
iid = str(302)  # raw item id (as in the ratings file). They are **strings**!

In [2]:
from surprise import SVD
from surprise import accuracy
from surprise.model_selection import cross_validate

# Use the SVD algorithm.
algo = SVD()

# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)
predictions = algo.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)

# Run 5-fold cross-validation and compare
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

RMSE: 0.9490
Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9396  0.9438  0.9285  0.9281  0.9412  0.9362  0.0066  
MAE (testset)     0.7399  0.7422  0.7308  0.7321  0.7422  0.7374  0.0050  
Fit time          7.25    7.11    7.46    7.08    7.30    7.24    0.14    
Test time         0.52    0.37    0.34    0.39    0.46    0.42    0.07    


{'test_rmse': array([0.93962043, 0.94379347, 0.92854628, 0.92806653, 0.94118684]),
 'test_mae': array([0.7398714 , 0.74223589, 0.73079158, 0.73210398, 0.74223826]),
 'fit_time': (7.247158765792847,
  7.106967449188232,
  7.459359407424927,
  7.08176851272583,
  7.299118757247925),
 'test_time': (0.521958589553833,
  0.36715006828308105,
  0.3427605628967285,
  0.3929867744445801,
  0.462648868560791)}

In [3]:
pred = algo.predict(uid, iid, r_ui=4, verbose=True)

user: 196        item: 302        r_ui = 4.00   est = 4.52   {'was_impossible': False}


In [4]:
from surprise import KNNBasic

# Load the movielens-100k dataset
#data = Dataset.load_builtin('ml-100k')

# Retrieve the trainset.
#trainset = data.build_full_trainset()

# Use KNNBasic
algo = KNNBasic()
algo.fit(trainset)

# get a prediction for specific users and items.
pred = algo.predict(uid, iid, r_ui=4, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
user: 196        item: 302        r_ui = 4.00   est = 3.98   {'actual_k': 40, 'was_impossible': False}


In [5]:
# Build our movie id, movie names arrays
from surprise import get_dataset_dir

rid_to_name = {}
name_to_rid = {}

file_name = get_dataset_dir() + '/ml-100k/ml-100k/u.item'
with open(file_name, 'r', encoding='ISO-8859-1') as f:
    for line in f:
        line = line.split('|')
        rid_to_name[line[0]] = line[1]
        name_to_rid[line[1]] = line[0]

In [6]:
rid_to_name['302']

'L.A. Confidential (1997)'

In [7]:
name_to_rid['L.A. Confidential (1997)']

'302'

In [8]:
# Function to get name of movie based on id
def getMovieName(movieID):
    if movieID in rid_to_name:
        return rid_to_name[movieID]
    else:
        return ""


In [9]:
# Build our user data
from surprise import Reader

file_name = get_dataset_dir() + '/ml-100k/ml-100k/u.data'
reader = Reader(line_format='user item rating timestamp', sep='\t')
data = Dataset.load_from_file(file_name, reader=reader)
trainSet = data.build_full_trainset()

In [10]:
# Dictionary of movies already seen by uid
testUserInnerID = trainSet.to_inner_uid(uid)
watched = {}
for itemID, rating in trainSet.ur[testUserInnerID]:
    watched[itemID] = 1

In [11]:
# Item-based collaborative filtering
from surprise import KNNBasic

sim_options = {'name': 'cosine',
               'user_based': False
               }

model = KNNBasic(sim_options=sim_options)
model.fit(trainSet)
simsMatrix = model.compute_similarities()

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


In [12]:
# Define similarityRow
similarityRow = simsMatrix[testUserInnerID]

In [13]:
# Build testUserRatings
testUserRatings = trainSet.ur[testUserInnerID]
# https://surprise.readthedocs.io/en/stable/trainset.html

In [14]:
# Build kNeighbors
import heapq
# https://docs.python.org/2/library/heapq.html

k = 10
kNeighbors = heapq.nlargest(k, testUserRatings, key=lambda t: t[1])

In [15]:
# Get similar items weighted by rating
from collections import defaultdict

similarItems = defaultdict(float)
for itemID, rating in kNeighbors:
    similarityRow = simsMatrix[itemID]
    for innerID, score in enumerate(similarityRow):
        similarItems[innerID] += score * (rating / 5.0)

In [16]:
# Print top-rated items from similar items
from operator import itemgetter
# https://docs.python.org/3/library/operator.html

pos = 0
for itemID, ratingSum in sorted(similarItems.items(), key=itemgetter(1), reverse=True):
    if not itemID in watched:
        movieID = trainSet.to_raw_iid(itemID)
        print(movieID, getMovieName(movieID), ratingSum)
        pos += 1
        if (pos > 10):
            break

1330 An Unforgettable Summer (1994) 9.569725782787199
973 Grateful Dead (1995) 9.55040898169926
1506 Nelly & Monsieur Arnaud (1995) 9.545208434259099
314 3 Ninjas: High Noon At Mega Mountain (1998) 9.526303138153162
1083 Albino Alligator (1996) 9.522541737502978
935 Paradise Road (1997) 9.482367222711115
909 Dangerous Beauty (1998) 9.482275440776151
1379 Love and Other Catastrophes (1996) 9.479044527813514
1324 Loaded (1994) 9.459044077172537
1523 Good Man in Africa, A (1994) 9.456642682550967
1024 Mrs. Dalloway (1997) 9.45608397969789


In [17]:
# Aternative Tuning
from collections import defaultdict
from operator import itemgetter

kNeighbors = []
for rating in testUserRatings:
    if rating[1] >= 4.0:
        kNeighbors.append(rating)
    
similarItemsAlt = defaultdict(float)
for itemID, rating in kNeighbors:
    similarityRow = simsMatrix[itemID]
    for innerID, score in enumerate(similarityRow):
        similarItemsAlt[innerID] += score * (rating / 5.0)
        
pos = 0
for itemID, ratingSum in sorted(similarItemsAlt.items(), key=itemgetter(1), reverse=True):
    if not itemID in watched:
        movieID = trainSet.to_raw_iid(itemID)
        print(movieID, getMovieName(movieID), ratingSum)
        pos += 1
        if (pos > 10):
            break

935 Paradise Road (1997) 18.911889486207357
1068 Star Maker, The (Uomo delle stelle, L') (1995) 18.748924268405958
1169 Fresh (1994) 18.723948426699796
1451 Foreign Correspondent (1940) 18.723657710503176
587 Hour of the Pig, The (1993) 18.70882631460672
1141 War Room, The (1993) 18.67243715253662
691 Dark City (1998) 18.66730871995214
936 Brassed Off (1996) 18.658482192086183
1628 Lamerica (1994) 18.64334619219852
1379 Love and Other Catastrophes (1996) 18.631059087664006
114 Wallace & Gromit: The Best of Aardman Animation (1996) 18.617731088188748


In [18]:
# User-based collaborative filtering
sim_options = {'name': 'cosine',
               'user_based': True
               }

model = KNNBasic(sim_options=sim_options)
model.fit(trainSet)
simsMatrix = model.compute_similarities()

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


In [19]:
# Define similarityRow
similarityRow = simsMatrix[testUserInnerID]

In [20]:
# Top K users
import heapq

k = 10
TopKUsers = []
for innerID, score in enumerate(similarityRow):
    if (innerID != testUserInnerID):
        TopKUsers.append( (innerID, score) )

kNeighbors = heapq.nlargest(k, TopKUsers, key=lambda t: t[1])

In [21]:
# Add up ratings weighted by user similarity
similarUsers = defaultdict(float)
for similarUser in kNeighbors:
    innerID = similarUser[0]
    userSimilarityScore = similarUser[1]
    theirRatings = trainSet.ur[innerID]
    for rating in theirRatings:
        similarUsers[rating[0]] += (rating[1] / 5.0) * userSimilarityScore


In [22]:
# Print top-rated items from similar users
pos = 0
for itemID, ratingSum in sorted(similarUsers.items(), key=itemgetter(1), reverse=True):
    if not itemID in watched:
        movieID = trainSet.to_raw_iid(itemID)
        print(movieID, getMovieName(movieID), ratingSum)
        pos += 1
        if (pos > 10):
            break


288 Scream (1996) 4.999999999999999
294 Liar Liar (1997) 3.4000000000000004
50 Star Wars (1977) 3.0
300 Air Force One (1997) 2.8
258 Contact (1997) 2.6
237 Jerry Maguire (1996) 2.6
98 Silence of the Lambs, The (1991) 2.6
243 Jungle2Jungle (1997) 2.4
343 Alien: Resurrection (1997) 2.2
748 Saint, The (1997) 2.2
268 Chasing Amy (1997) 2.2


In [23]:
# Aternative Tuning
from collections import defaultdict
from operator import itemgetter

kNeighbors = []
for rating in testUserRatings:
    if rating[1] >= .95:
        kNeighbors.append(rating)
    
similarUsers = defaultdict(float)
for similarUser in kNeighbors:
    innerID = similarUser[0]
    userSimilarityScore = similarUser[1]
    theirRatings = trainSet.ur[innerID]
    for rating in theirRatings:
        similarUsers[rating[0]] += (rating[1] / 5.0) * userSimilarityScore
        
pos = 0
for itemID, ratingSum in sorted(similarUsers.items(), key=itemgetter(1), reverse=True):
    if not itemID in watched:
        movieID = trainSet.to_raw_iid(itemID)
        print(movieID, getMovieName(movieID), ratingSum)
        pos += 1
        if (pos > 10):
            break


50 Star Wars (1977) 61.79999999999999
181 Return of the Jedi (1983) 55.4
100 Fargo (1996) 54.4
258 Contact (1997) 45.2
117 Rock, The (1996) 43.00000000000001
127 Godfather, The (1972) 42.800000000000004
174 Raiders of the Lost Ark (1981) 40.4
423 E.T. the Extra-Terrestrial (1982) 40.0
300 Air Force One (1997) 39.8
294 Liar Liar (1997) 39.00000000000001
172 Empire Strikes Back, The (1980) 38.800000000000004
