## Collaborative Filtering
https://surprise.readthedocs.io/en/stable/getting_started.html<br />
https://sundog-education.com/

In [1]:
# From Surprise docs getting started

from surprise import Dataset
from surprise.model_selection import train_test_split

data = Dataset.load_builtin('ml-100k')
fullTrainSet = data.build_full_trainset()
trainset, testset = train_test_split(data, test_size=.25)

uid = str(196)  # raw user id (as in the ratings file). They are **strings**!
iid = str(302)  # raw item id (as in the ratings file). They are **strings**!

In [2]:
# Build our movie id, movie names arrays
from surprise import get_dataset_dir

rid_to_name = {}
name_to_rid = {}

file_name = get_dataset_dir() + '/ml-100k/ml-100k/u.item'
with open(file_name, 'r', encoding='ISO-8859-1') as f:
    for line in f:
        line = line.split('|')
        rid_to_name[line[0]] = line[1]
        name_to_rid[line[1]] = line[0]

In [3]:
rid_to_name['302']

'L.A. Confidential (1997)'

In [4]:
name_to_rid['L.A. Confidential (1997)']

'302'

In [5]:
# Function to get name of movie based on id
def getMovieName(movieID):
    if movieID in rid_to_name:
        return rid_to_name[movieID]
    else:
        return ""


In [6]:
# Build our user data
from surprise import Reader

file_name = get_dataset_dir() + '/ml-100k/ml-100k/u.data'
reader = Reader(line_format='user item rating timestamp', sep='\t')
data = Dataset.load_from_file(file_name, reader=reader)

In [7]:
import csv
from collections import defaultdict

ratings = defaultdict(int)
rankings = defaultdict(int)
with open(file_name, newline='') as csvfile:
    ratingReader = csv.reader(csvfile)
    next(ratingReader)
    for row in ratingReader:
        rowArray = str(row[0]).split('\t')
        movieID = int(rowArray[1])
        ratings[movieID] += 1
rank = 1
for movieID, ratingCount in sorted(ratings.items(), key=lambda x: x[1], reverse=True):
    rankings[movieID] = rank
    rank += 1

In [8]:
# Dictionary of movies already seen by uid
testUserInnerID = trainset.to_inner_uid(uid)
watched = {}
for itemID, rating in trainset.ur[testUserInnerID]:
    watched[itemID] = 1

In [9]:
# Item-based collaborative filtering
from surprise import KNNBasic

sim_options = {'name': 'cosine',
               'user_based': False
               }

model = KNNBasic(sim_options=sim_options)
model.fit(trainset)
predictions = model.test(testset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


In [10]:
def getTopN(predictions, n=10, minimumRating=4.0):
    topN = defaultdict(list)

    for userID, movieID, actualRating, estimatedRating, _ in predictions:
        if (estimatedRating >= minimumRating):
            topN[int(userID)].append((int(movieID), estimatedRating))

    for userID, ratings in topN.items():
        ratings.sort(key=lambda x: x[1], reverse=True)
        topN[int(userID)] = ratings[:n]

    return topN


In [11]:
def hitRate(topNPredicted, leftOutPredictions):
    hits = 0
    total = 0

    # For each left-out rating
    for leftOut in leftOutPredictions:
        userID = leftOut[0]
        leftOutMovieID = leftOut[1]
        # Is it in the predicted top 10 for this user?
        hit = False
        for movieID, predictedRating in topNPredicted[int(userID)]:
            if (int(leftOutMovieID) == int(movieID)):
                hit = True
                break
        if (hit) :
            hits += 1

        total += 1

    # Compute overall precision
    return hits/total


In [12]:
def cumulativeHitRate(topNPredicted, leftOutPredictions, ratingCutoff=0):
    hits = 0
    total = 0

    # For each left-out rating
    for userID, leftOutMovieID, actualRating, estimatedRating, _ in leftOutPredictions:
        # Only look at ability to recommend things the users actually liked...
        if (actualRating >= ratingCutoff):
            # Is it in the predicted top 10 for this user?
            hit = False
            for movieID, predictedRating in topNPredicted[int(userID)]:
                if (int(leftOutMovieID) == movieID):
                    hit = True
                    break
            if (hit) :
                hits += 1

            total += 1

    # Compute overall precision
    return hits/total


In [13]:
def ratingHitRate(topNPredicted, leftOutPredictions):
    hits = defaultdict(float)
    total = defaultdict(float)

    # For each left-out rating
    for userID, leftOutMovieID, actualRating, estimatedRating, _ in leftOutPredictions:
        # Is it in the predicted top N for this user?
        hit = False
        for movieID, predictedRating in topNPredicted[int(userID)]:
            if (int(leftOutMovieID) == movieID):
                hit = True
                break
        if (hit) :
            hits[actualRating] += 1

        total[actualRating] += 1

    # Compute overall precision
    for rating in sorted(hits.keys()):
        print (rating, hits[rating] / total[rating])


In [14]:
def averageReciprocalHitRank(topNPredicted, leftOutPredictions):
    summation = 0
    total = 0
    # For each left-out rating
    for userID, leftOutMovieID, actualRating, estimatedRating, _ in leftOutPredictions:
        # Is it in the predicted top N for this user?
        hitRank = 0
        rank = 0
        for movieID, predictedRating in topNPredicted[int(userID)]:
            rank = rank + 1
            if (int(leftOutMovieID) == movieID):
                hitRank = rank
                break
        if (hitRank > 0) :
            summation += 1.0 / hitRank

        total += 1

    return summation / total


In [15]:
def userCoverage(topNPredicted, numUsers, ratingThreshold=0):
    hits = 0
    for userID in topNPredicted.keys():
        hit = False
        for movieID, predictedRating in topNPredicted[userID]:
            if (predictedRating >= ratingThreshold):
                hit = True
                break
        if (hit):
            hits += 1

    return hits / numUsers


In [16]:
import itertools
# https://docs.python.org/2/library/itertools.html

def diversity(topNPredicted, simsAlgo):
    n = 0
    total = 0
    simsMatrix = simsAlgo.compute_similarities()
    for userID in topNPredicted.keys():
        pairs = itertools.combinations(topNPredicted[userID], 2)
        for pair in pairs:
            movie1 = pair[0][0]
            movie2 = pair[1][0]
            innerID1 = simsAlgo.trainset.to_inner_iid(str(movie1))
            innerID2 = simsAlgo.trainset.to_inner_iid(str(movie2))
            similarity = simsMatrix[innerID1][innerID2]
            total += similarity
            n += 1

    S = total / n
    return (1-S)


In [17]:
def novelty(topNPredicted, rankings):
    n = 0
    total = 0
    for userID in topNPredicted.keys():
        for rating in topNPredicted[userID]:
            movieID = rating[0]
            rank = rankings[movieID]
            total += rank
            n += 1
    return total / n


In [18]:
from surprise.model_selection import LeaveOneOut

LOOCV = LeaveOneOut(n_splits=1, random_state=1)

for trainSet, testSet in LOOCV.split(data):
    model.fit(trainSet)
    leftOutPredictions = model.test(testSet)
    bigTestSet = trainSet.build_anti_testset()
    allPredictions = model.test(bigTestSet)


Computing the cosine similarity matrix...
Done computing similarity matrix.


In [19]:
# Evaluation metrics
from collections import defaultdict
from surprise import accuracy
from surprise.model_selection import LeaveOneOut

print(accuracy.rmse(predictions))
print(accuracy.mae(predictions))
topNPredicted = getTopN(allPredictions, n=10)
print('Hit Rate:', hitRate(topNPredicted, leftOutPredictions))
print('Rating Hit Rate:')
ratingHitRate(topNPredicted, leftOutPredictions)
print('Cumulative Hit Rate:', cumulativeHitRate(topNPredicted, leftOutPredictions, 4.0))
print('Average Reciprocal Hit Rate:', averageReciprocalHitRank(topNPredicted, leftOutPredictions))
print('User Coverage:', userCoverage(topNPredicted, fullTrainSet.n_users, ratingThreshold=4.0))
print('Diversity:', diversity(topNPredicted, model))
print('Novelty:', novelty(topNPredicted, rankings))

RMSE: 1.0257
1.0256932211196677
MAE:  0.8129
0.8129226009037279
Hit Rate: 0.0042417815482502655
Rating Hit Rate:
2.0 0.009174311926605505
3.0 0.004424778761061947
5.0 0.008849557522123894
Cumulative Hit Rate: 0.003703703703703704
Average Reciprocal Hit Rate: 0.0006480499587604572
User Coverage: 0.9045599151643691
Computing the cosine similarity matrix...
Done computing similarity matrix.
Diversity: 0.5569158930082343
Novelty: 1291.7522355207043


In [20]:
# Run 5-fold cross-validation and compare
from surprise.model_selection import cross_validate

cross_validate(model, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.0321  1.0238  1.0265  1.0256  1.0282  1.0272  0.0028  
MAE (testset)     0.8149  0.8086  0.8121  0.8106  0.8142  0.8121  0.0023  
Fit time          3.84    3.84    3.83    3.97    3.84    3.86    0.05    
Test time         7.35    7.95    8.43    8.46    12.31   8.90    1.75    


{'test_rmse': array([1.03211842, 1.02378482, 1.02647282, 1.02558141, 1.0282013 ]),
 'test_mae': array([0.81494059, 0.8085966 , 0.81209358, 0.81056179, 0.81422078]),
 'fit_time': (3.8368287086486816,
  3.836512565612793,
  3.8317837715148926,
  3.965705633163452,
  3.8419413566589355),
 'test_time': (7.354872465133667,
  7.945512533187866,
  8.429388284683228,
  8.46420931816101,
  12.307350397109985)}

In [21]:
# Define matrix and similarityRow
simsMatrix = model.compute_similarities()
similarityRow = simsMatrix[testUserInnerID]

Computing the cosine similarity matrix...
Done computing similarity matrix.


In [22]:
# Build testUserRatings
testUserRatings = trainset.ur[testUserInnerID]

In [23]:
# Build kNeighbors
import heapq
# https://docs.python.org/2/library/heapq.html

k = 10
kNeighbors = heapq.nlargest(k, testUserRatings, key=lambda t: t[1])

In [24]:
# Get similar items weighted by rating
from collections import defaultdict

similarItems = defaultdict(float)
for itemID, rating in kNeighbors:
    similarityRow = simsMatrix[itemID]
    for innerID, score in enumerate(similarityRow):
        similarItems[innerID] += score * (rating / 5.0)

In [25]:
# Print top-rated items from similar items
from operator import itemgetter
# https://docs.python.org/3/library/operator.html

pos = 0
for itemID, ratingSum in sorted(similarItems.items(), key=itemgetter(1), reverse=True):
    if not itemID in watched:
        movieID = trainset.to_raw_iid(itemID)
        print(movieID, getMovieName(movieID), ratingSum)
        pos += 1
        if (pos > 10):
            break

618 Picnic (1955) 9.039175091983846
370 Mary Reilly (1996) 9.038191493346249
672 Candyman (1992) 8.978452627580609
305 Ice Storm, The (1997) 8.97136541681037
1058 War, The (1994) 8.970819518792984
499 Cat on a Hot Tin Roof (1958) 8.96708000230137
768 Casper (1995) 8.965292743436809
510 Magnificent Seven, The (1954) 8.95716246707514
942 What's Love Got to Do with It (1993) 8.954475667135616
1357 For the Moment (1994) 8.947485625254105
725 Exit to Eden (1994) 8.945884759369367


In [26]:
# Aternative Tuning
from collections import defaultdict
from operator import itemgetter

kNeighbors = []
for rating in testUserRatings:
    if rating[1] >= 4.0:
        kNeighbors.append(rating)
    
similarItemsAlt = defaultdict(float)
for itemID, rating in kNeighbors:
    similarityRow = simsMatrix[itemID]
    for innerID, score in enumerate(similarityRow):
        similarItemsAlt[innerID] += score * (rating / 5.0)
        
pos = 0
for itemID, ratingSum in sorted(similarItemsAlt.items(), key=itemgetter(1), reverse=True):
    if not itemID in watched:
        movieID = trainset.to_raw_iid(itemID)
        print(movieID, getMovieName(movieID), ratingSum)
        pos += 1
        if (pos > 10):
            break

748 Saint, The (1997) 16.0002160873711
618 Picnic (1955) 15.990649640672036
1184 Endless Summer 2, The (1994) 15.949333677769713
573 Body Snatchers (1993) 15.839895216412046
484 Maltese Falcon, The (1941) 15.823959057704087
288 Scream (1996) 15.80484953861616
1255 Broken English (1996) 15.788503592811475
120 Striptease (1996) 15.784897714443504
746 Real Genius (1985) 15.758042964077784
625 Sword in the Stone, The (1963) 15.72287151851038
708 Sex, Lies, and Videotape (1989) 15.719226107760083


In [27]:
# User-based collaborative filtering
sim_options = {'name': 'cosine',
               'user_based': True
               }

model = KNNBasic(sim_options=sim_options)
model.fit(trainset)
predictions = model.test(testset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


In [28]:
# Define similarityRow
simsMatrix = model.compute_similarities()
similarityRow = simsMatrix[testUserInnerID]

Computing the cosine similarity matrix...
Done computing similarity matrix.


In [29]:
# Top K users
import heapq

k = 10
TopKUsers = []
for innerID, score in enumerate(similarityRow):
    if (innerID != testUserInnerID):
        TopKUsers.append( (innerID, score) )

kNeighbors = heapq.nlargest(k, TopKUsers, key=lambda t: t[1])

In [30]:
# Add up ratings weighted by user similarity
similarUsers = defaultdict(float)
for similarUser in kNeighbors:
    innerID = similarUser[0]
    userSimilarityScore = similarUser[1]
    theirRatings = trainset.ur[innerID]
    for rating in theirRatings:
        similarUsers[rating[0]] += (rating[1] / 5.0) * userSimilarityScore


In [31]:
# Print top-rated items from similar users
pos = 0
for itemID, ratingSum in sorted(similarUsers.items(), key=itemgetter(1), reverse=True):
    if not itemID in watched:
        movieID = trainset.to_raw_iid(itemID)
        print(movieID, getMovieName(movieID), ratingSum)
        pos += 1
        if (pos > 10):
            break


258 Contact (1997) 4.6000000000000005
181 Return of the Jedi (1983) 4.6
300 Air Force One (1997) 4.0
127 Godfather, The (1972) 3.5999999999999996
288 Scream (1996) 3.4000000000000004
50 Star Wars (1977) 3.4
323 Dante's Peak (1997) 3.2
100 Fargo (1996) 3.0
343 Alien: Resurrection (1997) 3.0
405 Mission: Impossible (1996) 2.8000000000000003
271 Starship Troopers (1997) 2.8


In [32]:
# Aternative Tuning
from collections import defaultdict
from operator import itemgetter

kNeighbors = []
for rating in testUserRatings:
    if rating[1] >= .95:
        kNeighbors.append(rating)
    
similarUsers = defaultdict(float)
for similarUser in kNeighbors:
    innerID = similarUser[0]
    userSimilarityScore = similarUser[1]
    theirRatings = trainset.ur[innerID]
    for rating in theirRatings:
        similarUsers[rating[0]] += (rating[1] / 5.0) * userSimilarityScore
        
pos = 0
for itemID, ratingSum in sorted(similarUsers.items(), key=itemgetter(1), reverse=True):
    if not itemID in watched:
        movieID = trainset.to_raw_iid(itemID)
        print(movieID, getMovieName(movieID), ratingSum)
        pos += 1
        if (pos > 10):
            break


181 Return of the Jedi (1983) 55.60000000000001
174 Raiders of the Lost Ark (1981) 55.6
100 Fargo (1996) 55.6
204 Back to the Future (1985) 50.00000000000001
50 Star Wars (1977) 49.99999999999999
172 Empire Strikes Back, The (1980) 47.2
1 Toy Story (1995) 46.4
79 Fugitive, The (1993) 45.4
98 Silence of the Lambs, The (1991) 42.4
11 Seven (Se7en) (1995) 42.199999999999996
237 Jerry Maguire (1996) 42.199999999999996


In [33]:
# Run 5-fold cross-validation and compare
from surprise.model_selection import cross_validate

cross_validate(model, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.0200  1.0218  1.0170  1.0086  1.0151  1.0165  0.0046  
MAE (testset)     0.8053  0.8101  0.8054  0.7944  0.8039  0.8038  0.0052  
Fit time          2.34    2.00    2.93    3.27    2.03    2.51    0.50    
Test time         6.74    7.71    7.91    7.69    7.49    7.51    0.41    


{'test_rmse': array([1.01996277, 1.02184076, 1.01701131, 1.00859677, 1.01505836]),
 'test_mae': array([0.80534471, 0.81014095, 0.80543123, 0.79441833, 0.80388233]),
 'fit_time': (2.3406484127044678,
  2.0002520084381104,
  2.9340782165527344,
  3.2667782306671143,
  2.0318381786346436),
 'test_time': (6.738163471221924,
  7.712613105773926,
  7.91057276725769,
  7.6912925243377686,
  7.494473695755005)}

In [34]:
# Single prediction
pred = model.predict(uid, iid, r_ui=4, verbose=True)

user: 196        item: 302        r_ui = 4.00   est = 4.00   {'actual_k': 40, 'was_impossible': False}
