In [1]:
import pandas as pd
import numpy as np
import math

<h3> Movies Dataset </h3>

<img src="Capture.png" style = "width:500px" align = "left"/>

<h3> Collaborative Filtering - User Based Filtering - Manual Calculation</h3>

<h4> Based on other SIMILAR users' scores on movies, predict scores on users that have not watched the movies yet </h4>

In [2]:
users_feedback = {'Mary': {'The Hobbit': 2.5, 
                       'The Lord of the Rings': 3.5,
                       'Star Trek': 3.0, 
                       'The Terminator': 3.5, 
                       'Norbit': 2.5, 
                       'Star Wars': 3.0},
 
                'Peter': {'The Hobbit': 3.0, 
                        'The Lord of the Rings': 3.5, 
                        'Star Trek': 1.5, 
                        'The Terminator': 5.0, 
                        'Norbit': 3.0, 
                        'Star Wars': 3.5}, 

                'Stuart': {'The Hobbit': 2.5, 
                         'The Lord of the Rings': 3.0,
                         'The Terminator': 3.5, 
                         'Star Wars': 4.0},
 
                'Jessica': {'The Lord of the Rings': 3.5, 
                          'Star Trek': 3.0,
                          'The Terminator': 4.0, 
                          'Norbit': 2.5, 
                          'Star Wars': 4.5},
 
                'Paul': {'The Hobbit': 3.0, 
                       'The Lord of the Rings': 4.0, 
                       'Star Trek': 2.0, 
                       'The Terminator': 3.0, 
                       'Norbit': 2.0,
                       'Star Wars': 3.0}, 

                'Suzane': {'The Hobbit': 3.0, 
                           'The Lord of the Rings': 4.0,
                           'The Terminator': 5.0, 
                           'Norbit': 3.5, 
                           'Star Wars': 3.0},
  
                'Fred': {'The Lord of the Rings':4.5,
                           'The Terminator':4.0,
                         'Norbit':1.0}
}

<img src="Capture3.png" style = "width:500px" align = "left"/>

<h4> Calculating Euclidean Distance between 2 users</h4>

In [3]:
def similarity_test_bet_2users(dataset, user1, user2):
    
    euclidean_distance = 0
    
    for item in dataset[user1]:
        if item in dataset[user2]:
            distance = math.pow((dataset[user1][item] - dataset[user2][item]), 2)
            euclidean_distance = euclidean_distance + distance
    
    euclidean_distance = math.sqrt(euclidean_distance)
    
    percent_similarity = 1 / (1 + euclidean_distance)
    
    return percent_similarity

In [4]:
similarity_test_bet_2users(users_feedback, 'Mary', 'Fred')

0.3483314773547883

<h4> Calculating Euclidean Distance between 1 user and ALL other users </h4>

In [5]:
def get_similar_users(dataset, user_select):
    
    results = []
    
    for user in dataset.keys():
        if user != user_select:
            result = (similarity_test_bet_2users(dataset, user_select, user), user)
            results.append(result)
            
    results = sorted(results, reverse = True)       
    return results

In [6]:
get_similar_users(users_feedback, 'Fred')

[(0.4, 'Paul'),
 (0.38742588672279304, 'Stuart'),
 (0.3567891723253309, 'Jessica'),
 (0.3483314773547883, 'Mary'),
 (0.28989794855663564, 'Peter'),
 (0.2674788903885893, 'Suzane')]

<h4> Predicting movie score for movies that user has not watched yet based on the weighted assigned score of other movies rated by other users </h4>

<img src="Capture1.png" style = "width:500px" align = "left"/>

In [7]:
def user_recommendation(dataset, user_select):
    totals = {}
    sum_similarity = {}
    recommendations = []
    
    for other in dataset.keys():
        if other != user_select:
            similarity = similarity_test_bet_2users(dataset, user_select, other)
            if similarity == 0:   # potential problem if user has not watched any movie before
                continue
    
            for movie in dataset[other]:
                if movie not in dataset[user_select]:
                    totals.setdefault(movie, 0)
                    totals[movie] += dataset[other][movie] * similarity
                    
                    sum_similarity.setdefault(movie, 0)
                    sum_similarity[movie] += similarity
                    
    for item, total in totals.items():
        recommendation = (total / sum_similarity[item], item)
        recommendations.append(recommendation)
        
    recommendations.sort(reverse = True)
    
    return recommendations

In [8]:
user_recommendation(users_feedback, 'Fred')

[(3.5207797678329342, 'Star Wars'),
 (2.7827232588048805, 'The Hobbit'),
 (2.4015514030492744, 'Star Trek')]

<h4> Setting a score threshold and print only recommended movies to specific user </h4>

In [9]:
def print_movies_recommendation(dataset, user_select, min_score = 3):
    
    reco = user_recommendation(dataset, user_select)
    
    for movie in reco:
        if movie[0] >= min_score:
            print(movie[1])

In [10]:
print_movies_recommendation(users_feedback, 'Fred')

Star Wars


<h3> Using MovieLens Dataset </h3>

<p> Data source can be found here in GroupLens: <a href="https://files.grouplens.org/datasets/movielens/ml-100k.zip">Link</a></p> 

<h4> Based on similar users, what would a user prefer to watch? </h4>

In [11]:
def load_movie_lens():
    
    movies = {}
    user_ratings = {}
    
    for row in open("u.item", "r"):
        (id, name) = row.split('|')[0:2]
        movies[id] = name
        
    for row in open("u.data", "r"):
        (user_id, movie_id, rating) = row.split()[0:3]
        user_ratings.setdefault(user_id, {})
        user_ratings[user_id][movies[movie_id]] = float(rating)
        
    return user_ratings  

In [12]:
ds = load_movie_lens()

In [13]:
print_movies_recommendation(ds, '1', 4.9)

Saint of Fort Washington, The (1993)
They Made Me a Criminal (1939)
Someone Else's America (1995)
Santa with Muscles (1996)
Prefontaine (1997)
Marlene Dietrich: Shadow and Light (1996) 
Great Day in Harlem, A (1994)
Entertaining Angels: The Dorothy Day Story (1996)
Aiqing wansui (1994)
Star Kid (1997)


In [15]:
print_movies_recommendation(ds, '2', 4.9)

Prefontaine (1997)
They Made Me a Criminal (1939)
Star Kid (1997)
Someone Else's America (1995)
Santa with Muscles (1996)
Saint of Fort Washington, The (1993)
Marlene Dietrich: Shadow and Light (1996) 
Great Day in Harlem, A (1994)
Entertaining Angels: The Dorothy Day Story (1996)
Aiqing wansui (1994)


<h4> Recommend movies to potential audiences that would give a predicted score of above or equal to 4.5</h4>

In [16]:
def load_movie_lens2():
    
    movies = {}
    user_ratings = {}
    
    for row in open("u.item", "r"):
        (id, name) = row.split('|')[0:2]
        movies[id] = name
        
    for row in open("u.data", "r"):
        (user_id, movie_id, rating) = row.split()[0:3]
        user_ratings.setdefault(movies[movie_id], {})
        user_ratings[movies[movie_id]][user_id] = float(rating)
        
    return user_ratings  

In [17]:
ds2 = load_movie_lens2()

In [18]:
print_movies_recommendation(ds2, 'Star Wars (1977)', 4.5)

688
849
628
242
928
427
118
519
469
225
565
440
260
810


<H3> Using LibRecommender Package</H3>

<p> Documentation can be found here: <a href = "https://pypi.org/project/LibRecommender/"> Link </a></p>

In [26]:
def load_movie_lens():
    
    movies = {}
    
    for row in open("u.item", "r"):
        (id, name) = row.split('|')[0:2]
        movies[id] = name
    
    return movies

In [37]:
movies = load_movie_lens()

In [38]:
import numpy as np
import pandas as pd
from libreco.data import DatasetPure
from libreco.algorithms import UserCF, ItemCF

In [49]:
dataset = pd.read_csv('u.data', sep='\t', names = ['user', 'item', 'label', 'time'])

In [50]:
dataset

Unnamed: 0,user,item,label,time
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156


In [51]:
data, data_info = DatasetPure.build_trainset(dataset)

In [52]:
print(data_info)      

#item-based filtering perform better with higher data density

n_users: 943, n_items: 1682, data density: 6.3047 %


<h4> User-Based Filtering using LibReco </h4> 

In [60]:
user_cf = UserCF(task='rating', data_info=data_info, sim_type='pearson')
user_cf.fit(data, neg_sampling = False)

Training start time: [35m2023-08-06 22:20:23[0m
Final block size and num: (943, 1)
sim_matrix elapsed: 0.061s
sim_matrix, shape: (943, 943), num_elements: 855194, density: 96.1704 %


top_k: 100%|███████████████████████████████████████████████████████████████████████| 943/943 [00:00<00:00, 5695.89it/s]


In [61]:
print('Score prediction: ', user_cf.predict(user=1, item=10), movies['10'])

Score prediction:  3.710552453994751 Richard III (1995)


In [66]:
print('Recommendations: ', user_cf.recommend_user(user=1, n_rec=10))

Recommendations:  {1: array([ 487,  516, 1514,  874, 1143,  868,  865,  414,  811,  814],
      dtype=int64)}


In [80]:
user_cf.recommend_user(user=1, n_rec=10)[1]

array([ 487,  516, 1514,  874, 1143,  868,  865,  414,  811,  814],
      dtype=int64)

In [85]:
def print_user_recommendation_lr(movie_dataset, cf, user, number_rec):
    recommendations = cf.recommend_user(user=user, n_rec=number_rec)
    recommendations = recommendations[user]
    for movie in recommendations:
        print(movie_dataset[str(movie)])

In [86]:
print_user_recommendation_lr(movies, user_cf, 1, 10)

Roman Holiday (1953)
Local Hero (1983)
Dream With the Fishes (1997)
Career Girls (1997)
Hard Eight (1996)
Hearts and Minds (1996)
Ice Storm, The (1997)
My Favorite Year (1982)
Thirty-Two Short Films About Glenn Gould (1993)
Great Day in Harlem, A (1994)


In [87]:
print_movies_recommendation(ds, '1', 4.8)          #different results from manual calculation because manual was generated from euclidean distance method.

Saint of Fort Washington, The (1993)
They Made Me a Criminal (1939)
Someone Else's America (1995)
Santa with Muscles (1996)
Prefontaine (1997)
Marlene Dietrich: Shadow and Light (1996) 
Great Day in Harlem, A (1994)
Entertaining Angels: The Dorothy Day Story (1996)
Aiqing wansui (1994)
Star Kid (1997)
