In [55]:
import pandas as pd
import numpy as np

In [56]:
links = pd.read_csv('ml-latest-small/links.csv', sep=',')
movies = pd.read_csv('ml-latest-small/movies.csv', sep=',', encoding='latin-1')
ratings = pd.read_csv('ml-latest-small/ratings.csv', sep=',')
tags = pd.read_csv('ml-latest-small/tags.csv', sep=',', encoding='latin-1')

In [57]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [58]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [59]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [60]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,15,339,sandra 'boring' bullock,1138537770
1,15,1955,dentist,1193435061
2,15,7478,Cambodia,1170560997
3,15,32892,Russian,1170626366
4,15,34162,forgettable,1141391765


In [61]:
user_movie_map = ratings.groupby('userId')['movieId'].apply(list).to_dict()
movie_user_map = ratings.groupby('movieId')['userId'].apply(list).to_dict()
user_review_map = ratings.groupby('userId')['rating'].apply(list).to_dict()
movie_title_map = movies.set_index('movieId')['title'].to_dict()
print("Movies reviewed by each user: " + str(user_movie_map))
print("Users that reviewed each movie: " + str(movie_user_map))
print("Reviews given by each user: " + str(user_review_map))
print("Title of each movie: " + str(movie_title_map))


Movies reviewed by each user: {1: [31, 1029, 1061, 1129, 1172, 1263, 1287, 1293, 1339, 1343, 1371, 1405, 1953, 2105, 2150, 2193, 2294, 2455, 2968, 3671], 2: [10, 17, 39, 47, 50, 52, 62, 110, 144, 150, 153, 161, 165, 168, 185, 186, 208, 222, 223, 225, 235, 248, 253, 261, 265, 266, 272, 273, 292, 296, 300, 314, 317, 319, 339, 349, 350, 356, 357, 364, 367, 370, 371, 372, 377, 382, 405, 410, 454, 457, 468, 474, 480, 485, 497, 500, 508, 509, 515, 527, 537, 539, 550, 551, 552, 585, 586, 587, 588, 589, 590, 592, 593, 616, 661, 720], 3: [60, 110, 247, 267, 296, 318, 355, 356, 377, 527, 588, 592, 593, 595, 736, 778, 866, 1197, 1210, 1235, 1271, 1378, 1580, 1721, 1884, 2028, 2318, 2513, 2694, 2702, 2716, 2762, 2841, 2858, 2959, 3243, 3510, 3949, 5349, 5669, 6377, 7153, 7361, 8622, 8636, 27369, 44191, 48783, 50068, 58559, 84236], 4: [10, 34, 112, 141, 153, 173, 185, 260, 289, 296, 329, 349, 356, 357, 364, 367, 380, 410, 431, 434, 435, 440, 442, 464, 480, 541, 588, 589, 590, 594, 596, 610, 616, 85

# Comparing Users - Pearson Correlation

Pearsons Correlation

In [62]:
def pearson_correlation(a, u, significance_weighting=True):
    m = list(set(user_movie_map[a]) & set(user_movie_map[u]))
    if len(m) == 0 or len(m) == 1:
        return 0.0
    upper_sum = 0.0
    lower_sum_a = 0.0
    lower_sum_u = 0.0
    for i in m:
        ra_i = ratings[(ratings['userId'] == a) & (ratings['movieId'] == i)]['rating'].tolist()
        ru_i = ratings[(ratings['userId'] == u) & (ratings['movieId'] == i)]['rating'].tolist()
        ra = ratings[(ratings['userId'] == a)]['rating'].tolist()
        ru = ratings[(ratings['userId'] == u)]['rating'].tolist()
        ra_avg = sum(ra)/len(ra)
        ru_avg = sum(ru)/len(ru)
        upper_sum += (float(ra_i[0]) - ra_avg) * (float(ru_i[0]) - ru_avg)
        lower_sum_a += (float(ra_i[0]) - ra_avg)**2
        lower_sum_u += (float(ru_i[0]) - ru_avg)**2
    if lower_sum_a == 0 or lower_sum_u == 0:
        return 0.0
    if significance_weighting:
        factor = min(10.0, len(m))/10.0
        return (upper_sum / (np.sqrt(lower_sum_a) * np.sqrt(lower_sum_u))) * factor
    else:
        return (upper_sum / (np.sqrt(lower_sum_a) * np.sqrt(lower_sum_u)))


In [63]:
for i in range(2, 20):
    print("Pearson correlation between user 1 and user " + str(i) + ": " + str(pearson_correlation(1, i)))

Pearson correlation between user 1 and user 2: 0.0
Pearson correlation between user 1 and user 3: 0.0
Pearson correlation between user 1 and user 4: 0.021068404187955424
Pearson correlation between user 1 and user 5: 0.0
Pearson correlation between user 1 and user 6: 0.0
Pearson correlation between user 1 and user 7: -0.3762133759528981
Pearson correlation between user 1 and user 8: 0.0
Pearson correlation between user 1 and user 9: 0.0
Pearson correlation between user 1 and user 10: 0.0
Pearson correlation between user 1 and user 11: 0.0
Pearson correlation between user 1 and user 12: 0.0
Pearson correlation between user 1 and user 13: 0.0
Pearson correlation between user 1 and user 14: 0.0
Pearson correlation between user 1 and user 15: 0.04377334731080121
Pearson correlation between user 1 and user 16: 0.0
Pearson correlation between user 1 and user 17: -0.19445617383602742
Pearson correlation between user 1 and user 18: 0.0
Pearson correlation between user 1 and user 19: 0.04811232

In [64]:
print("Pearsons correlation between user 1 and 4 without significance weighting is: " + str(pearson_correlation(1, 4, False)))

Pearsons correlation between user 1 and 4 without significance weighting is: 0.04213680837591085


In [65]:
print("Pearsons correlation between user 1 and 4 with significance weighting is: " + str(pearson_correlation(1, 4, True)))

Pearsons correlation between user 1 and 4 with significance weighting is: 0.021068404187955424


In [66]:
print("The ratio between the two is: " + str(pearson_correlation(1, 4, False)/pearson_correlation(1, 4, True)))

The ratio between the two is: 2.0


Recommendations Generator

In [67]:
def rating_predictions_pc(a,u, k, print_bool=True):
    users = movie_user_map[u]
    users = [x for x in users if x != a]
    pc = [pearson_correlation(a, x, False) for x in users]
    pc = list(zip(pc, users))
    pc = [x for x in pc if x[0] > 0]
    if print_bool: print("Number of nn with positive correlation: " + str(len(pc)))
    if len(pc) < k:
        k = len(pc)
    pc = sorted(pc, key=lambda x: x[0], reverse=True)[:k]
    if print_bool: print("Top " + str(k) + " most similar users that rated movie " + str(movie_title_map[u]) + ": " + str(pc))
    ra = ratings[(ratings['userId'] == a)]['rating'].tolist()
    ra_avg = sum(ra)/len(ra)
    upper_sum = 0.0
    lower_sum = 0.0
    for i in range(len(pc)):
        ru_i = ratings[(ratings['userId'] == pc[i][1]) & (ratings['movieId'] == u)]['rating'].tolist()
        ru_avg = sum(user_review_map[pc[i][1]])/len(user_review_map[pc[i][1]])
        upper_sum += (float(ru_i[0]) - ru_avg) * pc[i][0]
        lower_sum += pc[i][0]
    if lower_sum == 0:
        return ra_avg
    if print_bool: print("Weighted average of deviation from mean rating: " + str(upper_sum/lower_sum))
    return ra_avg + (upper_sum/lower_sum)

In [68]:
def top_N_recommendations_pc(a, N):
    movies = user_movie_map[a]
    movies_not_rated = [x for x in movie_user_map if x not in movies]
    ratings = [rating_predictions_pc(a, x, 10, False) for x in movies_not_rated]
    ratings = list(zip(ratings, movies_not_rated))
    ratings = sorted(ratings, key=lambda x: x[0], reverse=True)[:N]
    print("Top " + str(N) + " recommendations for user " + str(a) + ": " + str(ratings))
    recommendations = [movie_title_map[x[1]] for x in ratings]
    recommendations = list(zip(recommendations, [x[0] for x in ratings]))
    return recommendations

In [69]:
print("Rating prediction for user 1 and movie 10: " + str(rating_predictions_pc(1, 2, 10)))

Number of nn with positive correlation: 22
Top 10 most similar users that rated movie Jumanji (1995): [(0.955305259453512, 353), (0.9097023498182183, 428), (0.8849182223819824, 537), (0.8767729530225884, 177), (0.8694675056363698, 312), (0.8527860684644969, 561), (0.7988380406254074, 262), (0.7000261312823834, 466), (0.6625472742003929, 580), (0.6246950475544246, 641)]
Weighted average of deviation from mean rating: -0.45414895008883316
Rating prediction for user 1 and movie 10: 2.0958510499111664


In [70]:
print("Predicted rating for user 1 and movie 10: " + str(rating_predictions_pc(1, 10, 20)))

Number of nn with positive correlation: 30
Top 20 most similar users that rated movie GoldenEye (1995): [(0.999990790158363, 458), (0.955305259453512, 353), (0.9506054564909802, 236), (0.9494052936344851, 390), (0.9097023498182183, 428), (0.8849182223819826, 430), (0.8767729530225884, 177), (0.8694675056363698, 312), (0.8527860684644969, 561), (0.8137424950051069, 243), (0.7148990460385661, 574), (0.6647567066357397, 295), (0.6246950475544246, 641), (0.6201704336850951, 592), (0.6099352537868741, 247), (0.6041840090001795, 384), (0.5818865313144344, 516), (0.5681907170826419, 268), (0.46695691879603357, 608), (0.4047989180396508, 405)]
Weighted average of deviation from mean rating: -0.30464357350401816
Predicted rating for user 1 and movie 10: 2.2453564264959818


In [71]:
print("Predicted rating for user 1 and movie 260: " + str(rating_predictions_pc(1, 260, 20)))

Number of nn with positive correlation: 63
Top 20 most similar users that rated movie Star Wars: Episode IV - A New Hope (1977): [(0.955305259453512, 353), (0.9506054564909802, 236), (0.9494052936344851, 390), (0.9389661601733291, 394), (0.9224489154790747, 510), (0.9131865405858417, 594), (0.9097023498182183, 428), (0.8980852912291383, 197), (0.8849182223819826, 430), (0.8849182223819824, 537), (0.8694675056363698, 312), (0.8527860684644969, 561), (0.8137424950051069, 243), (0.8005948847861308, 242), (0.7148990460385661, 574), (0.7011965475750411, 22), (0.7000261312823834, 466), (0.6934749693237189, 439), (0.6647567066357397, 295), (0.6625472742003929, 580)]
Weighted average of deviation from mean rating: 0.4159146934894326
Predicted rating for user 1 and movie 260: 2.9659146934894323


In [72]:
#Takes a lot of time
print("Top 20 recommendations for user 1: " + str(top_N_recommendations_pc(1, 20)))

Top 20 recommendations for user 1: [(4.936834319526627, 3216), (4.936834319526627, 40412), (4.936834319526627, 92494), (4.928235294117647, 3320), (4.928235294117647, 4302), (4.928235294117647, 4731), (4.928235294117647, 5071), (4.928235294117647, 86781), (4.785040800791269, 97957), (4.69986349122835, 26974), (4.6777731671557845, 6214), (4.543621197252207, 3414), (4.468987045835008, 4754), (4.456057711228224, 121231), (4.428235294117647, 7034), (4.428235294117647, 91653), (4.428235294117647, 107910), (4.428235294117647, 111781), (4.428235294117647, 116897), (4.360820895522387, 7096)]
Top 20 recommendations for user 1: [('Vampyros Lesbos (Vampiras, Las) (1971)', 4.936834319526627), ("Dead Man's Shoes (2004)", 4.936834319526627), ('Dylan Moran: Monster (2004)', 4.936834319526627), ("Mifune's Last Song (Mifunes sidste sang) (1999)", 4.928235294117647), ('King Is Alive, The (2000)', 4.928235294117647), ('Innocence (2000)', 4.928235294117647), ('MaelstrÃ¶m (2000)', 4.928235294117647), ('Ince

In [73]:
# Takes a lot of time
print("Top 20 recommendations for user 522: " + str(top_N_recommendations_pc(522, 20)))

Top 20 recommendations for user 522: [(6.180285714285715, 565), (6.180285714285715, 1450), (6.180285714285715, 1563), (6.180285714285715, 1819), (6.180285714285715, 4076), (6.180285714285715, 4591), (6.180285714285715, 4796), (6.180285714285715, 4930), (6.180285714285715, 5427), (5.552834319526627, 3216), (5.552834319526627, 92494), (5.544235294117647, 4302), (5.544235294117647, 4731), (5.544235294117647, 5071), (5.512176190620671, 40412), (5.4319686172099475, 4518), (5.420711548631234, 3892), (5.411901639344262, 3879), (5.318977412731006, 3437), (5.318977412731006, 5765)]
Top 20 recommendations for user 522: [('Cronos (1993)', 6.180285714285715), ('Prisoner of the Mountains (Kavkazsky plennik) (1996)', 6.180285714285715), ('Dream With the Fishes (1997)', 6.180285714285715), ('Storefront Hitchcock (1997)', 6.180285714285715), ('Two Ninas (1999)', 6.180285714285715), ('Erik the Viking (1989)', 6.180285714285715), ('Grass Is Greener, The (1960)', 6.180285714285715), ('Funeral in Berlin (

# Item Item CF

Cosine Similarity

In [74]:
def cosine_similarity(i, j):
    intersect = list(set(movie_user_map[i]) & set(movie_user_map[j]))
    if len(intersect) == 0 or len(intersect) == 1:
        return 0.0
    upper_sum = 0.0
    lower_sum_i = 0.0
    lower_sum_j = 0.0
    for m in intersect:
        ru_i = ratings[(ratings['userId'] == m) & (ratings['movieId'] == i)]['rating'].tolist()
        ru_j = ratings[(ratings['userId'] == m) & (ratings['movieId'] == j)]['rating'].tolist()
        upper_sum += float(ru_i[0]) * float(ru_j[0])
        for n in movie_user_map[i]:
            lower_sum_i += float(ru_i[0])**2
            lower_sum_j += float(ru_j[0])**2
    if lower_sum_i == 0 or lower_sum_j == 0:
        return 0.0
    return (upper_sum / (np.sqrt(lower_sum_i) * np.sqrt(lower_sum_j)))

In [76]:
"""cosine_similarity_matrix = pd.DataFrame(index=movie_user_map.keys(), columns=movie_user_map.keys())
for i in movie_user_map.keys():
    for j in movie_user_map.keys():
        if i == j:
            cosine_similarity_matrix[i][j] = 1.0
        else:
            cosine_similarity_matrix[i][j] = cosine_similarity(i, j)"""

'cosine_similarity_matrix = pd.DataFrame(index=movie_user_map.keys(), columns=movie_user_map.keys())\nfor i in movie_user_map.keys():\n    for j in movie_user_map.keys():\n        if i == j:\n            cosine_similarity_matrix[i][j] = 1.0\n        else:\n            cosine_similarity_matrix[i][j] = cosine_similarity(i, j)'

In [77]:
print("Cosine similarity between movie 1 and 2: " + str(cosine_similarity(1, 2)))

Cosine similarity between movie 1 and 2: 0.0039000285847543012


In [78]:
print("Cosine similarity between movie 594 and 596: " + str(cosine_similarity(594, 596)))

Cosine similarity between movie 594 and 596: 0.013858566497411047


Recommendations Generator

In [79]:
def rating_predictions_cs(i, j, k):
    upper_sum = 0.0
    lower_sum = 0.0
    movies = list(movie_user_map.keys())
    movies = sorted(movies, key=lambda x: cosine_similarity(i, x), reverse=True)[:k]
    for m in movies:
        ru_m = ratings[(ratings['userId'] == j) & (ratings['movieId'] == m)]['rating'].tolist()
        if len(ru_m) == 0:
            continue
        upper_sum += cosine_similarity(i, m) * float(ru_m[0])
        lower_sum += abs(cosine_similarity(i, m))
    if lower_sum == 0:
        return 0.0
    return upper_sum / lower_sum

In [None]:
def top_N_recommendations_cs(u, N):
    movies = user_movie_map[u]
    movies_not_rated = [x for x in movie_user_map if x not in movies]
    ratings = []
    for m in movies_not_rated:
        ratings.append(rating_predictions_cs(m, u, 10))
    ratings = [x for x in ratings if x > 0]
    ratings = list(zip(ratings, movies_not_rated))
    ratings = sorted(ratings, key=lambda x: x[0], reverse=True)[:N]
    print("Top " + str(N) + " recommendations for user " + str(u) + ": " + str(ratings))
    recommendations = [movie_title_map[x[1]] for x in ratings]
    recommendations = list(zip(recommendations, [x[0] for x in ratings]))
    return recommendations

In [81]:
user_522_reviews = user_movie_map[522]
print("Number of movies rated by user 522: " + str(len(user_522_reviews)))
print("Number of movies rated by user 522 with positive cosine similarity with movie 25: " + str(len([x for x in user_522_reviews if cosine_similarity(25, x) > 0])))
similar_movies = {}
for i in user_522_reviews:
    similar_movies[i] = cosine_similarity(25, i)
similar_movies = sorted(similar_movies.items(), key=lambda x: x[1], reverse=True)[:20]
print("Top 20 most similar movies to movie 25 for user 522: " + str(similar_movies))
similar_movies = [movie_title_map[x[0]] for x in similar_movies]
print("Top 20 most similar movies to movie 25 for user 522: " + str(similar_movies))

Number of movies rated by user 522: 250
Number of movies rated by user 522 with positive cosine similarity with movie 25: 207
Top 20 most similar movies to movie 25 for user 522: [(65130, 0.00989584271623465), (87869, 0.009893776805596043), (98961, 0.009885750854445482), (59900, 0.009885746071340715), (88405, 0.009885186433550392), (97304, 0.00988456366252192), (99145, 0.009883904684900949), (90405, 0.009879060968877317), (8958, 0.009863501208530537), (81932, 0.009863415048471505), (64614, 0.00986183590526028), (91529, 0.009859750955749353), (95307, 0.009845006108500027), (6888, 0.009840433016570485), (61132, 0.009837191233245298), (106100, 0.009835283358628035), (97921, 0.009831818961012507), (97923, 0.009830248400814284), (1537, 0.009827180870909388), (4844, 0.009811478402011131)]
Top 20 most similar movies to movie 25 for user 522: ['Revolutionary Road (2008)', 'Horrible Bosses (2011)', 'Zero Dark Thirty (2012)', "You Don't Mess with the Zohan (2008)", 'Friends with Benefits (2011)'

In [None]:
# Takes a lot of time
q24 = top_N_recommendations_cs(522, 10)
print("Top 20 recommendations for user 522: " + str(q24))

KeyboardInterrupt: 

# Basket Recommendations

In [83]:
def basket_recommendations(basket, use_pos_similarities=True):
    movies = list(movie_user_map.keys())
    movies = [x for x in movies if x not in basket]
    if len(basket) == 1:
        movies = sorted(movies, key=lambda x: cosine_similarity(basket[0], x), reverse=True)[:10]
        if use_pos_similarities:
            movies = [x for x in movies if cosine_similarity(basket[0], x) > 0]
        return movies
    similar_movies = {}
    for i in movies:
        similar_movies[i] = 0
        for j in basket:
            if use_pos_similarities:
                if cosine_similarity(i, j) > 0:
                    similar_movies[i] += cosine_similarity(i, j)
            else:
                similar_movies[i] += cosine_similarity(i, j)
    similar_movies = sorted(similar_movies.items(), key=lambda x: x[1], reverse=True)[:10]
    return [x[0] for x in similar_movies]

In [84]:
q25 = basket_recommendations([1], True)
print("Recommendations for basket [1]: " + str(q25))

Recommendations for basket [1]: [511, 937, 2169, 2775, 5963, 7068, 8195, 31193, 43917, 78266]


In [85]:
q26 = basket_recommendations([1, 48, 239], True)
print("Top 10 recommendations for basket [1, 48, 239], with only positive: " + str(q26))

Top 10 recommendations for basket [1, 48, 239], with only positive: [1812, 78266, 840, 2898, 3029, 3048, 2751, 2464, 2786, 3962]


In [86]:
q27 = basket_recommendations([1, 48, 239], False)
print("Top 10 recommendations for basket [1, 48, 239]: " + str(q26))

Top 10 recommendations for basket [1, 48, 239]: [1812, 78266, 840, 2898, 3029, 3048, 2751, 2464, 2786, 3962]


# Hybrid Recommendations

In [91]:
def hybrid_recommendation(a, u , k):
    # Returns a combination of rating predictions combining pearson and cosine similarity
    return (0.5 * rating_predictions_pc(a, u, k, False)) + (0.5 * rating_predictions_cs(a, u, k))

In [94]:
def top_N_recommendations_hr(u, N):
    movies = user_movie_map[u]
    movies_not_rated = [x for x in movie_user_map if x not in movies]
    #ratings = [hybrid_recommendation(x, u, 10) for x in movies_not_rated]
    rating = []
    for x in movies_not_rated:
        rating.append(hybrid_recommendation(x, u, 10))
    ratings = [x for x in ratings if x > 0]
    ratings = list(zip(ratings, movies_not_rated))
    ratings = sorted(ratings, key=lambda x: x[0], reverse=True)[:N]
    print("Top " + str(N) + " recommendations for user " + str(u) + ": " + str(ratings))
    recommendations = [movie_title_map[x[1]] for x in ratings]
    recommendations = list(zip(recommendations, [x[0] for x in ratings]))
    return recommendations

In [95]:
# Takes a lot of time
q29 = top_N_recommendations_hr(522, 10)
print("Top 10 hybrid recommendations for user 522: " + str(q29))

KeyError: 674