In [None]:
# lets x equal the user id of the chosen user
x = 73

##Getting Top 10 recommendations for user x using SVD collaborative filtering

In [None]:
# installs surprise library
! pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit-surprise-1.1.1.tar.gz (11.8 MB)
[K     |████████████████████████████████| 11.8 MB 28.0 MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp37-cp37m-linux_x86_64.whl size=1619435 sha256=325c610be04ab098a404424f0506a1d5c788613a515de87cf7ea87ef0b851788
  Stored in directory: /root/.cache/pip/wheels/76/44/74/b498c42be47b2406bd27994e16c5188e337c657025ab400c1c
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.1


In [None]:
# imports needed modules
from surprise import Reader
from surprise import Dataset
from surprise import accuracy
from surprise.prediction_algorithms.matrix_factorization import SVD
import random
from collections import defaultdict

In [None]:
# defines top_n_recommendations function that returns a dictionary with keys of user_ids and values of the top ten recommended movies for each user_id
def top_n_recommendations(recommendations, n=10):
  top_n = defaultdict(list)

  for uid, iid, r_ui, est, _ in recommendations:
    top_n[uid].append((iid, est))

  for uid, user_ratings in top_n.items():
    user_ratings.sort(key=lambda x: x[1], reverse=True)
    top_n[uid] = user_ratings[:10]

  return top_n

In [None]:
# pre-processes data
data = Dataset.load_builtin('ml-1m')
raw_dataset = data.raw_ratings
random.shuffle(raw_dataset)
train_set = data.build_full_trainset()

Dataset ml-1m could not be found. Do you want to download it? [Y/n] Y
Trying to download dataset from http://files.grouplens.org/datasets/movielens/ml-1m.zip...
Done! Dataset ml-1m has been saved to /root/.surprise_data/ml-1m


In [None]:
# creates and trains algorithm
algorithm_svd = SVD(n_epochs = 30, reg_all = 0.03)
algorithm_svd.fit(train_set)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f06086694d0>

In [None]:
# finds predictions for content that users have not watched
test_set = train_set.build_anti_testset()
predictions = algorithm_svd.test(test_set)

In [None]:
# finds top 10 recommendations for user x
collaborative_top_recommendations = top_n_recommendations(predictions)
user_x_collaborative_recommendations = collaborative_top_recommendations[x]

##Getting Top 10 recommendations for user x using content filtering

In [None]:
# imports libraries
import numpy as np
import pandas as pd
from collections import Counter

In [None]:
# loads data from movies_1m.csv (movie ids and movie genres)
movies_dataset = pd.read_csv('movies_1m.csv', header=None, encoding='latin-1')
movie_ids_array = movies_dataset.iloc[:, 0].values
movie_ids_list = movie_ids_array.tolist()
movie_titles_array = movies_dataset.iloc[:, 1].values
movie_titles_list = movie_titles_array.tolist()
movie_genres = movies_dataset.iloc[:, 2].values

In [None]:
# list of all possible genres
possible_genres = ['Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama',
                   'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

In [None]:
# encodes genres into lists of 0 and 1's 
movie_genres_list = []
for movie in movie_genres:
  genres = movie.split('|')
  encoded_genres = []
  for genre in possible_genres:
    if genre in genres:
      encoded_genres.append(1)
    else:
      encoded_genres.append(0)
  movie_genres_list.append(encoded_genres)

In [None]:
# loads data from genome_scores_1m.csv (movie ids duplicated to correspond with tag relevances, tag ids duplicated, and tag relevances)
tags_dataset = pd.read_csv('genome_scores_1m.csv', header=None)
movie_ids_from_tags = tags_dataset.iloc[0, :].values
tags_ids = tags_dataset.iloc[1, :].values
tags_relevance = tags_dataset.iloc[2, :].values

In [None]:
# Puts tags' relevances for each movie into list with items of format [relevance_1, relevance_2, ...]
tags_ids_list = tags_ids.tolist()
movie_ids_from_tags_list = movie_ids_from_tags.tolist()
tags_relevance_list = tags_relevance.tolist()
individual_movie_relevances = []
for i in range(3952):
  movie_i_relevances = []
  for j in range(len(tags_relevance_list)):
    if i == movie_ids_from_tags_list[j]:
      movie_i_relevances.append(tags_relevance_list[j])
    else:
      individual_movie_relevances.append(movie_i_relevances)
      break

In [None]:
# loads data from ratings.csv (user ids, movie ids, ratings, timestamp)
ratings_dataset = pd.read_csv('ratings.csv')
user_id_array = ratings_dataset.iloc[:, 0].values
movies_watched_array = ratings_dataset.iloc[:, 1].values
ratings_array = ratings_dataset.iloc[:, 2].values

In [None]:
# converts arrays from ratings_dataset into lists
user_id_list = user_id_array.tolist()
movies_watched_list = movies_watched_array.tolist()
ratings_list = ratings_array.tolist()

In [None]:
# creates dictionary with form user_id: [list of movies the user rated 3 or up]
all_liked_movies = defaultdict(list)
for i in range(len(user_id_list)):
    user_id = user_id_list[i]
    movie_id = movies_watched_list[i]
    rating = ratings_list[i]
    if rating >= 3:
      all_liked_movies[user_id].append(movie_id)   

In [None]:
# creates list of movies that user x liked
x_liked_movies = all_liked_movies[x]

In [None]:
# creates a master list with one entry for each movie
# each entry is a list containing the movie's encoded values for genres and the movie's values for tag relevance
movie_comparison_points = []
for i in range(len(movie_genres_list)):
  movie_i_genres = movie_genres_list[i]
  movie_i_tag_relevances = individual_movie_relevances[i]
  movie_i_comparison_points = movie_i_genres + movie_i_tag_relevances
  movie_comparison_points.append(movie_i_comparison_points)

In [None]:
# creates list of all movies that would be recommended for each movie liked by user x
long_recommended_movies_for_x = []
for movie_id in x_liked_movies:
  # sets up values of chosen movie used as a recommendation basis
  chosen_movie_index = movie_id - 1
  chosen_movie_points = np.array(movie_comparison_points[chosen_movie_index])

  # sets up values of movies other than the chosen movie
  other_movie_points = movie_genres_list
  other_movie_points.pop(chosen_movie_index)

  # sets up lists to hold the highest similarity values and the corresponding movie ids
  top_five_similarities = [0, 0, 0, 0, 0]
  top_five_movie_ids = [0, 0, 0, 0, 0]

  # calculates movie similarity and fills top_five lists
  for i in range(len(other_movie_points)):
    movie = np.array(other_movie_points[i])
    similarity = np.dot(chosen_movie_points, movie) / (np.linalg.norm(chosen_movie_points) * np.linalg.norm(movie))
    if similarity < top_five_similarities[4]:
      continue
    if pd.isna(similarity) == True:
      continue
    if similarity > top_five_similarities[0]:
      top_five_similarities[0] = similarity
      top_five_movie_ids[0] = movie_ids_list[i]
    elif similarity > top_five_similarities[1]:
      top_five_similarities[1] = similarity
      top_five_movie_ids[1] = movie_ids_list[i]
    elif similarity > top_five_similarities[2]:
      top_five_similarities[2] = similarity
      top_five_movie_ids[2] = movie_ids_list[i]
    elif similarity > top_five_similarities[3]:
      top_five_similarities[3] = similarity
      top_five_movie_ids[3] = movie_ids_list[i]
    else:
      top_five_similarities[4] = similarity
      top_five_movie_ids[4] = movie_ids_list[i]
  for value in top_five_movie_ids:
    long_recommended_movies_for_x.append(value)

In [None]:
# finds the movie ids that appear most on long_recommended_movies_for_x (the ones that are most recommended)
user_x_content_recommendations = Counter(long_recommended_movies_for_x).most_common(10)

##Putting it all together

In [None]:
# finds "top recommended movies" (ones that are recommended both by collaborative and content) and "recommended movies" (ones that appear on exactly one recommendation list)
collaborative_movies = set(user_x_content_recommendations)
content_movies = set(user_x_collaborative_recommendations)
top_recommended_movie_ids = collaborative_movies.intersection(content_movies)
recommended_movie_ids = collaborative_movies.symmetric_difference(content_movies)

In [None]:
# converts movie ids into movie titles (movie_titles_list)
top_recommended_movies = []
for movie_id in top_recommended_movie_ids:
  index = movie_id - 1
  movie_title = movie_titles_list[index]
  top_recommended_movies.append(movie_title)
for movie_id in recommended_movie_ids:
  index = movie_id - 1
  movie_title = movie_titles_list[index]
  recommended_movies.append(movie_title)

In [None]:
# prints results
print('Top recommended movies: ' + top_recommended_movies)
print('Recommended movies: ' + recommended_movies)