# Aux class to load dataset

In [None]:
import csv
from surprise import Dataset
from surprise import Reader
from collections import defaultdict

class MovieLens:
    movieID_to_name = {}
    name_to_movieID = {}
    ratings_path = 'ml-latest-small/ratings.csv'
    movies_path = 'ml-latest-small/movies.csv'
    
    def load_movie_lens_latest_small(self):
        ratings_dataset = 0
        self.movieID_to_name = {}
        self.name_to_movieID = {}

        """" 
        The `Reader` class from the `surprise` library in Python, is a library for building and analyzing recommender systems.

        Here's what each part does:

        - `Reader(line_format='user item rating timestamp', sep=',', skip_lines=1)`: This initializes a new `Reader` object with a specific line format and separator.
            - `line_format='user item rating timestamp'`: This tells the `Reader` that each line in the data file will be in the format of 'user item rating timestamp'. It's a space-separated string where each word represents the order of the data in the file.
            - `sep=','`: This tells the `Reader` that the data in the file is separated by commas.
            - `skip_lines=1`: This tells the `Reader` to skip the first line of the file, which is typically the header.

        This line of code is typically used when you're about to load a dataset using the `Dataset.load_from_file()` or `Dataset.load_from_df()` methods from the `surprise` library. The `Reader` object tells these methods how to interpret the data file or DataFrame.
        """
        reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1)

        # Load ratings dataset
        ratings_dataset = Dataset.load_from_file(self.ratings_path, reader=reader)

        # Create mappings from movie name to movie id and vice versa
        with open(self.movies_path, newline='', encoding='ISO-8859-1') as csv_file:
                movie_reader = csv.reader(csv_file)
                next(movie_reader)  #Skip header line
                for row in movie_reader:
                    movie_id = int(row[0])
                    movie_name = row[1]
                    self.movieID_to_name[movie_id] = movie_name
                    self.name_to_movieID[movie_name] = movie_id

        return ratings_dataset

    def get_movie_name(self, movieID):
        if movieID in self.movieID_to_name:
            return self.movieID_to_name[movieID]
        else:
            return ""


# Exploring the dataset
Let's find a user that has a similar taste to ours.
1. Find a user that watched some movies that we also like.
2. Filter these movies by rating 4 or 5.
3. Check again if we like these movies.
4. We'll use a user that is similar to us to recommend movies and check if the recommendations for him would be good for us too.


In [None]:
import pandas as pd

# load ratings dataset
ratings = pd.read_csv('ml-latest-small/ratings.csv')

potential_test_subject = 2
ratings_user_me = ratings[ratings['userId'] == potential_test_subject]

# get ratings above 4
ratings_user_me = ratings_user_me[ratings_user_me['rating'] >= 4]
print('shape:', ratings_user_me.shape)
ratings_user_me


In [None]:
# load movies dataset
movies = pd.read_csv('ml-latest-small/movies.csv')

print('Ids of selected movies:', ratings_user_me['movieId'].values)

names_of_movies_user_me = []
for index, row in movies.iterrows():
    if row['movieId'] in ratings_user_me['movieId'].values:
        names_of_movies_user_me.append(row['title'])

print('Amount of movie names:', len(names_of_movies_user_me))
names_of_movies_user_me

# Loading dataset
On this step we'll build the combination of users and items (movies).

In [None]:
# This is the ID of the user for whom we want to get recommendations
test_subject = '2'

# This is the number of movie recommendations we want to generate for the user
number_of_recommendations = 10

# Load our data set and compute the user similarity matrix
# Instantiate the MovieLens object. This object will be used to interact with the MovieLens dataset
ml = MovieLens()

# Load the latest small dataset from MovieLens. This dataset contains 100,000 ratings and 1,300 tag applications applied to 9,000 movies by 700 users
data = ml.load_movie_lens_latest_small()

# Build a training set from the entire data set. This will be used to train our recommendation model
train_set = data.build_full_trainset()
print("Dataset built")

# Computing the cosine similarity matrix

In [None]:
from surprise import KNNBasic

# Options for similarity calculation
sim_options = {
    'name': 'cosine',  # We are using cosine similarity measure, other options include 'MSD' and 'pearson'
    'user_based': True  # We are opting for a user-based collaborative filtering, which means we find users that are similar to the target user and recommend items that those users liked. If this was False, it would be item-based collaborative filtering.
}

# KNNBasic is a basic collaborative filtering algorithm from the Surprise library
model = KNNBasic(sim_options=sim_options)  # Instantiate the model with the similarity options

model.fit(train_set)  # Fit the model to the training data. This is where the model learns the item-user relationships in the data.

sim_matrix = model.compute_similarities()  # Compute the similarity matrix. This matrix contains the computed similarity measure between each pair of users or items, depending on whether user_based is True or False.

# Finding the most similar users to our test subject user

In [None]:
import heapq

# Get the inner id of the test user from the training set. The inner id is a numeric identifier assigned to the user by the Surprise library.
test_user_inner_id = train_set.to_inner_uid(test_subject)

# Get the similarity scores of the test user with all other users. The similarity scores are stored in a row of the similarity matrix.
similarity_row = sim_matrix[test_user_inner_id]

similar_users = []
# Enumerate over the similarity scores of the test user with all other users.
for inner_id, score in enumerate(similarity_row):
    # Exclude the test user from the list of similar users.
    if (inner_id != test_user_inner_id):
        # Append the inner id and similarity score of each user to the list of similar users.
        similar_users.append( (inner_id, score) )

# Use a heap data structure to efficiently find the top N users with the highest similarity scores.
k_neighbors = heapq.nlargest(number_of_recommendations, similar_users, key=lambda similar_user: similar_user[1])

# Extract a list of candidate items

In [None]:
# Create a dictionary to store the candidate items for recommendation along with their cumulative scores
candidates = defaultdict(float)

# Iterate over the k nearest neighbors of the test user
for similarUser in k_neighbors:
    inner_id = similarUser[0]  # Get the inner id of the similar user
    userSimilarityScore = similarUser[1]  # Get the similarity score between the test user and the similar user

    # Get the ratings provided by the similar user
    theirRatings = train_set.ur[inner_id]

    # Iterate over the ratings provided by the similar user
    for ratings in theirRatings:
        # For each item rated by the similar user, add up the ratings for each item, weighted by user similarity
        # The rating is normalized by dividing by 5.0 (assuming the rating scale is 1-5)
        candidates[ratings[0]] += (ratings[1] / 5.0) * userSimilarityScore

# Filtering step
In this case, we discard movies that ware already watched by the user.

In [None]:
# Build a dictionary of items the user has already seen
watched = {}
# Iterate over the items and their ratings that the test user has interacted with
for itemID, ratings in train_set.ur[test_user_inner_id]:
    # Add each item to the dictionary and mark it as seen (1)
    watched[itemID] = 1

# Ranking step
Sort the recommendation candidates

In [None]:
from operator import itemgetter

# Get top-rated items from similar users:
pos = 0
# Sort the candidate items based on their cumulative scores in descending order
for itemID, ratingSum in sorted(candidates.items(), key=itemgetter(1), reverse=True):
    # Check if the user has already seen this item
    if not itemID in watched:
        # Convert the inner id of the item to the raw id. The raw id is the original id of the item in the dataset.
        movie_id = train_set.to_raw_iid(itemID)
        # Print the name of the movie and its cumulative score
        print(ml.get_movie_name(int(movie_id)), ratingSum)
        # Increment the position counter
        pos += 1
        # If we have already printed the top 10 items, break the loop
        if (pos > 10):
            break