In [1]:
import numpy as np
import pandas as pd

In [2]:
ratings_df = pd.read_csv('ml-latest-small/ratings.csv')

In [3]:
ratings_df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [4]:
num_users = ratings_df['userId'].nunique()
num_items = ratings_df['movieId'].nunique()

In [5]:
user_ids = sorted(ratings_df['userId'].unique())
movie_ids = sorted(ratings_df['movieId'].unique())

In [6]:
user_map = {user_id: index for index, user_id in enumerate(user_ids)}
movie_map = {movie_id: index for index, movie_id in enumerate(movie_ids)}

In [7]:
R = np.zeros((num_users, num_items))
for _, row in ratings_df.iterrows():
    i, j = user_map[row['userId']], movie_map[row['movieId']]
    R[i, j] = row['rating']

In [8]:
class MatrixFactorization:
    def __init__(self, R, K, alpha, beta, iterations):
        
        # The input user-item interaction matrix
        self.R = R
        # The dimensions of the interaction matrix
        self.num_users, self.num_items = R.shape
        # The number of latent factors
        self.K = K
        # The learning rate for stochastic gradient descent
        self.alpha = alpha
        # The regularization parameter
        self.beta = beta
        # The number of iterations for the optimization
        self.iterations = iterations

    def train(self):
        # Initialize the user and item factor matrices with random normal values
        self.P = np.random.normal(scale=1./self.K, size=(self.num_users, self.K))
        self.Q = np.random.normal(scale=1./self.K, size=(self.num_items, self.K))
        
        # Initialize the user bias vector
        self.b_u = np.zeros(self.num_users)
        # Initialize the item bias vector
        self.b_i = np.zeros(self.num_items)
        # Compute the global bias
        self.b = np.mean(self.R[np.where(self.R != 0)])
        
        # Create a list of samples with non-zero entries in the interaction matrix
        self.samples = [
            (i, j, self.R[i, j])
            for i in range(self.num_users)
            for j in range(self.num_items)
            if self.R[i, j] > 0
        ]
        # Run stochastic gradient descent for the specified number of iterations
        for _ in range(self.iterations):
            # Randomly shuffle the samples
            np.random.shuffle(self.samples)
            self.sgd()

        return self.P,self.Q
    
    def sgd(self):
        # Update the biases and factors for each sample using stochastic gradient descent
        for i, j, r in self.samples:
            prediction = self.get_rating(i, j)
            error = r - prediction
            #print(i,j,r)
            self.b_u[i] += self.alpha * (error - self.beta * self.b_u[i])
            self.b_i[j] += self.alpha * (error - self.beta * self.b_i[j])

            self.P[i, :] += self.alpha * (error * self.Q[j, :] - self.beta * self.P[i, :])
            self.Q[j, :] += self.alpha * (error * self.P[i, :] - self.beta * self.Q[j, :])

    def get_rating(self, i, j):
        # Compute the predicted rating for user i and item j
        return self.b + self.b_u[i] + self.b_i[j] + self.P[i, :].dot(self.Q[j, :].T)

    def full_matrix(self):
        # Compute the full matrix of predicted ratings
        return self.b + self.b_u[:, np.newaxis] + self.b_i[np.newaxis, :] + self.P.dot(self.Q.T)
    
    def cosine_similarity(self,u, v):
        # Compute the dot product between the two vectors
        dot_product = np.dot(u, v)
        # Compute the L2-norm of each vector
        norm_u = np.linalg.norm(u)
        norm_v = np.linalg.norm(v)
        # Compute the cosine similarity
        cosine_sim = dot_product / (norm_u * norm_v)
        return cosine_sim


    def recommend_movies(self, movie_title, titles, titles_inv, movie_map, item_factors, top_n=10):
        if movie_title not in titles:
            print(f"{movie_title} not found in the dataset.")
            return None
        # Get the index of the movie in the item factors matrix
        matched_index = titles[movie_title]
        # Calculate the cosine similarity between the movie and all other movies
        similarity = np.array([self.cosine_similarity(item_factors[matched_index], item_factors[i]) for i in range(len(item_factors))])
        # Get the indices of the top_n most similar movies, excluding the movie itself
        top_indices = np.argsort(similarity)[-top_n-1:-1][::-1]
        # Map the indices back to movie titles using the titles_inv dictionary
        recommended_movies = [(similarity[idx], titles_inv[idx]) for idx in top_indices]
        return recommended_movies

In [9]:
mf = MatrixFactorization(R, K=20, alpha=0.01, beta=0.1, iterations=500)
P,Q = mf.train()
predictions=mf.full_matrix()
print("Predicted ratings:\n", predictions)

Predicted ratings:
 [[4.48391122 4.04404174 3.91814034 ... 4.10115401 4.12607125 4.43846034]
 [3.92673642 3.43896772 3.82671967 ... 3.50496807 3.49030529 3.84859282]
 [2.46162867 2.43985346 1.56232356 ... 2.15576088 2.11369958 2.51314095]
 ...
 [3.23100904 2.96727804 2.51719639 ... 3.24391457 3.28548367 3.55326045]
 [3.49547222 3.2149266  3.0219905  ... 3.19573297 3.19654437 3.55387188]
 [4.25585909 3.50594819 3.51275385 ... 3.6735029  3.67097726 4.14510355]]


In [10]:
def load_movie_titles(movie_map):
    # Load the movies dataframe
    movies_df = pd.read_csv('ml-latest-small/movies.csv')
    # Create a mapping from movie titles to indices and vice versa
    titles = {}
    titles_inv = {}
    for _, row in movies_df.iterrows():
        movie_id, title = row['movieId'], row['title']
        if movie_id in movie_map:
            index = movie_map[movie_id]
            titles[title] = index
            titles_inv[index] = title
    return titles, titles_inv

def print_top10(recommended_movies):
    if recommended_movies is not None:
        print(f"Top recommendations for {movie_title}:")
        for i, (similarity, title) in enumerate(recommended_movies, start=1):
            print(f"{i}. {title} (similarity: {similarity:.4f})")
    else:
        print(f"No recommendations found for '{movie_title}'.")


In [11]:
# Create a mapping from movie titles to their corresponding movie vectors in the item factor matrix
movie_map = {movie_id: index for index, movie_id in enumerate(movie_ids)}

# Load movie titles
titles, titles_inv = load_movie_titles(movie_map)

# Get movie recommendations for a given movie title
item_factors = mf.Q

In [12]:
# All movie titles
#print(titles)

In [13]:
# Choose a movie Title here. Format: 'Title ({release date})'
movie_title = 'The Lego Movie (2014)'
print(movie_title in titles)
recommended_movies = mf.recommend_movies(movie_title, titles, titles_inv, movie_map, item_factors, top_n=10)
print_top10(recommended_movies)

True
Top recommendations for The Lego Movie (2014):
1. The Night Is Short, Walk on Girl (2017) (similarity: 0.7197)
2. Logan (2017) (similarity: 0.7069)
3. Hell in the Pacific (1968) (similarity: 0.6925)
4. Dead Alive (Braindead) (1992) (similarity: 0.6755)
5. Mother (1996) (similarity: 0.6723)
6. Silver Spoon (2014) (similarity: 0.6687)
7. Fight Club (1999) (similarity: 0.6638)
8. Dudley Do-Right (1999) (similarity: 0.6446)
9. Twelve Tasks of Asterix, The (Les douze travaux d'Astérix) (1976) (similarity: 0.6435)
10. Memento (2000) (similarity: 0.6393)


# GUI to input a movie title for top 10 recommendations

In [None]:
import tkinter as tk
from tkinter import ttk
from ttkthemes import ThemedTk

def on_recommend_button_click():
    movie_title = entry.get()
    recommended_movies = mf.recommend_movies(movie_title, titles, titles_inv, movie_map, item_factors, top_n=10)
    result_text.delete(1.0, tk.END)
    if recommended_movies is not None:
        result_text.insert(tk.END, f"Top recommendations for {movie_title}:\n")
        for i, (similarity, title) in enumerate(recommended_movies, start=1):
            result_text.insert(tk.END, f"{i}. {title} (similarity: {similarity:.4f})\n")
    else:
        result_text.insert(tk.END, f"No recommendations found for '{movie_title}'.\n")

root = ThemedTk(theme="arc")
root.title("Movie Recommendation System")

mainframe = ttk.Frame(root, padding="30 30 30 30")
mainframe.grid(row=0, column=0, sticky=(tk.W, tk.E, tk.N, tk.S))
root.columnconfigure(0, weight=1)
root.rowconfigure(0, weight=1)

label = ttk.Label(mainframe, text="Enter a movie title:")
label.grid(row=0, column=0, sticky=tk.W, pady=10)

entry = ttk.Entry(mainframe, width=50)
entry.grid(row=1, column=0, sticky=(tk.W, tk.E), pady=10)

button = ttk.Button(mainframe, text="Recommend", command=on_recommend_button_click)
button.grid(row=2, column=0, sticky=tk.W, pady=10)

result_text = tk.Text(mainframe, wrap=tk.WORD, width=50, height=12, relief=tk.SUNKEN, borderwidth=1)
result_text.grid(row=3, column=0, sticky=(tk.W, tk.E), pady=10)

entry.focus()
root.mainloop()