In [82]:
import pandas as pd
import numpy as np
import os
import time
import re

from scipy.sparse import load_npz

In [83]:
user_movie_matrix = pd.read_csv(os.path.join("..","output","collbarfilter.csv"))
user_similarity = pd.read_csv(os.path.join("..","output","UtU_similarityMatrix.csv")).iloc[:,1:]
movies = pd.read_csv(os.path.join("..","data","movies.csv"))
ratings = pd.read_csv(os.path.join("..","data","ratings.csv"))

In [84]:
userids = ratings["userId"].unique()
user_similarity.columns = userids
user_similarity.index = userids
user_similarity

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
1,0.000000,0.000000,0.0,0.932810,0.970770,0.939243,0.951037,0.978123,0.000000,0.000000,...,0.992413,0.958830,0.927490,0.954033,0.937549,0.967379,0.958247,0.956573,0.000000,0.976881
2,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.981042,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.990501
3,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,0.932810,0.000000,0.0,0.000000,0.901137,0.899828,0.945977,0.000000,0.000000,0.000000,...,0.879535,0.918902,0.912397,0.000000,0.952565,0.933792,0.913567,0.867322,0.000000,0.921235
5,0.970770,0.000000,0.0,0.901137,0.000000,0.932712,0.000000,0.943665,0.000000,0.000000,...,0.000000,0.939417,0.933872,0.954653,0.930918,0.960953,0.954314,0.928047,0.979824,0.960010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.967379,0.000000,0.0,0.933792,0.960953,0.947985,0.931303,0.967077,0.977616,0.902706,...,0.989826,0.946998,0.951495,0.980066,0.966624,0.000000,0.952440,0.950909,0.985496,0.975492
607,0.958247,0.000000,0.0,0.913567,0.954314,0.961672,0.972062,0.963498,0.000000,0.000000,...,0.994381,0.955249,0.923803,0.958146,0.932317,0.952440,0.000000,0.952135,0.979129,0.959225
608,0.956573,0.000000,0.0,0.867322,0.928047,0.911192,0.922770,0.959250,0.939601,0.852762,...,0.979492,0.946875,0.915620,0.926622,0.929948,0.950909,0.952135,0.000000,0.972303,0.957767
609,0.000000,0.000000,0.0,0.000000,0.979824,0.965591,0.000000,0.967007,0.000000,0.000000,...,0.000000,0.968857,0.945054,0.987983,0.000000,0.985496,0.979129,0.972303,0.000000,0.963104


In [85]:
def clean_title(title):
    return re.sub("[^a-zA-Z0-9 ]","", title)

In [86]:
def personRating(user1, user2):
    return user_similarity.loc[user1, user2]

def movieSimilarity(movie_id, user_id):    
    usersSeen= ratings.loc[ratings["movieId"] == movie_id, ["userId", "rating"]]
    
    if usersSeen.shape[0] < 20:
        return
    
    if usersSeen["userId"].isin([user_id]).any():
        print(f"User {user_id} has already seen movie {movie_id}")
        return
    
    similarityScores = []
    
    for user in usersSeen["userId"]:
        similarityScores.append(personRating(user_id, user))
    
    total_similarity = np.sum(similarityScores)
    
    predicted_score = np.dot(usersSeen["rating"], similarityScores)/total_similarity
    
    return predicted_score

In [87]:
class recommendation():
    def __init__(self, SimilarityMatrix, ratings, movies) -> None:
        self.SimilarityMatrix = SimilarityMatrix
        self.ratings = ratings
        self.user = None
        self.movie_id = None
        self.movie_title = None
        self.relevantUsers = None
    
    def run(self, user_id):
        self.user = user_id
        movie_recs = {}
        
        unseen_movies = set(self.ratings["userId"].unique()) - set(self.ratings[ratings["userId"] == self.user]["movieId"])
        for movie in unseen_movies:
            score = movieSimilarity(movie, self.user)
            if score is None or score < 4:
                continue
            movie_recs[movie] = score
        
        movie_recs_pd = pd.DataFrame.from_dict(movie_recs, orient = "index", columns = ["predictedRating"])
        movie_recs_pd = movie_recs_pd.reset_index().rename(columns={'index': 'movieId'})
        movie_recs_pd['movieId'] = movie_recs_pd['movieId'].astype(movies['movieId'].dtype)

        merged_df = pd.merge(movies, movie_recs_pd, on='movieId', how='right')

        return merged_df.sort_values("predictedRating", ascending=False)

In [88]:
recs = recommendation(SimilarityMatrix= user_similarity, ratings = ratings, movies = movies)
movie_recs = recs.run(user_id=1)

In [89]:
movie_recs

Unnamed: 0,movieId,title,genres,predictedRating
8,318,"Shawshank Redemption, The (1994)",Crime|Drama,4.440455
4,246,Hoop Dreams (1994),Documentary,4.309346
9,475,In the Name of the Father (1993),Drama,4.298309
2,111,Taxi Driver (1976),Crime|Drama|Thriller,4.117272
10,541,Blade Runner (1982),Action|Sci-Fi|Thriller,4.084966
3,215,Before Sunrise (1995),Drama|Romance,4.043885
0,29,"City of Lost Children, The (Cité des enfants p...",Adventure|Drama|Fantasy|Mystery|Sci-Fi,4.03301
7,308,Three Colors: White (Trzy kolory: Bialy) (1994),Comedy|Drama,4.027271
1,58,"Postman, The (Postino, Il) (1994)",Comedy|Drama|Romance,4.019925
6,307,Three Colors: Blue (Trois couleurs: Bleu) (1993),Drama,4.018938
