In [98]:
import pandas as pd
import numpy as np
import base64
import requests
import datetime

In [99]:
data = pd.read_csv('test_book.csv')
data.head()

df = data[['userId', 'movieId', 'rating']]
df = df.iloc[0: 1000, :]
#df.loc[df['userId'] == 1,]

df.head(10)

Unnamed: 0,userId,movieId,rating
0,1,2,1
1,1,3,2
2,1,4,2
3,1,5,5
4,1,7,4
5,1,8,3
6,1,9,5
7,2,1,1
8,2,2,5
9,2,3,3


In [100]:
df.groupby('movieId').userId.agg(list).reset_index()

Unnamed: 0,movieId,userId
0,1,"[2, 3, 4, 5]"
1,2,"[1, 2, 3, 4, 5]"
2,3,"[1, 2, 3, 4, 5]"
3,4,"[1, 4, 5]"
4,5,"[1, 2, 3, 5]"
5,6,"[2, 3, 4, 5]"
6,7,"[1, 2, 3, 4, 5]"
7,8,"[1, 2, 3, 4, 5]"
8,9,[1]


In [101]:
user_rated_movies_df = df.groupby('userId').movieId.agg(list).reset_index()
user_rated_movies_df



Unnamed: 0,userId,movieId
0,1,"[2, 3, 4, 5, 7, 8, 9]"
1,2,"[1, 2, 3, 5, 6, 7, 8]"
2,3,"[1, 2, 3, 5, 6, 7, 8]"
3,4,"[1, 2, 3, 4, 6, 7, 8]"
4,5,"[1, 2, 3, 4, 5, 6, 7, 8]"


In [102]:
users_that_rated_movie = df.groupby('movieId').userId.agg(list).reset_index().set_index('movieId').T.to_dict()
users_that_rated_movie

{1: {'userId': [2, 3, 4, 5]},
 2: {'userId': [1, 2, 3, 4, 5]},
 3: {'userId': [1, 2, 3, 4, 5]},
 4: {'userId': [1, 4, 5]},
 5: {'userId': [1, 2, 3, 5]},
 6: {'userId': [2, 3, 4, 5]},
 7: {'userId': [1, 2, 3, 4, 5]},
 8: {'userId': [1, 2, 3, 4, 5]},
 9: {'userId': [1]}}

In [103]:
# MATRIX FOR RATING OF EACH USER AND MOVIE PAIR

newf = df.pivot(index='userId', columns='movieId')
newf.columns = newf.columns.droplevel(0)

 
matrix = newf.copy()

newf.reset_index(inplace=True)



newf.fillna('NaN', inplace=True)

newf

movieId,userId,1,2,3,4,5,6,7,8,9
0,1,,1.0,2.0,2.0,5.0,,4.0,3.0,5.0
1,2,1.0,5.0,3.0,,2.0,3.0,4.0,3.0,
2,3,1.0,1.0,2.0,,2.0,4.0,4.0,5.0,
3,4,3.0,2.0,2.0,3.0,,1.0,3.0,2.0,
4,5,5.0,1.0,5.0,5.0,4.0,4.0,5.0,2.0,


In [104]:
def get_alpha():
    return 0.01

In [105]:
print('Enter user id that wants a recommendation:')
test_userId = int(input())
print('Enter movie id for unrated movie: ')
unrated_movieId = int(input())

Enter user id that wants a recommendation:
Enter movie id for unrated movie: 


In [106]:
all_users = sorted(list(set(df['userId'])))


def compute_prior(list_of_movies, possible_ratings):
    # Let r_i be the rating of the i-th movie by any user
    # P(r_i = y) where y is any user:
    #           # users | r_u,i = y 
    #           -----------------------
    #           # users | r_u,i =/= NA + # of plausible ratings
    # i.e. P() a movie is rated y = # of usres that rated it y / # of users that rate =/= 'NA' + # of ratings for the movie
    prior_probs = {}
    alpha = get_alpha()

    # prior represents the prob that the item i be rated by ANY user as y

    for movie in list_of_movies:
        for rating in possible_ratings:
            num = len(df.loc[(df['movieId'] == movie) & (df['rating'] == rating) ]) + alpha
            denom = len(users_that_rated_movie[movie]['userId']) + len(possible_ratings) * alpha
            prior_probs[(movie, rating)] = num / denom
        
    
    return prior_probs

list_of_movies = list(set(df['movieId']))
possible_ratings = sorted(list(set(df['rating'])))
prior_probs = compute_prior(list_of_movies, possible_ratings)
prior_probs

{(1, 1): 0.4962962962962963,
 (1, 2): 0.0024691358024691358,
 (1, 3): 0.24938271604938272,
 (1, 4): 0.0024691358024691358,
 (1, 5): 0.24938271604938272,
 (2, 1): 0.596039603960396,
 (2, 2): 0.2,
 (2, 3): 0.0019801980198019802,
 (2, 4): 0.0019801980198019802,
 (2, 5): 0.2,
 (3, 1): 0.0019801980198019802,
 (3, 2): 0.596039603960396,
 (3, 3): 0.2,
 (3, 4): 0.0019801980198019802,
 (3, 5): 0.2,
 (4, 1): 0.0032786885245901644,
 (4, 2): 0.3311475409836066,
 (4, 3): 0.3311475409836066,
 (4, 4): 0.0032786885245901644,
 (4, 5): 0.3311475409836066,
 (5, 1): 0.0024691358024691358,
 (5, 2): 0.4962962962962963,
 (5, 3): 0.0024691358024691358,
 (5, 4): 0.24938271604938272,
 (5, 5): 0.24938271604938272,
 (6, 1): 0.24938271604938272,
 (6, 2): 0.0024691358024691358,
 (6, 3): 0.24938271604938272,
 (6, 4): 0.4962962962962963,
 (6, 5): 0.0024691358024691358,
 (7, 1): 0.0019801980198019802,
 (7, 2): 0.0019801980198019802,
 (7, 3): 0.2,
 (7, 4): 0.596039603960396,
 (7, 5): 0.2,
 (8, 1): 0.0019801980198019802

In [116]:
from numpy import NaN


def compute_likelihoood(list_of_movies, possible_ratings, list_of_users):
    # need to calculate likelihood
    # P(r_j = k | r_i = y) where j is the j-th movie being rated as 'k', given that the rating of the i-th movie was 'y'
    # ---------------> # users | r_u,j = k AND r_u,i = y
    #                 ----------------------------------------
    #               # users | r_u,j = NA AND r_u,i = y + # of possibilities
    #
    # ======== # of users that rated movie j as 'k' AND movie i rated as 'y' 
    #         ---------------------------------------------------------------
    #       # of users that rated movie j as NA and rated movie i as y + all possibilities
    movieId = 0
    rating = 1
    feature_probs = {}
    alpha = get_alpha()
    # posterior_prob = prior_prob P(r_i = y) * P(r_j = k | r_i = y) for every movie j rated by user u
   
    

    #user_rated_movies = list(user_rated_movies_df.loc[user_rated_movies_df['userId'] == test_userId, ]['movieId'])[0]
    #print(f"Movies that user {test_userId} rated {user_rated_movies}")

   

    # likelihood values will only be calculated for movies that are rated by the specific user
    likelihoods = []
    
    df_dict = {}
    for user in list_of_users:
        user_rated_movies = list(user_rated_movies_df.loc[user_rated_movies_df['userId'] == user, ]['movieId'])[0]
        print(f"Movies that user {user} rated {user_rated_movies}")
        likelihood_df = pd.DataFrame(columns=possible_ratings)

        for i in range(len(user_rated_movies)):
            rated_movieId = user_rated_movies[i]
            for possible_rating in possible_ratings:
            
                rating = matrix.loc[user, rated_movieId]
                numerator = len(newf.loc[(newf[rated_movieId] == rating) & (newf[unrated_movieId] == possible_rating)]) + alpha
                denominator = len(newf.loc[(newf[rated_movieId] != 'NaN') & (newf[unrated_movieId] == possible_rating)]) + len(possible_ratings) * alpha
                likelihood = numerator / denominator
                likelihoods.append(likelihood)
                """
                if len(likelihoods) == len(user_rated_movies):
                    likelihood_df[possible_rating] = likelihoods"""
            likelihood_df.loc[rated_movieId] = likelihoods
            likelihoods = []
        df_dict[user] = likelihood_df
        #print(likelihood_df)
        
                
    print(df_dict)

    return df_dict
    

    

list_of_users = list(set(df['userId']))

df_dict = compute_likelihoood(list_of_movies, possible_ratings, all_users)

Movies that user 1 rated [2, 3, 4, 5, 7, 8, 9]
Movies that user 2 rated [1, 2, 3, 5, 6, 7, 8]
Movies that user 3 rated [1, 2, 3, 5, 6, 7, 8]
Movies that user 4 rated [1, 2, 3, 4, 6, 7, 8]
Movies that user 5 rated [1, 2, 3, 4, 5, 6, 7, 8]
{1:           1    2         3    4         5
2  0.492683  0.2  0.009524  0.2  0.961905
3  0.492683  0.2  0.961905  0.2  0.009524
4  0.200000  0.2  0.009524  0.2  0.009524
5  0.004878  0.2  0.200000  0.2  0.009524
7  0.980488  0.2  0.009524  0.2  0.009524
8  0.492683  0.2  0.009524  0.2  0.009524
9  0.200000  0.2  0.200000  0.2  0.200000, 2:           1    2         3    4         5
1  0.980488  0.2  0.009524  0.2  0.009524
2  0.492683  0.2  0.009524  0.2  0.009524
3  0.492683  0.2  0.009524  0.2  0.009524
5  0.980488  0.2  0.200000  0.2  0.009524
6  0.492683  0.2  0.009524  0.2  0.009524
7  0.980488  0.2  0.009524  0.2  0.009524
8  0.492683  0.2  0.009524  0.2  0.009524, 3:           1    2         3    4         5
1  0.980488  0.2  0.009524  0.2  0.0

In [117]:
def compute_posterior_probs(df_dict, prior_probs):
    # first compute product of each column (feature) in likelihood_df
    likelihood_df = df_dict[test_userId]
    col_product = pd.DataFrame(likelihood_df.product()).T

    # multiply each product by corresponding prior (class) prob to get posterior prob
    
    posterior_probs = []
    for i in range(len(col_product.columns)):
        posterior_prob = prior_probs[(unrated_movieId, col_product.columns[i])] * col_product.iloc[0, i]
        posterior_probs.append(posterior_prob)

    # make the posterior probs into a dataframe
    posterior_probs_df = pd.DataFrame(posterior_probs).T
    posterior_probs_df.columns = col_product.columns

    print(posterior_probs_df)

    return posterior_probs_df

posterior_probs_df = compute_posterior_probs(df_dict=df_dict, prior_probs=prior_probs)

          1             2             3             4             5
0  0.000011  3.160494e-08  7.894074e-11  3.160494e-08  3.759083e-12


In [109]:
def arg_max(posterior_probs_df):
    max_rating = np.argmax(posterior_probs_df) + 1
    return max_rating

rating = arg_max(posterior_probs_df)

def recommended(rating):
    recommended = ""
    if rating <= 3:
        recommended = "User will most likely not enjoy this"
    elif rating > 3:
        recommended = "User should definitely watch this"

    return recommended

recommended = recommended(rating)

print(f'The most likely rating for movie {unrated_movieId} based on users similar to user {test_userId} is a rating of {rating}. {recommended}')

The most likely rating for movie 1 based on users similar to user 1 is a rating of 1. User will most likely not enjoy this


In [110]:
'https://www.youtube.com/watch?v=3I8oX3OUL6I'
'https://www.youtube.com/watch?v=lFJbZ6LVxN8'
'https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8787761'

'https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8787761'