In [1]:
import pandas as pd

base_path = './ml-100k/'

# 1. Load ratings (u.data: tab-separated userId | movieId | rating | timestamp)
ratings_cols = ['userId','movieId','rating','timestamp']
ratings = pd.read_csv(
    f'{base_path}u.data',
    sep='\t',
    names=ratings_cols,
    engine='python'
)

# 2. Load movies (u.item: pipe-separated; first 5 cols + 19 genre flags)
item_cols = ['movieId','title','release_date','video_release_date','IMDb_URL']
genre_cols = [
    'unknown','Action','Adventure','Animation',"Children's",'Comedy','Crime',
    'Documentary','Drama','Fantasy','Film-Noir','Horror','Musical','Mystery',
    'Romance','Sci-Fi','Thriller','War','Western'
]
movies = pd.read_csv(
    f'{base_path}u.item',
    sep='|',
    names=item_cols + genre_cols,
    encoding='latin-1',
    engine='python'
)

# 3. Quick sanity-check
print(ratings.head())
print(movies.head())


   userId  movieId  rating  timestamp
0     196      242       3  881250949
1     186      302       3  891717742
2      22      377       1  878887116
3     244       51       2  880606923
4     166      346       1  886397596
   movieId              title release_date  video_release_date  \
0        1   Toy Story (1995)  01-Jan-1995                 NaN   
1        2   GoldenEye (1995)  01-Jan-1995                 NaN   
2        3  Four Rooms (1995)  01-Jan-1995                 NaN   
3        4  Get Shorty (1995)  01-Jan-1995                 NaN   
4        5     Copycat (1995)  01-Jan-1995                 NaN   

                                            IMDb_URL  unknown  Action  \
0  http://us.imdb.com/M/title-exact?Toy%20Story%2...        0       0   
1  http://us.imdb.com/M/title-exact?GoldenEye%20(...        0       1   
2  http://us.imdb.com/M/title-exact?Four%20Rooms%...        0       0   
3  http://us.imdb.com/M/title-exact?Get%20Shorty%...        0       1   
4  http://

## Model Implementation

In [2]:
# mean rating of movies dataframe, converted to integer as to be a categorical feature
mean_ratings = ratings.groupby(ratings.movieId).mean()[['rating']].astype(int)

# merge movies dataframe on mean_ratings dataframe by movieId
# final_df contains genres as features (True or False) and Ratings as a feature (1-5)
final_df = pd.merge(mean_ratings, movies.set_index('movieId'), left_index= True, right_index=True)

def p_X_given_y(watched_df, row): #returns P(x_1 | y), P(x_2 | y), P(x_3 | y)...
    row = row.drop(['title', 'release_date', 'video_release_date', 'IMDb_URL']) #drop non-features
    matches = (watched_df[row.index] == row.values).sum() #count(X, y)
    return (1 + matches) / (1 + len(watched_df))  #P(X | y) = count(X, y) / count(y), with laplace smoothing


def p_y_given_X(row, watched_df, final_df): #P(y | X) ∝ P(Y) * ∏ P(X | y)
    p_y = len(watched_df) / len(final_df) #P(Y) = count(Y) / count(All)
    p_X_given_y_vector = p_X_given_y(watched_df, row) #P(x_1 | y), P(x_2 | y), P(x_3 | y)...
    return p_y * p_X_given_y_vector.prod() #P(y | X) ∝ P(Y) * ∏ P(x_i | y)


# compute P(y|X) for all unwatched movies
# accepts as input the whole dataframe of all movies(final_df), and the dataframe of all watched movies by the user(watched_df)
def naive_bayes(final_df, watched_df):
    unwatched_df = pd.concat([final_df, watched_df, watched_df]).drop_duplicates(keep=False) #unwatched_movies df
    unwatched_df['p_y_given_X'] = unwatched_df.apply(p_y_given_X, axis = 1, args = (watched_df, final_df)) # P(y | X) for all unwatched movies
    return unwatched_df.sort_values(by = 'p_y_given_X', ascending = False)['title'].head(10) # returns 10 movies with highest probability


naive_bayes(final_df, final_df.sample(frac=0.2))  # Randomly select 20% of the DataFrame as watched

movieId
1682    Scream of Stone (Schrei aus Stein) (1991)
909                       Dangerous Beauty (1998)
961                                Orlando (1993)
958                       To Live (Huozhe) (1994)
956                          Nobody's Fool (1994)
310                         Rainmaker, The (1997)
1381                          Losing Chase (1996)
317              In the Name of the Father (1993)
942          What's Love Got to Do with It (1993)
1558                             Aparajito (1956)
Name: title, dtype: object

In [3]:
merged_df = ratings.merge(movies, on="movieId")
merged_df.sort_values(by = 'userId')#[merged_df['userId'] == 1]
#merged_df[merged_df['userId'] == 1]
hidden_df = merged_df[merged_df['userId'] == 1].sample(frac=0.2)
watched_df = merged_df[merged_df['userId'] == 1].drop(hidden_df.index)
result_df = naive_bayes(final_df, watched_df)


In [4]:
# import random
# #print(random.randint(3, 9))
# def validator(result_df, hidden_df):
#   value = 0
#   for i in result_df.index:
#     if i in hidden_df.index:
#       value+=1
#   percentage = value/len(result_df)
#   return percentage

# for i in range(10):
#   ID = random.randint(1, 943)
#   hidden_df = merged_df[merged_df['userId'] == ID].sample(frac=0.2)
#   watched_df = merged_df[merged_df['userId'] == ID].drop(hidden_df.index)
#   result_df = naive_bayes(final_df, watched_df)
#   print(validator(result_df, hidden_df))


0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0


In [5]:
def p_X_given_y(row, good_reviews): #returns P(x_1 | y), P(x_2 | y), P(x_3 | y)...
    matches = (good_reviews.values == row.values).sum(axis=0)
    return (1 + matches) / (1 + len(good_reviews))  #P(X | y) = count(X, y) / count(y), with laplace smoothing

def p_y_given_X(row, movies_final, ratings_final): #P(y | X) ∝ P(Y) * ∏ P(X | y)
    p_y = ratings_final['rating'].mean() #P(Y) = count(Y) / count(All)
    good_review_ids = ratings_final[ratings_final['rating']]['movieId']
    p_X_given_y_vector = p_X_given_y(row, movies_final.loc[good_review_ids]) #P(x_1 | y), P(x_2 | y), P(x_3 | y)...
    return p_y * p_X_given_y_vector.prod() #P(y | X) ∝ P(Y) * ∏ P(x_i | y)

# compute P(y|X) for all unwatched movies
# accepts as input the whole dataframe of all movies(final_df), and the dataframe of all watched movies by the user(watched_df)
def naive_bayes(movies, ratings):
    movies_final = movies.drop(['title', 'release_date', 'video_release_date', 'IMDb_URL'], axis = 1).set_index('movieId')
    ratings_final = ratings.reset_index()[['movieId', 'rating']]
    ratings_final['rating'] = ratings_final['rating'].isin([4,5])
    


    unwatched_df = movies_final[~movies_final.index.isin(ratings_final['movieId'])].copy()
    unwatched_df['p_y_given_X'] = unwatched_df.apply(p_y_given_X, axis = 1, args = (movies_final, ratings_final))
    return unwatched_df.sort_values(by = 'p_y_given_X', ascending = False)[['p_y_given_X']]
    
naive_bayes(movies, ratings[ratings['userId'] == 1])  # Randomly select 20% of the DataFrame as watched

Unnamed: 0_level_0,p_y_given_X
movieId,Unnamed: 1_level_1
1682,4.878166e-02
1005,4.878166e-02
1564,4.878166e-02
985,4.878166e-02
981,4.878166e-02
...,...
993,1.783629e-07
426,1.502854e-07
560,7.570921e-08
820,4.939761e-08
