In [574]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import itertools
from IPython.display import display

tqdm.pandas()

In [575]:
# define column names
col_names = {
    "data": [ 'user id' , 'item id' , 'rating' , 'timestamp'],
    "item": ['movie id' , 'movie title' , 'release date' , 'video release date' ,
              'IMDb URL' , 'unknown' , 'Action' , 'Adventure' , 'Animation' ,
              "Children's" , 'Comedy' , 'Crime' , 'Documentary' , 'Drama' , 'Fantasy' ,
              'Film-Noir' , 'Horror' , 'Musical' , 'Mystery' , 'Romance' , 'Sci-Fi' ,
              'Thriller' , 'War' , 'Western'],
    "user": ['user id' , 'age' , 'gender' , 'occupation' , 'zip code'],
    "genre": ['genre', 'genre id']
}

In [576]:
def read_data(file_name, sep, encoding, col_names):
    output = pd.read_csv(file_name, sep=sep, encoding=encoding, names=col_names)
    return output

In [577]:
ratings = read_data("./u.data", "\t", 'utf-8', col_names["data"])
ratings.head(5)

Unnamed: 0,user id,item id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [578]:
movies = read_data("./u.item", "|", 'latin-1', col_names["item"])
movies.drop(columns= ['video release date', 'IMDb URL'], inplace=True)
# only for debug
movies = movies[0:100]
movies.head(5)

Unnamed: 0,movie id,movie title,release date,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0


In [579]:
users = read_data("./u.user", "|", 'utf-8', col_names["user"])
users.head(5)

Unnamed: 0,user id,age,gender,occupation,zip code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [580]:
genres = read_data("./u.genre", "|", 'utf-8', col_names["genre"])
genres.head(20)

Unnamed: 0,genre,genre id
0,unknown,0
1,Action,1
2,Adventure,2
3,Animation,3
4,Children's,4
5,Comedy,5
6,Crime,6
7,Documentary,7
8,Drama,8
9,Fantasy,9


In [581]:
def get_ratings_single_movie(movie_id):
    # get all available ratings for a single movie
    return ratings[ratings["item id"] == movie_id].sort_values(by=['user id'])

In [582]:
def get_ratings_single_user(user_id):
    # get all movies rated by the user
    return ratings[ratings["user id"] == user_id].sort_values(by=['item id'])  

In [583]:
def get_both_rated_set(user1_ratings, user2_ratings):
    # return all items rated by both user sorted by the item id
    sim_user1_ratings = user1_ratings[user1_ratings["item id"].isin(user2_ratings["item id"])]
    sim_user2_ratings = user2_ratings[user2_ratings["item id"].isin(user1_ratings["item id"])]
    
    sim_user1_ratings = sim_user1_ratings.sort_values(by=['item id'])
    sim_user2_ratings = sim_user2_ratings.sort_values(by=['item id'])
    return sim_user1_ratings, sim_user2_ratings

In [584]:
def get_both_raters_set(item1_ratings, item2_ratings):
    # return all ratings bytserh users who rated both items sorted by the user id
    item1_raters = item1_ratings[item1_ratings["user id"].isin(item2_ratings["user id"])]["user id"].tolist()
    item2_raters = item2_ratings[item2_ratings["user id"].isin(item1_ratings["user id"])]["user id"].tolist()
    users_rated_both = np.unique(item1_raters + item2_raters)
    ratings_both = ratings[ratings["user id"].isin(users_rated_both)]
    ratings_both = ratings_both.sort_values(by=['user id'])
    return ratings_both

# Pearson correlation function

In [585]:
def pearson_correlation(user1_id, user2_id):
    # calculate pearson correlation between 2 users
    # step1: get all ratings by 2 users
    user1_ratings = get_ratings_single_user(user1_id)
    user2_ratings = get_ratings_single_user(user2_id)
    
    # step2: get the items rated by both users
    sim_user1_ratings, sim_user2_ratings = get_both_rated_set(user1_ratings, user2_ratings)
    
    # step3: calculate mean ratings ra, rb
    mean_user1_ratings = np.mean(user1_ratings['rating'])
    mean_user2_ratings = np.mean(user2_ratings['rating'])

    # step4: calculate the variance  
    var_1 = np.array(np.subtract(sim_user1_ratings["rating"], [mean_user1_ratings]))
    var_2 = np.array(np.subtract(sim_user2_ratings["rating"], [mean_user2_ratings]))

    # step5: compute the pearson correlation
    numerator = np.sum(var_1*var_2)
    denominator = np.sqrt(np.sum(np.power(var_1, 2)))*np.sqrt(np.sum(np.power(var_2, 2)))

    if denominator == 0:
        # in the case that denominator = 0 return NaN
        return float('NaN')
    else:
        correlation = numerator / denominator
        return correlation


# User-based prediction function

In [586]:
def predict_single_pair_user(user1_id, user2_id, item_id):
    # predict item's score of user 1 based on user 2
    # step1: get all ratings by 2 users
    user1_ratings = get_ratings_single_user(user1_id)
    user2_ratings = get_ratings_single_user(user2_id)
    
    # step2: get the items rated by both users
    sim_user1_ratings, sim_user2_ratings = get_both_rated_set(user1_ratings, user2_ratings)
    
    # if there is no similar rated item, return nan
    if sim_user1_ratings.empty:
        return [float('NaN'), float('NaN')]
    
    # step3: compute the mean rating of user 2
    mean_user2_ratings = np.mean(user2_ratings['rating'])
    
    # step4: get the pearson correlation
    correlation = pearson_correlation(user1_id, user2_id)
    var_2 = float(user2_ratings[user2_ratings["item id"] == item_id]["rating"]) - mean_user2_ratings

    # step5: return the output
    numerator = (correlation*var_2)
    denominator = correlation
    return [numerator, denominator]

In [587]:
def predict_user_item(user_id, item_id):
    # predict item's score for user
    # if user already rated the item, return the rating
    existing_rating = ratings.loc[(ratings['user id'] == user_id) & (ratings['item id'] == item_id)]
    if not existing_rating.empty:
        return item_id, movies.at[item_id - 1, 'movie title'], existing_rating['rating'].values[0]
        
    # step 1: get user ratings
    user_ratings = get_ratings_single_user(user_id)
    # step 2: compute the mean rating
    mean_user_ratings = np.mean(user_ratings['rating'])
    
    # step 3:  get all other users which rated the item
    users_domain = ratings[ratings["item id"] == item_id]
    
    # step 4: predict for each user in the users domain
    correlations = users_domain.apply(lambda row: predict_single_pair_user(user_id, row["user id"], item_id), axis=1, result_type="expand")
    correlations = np.array(correlations)
    
    # filter all nan, which cause by no same rated item between 2 users
    correlations = correlations[~np.isnan(correlations).any(axis=1), :]

    # step 5: calculate the score and return
    pred_score = mean_user_ratings + np.sum(correlations[:,0]) / np.sum(correlations[:,1])
    return item_id, movies.at[item_id - 1, 'movie title'], pred_score

In [588]:
def get_predicted_ratings(user_id):
    movies_ratings = movies.progress_apply(lambda row: predict_user_item(user_id, row["movie id"]), axis=1, result_type="expand")
    movies_ratings.columns = ["movie id", "movie title", "pred_rating"]

    return movies_ratings

In [589]:
# result = get_predicted_ratings(2)
# new_result = result.sort_values(by=['pred_rating'], ascending=False).head(20)
# display(new_result.reset_index(drop=True))

# Group recommender

In [590]:
def average_aggregation(users, users_ratings):
    # create dataframe with all 3 users' predicted ratings
    data = [users_ratings[0]["movie id"], users_ratings[0]["movie title"], users_ratings[0]["pred_rating"], users_ratings[1]["pred_rating"], users_ratings[2]["pred_rating"]]
    headers = ["movie id", "movie title", "user{} rating".format(str(users[0])), "user{} rating".format(str(users[1])), "user{} rating".format(str(users[2]))]
    all_users_ratings = pd.concat(data, axis=1, keys=headers)

    # remove ratings under threshold
    all_users_ratings = all_users_ratings[all_users_ratings[f"user{str(users[0])} rating"] >= 2]
    all_users_ratings = all_users_ratings[all_users_ratings[f"user{str(users[1])} rating"] >= 2]
    all_users_ratings = all_users_ratings[all_users_ratings[f"user{str(users[2])} rating"] >= 2]

    all_users_ratings['average'] = all_users_ratings.iloc[:, 1:4].mean(axis=1)

    return all_users_ratings

In [591]:
def group_recommender(users_list, movies_num=20):
    users_ratings = []
    for user in users_list:
        user_ratings = get_predicted_ratings(user)
        users_ratings.append(user_ratings)
    
    group_ratings = average_aggregation(users_list, users_ratings)
    
    return group_ratings.sort_values(by=['average'], ascending=False).head(movies_num)

In [592]:
# result = group_recommender([2, 17, 35])
# display(result)

# WNCF Algorithm

In [593]:
def wncf_granularity(all_movies, user_id, question_list, ratings, user_ratings, min_rated_num, closest_peers, movie_num=20):
    reasons = []
    movie_name, desired_position = question_list
    
    if all_movies.loc[all_movies['movie title'] == movie_name].shape[0] == 0:
        reasons.append("The movie is not in the database")
        return reasons

    movie_id = all_movies.loc[all_movies['movie title'] == movie_name]['movie id'].iloc[0]    
    print(f"Movie id: {movie_id}")
    movie_rating = user_ratings.loc[user_ratings['movie id'] == movie_id]['pred_rating'].iloc[0]
    print(f"Movie rating: {movie_rating}")
    same_rated_movies = user_ratings[user_ratings['pred_rating'] == movie_rating]
    print("Similarly rated movies")
    display(same_rated_movies)

    first_same_rated_movie_id = same_rated_movies.iloc[0]['movie id']
    reindexed_user_ratings = user_ratings.reset_index(drop=True)

    same_rated_movie_rating_index = reindexed_user_ratings.loc[reindexed_user_ratings['movie id'] == first_same_rated_movie_id].index[0]
    if same_rated_movies.shape[0] > 1 and same_rated_movie_rating_index <= movie_num:
        reasons.append("This movie's predicted rating ties with many others who made the list")
        return reasons
    
    movie_rating_index = reindexed_user_ratings.loc[reindexed_user_ratings['movie id'] == movie_id].index[0]
    print(f"Movie rating index: {movie_rating_index}")
    if movie_num < movie_rating_index <= 2*movie_num:
        reasons.append(f"This movie is ranked at position {movie_rating_index}, just a bit too far down")
        return reasons

    user_rating_movie_row = ratings[(ratings['item id'] == movie_id) & (ratings['user id'] == user_id)]
    if user_rating_movie_row.shape[0] > 0:
        user_rating_movie = user_rating_movie_row['rating'].iloc[0]
        reasons.append(f"You already rated this movie, and it's only rated {user_rating_movie}")
        return reasons

    movie_ratings = ratings.loc[ratings['item id'] == movie_id]
    print(f"Ratings of this movie: {movie_ratings}")
    display(movie_ratings)
    if movie_ratings.shape[0] == 0:
        reasons.append("This movie is not rated")
        return reasons
    if movie_ratings.shape[0] > 0:
        most_similar_peers_rated = movie_ratings.loc[movie_ratings['user id'].isin(closest_peers["user id"])]
        if most_similar_peers_rated.shape[0] < min_rated_num:
            reasons.append(f"Not enough similar peers rated this movie, only {most_similar_peers_rated.shape[0]} out of {min_rated_num} did")
        if movie_ratings.shape[0] < min_rated_num:
            reasons.append(f"Not enough peers rated this movie, only {movie_ratings.shape[0]} out of {min_rated_num} did")
        if len(reasons) > 0:
            return reasons

    return ["This movie should be in your recommended list"]
    

In [594]:
def why_not_question(movie_name, user_id, desired_position=-1, closest_peers_num=50, min_rated_num=5, movie_num=20):
    question_list = [movie_name, desired_position]
    user_ratings = get_predicted_ratings(user_id).sort_values(by=['pred_rating'], ascending=False)
    display(user_ratings.sort_values(by=['pred_rating'], ascending=False).head(20))
    
    # calculate pearson correlation for every user
    peers = users.loc[users['user id'] != user_id]
    peers["similarity"] = peers.apply(lambda row: pearson_correlation(user_id, row["user id"]), axis=1)
    closest_peers = peers.sort_values(by=['similarity'], ascending=False).head(closest_peers_num)

    reasons = wncf_granularity(movies, user_id, question_list, ratings, user_ratings, min_rated_num, closest_peers, movie_num)
    print("***********************************************************")
    for reason in reasons:
        print(reason)

In [595]:
why_not_question("Mighty Aphrodite (1995)", 2)

100%|██████████| 100/100 [00:20<00:00,  4.84it/s]


Unnamed: 0,movie id,movie title,pred_rating
99,100,Fargo (1996),5.0
49,50,Star Wars (1977),5.0
63,64,"Shawshank Redemption, The (1994)",4.550331
11,12,"Usual Suspects, The (1995)",4.542483
97,98,"Silence of the Lambs, The (1991)",4.407805
58,59,Three Colors: Red (1994),4.29838
22,23,Taxi Driver (1976),4.292006
88,89,Blade Runner (1982),4.269149
44,45,Eat Drink Man Woman (1994),4.264728
59,60,Three Colors: Blue (1993),4.252207


Movie id: 13
Movie rating: 4.0
Similarly rated movies


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  peers["similarity"] = peers.apply(lambda row: pearson_correlation(user_id, row["user id"]), axis=1)


Unnamed: 0,movie id,movie title,pred_rating
0,1,Toy Story (1995),4.0
13,14,"Postino, Il (1994)",4.0
24,25,"Birdcage, The (1996)",4.0
12,13,Mighty Aphrodite (1995),4.0


Movie rating index: 27
***********************************************************
This movie is ranked at position 27, just a bit too far down


 - 942 peers (943 total users - 1)
 - 50 closest peers
 - At least 5 peers have rated the movie