In [22]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import itertools
from IPython.display import display

tqdm.pandas()

In [23]:
# define column names
col_names = {
    "data": [ 'user id' , 'item id' , 'rating' , 'timestamp'],
    "item": ['movie id' , 'movie title' , 'release date' , 'video release date' ,
              'IMDb URL' , 'unknown' , 'Action' , 'Adventure' , 'Animation' ,
              "Children's" , 'Comedy' , 'Crime' , 'Documentary' , 'Drama' , 'Fantasy' ,
              'Film-Noir' , 'Horror' , 'Musical' , 'Mystery' , 'Romance' , 'Sci-Fi' ,
              'Thriller' , 'War' , 'Western'],
    "user": ['user id' , 'age' , 'gender' , 'occupation' , 'zip code'],
    "genre": ['genre', 'genre id']
}

In [24]:
def read_data(file_name, sep, encoding, col_names):
    output = pd.read_csv(file_name, sep=sep, encoding=encoding, names=col_names)
    return output

In [25]:
ratings = read_data("./u.data", "\t", 'utf-8', col_names["data"])
ratings.head(5)

Unnamed: 0,user id,item id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [26]:
movies = read_data("./u.item", "|", 'latin-1', col_names["item"])
movies.drop(columns= ['video release date', 'IMDb URL'], inplace=True)
# only for debug
movies = movies[0:100]
movies.head(5)

Unnamed: 0,movie id,movie title,release date,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0


In [30]:
users = read_data("./u.user", "|", 'utf-8', col_names["user"])
users.head(5)

Unnamed: 0,user id,age,gender,occupation,zip code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [31]:
genres = read_data("./u.genre", "|", 'utf-8', col_names["genre"])
genres.head(20)

Unnamed: 0,genre,genre id
0,unknown,0
1,Action,1
2,Adventure,2
3,Animation,3
4,Children's,4
5,Comedy,5
6,Crime,6
7,Documentary,7
8,Drama,8
9,Fantasy,9


In [32]:
def get_ratings_single_movie(movie_id):
    # get all available ratings for a single movie
    return ratings[ratings["item id"] == movie_id].sort_values(by=['user id'])

In [33]:
def get_ratings_single_user(user_id):
    # get all movies rated by the user
    return ratings[ratings["user id"] == user_id].sort_values(by=['item id'])  

In [34]:
def get_both_rated_set(user1_ratings, user2_ratings):
    # return all items rated by both user sorted by the item id
    sim_user1_ratings = user1_ratings[user1_ratings["item id"].isin(user2_ratings["item id"])]
    sim_user2_ratings = user2_ratings[user2_ratings["item id"].isin(user1_ratings["item id"])]
    
    sim_user1_ratings = sim_user1_ratings.sort_values(by=['item id'])
    sim_user2_ratings = sim_user2_ratings.sort_values(by=['item id'])
    return sim_user1_ratings, sim_user2_ratings

In [35]:
def get_both_raters_set(item1_ratings, item2_ratings):
    # return all ratings bytserh users who rated both items sorted by the user id
    item1_raters = item1_ratings[item1_ratings["user id"].isin(item2_ratings["user id"])]["user id"].tolist()
    item2_raters = item2_ratings[item2_ratings["user id"].isin(item1_ratings["user id"])]["user id"].tolist()
    users_rated_both = np.unique(item1_raters + item2_raters)
    ratings_both = ratings[ratings["user id"].isin(users_rated_both)]
    ratings_both = ratings_both.sort_values(by=['user id'])
    return ratings_both

# Pearson correlation function

In [36]:
def pearson_correlation(user1_id, user2_id):
    # calculate pearson correlation between 2 users
    # step1: get all ratings by 2 users
    user1_ratings = get_ratings_single_user(user1_id)
    user2_ratings = get_ratings_single_user(user2_id)
    
    # step2: get the items rated by both users
    sim_user1_ratings, sim_user2_ratings = get_both_rated_set(user1_ratings, user2_ratings)
    
    # step3: calculate mean ratings ra, rb
    mean_user1_ratings = np.mean(user1_ratings['rating'])
    mean_user2_ratings = np.mean(user2_ratings['rating'])

    # step4: calculate the variance  
    var_1 = np.array(np.subtract(sim_user1_ratings["rating"], [mean_user1_ratings]))
    var_2 = np.array(np.subtract(sim_user2_ratings["rating"], [mean_user2_ratings]))

    # step5: compute the pearson correlation
    numerator = np.sum(var_1*var_2)
    denominator = np.sqrt(np.sum(np.power(var_1, 2)))*np.sqrt(np.sum(np.power(var_2, 2)))

    if denominator == 0:
        # in the case that denominator = 0 return NaN
        return float('NaN'), user1_id, user2_id
    else:
        correlation = numerator / denominator
        return correlation, user1_id, user2_id


# User-based prediction function

In [37]:
def predict_single_pair_user(user1_id, user2_id, item_id):
    # predict item's score of user 1 based on user 2
    # step1: get all ratings by 2 users
    user1_ratings = get_ratings_single_user(user1_id)
    user2_ratings = get_ratings_single_user(user2_id)
    
    # step2: get the items rated by both users
    sim_user1_ratings, sim_user2_ratings = get_both_rated_set(user1_ratings, user2_ratings)
    
    # if there is no similar rated item, return nan
    if sim_user1_ratings.empty:
        return [float('NaN'), float('NaN')]
    
    # step3: compute the mean rating of user 2
    mean_user2_ratings = np.mean(user2_ratings['rating'])
    
    # step4: get the pearson correlation
    correlation, user1_id, user2_id = pearson_correlation(user1_id, user2_id)
    var_2 = float(user2_ratings[user2_ratings["item id"] == item_id]["rating"]) - mean_user2_ratings

    # step5: return the output
    numerator = (correlation*var_2)
    denominator = correlation
    return [numerator, denominator]

In [38]:
def predict_user_item(user_id, item_id):
    # predict item's score for user
    # if user already rated the item, return the rating
    existing_rating = ratings.loc[(ratings['user id'] == user_id) & (ratings['item id'] == item_id)]
    if not existing_rating.empty:
        return item_id, movies.at[item_id - 1, 'movie title'], existing_rating['rating'].values[0]
        
    # step 1: get user ratings
    user_ratings = get_ratings_single_user(user_id)
    # step 2: compute the mean rating
    mean_user_ratings = np.mean(user_ratings['rating'])
    
    # step 3:  get all other users which rated the item
    users_domain = ratings[ratings["item id"] == item_id]
    
    # step 4: predict for each user in the users domain
    correlations = users_domain.apply(lambda row: predict_single_pair_user(user_id, row["user id"], item_id), axis=1, result_type="expand")
    correlations = np.array(correlations)
    
    # filter all nan, which cause by no same rated item between 2 users
    correlations = correlations[~np.isnan(correlations).any(axis=1), :]

    # step 5: calculate the score and return
    pred_score = mean_user_ratings + np.sum(correlations[:,0]) / np.sum(correlations[:,1])
    return item_id, movies.at[item_id - 1, 'movie title'], pred_score

In [39]:
def get_predicted_ratings(user_id):
    movies_ratings = movies.progress_apply(lambda row: predict_user_item(user_id, row["movie id"]), axis=1, result_type="expand")
    movies_ratings.columns = ["movie id", "movie title", "pred_rating"]

    return movies_ratings

# Group recommender

In [40]:
def average_aggregation(users, users_ratings):
    # create dataframe with all 3 users' predicted ratings
    data = [users_ratings[0]["movie id"], users_ratings[0]["movie title"], users_ratings[0]["pred_rating"], users_ratings[1]["pred_rating"], users_ratings[2]["pred_rating"]]
    headers = ["movie id", "movie title", "user{} rating".format(str(users[0])), "user{} rating".format(str(users[1])), "user{} rating".format(str(users[2]))]
    all_users_ratings = pd.concat(data, axis=1, keys=headers)

    # remove ratings under threshold
    all_users_ratings = all_users_ratings[all_users_ratings[f"user{str(users[0])} rating"] >= 2]
    all_users_ratings = all_users_ratings[all_users_ratings[f"user{str(users[1])} rating"] >= 2]
    all_users_ratings = all_users_ratings[all_users_ratings[f"user{str(users[2])} rating"] >= 2]

    all_users_ratings['average'] = all_users_ratings.iloc[:, 1:4].mean(axis=1)

    return all_users_ratings

In [41]:
def group_recommender(users_list, movies_num=20):
    users_ratings = []
    for user in users_list:
        user_ratings = get_predicted_ratings(user)
        users_ratings.append(user_ratings)
    
    group_ratings = average_aggregation(users_list, users_ratings)
    
    return group_ratings.sort_values(by=['average'], ascending=False).head(movies_num)

In [42]:
result = group_recommender([2, 17, 35])
display(result)

100%|██████████| 100/100 [00:20<00:00,  4.77it/s]
100%|██████████| 100/100 [00:21<00:00,  4.60it/s]
100%|██████████| 100/100 [00:22<00:00,  4.40it/s]
  all_users_ratings['average'] = all_users_ratings.iloc[:, 1:4].mean(axis=1)


Unnamed: 0,movie id,movie title,user2 rating,user17 rating,user35 rating,average
49,50,Star Wars (1977),5.0,4.060897,3.26754,4.530449
99,100,Fargo (1996),5.0,4.0,3.855796,4.5
11,12,"Usual Suspects, The (1995)",4.542483,4.126286,2.994006,4.334384
63,64,"Shawshank Redemption, The (1994)",4.550331,3.874734,3.221284,4.212532
97,98,"Silence of the Lambs, The (1991)",4.407805,3.745008,3.839236,4.076406
22,23,Taxi Driver (1976),4.292006,3.763017,2.218471,4.027512
44,45,Eat Drink Man Woman (1994),4.264728,3.762569,3.406358,4.013649
0,1,Toy Story (1995),4.0,4.0,2.549545,4.0
59,60,Three Colors: Blue (1993),4.252207,3.735299,2.909408,3.993753
6,7,Twelve Monkeys (1995),3.973975,4.0,3.903087,3.986987


# WNCF Algorithm

In [None]:
def wncf_granularity(all_movies, all_users, user_id, question, ratings, user_ratings, numPI, numP, peers, movie_num):
    movie_name = question[0]
    desired_position = -1
    if len(question) > 1:
        desired_position = question[1]
    
    if all_movies.loc[all_movies['movie title'] == movie_name].shape[0] == 0:
        return "MOVIE_NOT_FOUND"
    
    movie_id = all_movies.loc[all_movies['movie title'] == movie_name]['movie id'].iloc[0]

    movie_rating = user_ratings.loc[user_ratings['movie id'] == movie_id]['pred_rating'].iloc[0]
    same_rated_movies = user_ratings[user_ratings['pred_rating'] == movie_rating]
    if same_rated_movies.shape[0] > 1 and same_rated_movies.index[0] <= movie_num:
        return "TIE"

    user_ratings.reset_index(drop=True)
    movie_rating_index = user_ratings.loc[user_ratings['movie id'] == movie_id].index[0]
    if movie_rating_index <= 2*movie_num:
        return f"TOO FAR: {movie_rating_index}"

    if ratings.loc[ratings['item id'] == movie_id].shape[0] == 0:
        return "MOVIE_NOT_RATED"

    # TODO: checking peers

    return "NO_PEER_RATED"
    