In [248]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import itertools
from IPython.display import display

tqdm.pandas()

In [249]:
# define column names
col_names = {
    "data": [ 'user id' , 'item id' , 'rating' , 'timestamp'],
    "item": ['movie id' , 'movie title' , 'release date' , 'video release date' ,
              'IMDb URL' , 'unknown' , 'Action' , 'Adventure' , 'Animation' ,
              "Children's" , 'Comedy' , 'Crime' , 'Documentary' , 'Drama' , 'Fantasy' ,
              'Film-Noir' , 'Horror' , 'Musical' , 'Mystery' , 'Romance' , 'Sci-Fi' ,
              'Thriller' , 'War' , 'Western'],
    "user": ['user id' , 'age' , 'gender' , 'occupation' , 'zip code'],
    "genre": ['genre', 'genre id']
}

In [250]:
def read_data(file_name, sep, encoding, col_names):
    output = pd.read_csv(file_name, sep=sep, encoding=encoding, names=col_names)
    return output

In [251]:
ratings = read_data("./u.data", "\t", 'utf-8', col_names["data"])
ratings.head(5)

Unnamed: 0,user id,item id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [252]:
movies = read_data("./u.item", "|", 'latin-1', col_names["item"])
movies.drop(columns= ['video release date', 'IMDb URL'], inplace=True)
# only for debug
movies = movies[0:100]
movies.head(5)

Unnamed: 0,movie id,movie title,release date,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0


In [253]:
users = read_data("./u.user", "|", 'utf-8', col_names["user"])
users.head(5)

Unnamed: 0,user id,age,gender,occupation,zip code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [254]:
genres = read_data("./u.genre", "|", 'utf-8', col_names["genre"])
genres.head(5)

Unnamed: 0,genre,genre id
0,unknown,0
1,Action,1
2,Adventure,2
3,Animation,3
4,Children's,4


In [255]:
def get_ratings_single_movie(movie_id):
    # get all available ratings for a single movie
    return ratings[ratings["item id"] == movie_id].sort_values(by=['user id'])

In [256]:
def get_ratings_single_user(user_id):
    # get all movies rated by the user
    return ratings[ratings["user id"] == user_id].sort_values(by=['item id'])  

In [257]:
def get_both_rated_set(user1_ratings, user2_ratings):
    # return all items rated by both user sorted by the item id
    sim_user1_ratings = user1_ratings[user1_ratings["item id"].isin(user2_ratings["item id"])]
    sim_user2_ratings = user2_ratings[user2_ratings["item id"].isin(user1_ratings["item id"])]
    
    sim_user1_ratings = sim_user1_ratings.sort_values(by=['item id'])
    sim_user2_ratings = sim_user2_ratings.sort_values(by=['item id'])
    return sim_user1_ratings, sim_user2_ratings

In [258]:
def get_both_raters_set(item1_ratings, item2_ratings):
    # return all ratings bytserh users who rated both items sorted by the user id
    item1_raters = item1_ratings[item1_ratings["user id"].isin(item2_ratings["user id"])]["user id"].tolist()
    item2_raters = item2_ratings[item2_ratings["user id"].isin(item1_ratings["user id"])]["user id"].tolist()
    users_rated_both = np.unique(item1_raters + item2_raters)
    ratings_both = ratings[ratings["user id"].isin(users_rated_both)]
    ratings_both = ratings_both.sort_values(by=['user id'])
    return ratings_both

# Pearson correlation function

In [259]:
def pearson_correlation(user1_id, user2_id):
    # calculate pearson correlation between 2 users
    # step1: get all ratings by 2 users
    user1_ratings = get_ratings_single_user(user1_id)
    user2_ratings = get_ratings_single_user(user2_id)
    
    # step2: get the items rated by both users
    sim_user1_ratings, sim_user2_ratings = get_both_rated_set(user1_ratings, user2_ratings)
    
    # step3: calculate mean ratings ra, rb
    mean_user1_ratings = np.mean(user1_ratings['rating'])
    mean_user2_ratings = np.mean(user2_ratings['rating'])

    # step4: calculate the variance  
    var_1 = np.array(np.subtract(sim_user1_ratings["rating"], [mean_user1_ratings]))
    var_2 = np.array(np.subtract(sim_user2_ratings["rating"], [mean_user2_ratings]))

    # step5: compute the pearson correlation
    numerator = np.sum(var_1*var_2)
    denominator = np.sqrt(np.sum(np.power(var_1, 2)))*np.sqrt(np.sum(np.power(var_2, 2)))

    if denominator == 0:
        # in the case that denominator = 0 return NaN
        return float('NaN'), user1_id, user2_id
    else:
        correlation = numerator / denominator
        return correlation, user1_id, user2_id


# User-based prediction function

In [260]:
def predict_single_pair_user(user1_id, user2_id, item_id):
    # predict item's score of user 1 based on user 2
    # step1: get all ratings by 2 users
    user1_ratings = get_ratings_single_user(user1_id)
    user2_ratings = get_ratings_single_user(user2_id)
    
    # step2: get the items rated by both users
    sim_user1_ratings, sim_user2_ratings = get_both_rated_set(user1_ratings, user2_ratings)
    
    # if there is no similar rated item, return nan
    if sim_user1_ratings.empty:
        return [float('NaN'), float('NaN')]
    
    # step3: compute the mean rating of user 2
    mean_user2_ratings = np.mean(user2_ratings['rating'])
    
    # step4: get the pearson correlation
    correlation, user1_id, user2_id = pearson_correlation(user1_id, user2_id)
    var_2 = float(user2_ratings[user2_ratings["item id"] == item_id]["rating"]) - mean_user2_ratings

    # step5: return the output
    numerator = (correlation*var_2)
    denominator = correlation
    return [numerator, denominator]

In [261]:
def predict_user_item(user_id, item_id):
    # predict item's score for user
    # if user already rated the item, return the rating
    existing_rating = ratings.loc[(ratings['user id'] == user_id) & (ratings['item id'] == item_id)]
    if not existing_rating.empty:
        return item_id, movies.at[item_id - 1, 'movie title'], existing_rating['rating'].values[0]
        
    # step 1: get user ratings
    user_ratings = get_ratings_single_user(user_id)
    # step 2: compute the mean rating
    mean_user_ratings = np.mean(user_ratings['rating'])
    
    # step 3:  get all other users which rated the item
    users_domain = ratings[ratings["item id"] == item_id]
    
    # step 4: predict for each user in the users domain
    correlations = users_domain.apply(lambda row: predict_single_pair_user(user_id, row["user id"], item_id), axis=1, result_type="expand")
    correlations = np.array(correlations)
    
    # filter all nan, which cause by no same rated item between 2 users
    correlations = correlations[~np.isnan(correlations).any(axis=1), :]

    # step 5: calculate the score and return
    pred_score = mean_user_ratings + np.sum(correlations[:,0]) / np.sum(correlations[:,1])
    return item_id, movies.at[item_id - 1, 'movie title'], pred_score

In [262]:
def get_predicted_ratings(user_id):
    movies_ratings = movies.progress_apply(lambda row: predict_user_item(user_id, row["movie id"]), axis=1, result_type="expand")
    movies_ratings.columns = ["movie id", "movie title", "pred_rating"]

    return movies_ratings

# Average aggregation method

In [263]:
def avg_aggration(user1, user2, user3):
    # get predicted ratings of 3 users
    user1_ratings = get_predicted_ratings(user1)
    user2_ratings = get_predicted_ratings(user2)
    user3_ratings = get_predicted_ratings(user3)

    # create dataframe with all 3 users' predicted ratings
    data = [user1_ratings["movie title"], user1_ratings["pred_rating"], user2_ratings["pred_rating"], user3_ratings["pred_rating"]]
    headers = ["movie title", "user{} rating".format(str(user1)), "user{} rating".format(str(user2)), "user{} rating".format(str(user3))]
    all_users_ratings = pd.concat(data, axis=1, keys=headers)
    all_users_ratings['average'] = all_users_ratings.iloc[:, 1:4].mean(axis=1)

    return all_users_ratings

In [264]:
def top_avg_movies(user1, user2, user3, movies_num):
    # get top movies with average rating
    all_users_ratings = avg_aggration(user1, user2, user3)
    top_movies = all_users_ratings.sort_values(by=['average'], ascending=False)
    top_movies = top_movies.iloc[:movies_num, :]
    
    return top_movies

In [265]:
# This took about 7.5 min to run
# top_movies_avg = top_avg_movies(2, 17, 35, 20)
# top_movies_avg.head(20)

# Least-misery aggregation method

In [266]:
def least_misery_aggration(user1, user2, user3):
    # get predicted ratings of 3 users
    user1_ratings = get_predicted_ratings(user1)
    user2_ratings = get_predicted_ratings(user2)
    user3_ratings = get_predicted_ratings(user3)

    # create dataframe with all 3 users' predicted ratings
    data = [user1_ratings["movie title"], user1_ratings["pred_rating"], user2_ratings["pred_rating"], user3_ratings["pred_rating"]]
    headers = ["movie title", "user{} rating".format(str(user1)), "user{} rating".format(str(user2)), "user{} rating".format(str(user3))]
    all_users_ratings = pd.concat(data, axis=1, keys=headers)
    all_users_ratings['minimum'] = all_users_ratings.iloc[:, 1:4].min(axis=1)

    return all_users_ratings

In [267]:
def top_least_misery_movies(user1, user2, user3, movies_num):
    # get top movies with least-misery rating
    all_users_ratings = least_misery_aggration(user1, user2, user3)
    top_movies = all_users_ratings.sort_values(by=['minimum'], ascending=False)
    top_movies = top_movies.iloc[:movies_num, :]
    
    return top_movies

In [268]:
# This took about 7.5 min to run
# top_movies_least_misery = top_least_misery_movies(2, 17, 35, 20)
# top_movies_least_misery.head(20)

# Both aggregation methods

In [269]:
def both_aggregation(users, users_ratings):
    # create dataframe with all 3 users' predicted ratings
    data = [users_ratings[0]["movie title"], users_ratings[0]["pred_rating"], users_ratings[1]["pred_rating"], users_ratings[2]["pred_rating"]]
    headers = ["movie title", "user{} rating".format(str(users[0])), "user{} rating".format(str(users[1])), "user{} rating".format(str(users[2]))]
    all_users_ratings = pd.concat(data, axis=1, keys=headers)
    all_users_ratings['average'] = all_users_ratings.iloc[:, 1:4].mean(axis=1)
    all_users_ratings['minimum'] = all_users_ratings.iloc[:, 1:4].min(axis=1)

    return all_users_ratings

# Disagreements function

In [270]:
def compute_disagreements(user1_ratings, user2_ratings):
    user1_sorted_rating = np.argsort(user1_ratings["pred_rating"])
    user2_sorted_rating = np.argsort(user2_ratings["pred_rating"])
    pairs = itertools.combinations(range(0, len(user1_sorted_rating)), 2)
    distance = 0
    for x, y in pairs:
        a = user1_sorted_rating[x] - user1_sorted_rating[y]
        b = user2_sorted_rating[x] - user2_sorted_rating[y]
        if a * b < 0:
            distance += 1
    return distance

In [271]:
def disagreements_aggration(user1, user2, user3):
    # get predicted ratings of 3 users
    user1_ratings = get_predicted_ratings(user1)
    user2_ratings = get_predicted_ratings(user2)
    user3_ratings = get_predicted_ratings(user3)
    
    # get pairwise disagreements score
    dis_1_2 = compute_disagreements(user1_ratings, user2_ratings)
    dis_1_3 = compute_disagreements(user1_ratings, user3_ratings)
    dis_2_3 = compute_disagreements(user2_ratings, user3_ratings)
    
    disagreements = [dis_1_2, dis_1_3, dis_2_3]
    
    # normalize the distance score range 1 - 5
    dis_1_2_norm = ((dis_1_2 - min(disagreements))*4)/(max(disagreements) - min(disagreements))+1
    dis_1_3_norm = ((dis_1_3 - min(disagreements))*4)/(max(disagreements) - min(disagreements))+1
    dis_2_3_norm = ((dis_2_3 - min(disagreements))*4)/(max(disagreements) - min(disagreements))+1
    
    # add disagreements as the penalty 
    user1_ratings["pred_rating"] = user1_ratings["pred_rating"] * (1/(dis_1_2_norm + dis_1_3_norm))
    user2_ratings["pred_rating"] = user2_ratings["pred_rating"] * (1/(dis_1_2_norm + dis_2_3_norm))
    user3_ratings["pred_rating"] = user3_ratings["pred_rating"] * (1/(dis_1_3_norm + dis_2_3_norm))
    
    # create dataframe with all 3 users' predicted ratings
    data = [user1_ratings["movie title"], user1_ratings["pred_rating"], user2_ratings["pred_rating"], user3_ratings["pred_rating"]]
    headers = ["movie title", "user{} rating".format(str(user1)), "user{} rating".format(str(user2)), "user{} rating".format(str(user3))]
    all_users_ratings = pd.concat(data, axis=1, keys=headers)
    all_users_ratings['mean'] = all_users_ratings.iloc[:, 1:4].mean(axis=1)
    
    return all_users_ratings

In [272]:
def top_disagreements_movies(user1, user2, user3, movies_num):
    # get top movies with least-misery rating
    all_users_ratings = disagreements_aggration(user1, user2, user3)
    top_movies = all_users_ratings.sort_values(by=['mean'], ascending=False)
    top_movies = top_movies.iloc[:movies_num, :]
    
    return top_movies

In [273]:
# top_movies_disagreements = top_disagreements_movies(2, 17, 35, 20)
# top_movies_disagreements.head(20)

# Sequential recommendation

In [274]:
def user_satisfaction(user, user_ratings, group_ratings, movies_num):
    top_user_ratings = user_ratings.sort_values(by=['pred_rating'], ascending=False).head(movies_num)

    user_filtered_group_ratings = group_ratings[group_ratings.index.isin(top_user_ratings.index)]
    group_list_satisfaction = user_filtered_group_ratings["user{} rating".format(str(user))].sum()

    user_list_satisfaction = top_user_ratings["pred_rating"].sum()

    return group_list_satisfaction/user_list_satisfaction

In [275]:
def group_satisfaction(users_list, users_ratings, group_ratings, movies_num):
    user_satisfactions = []
    for (index, user) in enumerate(users_list):
        user_satisfactions.append(user_satisfaction(user, users_ratings[index], group_ratings, movies_num))
    
    # return average satisfaction and weight (alpha)
    return [sum(user_satisfactions)/len(user_satisfactions), max(user_satisfactions) - min(user_satisfactions)]

In [276]:
def sequential_iteration(users_list, users_ratings, rated_movies, group_ratings, iteration, movies_num):    
    filtered_group_ratings = group_ratings[group_ratings.index.isin(rated_movies.index)]

    alpha = 0 
    if iteration > 1:
        alpha = group_satisfaction(users_list, users_ratings, group_ratings, movies_num)[1]
    filtered_group_ratings["score"] = (1-alpha)*filtered_group_ratings["average"] + alpha*filtered_group_ratings["minimum"]

    filtered_group_ratings = filtered_group_ratings.sort_values(by=['score'], ascending=False)
    return filtered_group_ratings

In [277]:
def sequential_recommender(users_list, sequences_num, movies_num):
    users_ratings = []
    for user in users_list:
        user_ratings = get_predicted_ratings(user)
        users_ratings.append(user_ratings)

    rated_movies = pd.concat(users_ratings, axis=0)
    rated_movies = rated_movies[~rated_movies.index.duplicated(keep='first')]
    
    group_ratings = both_aggregation(users_list, users_ratings)
    recommended_sequence = []
    for i in range(1, sequences_num+1):
        group_ratings_iteration = sequential_iteration(users_list, users_ratings, rated_movies, group_ratings, i, movies_num)
        recommended_sequence.append(group_ratings_iteration.head(movies_num))
        group_ratings = group_ratings_iteration

    return recommended_sequence

In [278]:
recommended_sequence = sequential_recommender([2, 17, 35], 5, 20)
display(recommended_sequence[0])
display(recommended_sequence[1])
display(recommended_sequence[2])
display(recommended_sequence[3])
display(recommended_sequence[4])

100%|██████████| 100/100 [00:24<00:00,  4.02it/s]
100%|██████████| 100/100 [00:23<00:00,  4.17it/s]
100%|██████████| 100/100 [00:24<00:00,  4.16it/s]


Unnamed: 0,movie title,user2 rating,user17 rating,user35 rating,average,minimum,score
10,Seven (Se7en) (1995),3.905996,3.253117,41.657649,16.272254,3.253117,16.272254
48,I.Q. (1994),3.485273,2.665797,29.361209,11.837427,2.665797,11.837427
53,Outbreak (1995),3.321702,2.548432,26.070148,10.646761,2.548432,10.646761
28,Batman Forever (1995),2.819234,1.920805,12.786122,5.842054,1.920805,5.842054
20,Muppet Treasure Island (1996),2.968338,2.352569,9.608323,4.97641,2.352569,4.97641
26,Bad Boys (1995),3.308049,2.28391,7.724331,4.438763,2.28391,4.438763
86,Searching for Bobby Fischer (1993),4.04444,3.390964,5.585773,4.340392,3.390964,4.340392
99,Fargo (1996),5.0,4.0,3.855796,4.285265,3.855796,4.285265
49,Star Wars (1977),5.0,4.060897,3.26754,4.109479,3.26754,4.109479
38,Strange Days (1995),3.50097,2.725617,5.805871,4.010819,2.725617,4.010819


Unnamed: 0,movie title,user2 rating,user17 rating,user35 rating,average,minimum,score
10,Seven (Se7en) (1995),3.905996,3.253117,41.657649,16.272254,3.253117,16.272254
48,I.Q. (1994),3.485273,2.665797,29.361209,11.837427,2.665797,11.837427
53,Outbreak (1995),3.321702,2.548432,26.070148,10.646761,2.548432,10.646761
28,Batman Forever (1995),2.819234,1.920805,12.786122,5.842054,1.920805,5.842054
20,Muppet Treasure Island (1996),2.968338,2.352569,9.608323,4.97641,2.352569,4.97641
26,Bad Boys (1995),3.308049,2.28391,7.724331,4.438763,2.28391,4.438763
86,Searching for Bobby Fischer (1993),4.04444,3.390964,5.585773,4.340392,3.390964,4.340392
99,Fargo (1996),5.0,4.0,3.855796,4.285265,3.855796,4.285265
49,Star Wars (1977),5.0,4.060897,3.26754,4.109479,3.26754,4.109479
38,Strange Days (1995),3.50097,2.725617,5.805871,4.010819,2.725617,4.010819


Unnamed: 0,movie title,user2 rating,user17 rating,user35 rating,average,minimum,score
10,Seven (Se7en) (1995),3.905996,3.253117,41.657649,16.272254,3.253117,16.272254
48,I.Q. (1994),3.485273,2.665797,29.361209,11.837427,2.665797,11.837427
53,Outbreak (1995),3.321702,2.548432,26.070148,10.646761,2.548432,10.646761
28,Batman Forever (1995),2.819234,1.920805,12.786122,5.842054,1.920805,5.842054
20,Muppet Treasure Island (1996),2.968338,2.352569,9.608323,4.97641,2.352569,4.97641
26,Bad Boys (1995),3.308049,2.28391,7.724331,4.438763,2.28391,4.438763
86,Searching for Bobby Fischer (1993),4.04444,3.390964,5.585773,4.340392,3.390964,4.340392
99,Fargo (1996),5.0,4.0,3.855796,4.285265,3.855796,4.285265
49,Star Wars (1977),5.0,4.060897,3.26754,4.109479,3.26754,4.109479
38,Strange Days (1995),3.50097,2.725617,5.805871,4.010819,2.725617,4.010819


Unnamed: 0,movie title,user2 rating,user17 rating,user35 rating,average,minimum,score
10,Seven (Se7en) (1995),3.905996,3.253117,41.657649,16.272254,3.253117,16.272254
48,I.Q. (1994),3.485273,2.665797,29.361209,11.837427,2.665797,11.837427
53,Outbreak (1995),3.321702,2.548432,26.070148,10.646761,2.548432,10.646761
28,Batman Forever (1995),2.819234,1.920805,12.786122,5.842054,1.920805,5.842054
20,Muppet Treasure Island (1996),2.968338,2.352569,9.608323,4.97641,2.352569,4.97641
26,Bad Boys (1995),3.308049,2.28391,7.724331,4.438763,2.28391,4.438763
86,Searching for Bobby Fischer (1993),4.04444,3.390964,5.585773,4.340392,3.390964,4.340392
99,Fargo (1996),5.0,4.0,3.855796,4.285265,3.855796,4.285265
49,Star Wars (1977),5.0,4.060897,3.26754,4.109479,3.26754,4.109479
38,Strange Days (1995),3.50097,2.725617,5.805871,4.010819,2.725617,4.010819


Unnamed: 0,movie title,user2 rating,user17 rating,user35 rating,average,minimum,score
10,Seven (Se7en) (1995),3.905996,3.253117,41.657649,16.272254,3.253117,16.272254
48,I.Q. (1994),3.485273,2.665797,29.361209,11.837427,2.665797,11.837427
53,Outbreak (1995),3.321702,2.548432,26.070148,10.646761,2.548432,10.646761
28,Batman Forever (1995),2.819234,1.920805,12.786122,5.842054,1.920805,5.842054
20,Muppet Treasure Island (1996),2.968338,2.352569,9.608323,4.97641,2.352569,4.97641
26,Bad Boys (1995),3.308049,2.28391,7.724331,4.438763,2.28391,4.438763
86,Searching for Bobby Fischer (1993),4.04444,3.390964,5.585773,4.340392,3.390964,4.340392
99,Fargo (1996),5.0,4.0,3.855796,4.285265,3.855796,4.285265
49,Star Wars (1977),5.0,4.060897,3.26754,4.109479,3.26754,4.109479
38,Strange Days (1995),3.50097,2.725617,5.805871,4.010819,2.725617,4.010819
