In [38]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import itertools

tqdm.pandas()

  from pandas import Panel


In [2]:
# define column names
col_names = {
    "data": [ 'user id' , 'item id' , 'rating' , 'timestamp'],
    "item": ['movie id' , 'movie title' , 'release date' , 'video release date' ,
              'IMDb URL' , 'unknown' , 'Action' , 'Adventure' , 'Animation' ,
              "Children's" , 'Comedy' , 'Crime' , 'Documentary' , 'Drama' , 'Fantasy' ,
              'Film-Noir' , 'Horror' , 'Musical' , 'Mystery' , 'Romance' , 'Sci-Fi' ,
              'Thriller' , 'War' , 'Western'],
    "user": ['user id' , 'age' , 'gender' , 'occupation' , 'zip code'],
    "genre": ['genre', 'genre id']
}

In [3]:
def read_data(file_name, sep, encoding, col_names):
    output = pd.read_csv(file_name, sep=sep, encoding=encoding, names=col_names)
    return output

In [4]:
ratings = read_data("./u.data", "\t", 'utf-8', col_names["data"])
ratings.head(5)

Unnamed: 0,user id,item id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [80]:
movies = read_data("./u.item", "|", 'latin-1', col_names["item"])
movies.drop(columns= ['video release date', 'IMDb URL'], inplace=True)
# only for debug
movies = movies[0:100]
movies.head(5)

Unnamed: 0,movie id,movie title,release date,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0


In [6]:
users = read_data("./u.user", "|", 'utf-8', col_names["user"])
users.head(5)

Unnamed: 0,user id,age,gender,occupation,zip code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [7]:
genres = read_data("./u.genre", "|", 'utf-8', col_names["genre"])
genres.head(5)

Unnamed: 0,genre,genre id
0,unknown,0
1,Action,1
2,Adventure,2
3,Animation,3
4,Children's,4


In [8]:
def get_ratings_single_movie(movie_id):
    # get all available ratings for a single movie
    return ratings[ratings["item id"] == movie_id].sort_values(by=['user id'])

In [9]:
def get_ratings_single_user(user_id):
    # get all movies rated by the user
    return ratings[ratings["user id"] == user_id].sort_values(by=['item id'])  

In [10]:
def get_both_rated_set(user1_ratings, user2_ratings):
    # return all items rated by both user sorted by the item id
    sim_user1_ratings = user1_ratings[user1_ratings["item id"].isin(user2_ratings["item id"])]
    sim_user2_ratings = user2_ratings[user2_ratings["item id"].isin(user1_ratings["item id"])]
    
    sim_user1_ratings = sim_user1_ratings.sort_values(by=['item id'])
    sim_user2_ratings = sim_user2_ratings.sort_values(by=['item id'])
    return sim_user1_ratings, sim_user2_ratings

In [11]:
def get_both_raters_set(item1_ratings, item2_ratings):
    # return all ratings bytserh users who rated both items sorted by the user id
    item1_raters = item1_ratings[item1_ratings["user id"].isin(item2_ratings["user id"])]["user id"].tolist()
    item2_raters = item2_ratings[item2_ratings["user id"].isin(item1_ratings["user id"])]["user id"].tolist()
    users_rated_both = np.unique(item1_raters + item2_raters)
    ratings_both = ratings[ratings["user id"].isin(users_rated_both)]
    ratings_both = ratings_both.sort_values(by=['user id'])
    return ratings_both

# Pearson correlation function

In [12]:
def pearson_correlation(user1_id, user2_id):
    # calculate pearson correlation between 2 users
    # step1: get all ratings by 2 users
    user1_ratings = get_ratings_single_user(user1_id)
    user2_ratings = get_ratings_single_user(user2_id)
    
    # step2: get the items rated by both users
    sim_user1_ratings, sim_user2_ratings = get_both_rated_set(user1_ratings, user2_ratings)
    
    # step3: calculate mean ratings ra, rb
    mean_user1_ratings = np.mean(user1_ratings['rating'])
    mean_user2_ratings = np.mean(user2_ratings['rating'])

    # step4: calculate the variance  
    var_1 = np.array(np.subtract(sim_user1_ratings["rating"], [mean_user1_ratings]))
    var_2 = np.array(np.subtract(sim_user2_ratings["rating"], [mean_user2_ratings]))

    # step5: compute the pearson correlation
    numerator = np.sum(var_1*var_2)
    denominator = np.sqrt(np.sum(np.power(var_1, 2)))*np.sqrt(np.sum(np.power(var_2, 2)))

    if denominator == 0:
        # in the case that denominator = 0 return NaN
        return float('NaN'), user1_id, user2_id
    else:
        correlation = numerator / denominator
        return correlation, user1_id, user2_id


# User-based prediction function

In [13]:
def predict_single_pair_user(user1_id, user2_id, item_id):
    # predict item's score of user 1 based on user 2
    # step1: get all ratings by 2 users
    user1_ratings = get_ratings_single_user(user1_id)
    user2_ratings = get_ratings_single_user(user2_id)
    
    # step2: get the items rated by both users
    sim_user1_ratings, sim_user2_ratings = get_both_rated_set(user1_ratings, user2_ratings)
    
    # if there is no similar rated item, return nan
    if sim_user1_ratings.empty:
        return [float('NaN'), float('NaN')]
    
    # step3: compute the mean rating of user 2
    mean_user2_ratings = np.mean(user2_ratings['rating'])
    
    # step4: get the pearson correlation
    correlation, user1_id, user2_id = pearson_correlation(user1_id, user2_id)
    var_2 = float(user2_ratings[user2_ratings["item id"] == item_id]["rating"]) - mean_user2_ratings

    # step5: return the output
    numerator = (correlation*var_2)
    denominator = correlation
    return [numerator, denominator]

In [14]:
def predict_user_item(user_id, item_id):
    # predict item's score for user
    # if user already rated the item, return the rating
    existing_rating = ratings.loc[(ratings['user id'] == user_id) & (ratings['item id'] == item_id)]
    if not existing_rating.empty:
        return item_id, movies.at[item_id - 1, 'movie title'], existing_rating['rating'].values[0]
        
    # step 1: get user ratings
    user_ratings = get_ratings_single_user(user_id)
    # step 2: compute the mean rating
    mean_user_ratings = np.mean(user_ratings['rating'])
    
    # step 3:  get all other users which rated the item
    users_domain = ratings[ratings["item id"] == item_id]
    
    # step 4: predict for each user in the users domain
    correlations = users_domain.apply(lambda row: predict_single_pair_user(user_id, row["user id"], item_id), axis=1, result_type="expand")
    correlations = np.array(correlations)
    
    # filter all nan, which cause by no same rated item between 2 users
    correlations = correlations[~np.isnan(correlations).any(axis=1), :]

    # step 5: calculate the score and return
    pred_score = mean_user_ratings + np.sum(correlations[:,0]) / np.sum(correlations[:,1])
    return item_id, movies.at[item_id - 1, 'movie title'], pred_score

In [75]:
def get_predicted_ratings(user_id):
    movies_ratings = movies.progress_apply(lambda row: predict_user_item(user_id, row["movie id"]), axis=1, result_type="expand")
    movies_ratings.columns = ["movie id", "movie title", "pred_rating"]

    return movies_ratings

# Task 1: Average aggregation method

In [90]:
def avg_aggration(user1, user2, user3):
    # get predicted ratings of 3 users
    user1_ratings = get_predicted_ratings(user1)
    user2_ratings = get_predicted_ratings(user2)
    user3_ratings = get_predicted_ratings(user3)

    # create dataframe with all 3 users' predicted ratings
    data = [user1_ratings["movie title"], user1_ratings["pred_rating"], user2_ratings["pred_rating"], user3_ratings["pred_rating"]]
    headers = ["movie title", "user1 rating", "user2 rating", "user3 rating"]
    all_users_ratings = pd.concat(data, axis=1, keys=headers)
    all_users_ratings['average'] = all_users_ratings.iloc[:, 1:4].mean(axis=1)

    return all_users_ratings

In [91]:
def top_avg_movies(user1, user2, user3, movies_num):
    # get top movies with average rating
    all_users_ratings = avg_aggration(user1, user2, user3)
    top_movies = all_users_ratings.sort_values(by=['average'], ascending=False)
    top_movies = top_movies.iloc[:movies_num, :]
    
    return top_movies

In [92]:
# This took about 7.5 min to run
top_movies_avg = top_avg_movies(2, 17, 35, 20)
top_movies_avg.head(20)

100%|██████████| 100/100 [01:11<00:00,  1.40it/s]
100%|██████████| 100/100 [01:15<00:00,  1.33it/s]
100%|██████████| 100/100 [01:17<00:00,  1.30it/s]


Unnamed: 0,movie title,user1 rating,user2 rating,user3 rating,average
10,Seven (Se7en) (1995),3.905996,3.253117,41.657649,16.272254
48,I.Q. (1994),3.485273,2.665797,29.361209,11.837427
53,Outbreak (1995),3.321702,2.548432,26.070148,10.646761
28,Batman Forever (1995),2.819234,1.920805,12.786122,5.842054
20,Muppet Treasure Island (1996),2.968338,2.352569,9.608323,4.97641
26,Bad Boys (1995),3.308049,2.28391,7.724331,4.438763
86,Searching for Bobby Fischer (1993),4.04444,3.390964,5.585773,4.340392
99,Fargo (1996),5.0,4.0,3.855796,4.285265
49,Star Wars (1977),5.0,4.060897,3.26754,4.109479
38,Strange Days (1995),3.50097,2.725617,5.805871,4.010819


# Task 2: Least-misery aggregation method

In [28]:
def least_misery_aggration(user1, user2, user3):
    # get predicted ratings of 3 users
    user1_ratings = get_predicted_ratings(user1)
    user2_ratings = get_predicted_ratings(user2)
    user3_ratings = get_predicted_ratings(user3)

    # create dataframe with all 3 users' predicted ratings
    data = [user1_ratings["movie title"], user1_ratings["pred_rating"], user2_ratings["pred_rating"], user3_ratings["pred_rating"]]
    headers = ["movie title", "user1 rating", "user2 rating", "user3 rating"]
    all_users_ratings = pd.concat(data, axis=1, keys=headers)
    all_users_ratings['minimum'] = all_users_ratings.iloc[:, 1:4].min(axis=1)

    return all_users_ratings

In [29]:
def top_least_misery_movies(user1, user2, user3, movies_num):
    # get top movies with least-misery rating
    all_users_ratings = least_misery_aggration(user1, user2, user3)
    top_movies = all_users_ratings.sort_values(by=['minimum'], ascending=False)
    top_movies = top_movies.iloc[:movies_num, :]
    
    return top_movies

In [89]:
# This took about 7.5 min to run
top_movies_least_misery = top_least_misery_movies(2, 17, 35, 20)
top_movies_least_misery.head(20)

100%|██████████| 100/100 [01:13<00:00,  1.35it/s]
100%|██████████| 100/100 [01:21<00:00,  1.23it/s]
100%|██████████| 100/100 [01:17<00:00,  1.29it/s]


Unnamed: 0,movie title,user1 rating,user2 rating,user3 rating,minimum
6,Twelve Monkeys (1995),3.973975,4.0,3.903087,3.903087
99,Fargo (1996),5.0,4.0,3.855796,3.855796
97,"Silence of the Lambs, The (1991)",4.407805,3.745008,3.839236,3.745008
13,"Postino, Il (1994)",4.0,3.741435,3.781842,3.741435
55,Pulp Fiction (1994),4.206394,3.71083,3.859936,3.71083
74,Brother Minister: The Assassination of Malcolm...,3.594802,3.600329,4.467304,3.594802
88,Blade Runner (1982),4.269149,3.515706,4.203631,3.515706
7,Babe (1995),4.027053,3.460437,3.541116,3.460437
90,"Nightmare Before Christmas, The (1993)",3.943459,3.408689,3.950274,3.408689
44,Eat Drink Man Woman (1994),4.264728,3.762569,3.406358,3.406358


## Task 3: Disagreements function

In [85]:
def compute_disagreements(user1_ratings, user2_ratings):
    user1_sorted_rating = np.argsort(user1_ratings["pred_rating"])
    user2_sorted_rating = np.argsort(user2_ratings["pred_rating"])
    pairs = itertools.combinations(range(0, len(user1_sorted_rating)), 2)
    distance = 0
    for x, y in pairs:
        a = user1_sorted_rating[x] - user1_sorted_rating[y]
        b = user2_sorted_rating[x] - user2_sorted_rating[y]
        if a * b < 0:
            distance += 1
    return distance

In [96]:
def disagreements_aggration(user1, user2, user3):
    # get predicted ratings of 3 users
    user1_ratings = get_predicted_ratings(user1)
    user2_ratings = get_predicted_ratings(user2)
    user3_ratings = get_predicted_ratings(user3)
    
    # get pairwise disagreements score
    dis_1_2 = compute_disagreements(user1_ratings, user2_ratings)
    dis_1_3 = compute_disagreements(user1_ratings, user3_ratings)
    dis_2_3 = compute_disagreements(user2_ratings, user3_ratings)
    
    disagreements = [dis_1_2, dis_1_3, dis_2_3]
    
    # normalize the distance score range 1 - 5
    dis_1_2_norm = ((dis_1_2 - min(disagreements))*4)/(max(disagreements) - min(disagreements))+1
    dis_1_3_norm = ((dis_1_3 - min(disagreements))*4)/(max(disagreements) - min(disagreements))+1
    dis_2_3_norm = ((dis_2_3 - min(disagreements))*4)/(max(disagreements) - min(disagreements))+1
    
    # add disagreements as the penalty 
    user1_ratings["pred_rating"] = user1_ratings["pred_rating"] * (1/(dis_1_2_norm + dis_1_3_norm))
    user2_ratings["pred_rating"] = user2_ratings["pred_rating"] * (1/(dis_1_2_norm + dis_2_3_norm))
    user3_ratings["pred_rating"] = user3_ratings["pred_rating"] * (1/(dis_1_3_norm + dis_2_3_norm))
    
    # create dataframe with all 3 users' predicted ratings
    data = [user1_ratings["movie title"], user1_ratings["pred_rating"], user2_ratings["pred_rating"], user3_ratings["pred_rating"]]
    headers = ["movie title", "user1 rating", "user2 rating", "user3 rating"]
    all_users_ratings = pd.concat(data, axis=1, keys=headers)
    all_users_ratings['mean'] = all_users_ratings.iloc[:, 1:4].mean(axis=1)
    
    return all_users_ratings

In [97]:
def top_disagreements_movies(user1, user2, user3, movies_num):
    # get top movies with least-misery rating
    all_users_ratings = disagreements_aggration(user1, user2, user3)
    top_movies = all_users_ratings.sort_values(by=['mean'], ascending=False)
    top_movies = top_movies.iloc[:movies_num, :]
    
    return top_movies

In [98]:
top_movies_disagreements = top_disagreements_movies(2, 17, 35, 20)
top_movies_disagreements.head(20)

100%|██████████| 100/100 [01:05<00:00,  1.52it/s]
100%|██████████| 100/100 [01:12<00:00,  1.38it/s]
100%|██████████| 100/100 [01:10<00:00,  1.41it/s]


Unnamed: 0,movie title,user1 rating,user2 rating,user3 rating,mean
10,Seven (Se7en) (1995),0.76289,0.356701,6.942941,2.687511
48,I.Q. (1994),0.680717,0.292302,4.893535,1.955518
53,Outbreak (1995),0.64877,0.279433,4.345025,1.757743
28,Batman Forever (1995),0.550632,0.210615,2.13102,0.964089
20,Muppet Treasure Island (1996),0.579753,0.257957,1.601387,0.813033
26,Bad Boys (1995),0.646103,0.250429,1.287388,0.727973
86,Searching for Bobby Fischer (1993),0.78993,0.371816,0.930962,0.697569
99,Fargo (1996),0.976562,0.438596,0.642633,0.685931
49,Star Wars (1977),0.976562,0.445274,0.54459,0.655475
38,Strange Days (1995),0.683783,0.298861,0.967645,0.650097
